optimize(uvr5): apply jit to spec_utils & fix flac save

also fix #85
2026-06-05 17:20:25 +08:00 · 2024-11-28 23:19:05 +09:00
parent 4582d4b49a
commit 5969314e8d
11 changed files with 104 additions and 581 deletions
--- a/infer/modules/uvr5/mdxnet.py
+++ b/infer/modules/uvr5/mdxnet.py
@@ -3,12 +3,11 @@ import logging

 logger = logging.getLogger(__name__)

-import librosa
 import numpy as np
 import torch
 from tqdm import tqdm

-from infer.lib.audio import downsample_audio, save_audio
+from infer.lib.audio import load_audio, save_audio

 cpu = torch.device("cpu")

@@ -201,29 +200,18 @@ class Predictor:
        os.makedirs(vocal_root, exist_ok=True)
        os.makedirs(others_root, exist_ok=True)
        basename = os.path.basename(m)
-        mix, rate = librosa.load(m, mono=False, sr=44100)
+        mix, rate = load_audio(m, mono=False, sr=44100)
        if mix.ndim == 1:
            mix = np.asfortranarray([mix, mix])
        mix = mix.T
        sources = self.demix(mix.T)
        opt = sources[0].T
-        if format in ["wav", "flac"]:
-            save_audio(
-                "%s/vocal_%s.%s" % (vocal_root, basename, format), mix - opt, rate
-            )
-            save_audio(
-                "%s/instrument_%s.%s" % (others_root, basename, format), opt, rate
-            )
-        else:
-            path_vocal = "%s/vocal_%s.wav" % (vocal_root, basename)
-            path_other = "%s/instrument_%s.wav" % (others_root, basename)
-            save_audio(path_vocal, opt, rate)
-            save_audio(path_other, opt, rate)
-            opt_path_vocal = path_vocal[:-4] + ".%s" % format
-            opt_path_other = path_other[:-4] + ".%s" % format
-            downsample_audio(path_vocal, opt_path_vocal, format)
-            downsample_audio(path_other, opt_path_other, format)
-
+        save_audio(
+            "%s/vocal_%s.%s" % (vocal_root, basename, format), mix - opt, rate, True, format=format,
+        )
+        save_audio(
+            "%s/instrument_%s.%s" % (others_root, basename, format), opt, rate, True, format=format,
+        )

 class MDXNetDereverb:
    def __init__(self, chunks, device):
--- a/infer/modules/uvr5/modules.py
+++ b/infer/modules/uvr5/modules.py
@@ -55,13 +55,17 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
                    done = 1
            except Exception as e:
                need_reformat = 1
-                print(f"Exception {e} occured. Will reformat")
+                logger.warning(f"Exception {e} occured. Will reformat")
            if need_reformat == 1:
                tmp_path = "%s/%s.reformatted.wav" % (
                    os.path.join(os.environ["TEMP"]),
                    os.path.basename(inp_path),
                )
                resample_audio(inp_path, tmp_path, "pcm_s16le", "s16", 44100, "stereo")
+                try:  # Remove the original file
+                    os.remove(inp_path)
+                except Exception as e:
+                    print(f"Failed to remove the original file: {e}")
                inp_path = tmp_path
            try:
                if done == 0:
--- a/infer/modules/uvr5/vr.py
+++ b/infer/modules/uvr5/vr.py
@@ -5,7 +5,7 @@ logger = logging.getLogger(__name__)

 import librosa
 import numpy as np
-from infer.lib.audio import downsample_audio, save_audio
+from infer.lib.audio import save_audio
 import torch

 from infer.lib.uvr5_pack.lib_v5 import nets_123821KB as Nets
@@ -119,7 +119,7 @@ class AudioPre:
        if ins_root is not None:
            if self.data["high_end_process"].startswith("mirroring"):
                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], y_spec_m, input_high_end, self.mp
+                    self.data["high_end_process"], y_spec_m, input_high_end, self.mp.param["pre_filter_start"]
                )
                wav_instrument = spec_utils.cmb_spectrogram_to_wave(
                    y_spec_m, self.mp, input_high_end_h, input_high_end_
@@ -131,23 +131,16 @@ class AudioPre:
                head = "vocal_"
            else:
                head = "instrument_"
-            if format in ["wav", "flac"]:
-                save_audio(
-                    os.path.join(
-                        ins_root,
-                        head + "{}_{}.{}".format(name, self.data["agg"], format),
-                    ),
-                    wav_instrument,
-                    self.mp.param["sr"],
-                )
-            else:
-                path = os.path.join(
-                    ins_root, head + "{}_{}.wav".format(name, self.data["agg"])
-                )
-                save_audio(path, wav_instrument, self.mp.param["sr"])
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + ".%s" % format
-                    downsample_audio(path, opt_format_path, format)
+            save_audio(
+                os.path.join(
+                    ins_root,
+                    head + "{}_{}.{}".format(name, self.data["agg"], format),
+                ),
+                wav_instrument,
+                self.mp.param["sr"],
+                f32=True,
+                format=format
+            )
        if vocal_root is not None:
            if self.is_reverse:
                head = "instrument_"
@@ -155,7 +148,7 @@ class AudioPre:
                head = "vocal_"
            if self.data["high_end_process"].startswith("mirroring"):
                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], v_spec_m, input_high_end, self.mp
+                    self.data["high_end_process"], v_spec_m, input_high_end, self.mp.param["pre_filter_start"]
                )
                wav_vocals = spec_utils.cmb_spectrogram_to_wave(
                    v_spec_m, self.mp, input_high_end_h, input_high_end_
@@ -163,20 +156,13 @@ class AudioPre:
            else:
                wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
            logger.info("%s vocals done" % name)
-            if format in ["wav", "flac"]:
-                save_audio(
-                    os.path.join(
-                        vocal_root,
-                        head + "{}_{}.{}".format(name, self.data["agg"], format),
-                    ),
-                    wav_vocals,
-                    self.mp.param["sr"],
-                )
-            else:
-                path = os.path.join(
-                    vocal_root, head + "{}_{}.wav".format(name, self.data["agg"])
-                )
-                save_audio(path, wav_vocals, self.mp.param["sr"])
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + ".%s" % format
-                    downsample_audio(path, opt_format_path, format)
+            save_audio(
+                os.path.join(
+                    vocal_root,
+                    head + "{}_{}.{}".format(name, self.data["agg"], format),
+                ),
+                wav_vocals,
+                self.mp.param["sr"],
+                f32=True,
+                format=format
+            )
--- a/infer/modules/vc/modules.py
+++ b/infer/modules/vc/modules.py
@@ -251,25 +251,13 @@ class VC:
                if "Success" in info:
                    try:
                        tgt_sr, audio_opt = opt
-                        if format1 in ["wav", "flac"]:
-                            save_audio(
-                                "%s/%s.%s"
-                                % (opt_root, os.path.basename(path), format1),
-                                audio_opt,
-                                tgt_sr,
-                            )
-                        else:
-                            path = "%s/%s.%s" % (
-                                opt_root,
-                                os.path.basename(path),
-                                format1,
-                            )
-                            with open(path, "wb") as outf:
-                                wav2(
-                                    float_np_array_to_wav_buf(audio_opt, tgt_sr),
-                                    outf,
-                                    format1,
-                                )
+                        save_audio(
+                            "%s/%s.%s"
+                            % (opt_root, os.path.basename(path), format1),
+                            audio_opt,
+                            tgt_sr,
+                            f32=True,
+                        )
                    except:
                        info += traceback.format_exc()
                infos.append("%s->%s" % (os.path.basename(path), info))