diff --git a/infer/lib/audio.py b/infer/lib/audio.py index f55e47a..f4f1547 100644 --- a/infer/lib/audio.py +++ b/infer/lib/audio.py @@ -28,6 +28,7 @@ def float_to_int16(audio: np.ndarray) -> np.ndarray: am = 32767 * 32768 // am return np.multiply(audio, am).astype(np.int16) + def float_np_array_to_wav_buf(wav: np.ndarray, sr: int, f32=False) -> BytesIO: buf = BytesIO() if f32: @@ -41,10 +42,12 @@ def float_np_array_to_wav_buf(wav: np.ndarray, sr: int, f32=False) -> BytesIO: buf.seek(0, 0) return buf + def save_audio(path: str, audio: np.ndarray, sr: int, f32=False): with open(path, "wb") as f: f.write(float_np_array_to_wav_buf(audio, sr, f32).getbuffer()) + def wav2(i: BytesIO, o: BufferedWriter, format: str): inp = av.open(i, "r") format = video_format_dict.get(format, format) @@ -65,12 +68,14 @@ def wav2(i: BytesIO, o: BufferedWriter, format: str): def load_audio( - file: Union[str, BytesIO, Path], - sr: Optional[int]=None, - format: Optional[str]=None, - mono=True - ) -> Union[np.ndarray, Tuple[np.ndarray, int]]: - if (isinstance(file, str) and not Path(file).exists()) or (isinstance(file, Path) and not file.exists()): + file: Union[str, BytesIO, Path], + sr: Optional[int] = None, + format: Optional[str] = None, + mono=True, +) -> Union[np.ndarray, Tuple[np.ndarray, int]]: + if (isinstance(file, str) and not Path(file).exists()) or ( + isinstance(file, Path) and not file.exists() + ): raise FileNotFoundError(f"File not found: {file}") rate = 0 @@ -78,12 +83,25 @@ def load_audio( audio_stream = next(s for s in container.streams if s.type == "audio") channels = 1 if audio_stream.layout == "mono" else 2 container.seek(0) - resampler = AudioResampler(format="fltp", layout=audio_stream.layout, rate=sr) if sr is not None else None + resampler = ( + AudioResampler(format="fltp", layout=audio_stream.layout, rate=sr) + if sr is not None + else None + ) # Estimated maximum total number of samples to pre-allocate the array # AV stores length in microseconds by default - estimated_total_samples = int(container.duration * sr // 1_000_000) if sr is not None else 48000 - decoded_audio = np.zeros(estimated_total_samples + 1 if channels == 1 else (channels, estimated_total_samples + 1), dtype=np.float32) + estimated_total_samples = ( + int(container.duration * sr // 1_000_000) if sr is not None else 48000 + ) + decoded_audio = np.zeros( + ( + estimated_total_samples + 1 + if channels == 1 + else (channels, estimated_total_samples + 1) + ), + dtype=np.float32, + ) offset = 0 @@ -92,7 +110,9 @@ def load_audio( rate = 0 for frame in packet: frame.pts = None # 清除时间戳,避免重新采样问题 - resampled_frames = resampler.resample(frame) if resampler is not None else [frame] + resampled_frames = ( + resampler.resample(frame) if resampler is not None else [frame] + ) for resampled_frame in resampled_frames: frame_data = resampled_frame.to_ndarray() rate = resampled_frame.rate @@ -104,13 +124,16 @@ def load_audio( yield p.decode() for r, frames_data in map(process_packet, frame_iter(container)): - if not rate: rate = r + if not rate: + rate = r for frame_data in frames_data: end_index = offset + len(frame_data[0]) # 检查 decoded_audio 是否有足够的空间,并在必要时调整大小 if end_index > decoded_audio.shape[1]: - decoded_audio = np.resize(decoded_audio, (decoded_audio.shape[0], end_index*4)) + decoded_audio = np.resize( + decoded_audio, (decoded_audio.shape[0], end_index * 4) + ) np.copyto(decoded_audio[..., offset:end_index], frame_data) offset += len(frame_data[0]) @@ -126,7 +149,9 @@ def load_audio( return decoded_audio, rate -def downsample_audio(input_path: str, output_path: str, format: str, br=128_000) -> None: +def downsample_audio( + input_path: str, output_path: str, format: str, br=128_000 +) -> None: """ default to 128kb/s (equivalent to -q:a 2) """ diff --git a/infer/lib/slicer2.py b/infer/lib/slicer2.py index ba751cd..f47bbdd 100644 --- a/infer/lib/slicer2.py +++ b/infer/lib/slicer2.py @@ -244,11 +244,15 @@ def main(): for i, chunk in enumerate(chunks): if len(chunk.shape) > 1: chunk = chunk.T - save_audio(os.path.join( - out, - f"%s_%d.wav" - % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), - ), chunk, sr) + save_audio( + os.path.join( + out, + f"%s_%d.wav" + % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), + ), + chunk, + sr, + ) if __name__ == "__main__": diff --git a/infer/modules/train/preprocess.py b/infer/modules/train/preprocess.py index 7fa374e..86625c9 100644 --- a/infer/modules/train/preprocess.py +++ b/infer/modules/train/preprocess.py @@ -62,15 +62,24 @@ class PreProcess: tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + ( 1 - self.alpha ) * tmp_audio - save_audio("%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), tmp_audio, self.sr, f32=True) + save_audio( + "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), + tmp_audio, + self.sr, + f32=True, + ) with open("%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), "wb") as f: - f.write(float_np_array_to_wav_buf( - load_audio( - float_np_array_to_wav_buf(tmp_audio, self.sr, f32=True), - sr=16000, - format="wav", - ) - , 16000, True).getbuffer()) + f.write( + float_np_array_to_wav_buf( + load_audio( + float_np_array_to_wav_buf(tmp_audio, self.sr, f32=True), + sr=16000, + format="wav", + ), + 16000, + True, + ).getbuffer() + ) def pipeline(self, path, idx0): try: diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py index bb6a4c0..e8f7156 100644 --- a/infer/modules/train/train.py +++ b/infer/modules/train/train.py @@ -133,11 +133,21 @@ def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger): try: dist.init_process_group( - backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", init_method="env://", world_size=n_gpus, rank=rank + backend=( + "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl" + ), + init_method="env://", + world_size=n_gpus, + rank=rank, ) except: dist.init_process_group( - backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", init_method="env://?use_libuv=False", world_size=n_gpus, rank=rank + backend=( + "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl" + ), + init_method="env://?use_libuv=False", + world_size=n_gpus, + rank=rank, ) torch.manual_seed(hps.train.seed) if torch.cuda.is_available(): @@ -243,13 +253,17 @@ def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger): if hasattr(net_g, "module"): logger.info( net_g.module.load_state_dict( - torch.load(hps.pretrainG, map_location="cpu", weights_only=True)["model"] + torch.load( + hps.pretrainG, map_location="cpu", weights_only=True + )["model"] ) ) ##测试不加载优化器 else: logger.info( net_g.load_state_dict( - torch.load(hps.pretrainG, map_location="cpu", weights_only=True)["model"] + torch.load( + hps.pretrainG, map_location="cpu", weights_only=True + )["model"] ) ) ##测试不加载优化器 if hps.pretrainD != "": @@ -258,13 +272,17 @@ def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger): if hasattr(net_d, "module"): logger.info( net_d.module.load_state_dict( - torch.load(hps.pretrainD, map_location="cpu", weights_only=True)["model"] + torch.load( + hps.pretrainD, map_location="cpu", weights_only=True + )["model"] ) ) else: logger.info( net_d.load_state_dict( - torch.load(hps.pretrainD, map_location="cpu", weights_only=True)["model"] + torch.load( + hps.pretrainD, map_location="cpu", weights_only=True + )["model"] ) ) diff --git a/infer/modules/uvr5/mdxnet.py b/infer/modules/uvr5/mdxnet.py index cbd5019..b37b431 100644 --- a/infer/modules/uvr5/mdxnet.py +++ b/infer/modules/uvr5/mdxnet.py @@ -208,8 +208,12 @@ class Predictor: sources = self.demix(mix.T) opt = sources[0].T if format in ["wav", "flac"]: - save_audio("%s/vocal_%s.%s" % (vocal_root, basename, format), mix - opt, rate) - save_audio("%s/instrument_%s.%s" % (others_root, basename, format), opt, rate) + save_audio( + "%s/vocal_%s.%s" % (vocal_root, basename, format), mix - opt, rate + ) + save_audio( + "%s/instrument_%s.%s" % (others_root, basename, format), opt, rate + ) else: path_vocal = "%s/vocal_%s.wav" % (vocal_root, basename) path_other = "%s/instrument_%s.wav" % (others_root, basename) diff --git a/infer/modules/uvr5/vr.py b/infer/modules/uvr5/vr.py index 9156933..264bb41 100644 --- a/infer/modules/uvr5/vr.py +++ b/infer/modules/uvr5/vr.py @@ -48,9 +48,7 @@ class AudioPre: self.mp = mp self.model = model - def _path_audio_( - self, music_file, ins_root=None, vocal_root=None, format="flac" - ): + def _path_audio_(self, music_file, ins_root=None, vocal_root=None, format="flac"): if ins_root is None and vocal_root is None: return "No save root." name = os.path.basename(music_file) @@ -134,10 +132,14 @@ class AudioPre: else: head = "instrument_" if format in ["wav", "flac"]: - save_audio(os.path.join( + save_audio( + os.path.join( ins_root, head + "{}_{}.{}".format(name, self.data["agg"], format), - ), wav_instrument, self.mp.param["sr"]) + ), + wav_instrument, + self.mp.param["sr"], + ) else: path = os.path.join( ins_root, head + "{}_{}.wav".format(name, self.data["agg"]) @@ -162,10 +164,14 @@ class AudioPre: wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) logger.info("%s vocals done" % name) if format in ["wav", "flac"]: - save_audio(os.path.join( + save_audio( + os.path.join( vocal_root, head + "{}_{}.{}".format(name, self.data["agg"], format), - ), wav_vocals, self.mp.param["sr"]) + ), + wav_vocals, + self.mp.param["sr"], + ) else: path = os.path.join( vocal_root, head + "{}_{}.wav".format(name, self.data["agg"]) diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py index 3e526a0..1bfdf2d 100644 --- a/infer/modules/vc/modules.py +++ b/infer/modules/vc/modules.py @@ -252,8 +252,12 @@ class VC: try: tgt_sr, audio_opt = opt if format1 in ["wav", "flac"]: - save_audio("%s/%s.%s" - % (opt_root, os.path.basename(path), format1), audio_opt, tgt_sr) + save_audio( + "%s/%s.%s" + % (opt_root, os.path.basename(path), format1), + audio_opt, + tgt_sr, + ) else: path = "%s/%s.%s" % ( opt_root, @@ -261,7 +265,11 @@ class VC: format1, ) with open(path, "wb") as outf: - wav2(float_np_array_to_wav_buf(audio_opt, tgt_sr), outf, format1) + wav2( + float_np_array_to_wav_buf(audio_opt, tgt_sr), + outf, + format1, + ) except: info += traceback.format_exc() infos.append("%s->%s" % (os.path.basename(path), info)) diff --git a/rvc/layers/generators.py b/rvc/layers/generators.py index 97fdb46..185f939 100644 --- a/rvc/layers/generators.py +++ b/rvc/layers/generators.py @@ -166,9 +166,11 @@ class SineGenerator(torch.nn.Module): rad = f0 / self.sampling_rate * a rad2 = torch.fmod(rad[:, :-1, -1:].float() + 0.5, 1.0) - 0.5 rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0) - rad += F.pad(rad_acc, (0, 0, 1, 0), mode='constant') + rad += F.pad(rad_acc, (0, 0, 1, 0), mode="constant") rad = rad.reshape(f0.shape[0], -1, 1) - b = torch.arange(1, self.dim + 1, dtype=f0.dtype, device=f0.device).reshape(1, 1, -1) + b = torch.arange(1, self.dim + 1, dtype=f0.dtype, device=f0.device).reshape( + 1, 1, -1 + ) rad *= b rand_ini = torch.rand(1, 1, self.dim, device=f0.device) rand_ini[..., 0] = 0 diff --git a/tools/cmd/onnx/infer.py b/tools/cmd/onnx/infer.py index d9c4311..8526084 100644 --- a/tools/cmd/onnx/infer.py +++ b/tools/cmd/onnx/infer.py @@ -20,4 +20,4 @@ wav, sr = librosa.load(wav_path, sr=sampling_rate) audio = model.infer(wav, sr, sampling_rate, sid, f0_method, f0_up_key) -save_audio(out_path, audio, sampling_rate) \ No newline at end of file +save_audio(out_path, audio, sampling_rate) diff --git a/web.py b/web.py index 6d3531c..0e3c3c7 100644 --- a/web.py +++ b/web.py @@ -144,12 +144,14 @@ outside_index_root = os.getenv("outside_index_root") names = [""] index_paths = [""] + def lookup_names(weight_root): global names for name in os.listdir(weight_root): if name.endswith(".pth"): names.append(name) + def lookup_indices(index_root): global index_paths for root, _, files in os.walk(index_root, topdown=False): @@ -157,6 +159,7 @@ def lookup_indices(index_root): if name.endswith(".index") and "trained" not in name: index_paths.append(str(pathlib.Path(root, name))) + lookup_names(weight_root) lookup_indices(index_root) lookup_indices(outside_index_root)