diff --git a/rvc/onnx/__init__.py b/rvc/onnx/__init__.py new file mode 100644 index 0000000..eeed4d8 --- /dev/null +++ b/rvc/onnx/__init__.py @@ -0,0 +1 @@ +from .infer import RVC diff --git a/rvc/onnx/infer.py b/rvc/onnx/infer.py index edcaa4e..efa0b18 100644 --- a/rvc/onnx/infer.py +++ b/rvc/onnx/infer.py @@ -38,9 +38,9 @@ class ContentVec(Model): super().__init__(vec_path, device) def __call__(self, wav: np.ndarray[typing.Any, np.dtype]): - return self.forward(wav) + return self.__forward(wav) - def forward(self, wav: np.ndarray[typing.Any, np.dtype]): + def __forward(self, wav: np.ndarray[typing.Any, np.dtype]): if wav.ndim == 2: # double channels wav = wav.mean(-1) assert wav.ndim == 1, wav.ndim @@ -67,21 +67,20 @@ class RVC(Model): def __init__( self, model_path: str | bytes | os.PathLike, - sr=40000, - hop_size=512, + hop_len=512, vec_path: str | bytes | os.PathLike = "vec-768-layer-12.onnx", device: typing.Literal["cpu", "cuda", "dml"] = "cpu", ): super().__init__(model_path, device) self.vec_model = ContentVec(vec_path, device) - self.sampling_rate = sr - self.hop_size = hop_size + self.hop_len = hop_len def inference( self, wav: np.ndarray[typing.Any, np.dtype], - sr: int, - sid: int, + wav_sr: int, + model_sr: int = 40000, + sid: int = 0, f0_method="dio", f0_up_key=0, ) -> np.ndarray[typing.Any, np.dtype[np.int16]]: @@ -91,17 +90,14 @@ class RVC(Model): f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0_predictor = get_f0_predictor( f0_method, - self.hop_size, - self.sampling_rate, + self.hop_len, + model_sr, ) org_length = len(wav) - if org_length / sr > 50.0: - raise RuntimeError("Reached Max Length") + if org_length / wav_sr > 50.0: + raise RuntimeError("wav max length exceeded") - wav16k = librosa.resample(wav, orig_sr=sr, target_sr=16000) - wav16k = wav16k - - hubert = self.vec_model(wav16k) + hubert = self.vec_model(librosa.resample(wav, orig_sr=wav_sr, target_sr=16000)) hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32) hubert_length = hubert.shape[1] @@ -126,7 +122,9 @@ class RVC(Model): out_wav = self.__forward( hubert, hubert_length, pitch, pitchf, ds, rnd ).squeeze() - out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant") + + out_wav = np.pad(out_wav, (0, 2 * self.hop_len), "constant") + return out_wav[0:org_length] def __forward( diff --git a/tools/onnx/export_onnx.py b/tools/onnx/export.py similarity index 100% rename from tools/onnx/export_onnx.py rename to tools/onnx/export.py diff --git a/tools/onnx/onnx_inference_demo.py b/tools/onnx/infer.py similarity index 80% rename from tools/onnx/onnx_inference_demo.py rename to tools/onnx/infer.py index a6bf3db..bf23b5c 100644 --- a/tools/onnx/onnx_inference_demo.py +++ b/tools/onnx/infer.py @@ -1,7 +1,7 @@ import soundfile import librosa -from rvc.onnx.infer import RVC +from rvc.onnx import RVC hop_size = 512 sampling_rate = 40000 # 采样率 @@ -14,7 +14,7 @@ wav_path = "123.wav" # 输入路径或ByteIO实例 out_path = "out.wav" # 输出路径或ByteIO实例 model = RVC( - model_path, vec_path=vec_path, sr=sampling_rate, hop_size=hop_size, device="cuda" + model_path, vec_path=vec_path, sr=sampling_rate, hop_len=hop_size, device="cuda" ) wav, sr = librosa.load(wav_path, sr=sampling_rate)