mirror of
https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git
synced 2026-06-08 12:00:49 +08:00
optimize(rvc.onnx): more modelize
This commit is contained in:
1
rvc/onnx/__init__.py
Normal file
1
rvc/onnx/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from .infer import RVC
|
||||||
@@ -38,9 +38,9 @@ class ContentVec(Model):
|
|||||||
super().__init__(vec_path, device)
|
super().__init__(vec_path, device)
|
||||||
|
|
||||||
def __call__(self, wav: np.ndarray[typing.Any, np.dtype]):
|
def __call__(self, wav: np.ndarray[typing.Any, np.dtype]):
|
||||||
return self.forward(wav)
|
return self.__forward(wav)
|
||||||
|
|
||||||
def forward(self, wav: np.ndarray[typing.Any, np.dtype]):
|
def __forward(self, wav: np.ndarray[typing.Any, np.dtype]):
|
||||||
if wav.ndim == 2: # double channels
|
if wav.ndim == 2: # double channels
|
||||||
wav = wav.mean(-1)
|
wav = wav.mean(-1)
|
||||||
assert wav.ndim == 1, wav.ndim
|
assert wav.ndim == 1, wav.ndim
|
||||||
@@ -67,21 +67,20 @@ class RVC(Model):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model_path: str | bytes | os.PathLike,
|
model_path: str | bytes | os.PathLike,
|
||||||
sr=40000,
|
hop_len=512,
|
||||||
hop_size=512,
|
|
||||||
vec_path: str | bytes | os.PathLike = "vec-768-layer-12.onnx",
|
vec_path: str | bytes | os.PathLike = "vec-768-layer-12.onnx",
|
||||||
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
|
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
|
||||||
):
|
):
|
||||||
super().__init__(model_path, device)
|
super().__init__(model_path, device)
|
||||||
self.vec_model = ContentVec(vec_path, device)
|
self.vec_model = ContentVec(vec_path, device)
|
||||||
self.sampling_rate = sr
|
self.hop_len = hop_len
|
||||||
self.hop_size = hop_size
|
|
||||||
|
|
||||||
def inference(
|
def inference(
|
||||||
self,
|
self,
|
||||||
wav: np.ndarray[typing.Any, np.dtype],
|
wav: np.ndarray[typing.Any, np.dtype],
|
||||||
sr: int,
|
wav_sr: int,
|
||||||
sid: int,
|
model_sr: int = 40000,
|
||||||
|
sid: int = 0,
|
||||||
f0_method="dio",
|
f0_method="dio",
|
||||||
f0_up_key=0,
|
f0_up_key=0,
|
||||||
) -> np.ndarray[typing.Any, np.dtype[np.int16]]:
|
) -> np.ndarray[typing.Any, np.dtype[np.int16]]:
|
||||||
@@ -91,17 +90,14 @@ class RVC(Model):
|
|||||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||||
f0_predictor = get_f0_predictor(
|
f0_predictor = get_f0_predictor(
|
||||||
f0_method,
|
f0_method,
|
||||||
self.hop_size,
|
self.hop_len,
|
||||||
self.sampling_rate,
|
model_sr,
|
||||||
)
|
)
|
||||||
org_length = len(wav)
|
org_length = len(wav)
|
||||||
if org_length / sr > 50.0:
|
if org_length / wav_sr > 50.0:
|
||||||
raise RuntimeError("Reached Max Length")
|
raise RuntimeError("wav max length exceeded")
|
||||||
|
|
||||||
wav16k = librosa.resample(wav, orig_sr=sr, target_sr=16000)
|
hubert = self.vec_model(librosa.resample(wav, orig_sr=wav_sr, target_sr=16000))
|
||||||
wav16k = wav16k
|
|
||||||
|
|
||||||
hubert = self.vec_model(wav16k)
|
|
||||||
hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
|
hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
|
||||||
hubert_length = hubert.shape[1]
|
hubert_length = hubert.shape[1]
|
||||||
|
|
||||||
@@ -126,7 +122,9 @@ class RVC(Model):
|
|||||||
out_wav = self.__forward(
|
out_wav = self.__forward(
|
||||||
hubert, hubert_length, pitch, pitchf, ds, rnd
|
hubert, hubert_length, pitch, pitchf, ds, rnd
|
||||||
).squeeze()
|
).squeeze()
|
||||||
out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant")
|
|
||||||
|
out_wav = np.pad(out_wav, (0, 2 * self.hop_len), "constant")
|
||||||
|
|
||||||
return out_wav[0:org_length]
|
return out_wav[0:org_length]
|
||||||
|
|
||||||
def __forward(
|
def __forward(
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import soundfile
|
import soundfile
|
||||||
import librosa
|
import librosa
|
||||||
|
|
||||||
from rvc.onnx.infer import RVC
|
from rvc.onnx import RVC
|
||||||
|
|
||||||
hop_size = 512
|
hop_size = 512
|
||||||
sampling_rate = 40000 # 采样率
|
sampling_rate = 40000 # 采样率
|
||||||
@@ -14,7 +14,7 @@ wav_path = "123.wav" # 输入路径或ByteIO实例
|
|||||||
out_path = "out.wav" # 输出路径或ByteIO实例
|
out_path = "out.wav" # 输出路径或ByteIO实例
|
||||||
|
|
||||||
model = RVC(
|
model = RVC(
|
||||||
model_path, vec_path=vec_path, sr=sampling_rate, hop_size=hop_size, device="cuda"
|
model_path, vec_path=vec_path, sr=sampling_rate, hop_len=hop_size, device="cuda"
|
||||||
)
|
)
|
||||||
|
|
||||||
wav, sr = librosa.load(wav_path, sr=sampling_rate)
|
wav, sr = librosa.load(wav_path, sr=sampling_rate)
|
||||||
Reference in New Issue
Block a user