1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-05 01:10:22 +08:00

optimize(infer): move onnx into rvc

This commit is contained in:
源文雨
2024-06-11 17:21:05 +09:00
parent e81b7c52c0
commit f956b333fa
12 changed files with 108 additions and 145 deletions

View File

@@ -1 +1,2 @@
from .infer import RVC
from .exporter import export_onnx

52
rvc/onnx/exporter.py Normal file
View File

@@ -0,0 +1,52 @@
import torch
from .synthesizer import SynthesizerTrnMsNSFsid
def export_onnx(from_cpkt_pth: str, to_onnx_pth: str) -> str:
cpt = torch.load(from_cpkt_pth, map_location="cpu")
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
test_phone = torch.rand(1, 200, vec_channels) # hidden unit
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
test_pitchf = torch.rand(1, 200) # nsf基频
test_ds = torch.LongTensor([0]) # 说话人ID
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
device = "cpu" # 导出时设备(不影响使用模型)
net_g = SynthesizerTrnMsNSFsid(
*cpt["config"], encoder_dim=vec_channels
) # fp32导出C++要支持fp16必须手动将内存重新排列所以暂时不用fp16
net_g.load_state_dict(cpt["weight"], strict=False)
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
output_names = [
"audio",
]
# net_g.construct_spkmixmap() #多角色混合轨道导出
torch.onnx.export(
net_g,
(
test_phone.to(device),
test_phone_lengths.to(device),
test_pitch.to(device),
test_pitchf.to(device),
test_ds.to(device),
test_rnd.to(device),
),
to_onnx_pth,
dynamic_axes={
"phone": [1],
"pitch": [1],
"pitchf": [1],
"rnd": [2],
},
do_constant_folding=False,
opset_version=17,
verbose=False,
input_names=input_names,
output_names=output_names,
)
return "Finished"

View File

@@ -23,7 +23,7 @@ class DioF0Predictor(F0Predictor):
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
for index, pitch in enumerate(f0):
f0[index] = round(pitch, 1)
return self.__interpolate_f0(self.__resize_f0(f0, p_len))[0]
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
def compute_f0_uv(
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
@@ -40,4 +40,4 @@ class DioF0Predictor(F0Predictor):
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
for index, pitch in enumerate(f0):
f0[index] = round(pitch, 1)
return self.__interpolate_f0(self.__resize_f0(f0, p_len))
return self.interpolate_f0(self.resize_f0(f0, p_len))

View File

@@ -18,7 +18,7 @@ class F0Predictor(object):
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
): ...
def __interpolate_f0(self, f0: np.ndarray[Any, np.dtype]):
def interpolate_f0(self, f0: np.ndarray[Any, np.dtype]):
"""
对F0进行插值处理
"""
@@ -56,7 +56,7 @@ class F0Predictor(object):
return ip_data[:, 0], vuv_vector[:, 0]
def __resize_f0(self, x: np.ndarray[Any, np.dtype], target_len: int):
def resize_f0(self, x: np.ndarray[Any, np.dtype], target_len: int):
source = np.array(x)
source[source < 0.001] = np.nan
target = np.interp(

View File

@@ -21,7 +21,7 @@ class HarvestF0Predictor(F0Predictor):
frame_period=1000 * self.hop_length / self.sampling_rate,
)
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
return self.__interpolate_f0(self.__resize_f0(f0, p_len))[0]
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
def compute_f0_uv(
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
@@ -36,4 +36,4 @@ class HarvestF0Predictor(F0Predictor):
frame_period=1000 * self.hop_length / self.sampling_rate,
)
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
return self.__interpolate_f0(self.__resize_f0(f0, p_len))
return self.interpolate_f0(self.resize_f0(f0, p_len))

View File

@@ -31,7 +31,7 @@ class PMF0Predictor(F0Predictor):
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
f0, uv = self.__interpolate_f0(f0)
f0, uv = self.interpolate_f0(f0)
return f0
def compute_f0_uv(
@@ -57,5 +57,5 @@ class PMF0Predictor(F0Predictor):
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
f0, uv = self.__interpolate_f0(f0)
f0, uv = self.interpolate_f0(f0)
return f0, uv

View File

@@ -1,10 +1,11 @@
import librosa
import numpy as np
import onnxruntime
import typing
import os
from onnx.f0predictors import (
import librosa
import numpy as np
import onnxruntime
from .f0predictors import (
PMF0Predictor,
HarvestF0Predictor,
DioF0Predictor,
@@ -15,7 +16,7 @@ from onnx.f0predictors import (
class Model:
def __init__(
self,
path: str | bytes | os.PathLike,
path: typing.Union[str, bytes, os.PathLike],
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
):
if device == "cpu":
@@ -32,7 +33,7 @@ class Model:
class ContentVec(Model):
def __init__(
self,
vec_path: str | bytes | os.PathLike,
vec_path: typing.Union[str, bytes, os.PathLike],
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
):
super().__init__(vec_path, device)
@@ -66,9 +67,9 @@ def get_f0_predictor(
class RVC(Model):
def __init__(
self,
model_path: str | bytes | os.PathLike,
model_path: typing.Union[str, bytes, os.PathLike],
hop_len=512,
vec_path: str | bytes | os.PathLike = "vec-768-layer-12.onnx",
vec_path: typing.Union[str, bytes, os.PathLike] = "vec-768-layer-12.onnx",
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
):
super().__init__(model_path, device)

80
rvc/onnx/synthesizer.py Normal file
View File

@@ -0,0 +1,80 @@
from typing import List, Optional, Union
import torch
from rvc.layers.synthesizers import SynthesizerTrnMsNSFsid as SynthesizerBase
class SynthesizerTrnMsNSFsid(SynthesizerBase):
def __init__(
self,
spec_channels: int,
segment_size: int,
inter_channels: int,
hidden_channels: int,
filter_channels: int,
n_heads: int,
n_layers: int,
kernel_size: int,
p_dropout: int,
resblock: str,
resblock_kernel_sizes: List[int],
resblock_dilation_sizes: List[List[int]],
upsample_rates: List[int],
upsample_initial_channel: int,
upsample_kernel_sizes: List[int],
spk_embed_dim: int,
gin_channels: int,
sr: Optional[Union[str, int]],
encoder_dim: int,
):
super().__init__(
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
spk_embed_dim,
gin_channels,
sr,
encoder_dim,
True,
)
self.speaker_map = None
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def construct_spkmixmap(self):
self.speaker_map = torch.zeros((self.n_speaker, 1, 1, self.gin_channels))
for i in range(self.n_speaker):
self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
self.speaker_map = self.speaker_map.unsqueeze(0)
def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
g = g * self.speaker_map # [N, S, B, 1, H]
g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
else:
g = g.unsqueeze(0)
g = self.emb_g(g).transpose(1, 2)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
return o