From f956b333facd14e419c3a789df236c34b27c00c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Tue, 11 Jun 2024 17:21:05 +0900 Subject: [PATCH] optimize(infer): move onnx into rvc --- infer/lib/infer_pack/models_onnx.py | 119 ------------------ rvc/onnx/__init__.py | 1 + .../onnx/export.py => rvc/onnx/exporter.py | 12 +- rvc/onnx/f0predictors/dio.py | 4 +- rvc/onnx/f0predictors/f0.py | 4 +- rvc/onnx/f0predictors/harvest.py | 4 +- rvc/onnx/f0predictors/pm.py | 4 +- rvc/onnx/infer.py | 17 +-- rvc/onnx/synthesizer.py | 80 ++++++++++++ tools/onnx/export.py | 2 +- tools/onnx/infer.py | 4 +- web.py | 2 +- 12 files changed, 108 insertions(+), 145 deletions(-) delete mode 100644 infer/lib/infer_pack/models_onnx.py rename infer/modules/onnx/export.py => rvc/onnx/exporter.py (84%) create mode 100644 rvc/onnx/synthesizer.py diff --git a/infer/lib/infer_pack/models_onnx.py b/infer/lib/infer_pack/models_onnx.py deleted file mode 100644 index e5778e6..0000000 --- a/infer/lib/infer_pack/models_onnx.py +++ /dev/null @@ -1,119 +0,0 @@ -import torch -from torch import nn - -from rvc.layers.nsf import NSFGenerator -from rvc.layers.encoders import TextEncoder, PosteriorEncoder -from rvc.layers.residuals import ResidualCouplingBlock - - -class SynthesizerTrnMsNSFsidM(nn.Module): - def __init__( - self, - spec_channels: int, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr, - encoder_dim, - **kwargs - ): - super(SynthesizerTrnMsNSFsidM, self).__init__() - if isinstance(sr, str): - sr = { - "32k": 32000, - "40k": 40000, - "48k": 48000, - }[sr] - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder( - encoder_dim, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - float(p_dropout), - ) - self.dec = NSFGenerator( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - sr=sr, - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - self.speaker_map = None - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def construct_spkmixmap(self): - self.speaker_map = torch.zeros((self.n_speaker, 1, 1, self.gin_channels)) - for i in range(self.n_speaker): - self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]])) - self.speaker_map = self.speaker_map.unsqueeze(0) - - def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None): - if self.speaker_map is not None: # [N, S] * [S, B, 1, H] - g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1] - g = g * self.speaker_map # [N, S, B, 1, H] - g = torch.sum(g, dim=1) # [N, 1, B, 1, H] - g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N] - else: - g = g.unsqueeze(0) - g = self.emb_g(g).transpose(1, 2) - - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) - return o diff --git a/rvc/onnx/__init__.py b/rvc/onnx/__init__.py index eeed4d8..a6a5519 100644 --- a/rvc/onnx/__init__.py +++ b/rvc/onnx/__init__.py @@ -1 +1,2 @@ from .infer import RVC +from .exporter import export_onnx diff --git a/infer/modules/onnx/export.py b/rvc/onnx/exporter.py similarity index 84% rename from infer/modules/onnx/export.py rename to rvc/onnx/exporter.py index b5ef2f5..0b54bc4 100644 --- a/infer/modules/onnx/export.py +++ b/rvc/onnx/exporter.py @@ -1,10 +1,10 @@ import torch -from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM +from .synthesizer import SynthesizerTrnMsNSFsid -def export_onnx(ModelPath, ExportedPath): - cpt = torch.load(ModelPath, map_location="cpu") +def export_onnx(from_cpkt_pth: str, to_onnx_pth: str) -> str: + cpt = torch.load(from_cpkt_pth, map_location="cpu") cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768 @@ -17,8 +17,8 @@ def export_onnx(ModelPath, ExportedPath): device = "cpu" # 导出时设备(不影响使用模型) - net_g = SynthesizerTrnMsNSFsidM( - *cpt["config"], is_half=False, encoder_dim=vec_channels + net_g = SynthesizerTrnMsNSFsid( + *cpt["config"], encoder_dim=vec_channels ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) net_g.load_state_dict(cpt["weight"], strict=False) input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] @@ -36,7 +36,7 @@ def export_onnx(ModelPath, ExportedPath): test_ds.to(device), test_rnd.to(device), ), - ExportedPath, + to_onnx_pth, dynamic_axes={ "phone": [1], "pitch": [1], diff --git a/rvc/onnx/f0predictors/dio.py b/rvc/onnx/f0predictors/dio.py index 4437696..c142670 100644 --- a/rvc/onnx/f0predictors/dio.py +++ b/rvc/onnx/f0predictors/dio.py @@ -23,7 +23,7 @@ class DioF0Predictor(F0Predictor): f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) for index, pitch in enumerate(f0): f0[index] = round(pitch, 1) - return self.__interpolate_f0(self.__resize_f0(f0, p_len))[0] + return self.interpolate_f0(self.resize_f0(f0, p_len))[0] def compute_f0_uv( self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None @@ -40,4 +40,4 @@ class DioF0Predictor(F0Predictor): f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) for index, pitch in enumerate(f0): f0[index] = round(pitch, 1) - return self.__interpolate_f0(self.__resize_f0(f0, p_len)) + return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/rvc/onnx/f0predictors/f0.py b/rvc/onnx/f0predictors/f0.py index ae20a79..7131491 100644 --- a/rvc/onnx/f0predictors/f0.py +++ b/rvc/onnx/f0predictors/f0.py @@ -18,7 +18,7 @@ class F0Predictor(object): self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None ): ... - def __interpolate_f0(self, f0: np.ndarray[Any, np.dtype]): + def interpolate_f0(self, f0: np.ndarray[Any, np.dtype]): """ 对F0进行插值处理 """ @@ -56,7 +56,7 @@ class F0Predictor(object): return ip_data[:, 0], vuv_vector[:, 0] - def __resize_f0(self, x: np.ndarray[Any, np.dtype], target_len: int): + def resize_f0(self, x: np.ndarray[Any, np.dtype], target_len: int): source = np.array(x) source[source < 0.001] = np.nan target = np.interp( diff --git a/rvc/onnx/f0predictors/harvest.py b/rvc/onnx/f0predictors/harvest.py index 694eb04..54956ba 100644 --- a/rvc/onnx/f0predictors/harvest.py +++ b/rvc/onnx/f0predictors/harvest.py @@ -21,7 +21,7 @@ class HarvestF0Predictor(F0Predictor): frame_period=1000 * self.hop_length / self.sampling_rate, ) f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) - return self.__interpolate_f0(self.__resize_f0(f0, p_len))[0] + return self.interpolate_f0(self.resize_f0(f0, p_len))[0] def compute_f0_uv( self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None @@ -36,4 +36,4 @@ class HarvestF0Predictor(F0Predictor): frame_period=1000 * self.hop_length / self.sampling_rate, ) f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) - return self.__interpolate_f0(self.__resize_f0(f0, p_len)) + return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/rvc/onnx/f0predictors/pm.py b/rvc/onnx/f0predictors/pm.py index 39450f1..7513e15 100644 --- a/rvc/onnx/f0predictors/pm.py +++ b/rvc/onnx/f0predictors/pm.py @@ -31,7 +31,7 @@ class PMF0Predictor(F0Predictor): pad_size = (p_len - len(f0) + 1) // 2 if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") - f0, uv = self.__interpolate_f0(f0) + f0, uv = self.interpolate_f0(f0) return f0 def compute_f0_uv( @@ -57,5 +57,5 @@ class PMF0Predictor(F0Predictor): pad_size = (p_len - len(f0) + 1) // 2 if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") - f0, uv = self.__interpolate_f0(f0) + f0, uv = self.interpolate_f0(f0) return f0, uv diff --git a/rvc/onnx/infer.py b/rvc/onnx/infer.py index 31ac954..b14b590 100644 --- a/rvc/onnx/infer.py +++ b/rvc/onnx/infer.py @@ -1,10 +1,11 @@ -import librosa -import numpy as np -import onnxruntime import typing import os -from onnx.f0predictors import ( +import librosa +import numpy as np +import onnxruntime + +from .f0predictors import ( PMF0Predictor, HarvestF0Predictor, DioF0Predictor, @@ -15,7 +16,7 @@ from onnx.f0predictors import ( class Model: def __init__( self, - path: str | bytes | os.PathLike, + path: typing.Union[str, bytes, os.PathLike], device: typing.Literal["cpu", "cuda", "dml"] = "cpu", ): if device == "cpu": @@ -32,7 +33,7 @@ class Model: class ContentVec(Model): def __init__( self, - vec_path: str | bytes | os.PathLike, + vec_path: typing.Union[str, bytes, os.PathLike], device: typing.Literal["cpu", "cuda", "dml"] = "cpu", ): super().__init__(vec_path, device) @@ -66,9 +67,9 @@ def get_f0_predictor( class RVC(Model): def __init__( self, - model_path: str | bytes | os.PathLike, + model_path: typing.Union[str, bytes, os.PathLike], hop_len=512, - vec_path: str | bytes | os.PathLike = "vec-768-layer-12.onnx", + vec_path: typing.Union[str, bytes, os.PathLike] = "vec-768-layer-12.onnx", device: typing.Literal["cpu", "cuda", "dml"] = "cpu", ): super().__init__(model_path, device) diff --git a/rvc/onnx/synthesizer.py b/rvc/onnx/synthesizer.py new file mode 100644 index 0000000..e8bf516 --- /dev/null +++ b/rvc/onnx/synthesizer.py @@ -0,0 +1,80 @@ +from typing import List, Optional, Union + +import torch + +from rvc.layers.synthesizers import SynthesizerTrnMsNSFsid as SynthesizerBase + + +class SynthesizerTrnMsNSFsid(SynthesizerBase): + def __init__( + self, + spec_channels: int, + segment_size: int, + inter_channels: int, + hidden_channels: int, + filter_channels: int, + n_heads: int, + n_layers: int, + kernel_size: int, + p_dropout: int, + resblock: str, + resblock_kernel_sizes: List[int], + resblock_dilation_sizes: List[List[int]], + upsample_rates: List[int], + upsample_initial_channel: int, + upsample_kernel_sizes: List[int], + spk_embed_dim: int, + gin_channels: int, + sr: Optional[Union[str, int]], + encoder_dim: int, + ): + super().__init__( + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + encoder_dim, + True, + ) + self.speaker_map = None + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def construct_spkmixmap(self): + self.speaker_map = torch.zeros((self.n_speaker, 1, 1, self.gin_channels)) + for i in range(self.n_speaker): + self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]])) + self.speaker_map = self.speaker_map.unsqueeze(0) + + def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None): + if self.speaker_map is not None: # [N, S] * [S, B, 1, H] + g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1] + g = g * self.speaker_map # [N, S, B, 1, H] + g = torch.sum(g, dim=1) # [N, 1, B, 1, H] + g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N] + else: + g = g.unsqueeze(0) + g = self.emb_g(g).transpose(1, 2) + + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o diff --git a/tools/onnx/export.py b/tools/onnx/export.py index 7776e3c..77be814 100644 --- a/tools/onnx/export.py +++ b/tools/onnx/export.py @@ -1,3 +1,3 @@ -from infer.modules.onnx.export import export_onnx +from rvc.onnx import export_onnx export_onnx("pt/Justin Bieber.pth", "pt/TestRvc_Rvc.onnx") diff --git a/tools/onnx/infer.py b/tools/onnx/infer.py index b982d63..acad6f8 100644 --- a/tools/onnx/infer.py +++ b/tools/onnx/infer.py @@ -14,11 +14,11 @@ wav_path = "123.wav" # 输入路径或ByteIO实例 out_path = "out.wav" # 输出路径或ByteIO实例 model = RVC( - model_path, vec_path=vec_path, sr=sampling_rate, hop_len=hop_size, device="cuda" + model_path, vec_path=vec_path, hop_len=hop_size, device="cuda" ) wav, sr = librosa.load(wav_path, sr=sampling_rate) -audio = model.infer(wav, sr, sid, f0_method=f0_method, f0_up_key=f0_up_key) +audio = model.infer(wav, sr, sampling_rate, sid, f0_method, f0_up_key) soundfile.write(out_path, audio, sampling_rate) diff --git a/web.py b/web.py index 4b8f483..7365dfe 100644 --- a/web.py +++ b/web.py @@ -182,7 +182,7 @@ def clean(): def export_onnx(ModelPath, ExportedPath): - from infer.modules.onnx.export import export_onnx as eo + from rvc.onnx import export_onnx as eo eo(ModelPath, ExportedPath)