mirror of
https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git
synced 2026-06-07 19:40:44 +08:00
Merge branch 'dev' into dev
This commit is contained in:
@@ -1,119 +0,0 @@
|
|||||||
import torch
|
|
||||||
from torch import nn
|
|
||||||
|
|
||||||
from rvc.layers.nsf import NSFGenerator
|
|
||||||
from rvc.layers.encoders import TextEncoder, PosteriorEncoder
|
|
||||||
from rvc.layers.residuals import ResidualCouplingBlock
|
|
||||||
|
|
||||||
|
|
||||||
class SynthesizerTrnMsNSFsidM(nn.Module):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
spec_channels: int,
|
|
||||||
segment_size,
|
|
||||||
inter_channels,
|
|
||||||
hidden_channels,
|
|
||||||
filter_channels,
|
|
||||||
n_heads,
|
|
||||||
n_layers,
|
|
||||||
kernel_size,
|
|
||||||
p_dropout,
|
|
||||||
resblock,
|
|
||||||
resblock_kernel_sizes,
|
|
||||||
resblock_dilation_sizes,
|
|
||||||
upsample_rates,
|
|
||||||
upsample_initial_channel,
|
|
||||||
upsample_kernel_sizes,
|
|
||||||
spk_embed_dim,
|
|
||||||
gin_channels,
|
|
||||||
sr,
|
|
||||||
encoder_dim,
|
|
||||||
**kwargs
|
|
||||||
):
|
|
||||||
super(SynthesizerTrnMsNSFsidM, self).__init__()
|
|
||||||
if isinstance(sr, str):
|
|
||||||
sr = {
|
|
||||||
"32k": 32000,
|
|
||||||
"40k": 40000,
|
|
||||||
"48k": 48000,
|
|
||||||
}[sr]
|
|
||||||
self.spec_channels = spec_channels
|
|
||||||
self.inter_channels = inter_channels
|
|
||||||
self.hidden_channels = hidden_channels
|
|
||||||
self.filter_channels = filter_channels
|
|
||||||
self.n_heads = n_heads
|
|
||||||
self.n_layers = n_layers
|
|
||||||
self.kernel_size = kernel_size
|
|
||||||
self.p_dropout = float(p_dropout)
|
|
||||||
self.resblock = resblock
|
|
||||||
self.resblock_kernel_sizes = resblock_kernel_sizes
|
|
||||||
self.resblock_dilation_sizes = resblock_dilation_sizes
|
|
||||||
self.upsample_rates = upsample_rates
|
|
||||||
self.upsample_initial_channel = upsample_initial_channel
|
|
||||||
self.upsample_kernel_sizes = upsample_kernel_sizes
|
|
||||||
self.segment_size = segment_size
|
|
||||||
self.gin_channels = gin_channels
|
|
||||||
# self.hop_length = hop_length#
|
|
||||||
self.spk_embed_dim = spk_embed_dim
|
|
||||||
self.enc_p = TextEncoder(
|
|
||||||
encoder_dim,
|
|
||||||
inter_channels,
|
|
||||||
hidden_channels,
|
|
||||||
filter_channels,
|
|
||||||
n_heads,
|
|
||||||
n_layers,
|
|
||||||
kernel_size,
|
|
||||||
float(p_dropout),
|
|
||||||
)
|
|
||||||
self.dec = NSFGenerator(
|
|
||||||
inter_channels,
|
|
||||||
resblock,
|
|
||||||
resblock_kernel_sizes,
|
|
||||||
resblock_dilation_sizes,
|
|
||||||
upsample_rates,
|
|
||||||
upsample_initial_channel,
|
|
||||||
upsample_kernel_sizes,
|
|
||||||
gin_channels=gin_channels,
|
|
||||||
sr=sr,
|
|
||||||
)
|
|
||||||
self.enc_q = PosteriorEncoder(
|
|
||||||
spec_channels,
|
|
||||||
inter_channels,
|
|
||||||
hidden_channels,
|
|
||||||
5,
|
|
||||||
1,
|
|
||||||
16,
|
|
||||||
gin_channels=gin_channels,
|
|
||||||
)
|
|
||||||
self.flow = ResidualCouplingBlock(
|
|
||||||
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
|
||||||
)
|
|
||||||
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
|
|
||||||
self.speaker_map = None
|
|
||||||
|
|
||||||
def remove_weight_norm(self):
|
|
||||||
self.dec.remove_weight_norm()
|
|
||||||
self.flow.remove_weight_norm()
|
|
||||||
self.enc_q.remove_weight_norm()
|
|
||||||
|
|
||||||
def construct_spkmixmap(self):
|
|
||||||
self.speaker_map = torch.zeros((self.n_speaker, 1, 1, self.gin_channels))
|
|
||||||
for i in range(self.n_speaker):
|
|
||||||
self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
|
|
||||||
self.speaker_map = self.speaker_map.unsqueeze(0)
|
|
||||||
|
|
||||||
def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
|
|
||||||
if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
|
|
||||||
g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
|
|
||||||
g = g * self.speaker_map # [N, S, B, 1, H]
|
|
||||||
g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
|
|
||||||
g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
|
|
||||||
else:
|
|
||||||
g = g.unsqueeze(0)
|
|
||||||
g = self.emb_g(g).transpose(1, 2)
|
|
||||||
|
|
||||||
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
|
||||||
z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
|
|
||||||
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
|
||||||
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
|
|
||||||
return o
|
|
||||||
@@ -1 +1,2 @@
|
|||||||
from .infer import RVC
|
from .infer import RVC
|
||||||
|
from .exporter import export_onnx
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
|
from .synthesizer import SynthesizerTrnMsNSFsid
|
||||||
|
|
||||||
|
|
||||||
def export_onnx(ModelPath, ExportedPath):
|
def export_onnx(from_cpkt_pth: str, to_onnx_pth: str) -> str:
|
||||||
cpt = torch.load(ModelPath, map_location="cpu")
|
cpt = torch.load(from_cpkt_pth, map_location="cpu")
|
||||||
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
|
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
|
||||||
vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
|
vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
|
||||||
|
|
||||||
@@ -17,8 +17,8 @@ def export_onnx(ModelPath, ExportedPath):
|
|||||||
|
|
||||||
device = "cpu" # 导出时设备(不影响使用模型)
|
device = "cpu" # 导出时设备(不影响使用模型)
|
||||||
|
|
||||||
net_g = SynthesizerTrnMsNSFsidM(
|
net_g = SynthesizerTrnMsNSFsid(
|
||||||
*cpt["config"], is_half=False, encoder_dim=vec_channels
|
*cpt["config"], encoder_dim=vec_channels
|
||||||
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
||||||
net_g.load_state_dict(cpt["weight"], strict=False)
|
net_g.load_state_dict(cpt["weight"], strict=False)
|
||||||
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
|
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
|
||||||
@@ -36,7 +36,7 @@ def export_onnx(ModelPath, ExportedPath):
|
|||||||
test_ds.to(device),
|
test_ds.to(device),
|
||||||
test_rnd.to(device),
|
test_rnd.to(device),
|
||||||
),
|
),
|
||||||
ExportedPath,
|
to_onnx_pth,
|
||||||
dynamic_axes={
|
dynamic_axes={
|
||||||
"phone": [1],
|
"phone": [1],
|
||||||
"pitch": [1],
|
"pitch": [1],
|
||||||
@@ -23,7 +23,7 @@ class DioF0Predictor(F0Predictor):
|
|||||||
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
|
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
|
||||||
for index, pitch in enumerate(f0):
|
for index, pitch in enumerate(f0):
|
||||||
f0[index] = round(pitch, 1)
|
f0[index] = round(pitch, 1)
|
||||||
return self.__interpolate_f0(self.__resize_f0(f0, p_len))[0]
|
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
|
||||||
|
|
||||||
def compute_f0_uv(
|
def compute_f0_uv(
|
||||||
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
|
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
|
||||||
@@ -40,4 +40,4 @@ class DioF0Predictor(F0Predictor):
|
|||||||
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
|
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
|
||||||
for index, pitch in enumerate(f0):
|
for index, pitch in enumerate(f0):
|
||||||
f0[index] = round(pitch, 1)
|
f0[index] = round(pitch, 1)
|
||||||
return self.__interpolate_f0(self.__resize_f0(f0, p_len))
|
return self.interpolate_f0(self.resize_f0(f0, p_len))
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ class F0Predictor(object):
|
|||||||
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
|
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
|
||||||
): ...
|
): ...
|
||||||
|
|
||||||
def __interpolate_f0(self, f0: np.ndarray[Any, np.dtype]):
|
def interpolate_f0(self, f0: np.ndarray[Any, np.dtype]):
|
||||||
"""
|
"""
|
||||||
对F0进行插值处理
|
对F0进行插值处理
|
||||||
"""
|
"""
|
||||||
@@ -56,7 +56,7 @@ class F0Predictor(object):
|
|||||||
|
|
||||||
return ip_data[:, 0], vuv_vector[:, 0]
|
return ip_data[:, 0], vuv_vector[:, 0]
|
||||||
|
|
||||||
def __resize_f0(self, x: np.ndarray[Any, np.dtype], target_len: int):
|
def resize_f0(self, x: np.ndarray[Any, np.dtype], target_len: int):
|
||||||
source = np.array(x)
|
source = np.array(x)
|
||||||
source[source < 0.001] = np.nan
|
source[source < 0.001] = np.nan
|
||||||
target = np.interp(
|
target = np.interp(
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ class HarvestF0Predictor(F0Predictor):
|
|||||||
frame_period=1000 * self.hop_length / self.sampling_rate,
|
frame_period=1000 * self.hop_length / self.sampling_rate,
|
||||||
)
|
)
|
||||||
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
|
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
|
||||||
return self.__interpolate_f0(self.__resize_f0(f0, p_len))[0]
|
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
|
||||||
|
|
||||||
def compute_f0_uv(
|
def compute_f0_uv(
|
||||||
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
|
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
|
||||||
@@ -36,4 +36,4 @@ class HarvestF0Predictor(F0Predictor):
|
|||||||
frame_period=1000 * self.hop_length / self.sampling_rate,
|
frame_period=1000 * self.hop_length / self.sampling_rate,
|
||||||
)
|
)
|
||||||
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
|
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
|
||||||
return self.__interpolate_f0(self.__resize_f0(f0, p_len))
|
return self.interpolate_f0(self.resize_f0(f0, p_len))
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class PMF0Predictor(F0Predictor):
|
|||||||
pad_size = (p_len - len(f0) + 1) // 2
|
pad_size = (p_len - len(f0) + 1) // 2
|
||||||
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
||||||
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
|
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
|
||||||
f0, uv = self.__interpolate_f0(f0)
|
f0, uv = self.interpolate_f0(f0)
|
||||||
return f0
|
return f0
|
||||||
|
|
||||||
def compute_f0_uv(
|
def compute_f0_uv(
|
||||||
@@ -57,5 +57,5 @@ class PMF0Predictor(F0Predictor):
|
|||||||
pad_size = (p_len - len(f0) + 1) // 2
|
pad_size = (p_len - len(f0) + 1) // 2
|
||||||
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
||||||
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
|
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
|
||||||
f0, uv = self.__interpolate_f0(f0)
|
f0, uv = self.interpolate_f0(f0)
|
||||||
return f0, uv
|
return f0, uv
|
||||||
|
|||||||
@@ -1,10 +1,11 @@
|
|||||||
import librosa
|
|
||||||
import numpy as np
|
|
||||||
import onnxruntime
|
|
||||||
import typing
|
import typing
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from onnx.f0predictors import (
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
import onnxruntime
|
||||||
|
|
||||||
|
from .f0predictors import (
|
||||||
PMF0Predictor,
|
PMF0Predictor,
|
||||||
HarvestF0Predictor,
|
HarvestF0Predictor,
|
||||||
DioF0Predictor,
|
DioF0Predictor,
|
||||||
@@ -15,7 +16,7 @@ from onnx.f0predictors import (
|
|||||||
class Model:
|
class Model:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
path: str | bytes | os.PathLike,
|
path: typing.Union[str, bytes, os.PathLike],
|
||||||
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
|
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
|
||||||
):
|
):
|
||||||
if device == "cpu":
|
if device == "cpu":
|
||||||
@@ -32,7 +33,7 @@ class Model:
|
|||||||
class ContentVec(Model):
|
class ContentVec(Model):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vec_path: str | bytes | os.PathLike,
|
vec_path: typing.Union[str, bytes, os.PathLike],
|
||||||
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
|
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
|
||||||
):
|
):
|
||||||
super().__init__(vec_path, device)
|
super().__init__(vec_path, device)
|
||||||
@@ -66,9 +67,9 @@ def get_f0_predictor(
|
|||||||
class RVC(Model):
|
class RVC(Model):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model_path: str | bytes | os.PathLike,
|
model_path: typing.Union[str, bytes, os.PathLike],
|
||||||
hop_len=512,
|
hop_len=512,
|
||||||
vec_path: str | bytes | os.PathLike = "vec-768-layer-12.onnx",
|
vec_path: typing.Union[str, bytes, os.PathLike] = "vec-768-layer-12.onnx",
|
||||||
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
|
device: typing.Literal["cpu", "cuda", "dml"] = "cpu",
|
||||||
):
|
):
|
||||||
super().__init__(model_path, device)
|
super().__init__(model_path, device)
|
||||||
|
|||||||
80
rvc/onnx/synthesizer.py
Normal file
80
rvc/onnx/synthesizer.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from rvc.layers.synthesizers import SynthesizerTrnMsNSFsid as SynthesizerBase
|
||||||
|
|
||||||
|
|
||||||
|
class SynthesizerTrnMsNSFsid(SynthesizerBase):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
spec_channels: int,
|
||||||
|
segment_size: int,
|
||||||
|
inter_channels: int,
|
||||||
|
hidden_channels: int,
|
||||||
|
filter_channels: int,
|
||||||
|
n_heads: int,
|
||||||
|
n_layers: int,
|
||||||
|
kernel_size: int,
|
||||||
|
p_dropout: int,
|
||||||
|
resblock: str,
|
||||||
|
resblock_kernel_sizes: List[int],
|
||||||
|
resblock_dilation_sizes: List[List[int]],
|
||||||
|
upsample_rates: List[int],
|
||||||
|
upsample_initial_channel: int,
|
||||||
|
upsample_kernel_sizes: List[int],
|
||||||
|
spk_embed_dim: int,
|
||||||
|
gin_channels: int,
|
||||||
|
sr: Optional[Union[str, int]],
|
||||||
|
encoder_dim: int,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
spec_channels,
|
||||||
|
segment_size,
|
||||||
|
inter_channels,
|
||||||
|
hidden_channels,
|
||||||
|
filter_channels,
|
||||||
|
n_heads,
|
||||||
|
n_layers,
|
||||||
|
kernel_size,
|
||||||
|
p_dropout,
|
||||||
|
resblock,
|
||||||
|
resblock_kernel_sizes,
|
||||||
|
resblock_dilation_sizes,
|
||||||
|
upsample_rates,
|
||||||
|
upsample_initial_channel,
|
||||||
|
upsample_kernel_sizes,
|
||||||
|
spk_embed_dim,
|
||||||
|
gin_channels,
|
||||||
|
sr,
|
||||||
|
encoder_dim,
|
||||||
|
True,
|
||||||
|
)
|
||||||
|
self.speaker_map = None
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
self.dec.remove_weight_norm()
|
||||||
|
self.flow.remove_weight_norm()
|
||||||
|
self.enc_q.remove_weight_norm()
|
||||||
|
|
||||||
|
def construct_spkmixmap(self):
|
||||||
|
self.speaker_map = torch.zeros((self.n_speaker, 1, 1, self.gin_channels))
|
||||||
|
for i in range(self.n_speaker):
|
||||||
|
self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
|
||||||
|
self.speaker_map = self.speaker_map.unsqueeze(0)
|
||||||
|
|
||||||
|
def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
|
||||||
|
if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
|
||||||
|
g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
|
||||||
|
g = g * self.speaker_map # [N, S, B, 1, H]
|
||||||
|
g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
|
||||||
|
g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
|
||||||
|
else:
|
||||||
|
g = g.unsqueeze(0)
|
||||||
|
g = self.emb_g(g).transpose(1, 2)
|
||||||
|
|
||||||
|
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
||||||
|
z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
|
||||||
|
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
||||||
|
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
|
||||||
|
return o
|
||||||
@@ -1,3 +1,3 @@
|
|||||||
from infer.modules.onnx.export import export_onnx
|
from rvc.onnx import export_onnx
|
||||||
|
|
||||||
export_onnx("pt/Justin Bieber.pth", "pt/TestRvc_Rvc.onnx")
|
export_onnx("pt/Justin Bieber.pth", "pt/TestRvc_Rvc.onnx")
|
||||||
|
|||||||
@@ -13,12 +13,10 @@ vec_path = "vec-256-layer-9.onnx" # 需要onnx的vec模型
|
|||||||
wav_path = "123.wav" # 输入路径或ByteIO实例
|
wav_path = "123.wav" # 输入路径或ByteIO实例
|
||||||
out_path = "out.wav" # 输出路径或ByteIO实例
|
out_path = "out.wav" # 输出路径或ByteIO实例
|
||||||
|
|
||||||
model = RVC(
|
model = RVC(model_path, vec_path=vec_path, hop_len=hop_size, device="cuda")
|
||||||
model_path, vec_path=vec_path, sr=sampling_rate, hop_len=hop_size, device="cuda"
|
|
||||||
)
|
|
||||||
|
|
||||||
wav, sr = librosa.load(wav_path, sr=sampling_rate)
|
wav, sr = librosa.load(wav_path, sr=sampling_rate)
|
||||||
|
|
||||||
audio = model.infer(wav, sr, sid, f0_method=f0_method, f0_up_key=f0_up_key)
|
audio = model.infer(wav, sr, sampling_rate, sid, f0_method, f0_up_key)
|
||||||
|
|
||||||
soundfile.write(out_path, audio, sampling_rate)
|
soundfile.write(out_path, audio, sampling_rate)
|
||||||
|
|||||||
Reference in New Issue
Block a user