diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py index 8b1f715..b7d7b9b 100644 --- a/infer/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -1,5 +1,4 @@ import math -import logging from typing import Optional, Tuple, List import torch @@ -7,8 +6,9 @@ from torch import nn from torch.nn import Conv1d, Conv2d, ConvTranspose1d from torch.nn import functional as F from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm -from infer.lib.infer_pack import modules +from rvc import residuals +from rvc.norms import WN from rvc.utils import ( get_padding, call_weight_data_normal_if_Conv, @@ -22,6 +22,26 @@ has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available()) class ResidualCouplingBlock(nn.Module): + class Flip(nn.Module): + """ + torch.jit.script() Compiled functions + can't take variable number of arguments or + use keyword-only arguments with defaults + """ + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x, torch.zeros([1], device=x.device) + def __init__( self, channels, @@ -44,7 +64,7 @@ class ResidualCouplingBlock(nn.Module): self.flows = nn.ModuleList() for i in range(n_flows): self.flows.append( - modules.ResidualCouplingLayer( + residuals.ResidualCouplingLayer( channels, hidden_channels, kernel_size, @@ -54,7 +74,7 @@ class ResidualCouplingBlock(nn.Module): mean_only=True, ) ) - self.flows.append(modules.Flip()) + self.flows.append(self.Flip()) def forward( self, @@ -108,7 +128,7 @@ class PosteriorEncoder(nn.Module): self.gin_channels = gin_channels self.pre = nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = modules.WN( + self.enc = WN( hidden_channels, kernel_size, dilation_rate, @@ -167,7 +187,7 @@ class Generator(torch.nn.Module): self.conv_pre = Conv1d( initial_channel, upsample_initial_channel, 7, 1, padding=3 ) - resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + resblock = residuals.ResBlock1 if resblock == "1" else residuals.ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): @@ -215,7 +235,7 @@ class Generator(torch.nn.Module): x = x + self.cond(g) for i in range(self.num_upsamples): - x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = F.leaky_relu(x, residuals.LRELU_SLOPE) x = self.ups[i](x) xs = None for j in range(self.num_kernels): @@ -382,33 +402,21 @@ class SourceModuleHnNSF(torch.nn.Module): sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0, - is_half=True, ): super(SourceModuleHnNSF, self).__init__() self.sine_amp = sine_amp self.noise_std = add_noise_std - self.is_half = is_half # to produce sine waveforms self.l_sin_gen = SineGen( sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod ) - # to merge source harmonics into a single excitation self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_tanh = torch.nn.Tanh() - # self.ddtype:int = -1 def forward(self, x: torch.Tensor, upp: int = 1): - # if self.ddtype ==-1: - # self.ddtype = self.l_linear.weight.dtype - sine_wavs, uv, _ = self.l_sin_gen(x, upp) - # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype) - # if self.is_half: - # sine_wavs = sine_wavs.half() - # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x))) - # print(sine_wavs.dtype,self.ddtype) - # if sine_wavs.dtype != self.l_linear.weight.dtype: + sine_wavs, _, _ = self.l_sin_gen(x, upp) sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) sine_merge = self.l_tanh(self.l_linear(sine_wavs)) return sine_merge, None, None # noise, uv @@ -426,7 +434,6 @@ class GeneratorNSF(torch.nn.Module): upsample_kernel_sizes, gin_channels, sr, - is_half=False, ): super(GeneratorNSF, self).__init__() self.num_kernels = len(resblock_kernel_sizes) @@ -434,13 +441,13 @@ class GeneratorNSF(torch.nn.Module): self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) self.m_source = SourceModuleHnNSF( - sampling_rate=sr, harmonic_num=0, is_half=is_half + sampling_rate=sr, harmonic_num=0 ) self.noise_convs = nn.ModuleList() self.conv_pre = Conv1d( initial_channel, upsample_initial_channel, 7, 1, padding=3 ) - resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + resblock = residuals.ResBlock1 if resblock == "1" else residuals.ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): @@ -486,7 +493,7 @@ class GeneratorNSF(torch.nn.Module): self.upp = math.prod(upsample_rates) - self.lrelu_slope = modules.LRELU_SLOPE + self.lrelu_slope = residuals.LRELU_SLOPE def forward( self, @@ -584,7 +591,6 @@ class SynthesizerTrnMs256NSFsid(nn.Module): spk_embed_dim: int, gin_channels: int, sr: str | int, - **kwargs ): super(SynthesizerTrnMs256NSFsid, self).__init__() if isinstance(sr, str): @@ -631,7 +637,6 @@ class SynthesizerTrnMs256NSFsid(nn.Module): upsample_kernel_sizes, gin_channels=gin_channels, sr=sr, - is_half=kwargs["is_half"], ) self.enc_q = PosteriorEncoder( spec_channels, @@ -764,7 +769,6 @@ class SynthesizerTrnMs768NSFsid(SynthesizerTrnMs256NSFsid): spk_embed_dim, gin_channels, sr, - **kwargs ): super(SynthesizerTrnMs768NSFsid, self).__init__( spec_channels, @@ -785,7 +789,6 @@ class SynthesizerTrnMs768NSFsid(SynthesizerTrnMs256NSFsid): spk_embed_dim, gin_channels, sr, - **kwargs ) del self.enc_p self.enc_p = TextEncoder( @@ -812,7 +815,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): n_layers, kernel_size, p_dropout, - resblock, + resblock: str, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, @@ -1095,7 +1098,7 @@ class DiscriminatorS(torch.nn.Module): for l in self.convs: x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = F.leaky_relu(x, residuals.LRELU_SLOPE) fmap.append(x) x = self.conv_post(x) fmap.append(x) @@ -1179,7 +1182,7 @@ class DiscriminatorP(torch.nn.Module): for l in self.convs: x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = F.leaky_relu(x, residuals.LRELU_SLOPE) fmap.append(x) x = self.conv_post(x) fmap.append(x) diff --git a/infer/lib/infer_pack/models_onnx.py b/infer/lib/infer_pack/models_onnx.py index d0c735a..34b25de 100644 --- a/infer/lib/infer_pack/models_onnx.py +++ b/infer/lib/infer_pack/models_onnx.py @@ -79,7 +79,6 @@ class SynthesizerTrnMsNSFsidM(nn.Module): upsample_kernel_sizes, gin_channels=gin_channels, sr=sr, - is_half=kwargs["is_half"], ) self.enc_q = PosteriorEncoder( spec_channels, diff --git a/infer/lib/infer_pack/modules.py b/infer/lib/infer_pack/modules.py deleted file mode 100644 index 95ffbfe..0000000 --- a/infer/lib/infer_pack/modules.py +++ /dev/null @@ -1,548 +0,0 @@ -import math -from typing import Optional, Tuple - -import torch -from torch import nn -from torch.nn import Conv1d -from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, weight_norm - -from rvc.utils import ( - get_padding, - call_weight_data_normal_if_Conv, - activate_add_tanh_sigmoid_multiply, -) -from rvc.transforms import piecewise_rational_quadratic_transform -from rvc.norms import LayerNorm - -LRELU_SLOPE = 0.1 - - -class DDSConv(nn.Module): - """ - Dialted and Depth-Separable Convolution - """ - - def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): - super(DDSConv, self).__init__() - self.channels = channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = float(p_dropout) - - self.drop = nn.Dropout(float(p_dropout)) - self.convs_sep = nn.ModuleList() - self.convs_1x1 = nn.ModuleList() - self.norms_1 = nn.ModuleList() - self.norms_2 = nn.ModuleList() - for i in range(n_layers): - dilation = kernel_size**i - padding = (kernel_size * dilation - dilation) // 2 - self.convs_sep.append( - nn.Conv1d( - channels, - channels, - kernel_size, - groups=channels, - dilation=dilation, - padding=padding, - ) - ) - self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) - self.norms_1.append(LayerNorm(channels)) - self.norms_2.append(LayerNorm(channels)) - - def forward(self, x, x_mask, g: Optional[torch.Tensor] = None): - if g is not None: - x = x + g - for i in range(self.n_layers): - y = self.convs_sep[i](x * x_mask) - y = self.norms_1[i](y) - y = F.gelu(y) - y = self.convs_1x1[i](y) - y = self.norms_2[i](y) - y = F.gelu(y) - y = self.drop(y) - x = x + y - return x * x_mask - - -class WN(torch.nn.Module): - def __init__( - self, - hidden_channels: int, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0, - p_dropout=0, - ): - super(WN, self).__init__() - assert kernel_size % 2 == 1 - self.hidden_channels = hidden_channels - self.kernel_size = (kernel_size,) - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - self.p_dropout = float(p_dropout) - - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = nn.Dropout(float(p_dropout)) - - if gin_channels != 0: - cond_layer = torch.nn.Conv1d( - gin_channels, 2 * hidden_channels * n_layers, 1 - ) - self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") - - for i in range(n_layers): - dilation = dilation_rate**i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = torch.nn.Conv1d( - hidden_channels, - 2 * hidden_channels, - kernel_size, - dilation=dilation, - padding=padding, - ) - in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") - self.in_layers.append(in_layer) - - # last one is not necessary - if i < n_layers - 1: - res_skip_channels = 2 * hidden_channels - else: - res_skip_channels = hidden_channels - - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") - self.res_skip_layers.append(res_skip_layer) - - def forward( - self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None - ): - output = torch.zeros_like(x) - - if g is not None: - g = self.cond_layer(g) - - for i, (in_layer, res_skip_layer) in enumerate( - zip(self.in_layers, self.res_skip_layers) - ): - x_in: torch.Tensor = in_layer(x) - if g is not None: - cond_offset = i * 2 * self.hidden_channels - g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] - else: - g_l = torch.zeros_like(x_in) - - acts = activate_add_tanh_sigmoid_multiply(x_in, g_l, self.hidden_channels) - acts = self.drop(acts) - - res_skip_acts = res_skip_layer(acts) - if i < self.n_layers - 1: - res_acts = res_skip_acts[:, : self.hidden_channels, :] - x = (x + res_acts) * x_mask - output = output + res_skip_acts[:, self.hidden_channels :, :] - else: - output = output + res_skip_acts - return output * x_mask - - def remove_weight_norm(self): - if self.gin_channels != 0: - torch.nn.utils.remove_weight_norm(self.cond_layer) - for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) - for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) - - def __prepare_scriptable__(self): - if self.gin_channels != 0: - for hook in self.cond_layer._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.cond_layer) - for l in self.in_layers: - for hook in l._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(l) - for l in self.res_skip_layers: - for hook in l._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(l) - return self - - -class ResBlock1(torch.nn.Module): - def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): - super(ResBlock1, self).__init__() - self.convs1 = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[2], - padding=get_padding(kernel_size, dilation[2]), - ) - ), - ] - ) - self.convs1.apply(call_weight_data_normal_if_Conv) - - self.convs2 = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - ] - ) - self.convs2.apply(call_weight_data_normal_if_Conv) - self.lrelu_slope = LRELU_SLOPE - - def forward(self, x: torch.Tensor, x_mask: Optional[torch.Tensor] = None): - for c1, c2 in zip(self.convs1, self.convs2): - xt = F.leaky_relu(x, self.lrelu_slope) - if x_mask is not None: - xt = xt * x_mask - xt = c1(xt) - xt = F.leaky_relu(xt, self.lrelu_slope) - if x_mask is not None: - xt = xt * x_mask - xt = c2(xt) - x = xt + x - if x_mask is not None: - x = x * x_mask - return x - - def remove_weight_norm(self): - for l in self.convs1: - remove_weight_norm(l) - for l in self.convs2: - remove_weight_norm(l) - - def __prepare_scriptable__(self): - for l in self.convs1: - for hook in l._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(l) - for l in self.convs2: - for hook in l._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(l) - return self - - -class ResBlock2(torch.nn.Module): - def __init__(self, channels, kernel_size=3, dilation=(1, 3)): - super(ResBlock2, self).__init__() - self.convs = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]), - ) - ), - ] - ) - self.convs.apply(call_weight_data_normal_if_Conv) - self.lrelu_slope = LRELU_SLOPE - - def forward(self, x, x_mask: Optional[torch.Tensor] = None): - for c in self.convs: - xt = F.leaky_relu(x, self.lrelu_slope) - if x_mask is not None: - xt = xt * x_mask - xt = c(xt) - x = xt + x - if x_mask is not None: - x = x * x_mask - return x - - def remove_weight_norm(self): - for l in self.convs: - remove_weight_norm(l) - - def __prepare_scriptable__(self): - for l in self.convs: - for hook in l._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(l) - return self - - -class Log(nn.Module): - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - g: Optional[torch.Tensor] = None, - reverse: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - if not reverse: - y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask - logdet = torch.sum(-y, [1, 2]) - return y, logdet - else: - x = torch.exp(x) * x_mask - return x - - -class Flip(nn.Module): - # torch.jit.script() Compiled functions \ - # can't take variable number of arguments or \ - # use keyword-only arguments with defaults - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - g: Optional[torch.Tensor] = None, - reverse: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - x = torch.flip(x, [1]) - if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) - return x, logdet - else: - return x, torch.zeros([1], device=x.device) - - -class ElementwiseAffine(nn.Module): - def __init__(self, channels): - super(ElementwiseAffine, self).__init__() - self.channels = channels - self.m = nn.Parameter(torch.zeros(channels, 1)) - self.logs = nn.Parameter(torch.zeros(channels, 1)) - - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = self.m + torch.exp(self.logs) * x - y = y * x_mask - logdet = torch.sum(self.logs * x_mask, [1, 2]) - return y, logdet - else: - x = (x - self.m) * torch.exp(-self.logs) * x_mask - return x - - -class ResidualCouplingLayer(nn.Module): - def __init__( - self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=0, - gin_channels=0, - mean_only=False, - ): - assert channels % 2 == 0, "channels should be divisible by 2" - super(ResidualCouplingLayer, self).__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only - - self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = WN( - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=float(p_dropout), - gin_channels=gin_channels, - ) - self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - self.post.weight.data.zero_() - self.post.bias.data.zero_() - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - g: Optional[torch.Tensor] = None, - reverse: bool = False, - ): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - h = self.pre(x0) * x_mask - h = self.enc(h, x_mask, g=g) - stats = self.post(h) * x_mask - if not self.mean_only: - m, logs = torch.split(stats, [self.half_channels] * 2, 1) - else: - m = stats - logs = torch.zeros_like(m) - - if not reverse: - x1 = m + x1 * torch.exp(logs) * x_mask - x = torch.cat([x0, x1], 1) - logdet = torch.sum(logs, [1, 2]) - return x, logdet - else: - x1 = (x1 - m) * torch.exp(-logs) * x_mask - x = torch.cat([x0, x1], 1) - return x, torch.zeros([1]) - - def remove_weight_norm(self): - self.enc.remove_weight_norm() - - def __prepare_scriptable__(self): - for hook in self.enc._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.enc) - return self - - -class ConvFlow(nn.Module): - def __init__( - self, - in_channels, - filter_channels, - kernel_size, - n_layers, - num_bins=10, - tail_bound=5.0, - ): - super(ConvFlow, self).__init__() - self.in_channels = in_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.num_bins = num_bins - self.tail_bound = tail_bound - self.half_channels = in_channels // 2 - - self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) - self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) - self.proj = nn.Conv1d( - filter_channels, self.half_channels * (num_bins * 3 - 1), 1 - ) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - g: Optional[torch.Tensor] = None, - reverse=False, - ): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - h: torch.Tensor = self.pre(x0) - h = self.convs(h, x_mask, g=g) - h = self.proj(h) * x_mask - - b, c, t = x0.shape - h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] - - unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( - self.filter_channels - ) - unnormalized_derivatives = h[..., 2 * self.num_bins :] - - x1, logabsdet = piecewise_rational_quadratic_transform( - x1, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=reverse, - tails="linear", - tail_bound=self.tail_bound, - ) - - x = torch.cat([x0, x1], 1) * x_mask - logdet = torch.sum(logabsdet * x_mask, [1, 2]) - if not reverse: - return x, logdet - else: - return x diff --git a/infer/lib/jit/synthesizer.py b/infer/lib/jit/synthesizer.py index b117989..a27180a 100644 --- a/infer/lib/jit/synthesizer.py +++ b/infer/lib/jit/synthesizer.py @@ -15,12 +15,12 @@ def get_synthesizer_ckpt(cpt, device=torch.device("cpu")): version = cpt.get("version", "v1") if version == "v1": if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False) + net_g = SynthesizerTrnMs256NSFsid(*cpt["config"]) else: net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) elif version == "v2": if if_f0 == 1: - net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False) + net_g = SynthesizerTrnMs768NSFsid(*cpt["config"]) else: net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) del net_g.enc_q diff --git a/rvc/attentions.py b/rvc/attentions.py index a6d34f8..b6db6b6 100644 --- a/rvc/attentions.py +++ b/rvc/attentions.py @@ -9,15 +9,15 @@ from torch.nn import functional as F class MultiHeadAttention(nn.Module): def __init__( self, - channels, - out_channels, - n_heads, - p_dropout=0.0, - window_size=None, - heads_share=True, - block_length=None, - proximal_bias=False, - proximal_init=False, + channels: int, + out_channels: int, + n_heads: int, + p_dropout: float = 0.0, + window_size: int | None = None, + heads_share: bool = True, + block_length: int | None = None, + proximal_bias: bool = False, + proximal_init: bool = False, ): super(MultiHeadAttention, self).__init__() assert channels % n_heads == 0 @@ -60,19 +60,30 @@ class MultiHeadAttention(nn.Module): self.conv_k.weight.copy_(self.conv_q.weight) self.conv_k.bias.copy_(self.conv_q.bias) + def __call__( + self, + x: torch.Tensor, + c: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + return super().__call__(x, c, attn_mask=attn_mask) + def forward( - self, x: torch.Tensor, c: torch.Tensor, attn_mask: Optional[torch.Tensor] = None - ): + self, + x: torch.Tensor, + c: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: q = self.conv_q(x) k = self.conv_k(c) v = self.conv_v(c) - x, _ = self.attention(q, k, v, mask=attn_mask) + x, _ = self._attention(q, k, v, mask=attn_mask) x = self.conv_o(x) return x - def attention( + def _attention( self, query: torch.Tensor, key: torch.Tensor, @@ -149,7 +160,7 @@ class MultiHeadAttention(nn.Module): return ret def _get_relative_embeddings(self, relative_embeddings, length: int): - max_relative_position = 2 * self.window_size + 1 + # max_relative_position = 2 * self.window_size + 1 # Pad first before slice to avoid using cond ops. pad_length: int = max(length - (self.window_size + 1), 0) slice_start_position = max((self.window_size + 1) - length, 0) @@ -217,13 +228,13 @@ class MultiHeadAttention(nn.Module): class FFN(nn.Module): def __init__( self, - in_channels, - out_channels, - filter_channels, - kernel_size, - p_dropout=0.0, - activation: str = None, - causal=False, + in_channels: int, + out_channels: int, + filter_channels: int, + kernel_size: int, + p_dropout: float = 0.0, + activation: str | None = None, + causal: bool = False, ): super(FFN, self).__init__() self.in_channels = in_channels @@ -234,32 +245,29 @@ class FFN(nn.Module): self.activation = activation self.causal = causal self.is_activation = True if activation == "gelu" else False - # if causal: - # self.padding = self._causal_padding - # else: - # self.padding = self._same_padding self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) self.drop = nn.Dropout(p_dropout) - def padding(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor: - if self.causal: - padding = self._causal_padding(x * x_mask) - else: - padding = self._same_padding(x * x_mask) - return padding + def __call__(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor: + return super().__call__(x, x_mask) - def forward(self, x: torch.Tensor, x_mask: torch.Tensor): - x = self.conv_1(self.padding(x, x_mask)) + def forward(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor: + x = self.conv_1(self._padding(x, x_mask)) if self.is_activation: x = x * torch.sigmoid(1.702 * x) else: x = torch.relu(x) x = self.drop(x) - x = self.conv_2(self.padding(x, x_mask)) + x = self.conv_2(self._padding(x, x_mask)) return x * x_mask + + def _padding(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor: + if self.causal: + return self._causal_padding(x * x_mask) + return self._same_padding(x * x_mask) def _causal_padding(self, x): if self.kernel_size == 1: diff --git a/rvc/norms.py b/rvc/norms.py index d93ebca..3463fb2 100644 --- a/rvc/norms.py +++ b/rvc/norms.py @@ -1,7 +1,10 @@ +from typing import Optional + import torch from torch import nn from torch.nn import functional as F +from .utils import activate_add_tanh_sigmoid_multiply class LayerNorm(nn.Module): def __init__(self, channels: int, eps: float = 1e-5): @@ -16,3 +19,128 @@ class LayerNorm(nn.Module): x = x.transpose(1, -1) x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) return x.transpose(1, -1) + +class WN(torch.nn.Module): + def __init__( + self, + hidden_channels: int, + kernel_size: int, + dilation_rate: int, + n_layers: int, + gin_channels: int = 0, + p_dropout: int = 0, + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = float(p_dropout) + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(float(p_dropout)) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + + for i in range(n_layers): + dilation = dilation_rate**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def __call__( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + return super().__call__(x, x_mask, g=g) + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + output = torch.zeros_like(x) + + if g is not None: + g = self.cond_layer(g) + + for i, (in_layer, res_skip_layer) in enumerate( + zip(self.in_layers, self.res_skip_layers) + ): + x_in: torch.Tensor = in_layer(x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + acts = activate_add_tanh_sigmoid_multiply(x_in, g_l, self.hidden_channels) + acts: torch.Tensor = self.drop(acts) + + res_skip_acts: torch.Tensor = res_skip_layer(acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + def __prepare_scriptable__(self): + if self.gin_channels != 0: + for hook in self.cond_layer._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self diff --git a/rvc/onnx/infer.py b/rvc/onnx/infer.py index 80bbb6a..c9576a9 100644 --- a/rvc/onnx/infer.py +++ b/rvc/onnx/infer.py @@ -38,9 +38,9 @@ class ContentVec(Model): super().__init__(vec_path, device) def __call__(self, wav: np.ndarray[typing.Any, np.dtype]): - return self.__forward(wav) + return self.forward(wav) - def __forward(self, wav: np.ndarray[typing.Any, np.dtype]): + def forward(self, wav: np.ndarray[typing.Any, np.dtype]): if wav.ndim == 2: # double channels wav = wav.mean(-1) assert wav.ndim == 1, wav.ndim diff --git a/rvc/residuals.py b/rvc/residuals.py new file mode 100644 index 0000000..09d6d02 --- /dev/null +++ b/rvc/residuals.py @@ -0,0 +1,260 @@ +from typing import Optional + +import torch +from torch import nn +from torch.nn import Conv1d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, weight_norm + +from .norms import WN +from .utils import ( + get_padding, + call_weight_data_normal_if_Conv, +) + +LRELU_SLOPE = 0.1 + +class ResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(call_weight_data_normal_if_Conv) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(call_weight_data_normal_if_Conv) + self.lrelu_slope = LRELU_SLOPE + + def forward(self, x: torch.Tensor, x_mask: Optional[torch.Tensor] = None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, self.lrelu_slope) + if x_mask is not None: + xt = xt * x_mask + xt = c1(xt) + xt = F.leaky_relu(xt, self.lrelu_slope) + if x_mask is not None: + xt = xt * x_mask + xt = c2(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + def __prepare_scriptable__(self): + for l in self.convs1: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.convs2: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + +class ResBlock2(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.convs = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + ] + ) + self.convs.apply(call_weight_data_normal_if_Conv) + self.lrelu_slope = LRELU_SLOPE + + def forward(self, x, x_mask: Optional[torch.Tensor] = None): + for c in self.convs: + xt = F.leaky_relu(x, self.lrelu_slope) + if x_mask is not None: + xt = xt * x_mask + xt = c(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + def __prepare_scriptable__(self): + for l in self.convs: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + +class ResidualCouplingLayer(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super(ResidualCouplingLayer, self).__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=float(p_dropout), + gin_channels=gin_channels, + ) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x, torch.zeros([1]) + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.enc._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc) + return self