diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py index f8d5771..1b3fc18 100644 --- a/infer/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -2,104 +2,19 @@ import math import logging from typing import Optional, Tuple, List -from rvc import utils - -logger = logging.getLogger(__name__) - import torch from torch import nn from torch.nn import Conv1d, Conv2d, ConvTranspose1d from torch.nn import functional as F from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm -from infer.lib.infer_pack import attentions, modules -from rvc.utils import get_padding, call_weight_data_normal_if_Conv +from infer.lib.infer_pack import modules + +from rvc.utils import get_padding, call_weight_data_normal_if_Conv, sequence_mask, slice_on_last_dim, rand_slice_segments_on_last_dim +from rvc.encoders import TextEncoder has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available()) -class TextEncoder(nn.Module): - def __init__( - self, - in_channels, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=True, - ): - super(TextEncoder, self).__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) - self.emb_phone = nn.Linear(in_channels, hidden_channels) - self.lrelu = nn.LeakyReLU(0.1, inplace=True) - if f0 == True: - self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 - self.encoder = attentions.Encoder( - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - float(p_dropout), - ) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def __call__( - self, - phone: torch.Tensor, - pitch: torch.Tensor, - lengths: torch.Tensor, - # skip_head: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - return super().__call__( - phone, - pitch, - lengths, - # skip_head=skip_head, - ) - - def forward( - self, - phone: torch.Tensor, - pitch: torch.Tensor, - lengths: torch.Tensor, - # skip_head: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - if pitch is None: - x = self.emb_phone(phone) - else: - x = self.emb_phone(phone) + self.emb_pitch(pitch) - x = x * math.sqrt(self.hidden_channels) # [b, t, h] - x = self.lrelu(x) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze( - utils.sequence_mask( - lengths, - x.size(2), - ), - 1, - ).to(x.dtype) - x = self.encoder(x * x_mask, x_mask) - """ - if skip_head is not None: - assert isinstance(skip_head, torch.Tensor) - head = int(skip_head.item()) - x = x[:, :, head:] - x_mask = x_mask[:, :, head:] - """ - stats: torch.Tensor = self.proj(x) * x_mask - m, logs = torch.split(stats, self.out_channels, dim=1) - return m, logs, x_mask - - class ResidualCouplingBlock(nn.Module): def __init__( self, @@ -205,11 +120,7 @@ class PosteriorEncoder(nn.Module): self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: x_mask = torch.unsqueeze( - utils.sequence_mask( - x_lengths, - x.size(2), - ), - 1, + sequence_mask(x_lengths, x.size(2)), 1, ).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) @@ -728,12 +639,6 @@ class SynthesizerTrnMs256NSFsid(nn.Module): inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels ) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - logger.debug( - "gin_channels: " - + str(gin_channels) - + ", self.spk_embed_dim: " - + str(self.spk_embed_dim) - ) def remove_weight_norm(self): self.dec.remove_weight_norm() @@ -783,9 +688,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module): m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = utils.rand_slice_segments_on_last_dim(z, y_lengths, self.segment_size) + z_slice, ids_slice = rand_slice_segments_on_last_dim(z, y_lengths, self.segment_size) # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) - pitchf = utils.slice_on_last_dim(pitchf, ids_slice, self.segment_size) + pitchf = slice_on_last_dim(pitchf, ids_slice, self.segment_size) # print(-2,pitchf.shape,z_slice.shape) o = self.dec(z_slice, pitchf, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) @@ -962,12 +867,6 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels ) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - logger.debug( - "gin_channels: " - + str(gin_channels) - + ", self.spk_embed_dim: " - + str(self.spk_embed_dim) - ) def remove_weight_norm(self): self.dec.remove_weight_norm() @@ -1007,7 +906,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = utils.rand_slice_segments_on_last_dim(z, y_lengths, self.segment_size) + z_slice, ids_slice = rand_slice_segments_on_last_dim(z, y_lengths, self.segment_size) o = self.dec(z_slice, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) diff --git a/infer/lib/infer_pack/models_onnx.py b/infer/lib/infer_pack/models_onnx.py index 26ad8e8..d0c735a 100644 --- a/infer/lib/infer_pack/models_onnx.py +++ b/infer/lib/infer_pack/models_onnx.py @@ -1,13 +1,14 @@ import torch from torch import nn -from .attentions import ( - TextEncoder, +from .models import ( ResidualCouplingBlock, PosteriorEncoder, GeneratorNSF, ) +from rvc.encoders import TextEncoder + class SynthesizerTrnMsNSFsidM(nn.Module): def __init__( diff --git a/infer/lib/infer_pack/modules.py b/infer/lib/infer_pack/modules.py index a94c510..b8a049c 100644 --- a/infer/lib/infer_pack/modules.py +++ b/infer/lib/infer_pack/modules.py @@ -1,89 +1,19 @@ -import copy import math from typing import Optional, Tuple -import numpy as np -import scipy import torch from torch import nn -from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn import Conv1d from torch.nn import functional as F from torch.nn.utils import remove_weight_norm, weight_norm -from rvc import utils -from rvc.utils import get_padding, call_weight_data_normal_if_Conv +from rvc.utils import get_padding, call_weight_data_normal_if_Conv, activate_add_tanh_sigmoid_multiply from rvc.transforms import piecewise_rational_quadratic_transform +from rvc.norms import LayerNorm LRELU_SLOPE = 0.1 -class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-5): - super(LayerNorm, self).__init__() - self.channels = channels - self.eps = eps - - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - x = x.transpose(1, -1) - x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) - return x.transpose(1, -1) - - -class ConvReluNorm(nn.Module): - def __init__( - self, - in_channels, - hidden_channels, - out_channels, - kernel_size, - n_layers, - p_dropout, - ): - super(ConvReluNorm, self).__init__() - self.in_channels = in_channels - self.hidden_channels = hidden_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = float(p_dropout) - assert n_layers > 1, "Number of layers should be larger than 0." - - self.conv_layers = nn.ModuleList() - self.norm_layers = nn.ModuleList() - self.conv_layers.append( - nn.Conv1d( - in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 - ) - ) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(float(p_dropout))) - for _ in range(n_layers - 1): - self.conv_layers.append( - nn.Conv1d( - hidden_channels, - hidden_channels, - kernel_size, - padding=kernel_size // 2, - ) - ) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.proj = nn.Conv1d(hidden_channels, out_channels, 1) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask): - x_org = x - for i in range(self.n_layers): - x = self.conv_layers[i](x * x_mask) - x = self.norm_layers[i](x) - x = self.relu_drop(x) - x = x_org + self.proj(x) - return x * x_mask - - class DDSConv(nn.Module): """ Dialted and Depth-Separable Convolution @@ -203,7 +133,7 @@ class WN(torch.nn.Module): else: g_l = torch.zeros_like(x_in) - acts = utils.activate_add_tanh_sigmoid_multiply(x_in, g_l, self.hidden_channels) + acts = activate_add_tanh_sigmoid_multiply(x_in, g_l, self.hidden_channels) acts = self.drop(acts) res_skip_acts = res_skip_layer(acts) diff --git a/infer/lib/infer_pack/attentions.py b/rvc/attentions.py similarity index 64% rename from infer/lib/infer_pack/attentions.py rename to rvc/attentions.py index 8185477..06d9ca4 100644 --- a/infer/lib/infer_pack/attentions.py +++ b/rvc/attentions.py @@ -5,161 +5,6 @@ import torch from torch import nn from torch.nn import functional as F -from infer.lib.infer_pack.modules import LayerNorm - - -class Encoder(nn.Module): - def __init__( - self, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size=1, - p_dropout=0.0, - window_size=10, - ): - super(Encoder, self).__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = int(n_layers) - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - - self.drop = nn.Dropout(p_dropout) - self.attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.attn_layers.append( - MultiHeadAttention( - hidden_channels, - hidden_channels, - n_heads, - p_dropout=p_dropout, - window_size=window_size, - ) - ) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append( - FFN( - hidden_channels, - hidden_channels, - filter_channels, - kernel_size, - p_dropout=p_dropout, - ) - ) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def __call__(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor: - return super().__call__(x, x_mask) - - def forward(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor: - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - zippep = zip( - self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2 - ) - for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zippep: - y = attn_layers(x, x, attn_mask) - y = self.drop(y) - x = norm_layers_1(x + y) - - y = ffn_layers(x, x_mask) - y = self.drop(y) - x = norm_layers_2(x + y) - x = x * x_mask - return x - - -""" -class Decoder(nn.Module): - def __init__( - self, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size=1, - p_dropout=0.0, - proximal_bias=False, - proximal_init=True, - ): - super(Decoder, self).__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - - self.drop = nn.Dropout(p_dropout) - self.self_attn_layers = nn.ModuleList() - self.norm_layers_0 = nn.ModuleList() - self.encdec_attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.self_attn_layers.append( - MultiHeadAttention( - hidden_channels, - hidden_channels, - n_heads, - p_dropout=p_dropout, - proximal_bias=proximal_bias, - proximal_init=proximal_init, - ) - ) - self.norm_layers_0.append(LayerNorm(hidden_channels)) - self.encdec_attn_layers.append( - MultiHeadAttention( - hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout - ) - ) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append( - FFN( - hidden_channels, - hidden_channels, - filter_channels, - kernel_size, - p_dropout=p_dropout, - causal=True, - ) - ) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask, h, h_mask): - # x: decoder input - # h: encoder output - self_attn_mask = utils.subsequent_mask(x_mask.size(2)).to( - device=x.device, dtype=x.dtype - ) - encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - for i in range(self.n_layers): - y = self.self_attn_layers[i](x, x, self_attn_mask) - y = self.drop(y) - x = self.norm_layers_0[i](x + y) - - y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x -""" - class MultiHeadAttention(nn.Module): def __init__( diff --git a/rvc/encoders.py b/rvc/encoders.py new file mode 100644 index 0000000..4cb62e3 --- /dev/null +++ b/rvc/encoders.py @@ -0,0 +1,161 @@ +import math +from typing import Tuple + +import torch +from torch import nn + +from .attentions import MultiHeadAttention, FFN +from .norms import LayerNorm +from .utils import sequence_mask + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels: int, + filter_channels: int, + n_heads: int, + n_layers: int, + kernel_size: int = 1, + p_dropout: float = 0.0, + window_size: int = 10, + ): + super(Encoder, self).__init__() + + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + + for _ in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def __call__(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor: + return super().__call__(x, x_mask) + + def forward(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor: + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for attn, norm1, ffn, norm2 in zip( + self.attn_layers, + self.norm_layers_1, + self.ffn_layers, + self.norm_layers_2, + ): + y = attn(x, x, attn_mask) + y = self.drop(y) + x = norm1(x + y) + + y = ffn(x, x_mask) + y = self.drop(y) + x = norm2(x + y) + x = x * x_mask + return x + + +class TextEncoder(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + hidden_channels: int, + filter_channels: int, + n_heads: int, + n_layers: int, + kernel_size: int, + p_dropout: float, + f0: bool = True, + ): + super(TextEncoder, self).__init__() + + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + + self.emb_phone = nn.Linear(in_channels, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def __call__( + self, + phone: torch.Tensor, + pitch: torch.Tensor, + lengths: torch.Tensor, + # skip_head: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + return super().__call__( + phone, + pitch, + lengths, + # skip_head=skip_head, + ) + + def forward( + self, + phone: torch.Tensor, + pitch: torch.Tensor, + lengths: torch.Tensor, + # skip_head: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + x = self.emb_phone(phone) + if pitch is not None: + x += self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze( + sequence_mask(lengths, x.size(2)), 1, + ).to(x.dtype) + x = self.encoder(x * x_mask, x_mask) + """ + if skip_head is not None: + assert isinstance(skip_head, torch.Tensor) + head = int(skip_head.item()) + x = x[:, :, head:] + x_mask = x_mask[:, :, head:] + """ + stats: torch.Tensor = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask diff --git a/rvc/norms.py b/rvc/norms.py new file mode 100644 index 0000000..d93ebca --- /dev/null +++ b/rvc/norms.py @@ -0,0 +1,18 @@ +import torch +from torch import nn +from torch.nn import functional as F + + +class LayerNorm(nn.Module): + def __init__(self, channels: int, eps: float = 1e-5): + super(LayerNorm, self).__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x: torch.Tensor): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) diff --git a/rvc/onnx/f0predictor/__init__.py b/rvc/onnx/f0predictors/__init__.py similarity index 100% rename from rvc/onnx/f0predictor/__init__.py rename to rvc/onnx/f0predictors/__init__.py diff --git a/rvc/onnx/f0predictor/dio.py b/rvc/onnx/f0predictors/dio.py similarity index 100% rename from rvc/onnx/f0predictor/dio.py rename to rvc/onnx/f0predictors/dio.py diff --git a/rvc/onnx/f0predictor/f0.py b/rvc/onnx/f0predictors/f0.py similarity index 100% rename from rvc/onnx/f0predictor/f0.py rename to rvc/onnx/f0predictors/f0.py diff --git a/rvc/onnx/f0predictor/harvest.py b/rvc/onnx/f0predictors/harvest.py similarity index 100% rename from rvc/onnx/f0predictor/harvest.py rename to rvc/onnx/f0predictors/harvest.py diff --git a/rvc/onnx/f0predictor/pm.py b/rvc/onnx/f0predictors/pm.py similarity index 100% rename from rvc/onnx/f0predictor/pm.py rename to rvc/onnx/f0predictors/pm.py diff --git a/rvc/onnx/infer.py b/rvc/onnx/infer.py index efa0b18..80bbb6a 100644 --- a/rvc/onnx/infer.py +++ b/rvc/onnx/infer.py @@ -4,7 +4,7 @@ import onnxruntime import typing import os -from onnx.f0predictor import ( +from onnx.f0predictors import ( PMF0Predictor, HarvestF0Predictor, DioF0Predictor,