1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-05 01:10:22 +08:00

optimize(infer): move attentions into rvc

This commit is contained in:
源文雨
2024-06-07 20:28:05 +09:00
parent 978abd8aac
commit 96604e8175
12 changed files with 195 additions and 341 deletions

View File

@@ -2,104 +2,19 @@ import math
import logging
from typing import Optional, Tuple, List
from rvc import utils
logger = logging.getLogger(__name__)
import torch
from torch import nn
from torch.nn import Conv1d, Conv2d, ConvTranspose1d
from torch.nn import functional as F
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
from infer.lib.infer_pack import attentions, modules
from rvc.utils import get_padding, call_weight_data_normal_if_Conv
from infer.lib.infer_pack import modules
from rvc.utils import get_padding, call_weight_data_normal_if_Conv, sequence_mask, slice_on_last_dim, rand_slice_segments_on_last_dim
from rvc.encoders import TextEncoder
has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available())
class TextEncoder(nn.Module):
def __init__(
self,
in_channels,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super(TextEncoder, self).__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = float(p_dropout)
self.emb_phone = nn.Linear(in_channels, hidden_channels)
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
float(p_dropout),
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def __call__(
self,
phone: torch.Tensor,
pitch: torch.Tensor,
lengths: torch.Tensor,
# skip_head: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
return super().__call__(
phone,
pitch,
lengths,
# skip_head=skip_head,
)
def forward(
self,
phone: torch.Tensor,
pitch: torch.Tensor,
lengths: torch.Tensor,
# skip_head: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
if pitch is None:
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(
utils.sequence_mask(
lengths,
x.size(2),
),
1,
).to(x.dtype)
x = self.encoder(x * x_mask, x_mask)
"""
if skip_head is not None:
assert isinstance(skip_head, torch.Tensor)
head = int(skip_head.item())
x = x[:, :, head:]
x_mask = x_mask[:, :, head:]
"""
stats: torch.Tensor = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask
class ResidualCouplingBlock(nn.Module):
def __init__(
self,
@@ -205,11 +120,7 @@ class PosteriorEncoder(nn.Module):
self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
x_mask = torch.unsqueeze(
utils.sequence_mask(
x_lengths,
x.size(2),
),
1,
sequence_mask(x_lengths, x.size(2)), 1,
).to(x.dtype)
x = self.pre(x) * x_mask
x = self.enc(x, x_mask, g=g)
@@ -728,12 +639,6 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
logger.debug(
"gin_channels: "
+ str(gin_channels)
+ ", self.spk_embed_dim: "
+ str(self.spk_embed_dim)
)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
@@ -783,9 +688,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
z_p = self.flow(z, y_mask, g=g)
z_slice, ids_slice = utils.rand_slice_segments_on_last_dim(z, y_lengths, self.segment_size)
z_slice, ids_slice = rand_slice_segments_on_last_dim(z, y_lengths, self.segment_size)
# print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
pitchf = utils.slice_on_last_dim(pitchf, ids_slice, self.segment_size)
pitchf = slice_on_last_dim(pitchf, ids_slice, self.segment_size)
# print(-2,pitchf.shape,z_slice.shape)
o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
@@ -962,12 +867,6 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
logger.debug(
"gin_channels: "
+ str(gin_channels)
+ ", self.spk_embed_dim: "
+ str(self.spk_embed_dim)
)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
@@ -1007,7 +906,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
z_p = self.flow(z, y_mask, g=g)
z_slice, ids_slice = utils.rand_slice_segments_on_last_dim(z, y_lengths, self.segment_size)
z_slice, ids_slice = rand_slice_segments_on_last_dim(z, y_lengths, self.segment_size)
o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)

View File

@@ -1,13 +1,14 @@
import torch
from torch import nn
from .attentions import (
TextEncoder,
from .models import (
ResidualCouplingBlock,
PosteriorEncoder,
GeneratorNSF,
)
from rvc.encoders import TextEncoder
class SynthesizerTrnMsNSFsidM(nn.Module):
def __init__(

View File

@@ -1,89 +1,19 @@
import copy
import math
from typing import Optional, Tuple
import numpy as np
import scipy
import torch
from torch import nn
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
from torch.nn import Conv1d
from torch.nn import functional as F
from torch.nn.utils import remove_weight_norm, weight_norm
from rvc import utils
from rvc.utils import get_padding, call_weight_data_normal_if_Conv
from rvc.utils import get_padding, call_weight_data_normal_if_Conv, activate_add_tanh_sigmoid_multiply
from rvc.transforms import piecewise_rational_quadratic_transform
from rvc.norms import LayerNorm
LRELU_SLOPE = 0.1
class LayerNorm(nn.Module):
def __init__(self, channels, eps=1e-5):
super(LayerNorm, self).__init__()
self.channels = channels
self.eps = eps
self.gamma = nn.Parameter(torch.ones(channels))
self.beta = nn.Parameter(torch.zeros(channels))
def forward(self, x):
x = x.transpose(1, -1)
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
return x.transpose(1, -1)
class ConvReluNorm(nn.Module):
def __init__(
self,
in_channels,
hidden_channels,
out_channels,
kernel_size,
n_layers,
p_dropout,
):
super(ConvReluNorm, self).__init__()
self.in_channels = in_channels
self.hidden_channels = hidden_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.p_dropout = float(p_dropout)
assert n_layers > 1, "Number of layers should be larger than 0."
self.conv_layers = nn.ModuleList()
self.norm_layers = nn.ModuleList()
self.conv_layers.append(
nn.Conv1d(
in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
)
)
self.norm_layers.append(LayerNorm(hidden_channels))
self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(float(p_dropout)))
for _ in range(n_layers - 1):
self.conv_layers.append(
nn.Conv1d(
hidden_channels,
hidden_channels,
kernel_size,
padding=kernel_size // 2,
)
)
self.norm_layers.append(LayerNorm(hidden_channels))
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
def forward(self, x, x_mask):
x_org = x
for i in range(self.n_layers):
x = self.conv_layers[i](x * x_mask)
x = self.norm_layers[i](x)
x = self.relu_drop(x)
x = x_org + self.proj(x)
return x * x_mask
class DDSConv(nn.Module):
"""
Dialted and Depth-Separable Convolution
@@ -203,7 +133,7 @@ class WN(torch.nn.Module):
else:
g_l = torch.zeros_like(x_in)
acts = utils.activate_add_tanh_sigmoid_multiply(x_in, g_l, self.hidden_channels)
acts = activate_add_tanh_sigmoid_multiply(x_in, g_l, self.hidden_channels)
acts = self.drop(acts)
res_skip_acts = res_skip_layer(acts)

View File

@@ -5,161 +5,6 @@ import torch
from torch import nn
from torch.nn import functional as F
from infer.lib.infer_pack.modules import LayerNorm
class Encoder(nn.Module):
def __init__(
self,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size=1,
p_dropout=0.0,
window_size=10,
):
super(Encoder, self).__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = int(n_layers)
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.window_size = window_size
self.drop = nn.Dropout(p_dropout)
self.attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.attn_layers.append(
MultiHeadAttention(
hidden_channels,
hidden_channels,
n_heads,
p_dropout=p_dropout,
window_size=window_size,
)
)
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(
FFN(
hidden_channels,
hidden_channels,
filter_channels,
kernel_size,
p_dropout=p_dropout,
)
)
self.norm_layers_2.append(LayerNorm(hidden_channels))
def __call__(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor:
return super().__call__(x, x_mask)
def forward(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor:
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
zippep = zip(
self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2
)
for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zippep:
y = attn_layers(x, x, attn_mask)
y = self.drop(y)
x = norm_layers_1(x + y)
y = ffn_layers(x, x_mask)
y = self.drop(y)
x = norm_layers_2(x + y)
x = x * x_mask
return x
"""
class Decoder(nn.Module):
def __init__(
self,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size=1,
p_dropout=0.0,
proximal_bias=False,
proximal_init=True,
):
super(Decoder, self).__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.drop = nn.Dropout(p_dropout)
self.self_attn_layers = nn.ModuleList()
self.norm_layers_0 = nn.ModuleList()
self.encdec_attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.self_attn_layers.append(
MultiHeadAttention(
hidden_channels,
hidden_channels,
n_heads,
p_dropout=p_dropout,
proximal_bias=proximal_bias,
proximal_init=proximal_init,
)
)
self.norm_layers_0.append(LayerNorm(hidden_channels))
self.encdec_attn_layers.append(
MultiHeadAttention(
hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
)
)
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(
FFN(
hidden_channels,
hidden_channels,
filter_channels,
kernel_size,
p_dropout=p_dropout,
causal=True,
)
)
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask, h, h_mask):
# x: decoder input
# h: encoder output
self_attn_mask = utils.subsequent_mask(x_mask.size(2)).to(
device=x.device, dtype=x.dtype
)
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for i in range(self.n_layers):
y = self.self_attn_layers[i](x, x, self_attn_mask)
y = self.drop(y)
x = self.norm_layers_0[i](x + y)
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x
"""
class MultiHeadAttention(nn.Module):
def __init__(

161
rvc/encoders.py Normal file
View File

@@ -0,0 +1,161 @@
import math
from typing import Tuple
import torch
from torch import nn
from .attentions import MultiHeadAttention, FFN
from .norms import LayerNorm
from .utils import sequence_mask
class Encoder(nn.Module):
def __init__(
self,
hidden_channels: int,
filter_channels: int,
n_heads: int,
n_layers: int,
kernel_size: int = 1,
p_dropout: float = 0.0,
window_size: int = 10,
):
super(Encoder, self).__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.window_size = window_size
self.drop = nn.Dropout(p_dropout)
self.attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for _ in range(self.n_layers):
self.attn_layers.append(
MultiHeadAttention(
hidden_channels,
hidden_channels,
n_heads,
p_dropout=p_dropout,
window_size=window_size,
)
)
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(
FFN(
hidden_channels,
hidden_channels,
filter_channels,
kernel_size,
p_dropout=p_dropout,
)
)
self.norm_layers_2.append(LayerNorm(hidden_channels))
def __call__(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor:
return super().__call__(x, x_mask)
def forward(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor:
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for attn, norm1, ffn, norm2 in zip(
self.attn_layers,
self.norm_layers_1,
self.ffn_layers,
self.norm_layers_2,
):
y = attn(x, x, attn_mask)
y = self.drop(y)
x = norm1(x + y)
y = ffn(x, x_mask)
y = self.drop(y)
x = norm2(x + y)
x = x * x_mask
return x
class TextEncoder(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
hidden_channels: int,
filter_channels: int,
n_heads: int,
n_layers: int,
kernel_size: int,
p_dropout: float,
f0: bool = True,
):
super(TextEncoder, self).__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = float(p_dropout)
self.emb_phone = nn.Linear(in_channels, hidden_channels)
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = Encoder(
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
float(p_dropout),
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def __call__(
self,
phone: torch.Tensor,
pitch: torch.Tensor,
lengths: torch.Tensor,
# skip_head: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
return super().__call__(
phone,
pitch,
lengths,
# skip_head=skip_head,
)
def forward(
self,
phone: torch.Tensor,
pitch: torch.Tensor,
lengths: torch.Tensor,
# skip_head: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
x = self.emb_phone(phone)
if pitch is not None:
x += self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(
sequence_mask(lengths, x.size(2)), 1,
).to(x.dtype)
x = self.encoder(x * x_mask, x_mask)
"""
if skip_head is not None:
assert isinstance(skip_head, torch.Tensor)
head = int(skip_head.item())
x = x[:, :, head:]
x_mask = x_mask[:, :, head:]
"""
stats: torch.Tensor = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask

18
rvc/norms.py Normal file
View File

@@ -0,0 +1,18 @@
import torch
from torch import nn
from torch.nn import functional as F
class LayerNorm(nn.Module):
def __init__(self, channels: int, eps: float = 1e-5):
super(LayerNorm, self).__init__()
self.channels = channels
self.eps = eps
self.gamma = nn.Parameter(torch.ones(channels))
self.beta = nn.Parameter(torch.zeros(channels))
def forward(self, x: torch.Tensor):
x = x.transpose(1, -1)
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
return x.transpose(1, -1)

View File

@@ -4,7 +4,7 @@ import onnxruntime
import typing
import os
from onnx.f0predictor import (
from onnx.f0predictors import (
PMF0Predictor,
HarvestF0Predictor,
DioF0Predictor,