1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-08 20:10:44 +08:00

optimize(rvc): move commons to rvc.utils

- remove redundant attentions_onnx
- shrink models_onnx
- add some type note to rvc.utils
This commit is contained in:
源文雨
2024-06-07 00:42:35 +09:00
parent 6f90ce3046
commit 5eed789fe7
8 changed files with 186 additions and 1477 deletions

View File

@@ -1,17 +1,18 @@
import math
import logging
from typing import Optional
from typing import Optional, Tuple, List
from rvc import utils
logger = logging.getLogger(__name__)
import numpy as np
import torch
from torch import nn
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
from torch.nn import Conv1d, Conv2d, ConvTranspose1d
from torch.nn import functional as F
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
from infer.lib.infer_pack import attentions, commons, modules
from infer.lib.infer_pack.commons import get_padding, init_weights
from infer.lib.infer_pack import attentions, modules
from rvc.utils import get_padding, call_weight_data_normal_if_Conv
has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available())
@@ -51,13 +52,25 @@ class TextEncoder(nn.Module):
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def __call__(
self,
phone: torch.Tensor,
pitch: torch.Tensor,
lengths: torch.Tensor,
# skip_head: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
return super().__call__(
phone, pitch, lengths,
# skip_head=skip_head,
)
def forward(
self,
phone: torch.Tensor,
pitch: torch.Tensor,
lengths: torch.Tensor,
skip_head: Optional[torch.Tensor] = None,
):
# skip_head: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
if pitch is None:
x = self.emb_phone(phone)
else:
@@ -65,15 +78,19 @@ class TextEncoder(nn.Module):
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
)
x_mask = torch.unsqueeze(
utils.sequence_mask(
lengths, x.size(2),
), 1,
).to(x.dtype)
x = self.encoder(x * x_mask, x_mask)
"""
if skip_head is not None:
assert isinstance(skip_head, torch.Tensor)
head = int(skip_head.item())
x = x[:, :, head:]
x_mask = x_mask[:, :, head:]
"""
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask
@@ -125,7 +142,7 @@ class ResidualCouplingBlock(nn.Module):
for flow in self.flows:
x, _ = flow(x, x_mask, g=g, reverse=reverse)
else:
for flow in self.flows[::-1]:
for flow in reversed(self.flows):
x, _ = flow.forward(x, x_mask, g=g, reverse=reverse)
return x
@@ -175,12 +192,19 @@ class PosteriorEncoder(nn.Module):
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def __call__(
self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
super().__call__(x, x_lengths, g = g)
def forward(
self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
):
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
x.dtype
)
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
x_mask = torch.unsqueeze(
utils.sequence_mask(
x_lengths, x.size(2),
), 1,
).to(x.dtype)
x = self.pre(x) * x_mask
x = self.enc(x, x_mask, g=g)
stats = self.proj(x) * x_mask
@@ -244,7 +268,7 @@ class Generator(torch.nn.Module):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups.apply(init_weights)
self.ups.apply(call_weight_data_normal_if_Conv)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
@@ -253,13 +277,15 @@ class Generator(torch.nn.Module):
self,
x: torch.Tensor,
g: Optional[torch.Tensor] = None,
n_res: Optional[torch.Tensor] = None,
# n_res: Optional[torch.Tensor] = None,
):
"""
if n_res is not None:
assert isinstance(n_res, torch.Tensor)
n = int(n_res.item())
if n != x.shape[-1]:
x = F.interpolate(x, size=n, mode="linear")
"""
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
@@ -529,7 +555,7 @@ class GeneratorNSF(torch.nn.Module):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups.apply(init_weights)
self.ups.apply(call_weight_data_normal_if_Conv)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
@@ -543,10 +569,11 @@ class GeneratorNSF(torch.nn.Module):
x,
f0,
g: Optional[torch.Tensor] = None,
n_res: Optional[torch.Tensor] = None,
# n_res: Optional[torch.Tensor] = None,
):
har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2)
"""
if n_res is not None:
assert isinstance(n_res, torch.Tensor)
n = int(n_res.item())
@@ -554,6 +581,7 @@ class GeneratorNSF(torch.nn.Module):
har_source = F.interpolate(har_source, size=n * self.upp, mode="linear")
if n != x.shape[-1]:
x = F.interpolate(x, size=n, mode="linear")
"""
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
@@ -611,39 +639,35 @@ class GeneratorNSF(torch.nn.Module):
return self
sr2sr = {
"32k": 32000,
"40k": 40000,
"48k": 48000,
}
class SynthesizerTrnMs256NSFsid(nn.Module):
def __init__(
self,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
spk_embed_dim,
gin_channels,
sr,
segment_size: int,
inter_channels: int,
hidden_channels: int,
filter_channels: int,
n_heads: int,
n_layers: int,
kernel_size: int,
p_dropout: int,
resblock: str,
resblock_kernel_sizes: List[int],
resblock_dilation_sizes: List[List[int]],
upsample_rates: List[int],
upsample_initial_channel: int,
upsample_kernel_sizes: List[int],
spk_embed_dim: int,
gin_channels: int,
sr: str | int,
**kwargs
):
super(SynthesizerTrnMs256NSFsid, self).__init__()
if isinstance(sr, str):
sr = sr2sr[sr]
if isinstance(sr, str): sr = {
"32k": 32000,
"40k": 40000,
"48k": 48000,
}[sr]
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
@@ -752,11 +776,11 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
z_p = self.flow(z, y_mask, g=g)
z_slice, ids_slice = commons.rand_slice_segments(
z_slice, ids_slice = utils.rand_slice_segments(
z, y_lengths, self.segment_size
)
# print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
pitchf = utils.slice_on_last_dim(pitchf, ids_slice, self.segment_size)
# print(-2,pitchf.shape,z_slice.shape)
o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
@@ -771,7 +795,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
sid: torch.Tensor,
skip_head: Optional[torch.Tensor] = None,
return_length: Optional[torch.Tensor] = None,
return_length2: Optional[torch.Tensor] = None,
# return_length2: Optional[torch.Tensor] = None,
):
g = self.emb_g(sid).unsqueeze(-1)
if skip_head is not None and return_length is not None:
@@ -791,7 +815,10 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, nsff0, g=g, n_res=return_length2)
o = self.dec(
z * x_mask, nsff0, g=g,
# n_res=return_length2,
)
return o, x_mask, (z, z_p, m_p, logs_p)
@@ -973,7 +1000,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
z_p = self.flow(z, y_mask, g=g)
z_slice, ids_slice = commons.rand_slice_segments(
z_slice, ids_slice = utils.rand_slice_segments(
z, y_lengths, self.segment_size
)
o = self.dec(z_slice, g=g)
@@ -987,7 +1014,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
sid: torch.Tensor,
skip_head: Optional[torch.Tensor] = None,
return_length: Optional[torch.Tensor] = None,
return_length2: Optional[torch.Tensor] = None,
#return_length2: Optional[torch.Tensor] = None,
):
g = self.emb_g(sid).unsqueeze(-1)
if skip_head is not None and return_length is not None:
@@ -1006,7 +1033,10 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, g=g, n_res=return_length2)
o = self.dec(
z * x_mask, g=g,
# n_res=return_length2
)
return o, x_mask, (z, z_p, m_p, logs_p)