diff --git a/infer/lib/infer_pack/attentions.py b/infer/lib/infer_pack/attentions.py index fb43440..f18212f 100644 --- a/infer/lib/infer_pack/attentions.py +++ b/infer/lib/infer_pack/attentions.py @@ -73,6 +73,7 @@ class Encoder(nn.Module): x = x * x_mask return x + """ class Decoder(nn.Module): def __init__( @@ -158,6 +159,7 @@ class Decoder(nn.Module): return x """ + class MultiHeadAttention(nn.Module): def __init__( self, diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py index 89f3142..d9bca42 100644 --- a/infer/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -60,7 +60,9 @@ class TextEncoder(nn.Module): # skip_head: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: return super().__call__( - phone, pitch, lengths, + phone, + pitch, + lengths, # skip_head=skip_head, ) @@ -80,8 +82,10 @@ class TextEncoder(nn.Module): x = torch.transpose(x, 1, -1) # [b, h, t] x_mask = torch.unsqueeze( utils.sequence_mask( - lengths, x.size(2), - ), 1, + lengths, + x.size(2), + ), + 1, ).to(x.dtype) x = self.encoder(x * x_mask, x_mask) """ @@ -193,17 +197,19 @@ class PosteriorEncoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def __call__( - self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None + self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - super().__call__(x, x_lengths, g = g) + super().__call__(x, x_lengths, g=g) def forward( self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: x_mask = torch.unsqueeze( utils.sequence_mask( - x_lengths, x.size(2), - ), 1, + x_lengths, + x.size(2), + ), + 1, ).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) @@ -663,11 +669,12 @@ class SynthesizerTrnMs256NSFsid(nn.Module): **kwargs ): super(SynthesizerTrnMs256NSFsid, self).__init__() - if isinstance(sr, str): sr = { - "32k": 32000, - "40k": 40000, - "48k": 48000, - }[sr] + if isinstance(sr, str): + sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, + }[sr] self.spec_channels = spec_channels self.inter_channels = inter_channels self.hidden_channels = hidden_channels @@ -776,9 +783,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module): m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = utils.rand_slice_segments( - z, y_lengths, self.segment_size - ) + z_slice, ids_slice = utils.rand_slice_segments(z, y_lengths, self.segment_size) # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) pitchf = utils.slice_on_last_dim(pitchf, ids_slice, self.segment_size) # print(-2,pitchf.shape,z_slice.shape) @@ -816,7 +821,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module): z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec( - z * x_mask, nsff0, g=g, + z * x_mask, + nsff0, + g=g, # n_res=return_length2, ) return o, x_mask, (z, z_p, m_p, logs_p) @@ -1000,9 +1007,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = utils.rand_slice_segments( - z, y_lengths, self.segment_size - ) + z_slice, ids_slice = utils.rand_slice_segments(z, y_lengths, self.segment_size) o = self.dec(z_slice, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) @@ -1014,7 +1019,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): sid: torch.Tensor, skip_head: Optional[torch.Tensor] = None, return_length: Optional[torch.Tensor] = None, - #return_length2: Optional[torch.Tensor] = None, + # return_length2: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) if skip_head is not None and return_length is not None: @@ -1034,7 +1039,8 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec( - z * x_mask, g=g, + z * x_mask, + g=g, # n_res=return_length2 ) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/infer/lib/infer_pack/models_onnx.py b/infer/lib/infer_pack/models_onnx.py index 63c2caa..00a42a8 100644 --- a/infer/lib/infer_pack/models_onnx.py +++ b/infer/lib/infer_pack/models_onnx.py @@ -1,7 +1,15 @@ import torch from torch import nn -from .attentions import TextEncoder, ResidualCouplingBlock, PosteriorEncoder, Generator, SineGen, SourceModuleHnNSF, GeneratorNSF +from .attentions import ( + TextEncoder, + ResidualCouplingBlock, + PosteriorEncoder, + Generator, + SineGen, + SourceModuleHnNSF, + GeneratorNSF, +) class SynthesizerTrnMsNSFsidM(nn.Module): @@ -29,11 +37,12 @@ class SynthesizerTrnMsNSFsidM(nn.Module): **kwargs ): super(SynthesizerTrnMsNSFsidM, self).__init__() - if isinstance(sr, str): sr = { - "32k": 32000, - "40k": 40000, - "48k": 48000, - }[sr] + if isinstance(sr, str): + sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, + }[sr] self.spec_channels = spec_channels self.inter_channels = inter_channels self.hidden_channels = hidden_channels diff --git a/rvc/utils.py b/rvc/utils.py index 4cf5cef..94b52e5 100644 --- a/rvc/utils.py +++ b/rvc/utils.py @@ -2,11 +2,12 @@ from typing import List, Optional, Tuple import torch + def call_weight_data_normal_if_Conv(m: torch.nn.Module): classname = m.__class__.__name__ if classname.find("Conv") != -1: - mean=0.0 - std=0.01 + mean = 0.0 + std = 0.01 m.weight.data.normal_(mean, std) @@ -15,8 +16,10 @@ def get_padding(kernel_size: int, dilation=1): def slice_on_last_dim( - x: torch.Tensor, start_indices: List[int], segment_size=4, - ) -> torch.Tensor: + x: torch.Tensor, + start_indices: List[int], + segment_size=4, +) -> torch.Tensor: new_shape = x.shape new_shape[-1] = segment_size ret = torch.empty(new_shape) @@ -28,10 +31,13 @@ def slice_on_last_dim( def rand_slice_segments( - x: torch.Tensor, x_lengths: int = None, segment_size=4, - ) -> Tuple[torch.Tensor, List[int]]: + x: torch.Tensor, + x_lengths: int = None, + segment_size=4, +) -> Tuple[torch.Tensor, List[int]]: b, _, t = x.size() - if x_lengths is None: x_lengths = t + if x_lengths is None: + x_lengths = t ids_str_max = x_lengths - segment_size + 1 ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) ret = slice_on_last_dim(x, ids_str, segment_size) @@ -53,8 +59,9 @@ def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]: def sequence_mask( - length: torch.Tensor, max_length: Optional[int] = None, - ) -> torch.BoolTensor: + length: torch.Tensor, + max_length: Optional[int] = None, +) -> torch.BoolTensor: if max_length is None: max_length = int(length.max()) x = torch.arange(max_length, dtype=length.dtype, device=length.device)