diff --git a/infer/modules/train/extract_f0_print.py b/infer/modules/train/extract_f0_print.py index 87b6b14..d0c3c03 100644 --- a/infer/modules/train/extract_f0_print.py +++ b/infer/modules/train/extract_f0_print.py @@ -2,6 +2,7 @@ import os import sys import traceback from pathlib import Path +import importlib.util from dotenv import load_dotenv @@ -38,6 +39,8 @@ f0method = sys.argv[3] device = sys.argv[4] is_half = sys.argv[5] == "True" +if importlib.util.find_spec("torch_directml") is not None: + import torch_directml # use side effect class FeatureInput(object): def __init__(self, is_half: bool, device="cpu", samplerate=16000, hop_size=160): diff --git a/rvc/f0/f0.py b/rvc/f0/f0.py index 0f615e6..c79c4f8 100644 --- a/rvc/f0/f0.py +++ b/rvc/f0/f0.py @@ -11,14 +11,14 @@ class F0Predictor(object): f0_min=50, f0_max=1100, sampling_rate=44100, - device: Optional[str] = None, + device: Optional[Union[str, torch.device]] = None, ): self.hop_length = hop_length self.f0_min = f0_min self.f0_max = f0_max self.sampling_rate = sampling_rate - if device is None: - device = "cuda:0" if torch.cuda.is_available() else "cpu" + if not device: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.device = device def compute_f0( diff --git a/rvc/f0/mel.py b/rvc/f0/mel.py index 2c06800..ee98b15 100644 --- a/rvc/f0/mel.py +++ b/rvc/f0/mel.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Union import torch import numpy as np @@ -17,9 +17,9 @@ class MelSpectrogram(torch.nn.Module): hop_length: int, n_fft: Optional[int] = None, mel_fmin: int = 0, - mel_fmax: int = None, + mel_fmax: Optional[int] = None, clamp: float = 1e-5, - device=torch.device("cpu"), + device: Union[str, torch.device] = torch.device("cpu"), ): super().__init__() if n_fft is None: diff --git a/rvc/f0/rmvpe.py b/rvc/f0/rmvpe.py index 33eac0a..02d4f0c 100644 --- a/rvc/f0/rmvpe.py +++ b/rvc/f0/rmvpe.py @@ -1,6 +1,6 @@ from io import BytesIO import os -from typing import Any, Optional, Union +from typing import Optional, Union import numpy as np import torch diff --git a/rvc/layers/attentions.py b/rvc/layers/attentions.py index 22b626d..3e1fd01 100644 --- a/rvc/layers/attentions.py +++ b/rvc/layers/attentions.py @@ -12,8 +12,8 @@ class MultiHeadAttention(nn.Module): channels: int, out_channels: int, n_heads: int, + window_size: int, p_dropout: float = 0.0, - window_size: Optional[int] = None, heads_share: bool = True, block_length: Optional[int] = None, proximal_bias: bool = False, diff --git a/rvc/layers/encoders.py b/rvc/layers/encoders.py index 1fc2478..eeaa3fd 100644 --- a/rvc/layers/encoders.py +++ b/rvc/layers/encoders.py @@ -42,8 +42,8 @@ class Encoder(nn.Module): hidden_channels, hidden_channels, n_heads, + window_size, p_dropout=p_dropout, - window_size=window_size, ) ) self.norm_layers_1.append(LayerNorm(hidden_channels)) @@ -121,7 +121,7 @@ class TextEncoder(nn.Module): def __call__( self, phone: torch.Tensor, - pitch: torch.Tensor, + pitch: Optional[torch.Tensor], lengths: torch.Tensor, skip_head: Optional[int] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: @@ -135,7 +135,7 @@ class TextEncoder(nn.Module): def forward( self, phone: torch.Tensor, - pitch: torch.Tensor, + pitch: Optional[torch.Tensor], lengths: torch.Tensor, skip_head: Optional[int] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: diff --git a/rvc/layers/generators.py b/rvc/layers/generators.py index 185f939..d78d6cd 100644 --- a/rvc/layers/generators.py +++ b/rvc/layers/generators.py @@ -46,6 +46,7 @@ class Generator(torch.nn.Module): self.resblocks = nn.ModuleList() resblock_module = ResBlock1 if resblock == "1" else ResBlock2 + ch = 0 for i in range(len(self.ups)): ch = upsample_initial_channel // (2 ** (i + 1)) for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes): diff --git a/rvc/layers/norms.py b/rvc/layers/norms.py index 9418035..4b07143 100644 --- a/rvc/layers/norms.py +++ b/rvc/layers/norms.py @@ -30,7 +30,7 @@ class WN(torch.nn.Module): dilation_rate: int, n_layers: int, gin_channels: int = 0, - p_dropout: int = 0, + p_dropout: float = 0, ): super(WN, self).__init__() assert kernel_size % 2 == 1 diff --git a/rvc/layers/nsf.py b/rvc/layers/nsf.py index 5e9e35a..22fd968 100644 --- a/rvc/layers/nsf.py +++ b/rvc/layers/nsf.py @@ -1,4 +1,4 @@ -from typing import Optional, List +from typing import Optional, List, Union import math import torch @@ -83,7 +83,7 @@ class NSFGenerator(torch.nn.Module): self.conv_pre = Conv1d( initial_channel, upsample_initial_channel, 7, 1, padding=3 ) - resblock = ResBlock1 if resblock == "1" else ResBlock2 + resblockcls = ResBlock1 if resblock == "1" else ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): @@ -114,12 +114,13 @@ class NSFGenerator(torch.nn.Module): self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) self.resblocks = nn.ModuleList() + ch = 0 for i in range(len(self.ups)): - ch: int = upsample_initial_channel // (2 ** (i + 1)) + ch = upsample_initial_channel // (2 ** (i + 1)) for j, (k, d) in enumerate( zip(resblock_kernel_sizes, resblock_dilation_sizes) ): - self.resblocks.append(resblock(ch, k, d)) + self.resblocks.append(resblockcls(ch, k, d)) self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) self.ups.apply(call_weight_data_normal_if_Conv) diff --git a/rvc/layers/residuals.py b/rvc/layers/residuals.py index 4a135ce..45b3d11 100644 --- a/rvc/layers/residuals.py +++ b/rvc/layers/residuals.py @@ -20,7 +20,7 @@ class ResBlock1(torch.nn.Module): self, channels: int, kernel_size: int = 3, - dilation: List[int] = (1, 3, 5), + dilation: List[int] = [1, 3, 5], ): super(ResBlock1, self).__init__() @@ -117,7 +117,7 @@ class ResBlock2(torch.nn.Module): self, channels: int, kernel_size=3, - dilation: List[int] = (1, 3), + dilation: List[int] = [1, 3], ): super(ResBlock2, self).__init__() self.convs = nn.ModuleList() @@ -182,7 +182,7 @@ class ResidualCouplingLayer(nn.Module): kernel_size: int, dilation_rate: int, n_layers: int, - p_dropout: int = 0, + p_dropout: float = 0, gin_channels: int = 0, mean_only: bool = False, ): diff --git a/rvc/layers/synthesizers.py b/rvc/layers/synthesizers.py index 474781e..2bb7ea4 100644 --- a/rvc/layers/synthesizers.py +++ b/rvc/layers/synthesizers.py @@ -34,7 +34,7 @@ class SynthesizerTrnMsNSFsid(nn.Module): upsample_kernel_sizes: List[int], spk_embed_dim: int, gin_channels: int, - sr: Optional[Union[str, int]], + sr: Union[str, int], encoder_dim: int, use_f0: bool, ): @@ -143,7 +143,7 @@ class SynthesizerTrnMsNSFsid(nn.Module): torch.nn.utils.remove_weight_norm(self.enc_q) return self - @torch.jit.ignore + @torch.jit.ignore() def forward( self, phone: torch.Tensor, @@ -155,18 +155,20 @@ class SynthesizerTrnMsNSFsid(nn.Module): pitchf: Optional[torch.Tensor] = None, ): # 这里ds是id,[bs,1] # print(1,pitch.shape)#[bs,t] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + embg = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=embg) + z_p = self.flow(z, y_mask, g=embg) z_slice, ids_slice = rand_slice_segments_on_last_dim( z, y_lengths, self.segment_size ) - if pitchf is not None: + if pitchf is not None and isinstance(self.dec, NSFGenerator): pitchf = slice_on_last_dim(pitchf, ids_slice, self.segment_size) - o = self.dec(z_slice, pitchf, g=g) + o = self.dec(z_slice, pitchf, g=embg) # type: ignore + elif isinstance(self.dec, Generator): + o = self.dec(z_slice, g=embg) else: - o = self.dec(z_slice, g=g) + raise KeyError(f"unknown dec type: {type(self.dec).__name__}") return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) @torch.jit.export @@ -201,15 +203,17 @@ class SynthesizerTrnMsNSFsid(nn.Module): z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) del z_p, m_p, logs_p - if pitchf is not None: - o = self.dec( + if pitchf is not None and isinstance(self.dec, NSFGenerator): + o = self.dec( z * x_mask, pitchf, g=g, n_res=return_length2, ) - else: + elif isinstance(self.dec, Generator): o = self.dec(z * x_mask, g=g, n_res=return_length2) + else: + raise KeyError(f"unknown dec type: {type(self.dec).__name__}") del x_mask, z return o # , x_mask, (z, z_p, m_p, logs_p) @@ -326,7 +330,7 @@ class SynthesizerTrnMs256NSFsid_nono(SynthesizerTrnMsNSFsid): upsample_kernel_sizes: List[int], spk_embed_dim: int, gin_channels: int, - sr=None, + sr: Union[str, int], ): super().__init__( spec_channels, @@ -346,6 +350,7 @@ class SynthesizerTrnMs256NSFsid_nono(SynthesizerTrnMsNSFsid): upsample_kernel_sizes, spk_embed_dim, gin_channels, + sr, 256, False, ) @@ -371,7 +376,7 @@ class SynthesizerTrnMs768NSFsid_nono(SynthesizerTrnMsNSFsid): upsample_kernel_sizes: List[int], spk_embed_dim: int, gin_channels: int, - sr=None, + sr: Union[str, int], ): super().__init__( spec_channels, @@ -391,6 +396,7 @@ class SynthesizerTrnMs768NSFsid_nono(SynthesizerTrnMsNSFsid): upsample_kernel_sizes, spk_embed_dim, gin_channels, + sr, 768, False, ) diff --git a/rvc/layers/utils.py b/rvc/layers/utils.py index 418578b..bf5a9ec 100644 --- a/rvc/layers/utils.py +++ b/rvc/layers/utils.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, Iterator +from typing import List, Optional, Tuple, Iterator, Union import torch @@ -17,7 +17,7 @@ def get_padding(kernel_size: int, dilation=1) -> int: def slice_on_last_dim( x: torch.Tensor, - start_indices: List[int], + start_indices: Union[List[int], torch.Tensor], segment_size=4, ) -> torch.Tensor: new_shape = [*x.shape] @@ -32,9 +32,9 @@ def slice_on_last_dim( def rand_slice_segments_on_last_dim( x: torch.Tensor, - x_lengths: int = None, + x_lengths: Optional[Union[int, torch.Tensor]] = None, segment_size=4, -) -> Tuple[torch.Tensor, List[int]]: +) -> Tuple[torch.Tensor, Union[List[int], torch.Tensor]]: b, _, t = x.size() if x_lengths is None: x_lengths = t @@ -58,7 +58,7 @@ def activate_add_tanh_sigmoid_multiply( def sequence_mask( length: torch.Tensor, max_length: Optional[int] = None, -) -> torch.BoolTensor: +): if max_length is None: max_length = int(length.max()) x = torch.arange(max_length, dtype=length.dtype, device=length.device) diff --git a/rvc/onnx/synthesizer.py b/rvc/onnx/synthesizer.py index e8bf516..9dafce8 100644 --- a/rvc/onnx/synthesizer.py +++ b/rvc/onnx/synthesizer.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union +from typing import List, Union import torch @@ -25,7 +25,7 @@ class SynthesizerTrnMsNSFsid(SynthesizerBase): upsample_kernel_sizes: List[int], spk_embed_dim: int, gin_channels: int, - sr: Optional[Union[str, int]], + sr: Union[str, int], encoder_dim: int, ): super().__init__(