optimize(rvc): gather residuals

2026-07-17 02:40:35 +08:00 · 2024-06-08 00:44:46 +09:00
parent eb24434260
commit b91dcf2261
9 changed files with 210 additions and 200 deletions
--- a/infer/lib/infer_pack/models.py
+++ b/infer/lib/infer_pack/models.py
@@ -9,6 +9,7 @@ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
 from rvc import residuals

 from rvc.norms import WN
+from rvc.residuals import ResidualCouplingBlock
 from rvc.utils import (
    get_padding,
    call_weight_data_normal_if_Conv,
@@ -21,92 +22,6 @@ from rvc.encoders import TextEncoder
 has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available())


-class ResidualCouplingBlock(nn.Module):
-    class Flip(nn.Module):
-        """
-        torch.jit.script() Compiled functions
-        can't take variable number of arguments or
-        use keyword-only arguments with defaults
-        """
-        def forward(
-            self,
-            x: torch.Tensor,
-            x_mask: torch.Tensor,
-            g: Optional[torch.Tensor] = None,
-            reverse: bool = False,
-        ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-            x = torch.flip(x, [1])
-            if not reverse:
-                logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
-                return x, logdet
-            else:
-                return x, torch.zeros([1], device=x.device)
-
-    def __init__(
-        self,
-        channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        n_flows=4,
-        gin_channels=0,
-    ):
-        super(ResidualCouplingBlock, self).__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-
-        self.flows = nn.ModuleList()
-        for i in range(n_flows):
-            self.flows.append(
-                residuals.ResidualCouplingLayer(
-                    channels,
-                    hidden_channels,
-                    kernel_size,
-                    dilation_rate,
-                    n_layers,
-                    gin_channels=gin_channels,
-                    mean_only=True,
-                )
-            )
-            self.flows.append(self.Flip())
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_mask: torch.Tensor,
-        g: Optional[torch.Tensor] = None,
-        reverse: bool = False,
-    ):
-        if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, x_mask, g=g, reverse=reverse)
-        else:
-            for flow in reversed(self.flows):
-                x, _ = flow.forward(x, x_mask, g=g, reverse=reverse)
-        return x
-
-    def remove_weight_norm(self):
-        for i in range(self.n_flows):
-            self.flows[i * 2].remove_weight_norm()
-
-    def __prepare_scriptable__(self):
-        for i in range(self.n_flows):
-            for hook in self.flows[i * 2]._forward_pre_hooks.values():
-                if (
-                    hook.__module__ == "torch.nn.utils.weight_norm"
-                    and hook.__class__.__name__ == "WeightNorm"
-                ):
-                    torch.nn.utils.remove_weight_norm(self.flows[i * 2])
-
-        return self
-
-
 class PosteriorEncoder(nn.Module):
    def __init__(
        self,
@@ -425,15 +340,15 @@ class SourceModuleHnNSF(torch.nn.Module):
 class GeneratorNSF(torch.nn.Module):
    def __init__(
        self,
-        initial_channel,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        gin_channels,
-        sr,
+        initial_channel: int,
+        resblock: str,
+        resblock_kernel_sizes: List[int],
+        resblock_dilation_sizes: List[List[int]],
+        upsample_rates: List[int],
+        upsample_initial_channel: int,
+        upsample_kernel_sizes: List[int],
+        gin_channels: int,
+        sr: int,
    ):
        super(GeneratorNSF, self).__init__()
        self.num_kernels = len(resblock_kernel_sizes)
@@ -479,7 +394,7 @@ class GeneratorNSF(torch.nn.Module):

        self.resblocks = nn.ModuleList()
        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
+            ch: int = upsample_initial_channel // (2 ** (i + 1))
            for j, (k, d) in enumerate(
                zip(resblock_kernel_sizes, resblock_dilation_sizes)
            ):
@@ -817,7 +732,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
        p_dropout,
        resblock: str,
        resblock_kernel_sizes,
-        resblock_dilation_sizes,
+        resblock_dilation_sizes: List[List[int]],
        upsample_rates,
        upsample_initial_channel,
        upsample_kernel_sizes,
--- a/infer/lib/infer_pack/models_onnx.py
+++ b/infer/lib/infer_pack/models_onnx.py
@@ -2,12 +2,12 @@ import torch
 from torch import nn

 from .models import (
-    ResidualCouplingBlock,
    PosteriorEncoder,
    GeneratorNSF,
 )

 from rvc.encoders import TextEncoder
+from rvc.residuals import ResidualCouplingBlock


 class SynthesizerTrnMsNSFsidM(nn.Module):
--- a/rvc/attentions.py
+++ b/rvc/attentions.py
@@ -13,9 +13,9 @@ class MultiHeadAttention(nn.Module):
        out_channels: int,
        n_heads: int,
        p_dropout: float = 0.0,
-        window_size: int | None = None,
+        window_size: Optional[int] = None,
        heads_share: bool = True,
-        block_length: int | None = None,
+        block_length: Optional[int] = None,
        proximal_bias: bool = False,
        proximal_init: bool = False,
    ):
@@ -233,7 +233,7 @@ class FFN(nn.Module):
        filter_channels: int,
        kernel_size: int,
        p_dropout: float = 0.0,
-        activation: str | None = None,
+        activation: Optional[str] = None,
        causal: bool = False,
    ):
        super(FFN, self).__init__()
--- a/rvc/onnx/f0predictors/dio.py
+++ b/rvc/onnx/f0predictors/dio.py
@@ -1,6 +1,7 @@
+from typing import Any, Optional
+
 import numpy as np
 import pyworld
-import typing

 from .f0 import F0Predictor

@@ -10,7 +11,7 @@ class DioF0Predictor(F0Predictor):
        super().__init__(hop_length, f0_min, f0_max, sampling_rate)

    def compute_f0(
-        self, wav: np.ndarray[typing.Any, np.dtype], p_len: int | None = None
+        self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
    ):
        if p_len is None:
            p_len = wav.shape[0] // self.hop_length
@@ -27,7 +28,7 @@ class DioF0Predictor(F0Predictor):
        return self.__interpolate_f0(self.__resize_f0(f0, p_len))[0]

    def compute_f0_uv(
-        self, wav: np.ndarray[typing.Any, np.dtype], p_len: int | None = None
+        self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
    ):
        if p_len is None:
            p_len = wav.shape[0] // self.hop_length
--- a/rvc/onnx/f0predictors/f0.py
+++ b/rvc/onnx/f0predictors/f0.py
@@ -1,5 +1,6 @@
+from typing import Any, Optional
+
 import numpy as np
-import typing


 class F0Predictor(object):
@@ -10,14 +11,14 @@ class F0Predictor(object):
        self.sampling_rate = sampling_rate

    def compute_f0(
-        self, wav: np.ndarray[typing.Any, np.dtype], p_len: int | None = None
+        self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
    ): ...

    def compute_f0_uv(
-        self, wav: np.ndarray[typing.Any, np.dtype], p_len: int | None = None
+        self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
    ): ...

-    def __interpolate_f0(self, f0: np.ndarray[typing.Any, np.dtype]):
+    def __interpolate_f0(self, f0: np.ndarray[Any, np.dtype]):
        """
        对F0进行插值处理
        """
@@ -55,7 +56,7 @@ class F0Predictor(object):

        return ip_data[:, 0], vuv_vector[:, 0]

-    def __resize_f0(self, x: np.ndarray[typing.Any, np.dtype], target_len: int):
+    def __resize_f0(self, x: np.ndarray[Any, np.dtype], target_len: int):
        source = np.array(x)
        source[source < 0.001] = np.nan
        target = np.interp(
--- a/rvc/onnx/f0predictors/harvest.py
+++ b/rvc/onnx/f0predictors/harvest.py
@@ -1,6 +1,7 @@
+from typing import Any, Optional
+
 import numpy as np
 import pyworld
-import typing

 from .f0 import F0Predictor

@@ -10,7 +11,7 @@ class HarvestF0Predictor(F0Predictor):
        super().__init__(hop_length, f0_min, f0_max, sampling_rate)

    def compute_f0(
-        self, wav: np.ndarray[typing.Any, np.dtype], p_len: int | None = None
+        self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
    ):
        if p_len is None:
            p_len = wav.shape[0] // self.hop_length
@@ -25,7 +26,7 @@ class HarvestF0Predictor(F0Predictor):
        return self.__interpolate_f0(self.__resize_f0(f0, p_len))[0]

    def compute_f0_uv(
-        self, wav: np.ndarray[typing.Any, np.dtype], p_len: int | None = None
+        self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
    ):
        if p_len is None:
            p_len = wav.shape[0] // self.hop_length
--- a/rvc/onnx/f0predictors/pm.py
+++ b/rvc/onnx/f0predictors/pm.py
@@ -1,6 +1,7 @@
+from typing import Any, Optional
+
 import numpy as np
 import parselmouth
-import typing

 from .f0 import F0Predictor

@@ -10,7 +11,7 @@ class PMF0Predictor(F0Predictor):
        super().__init__(hop_length, f0_min, f0_max, sampling_rate)

    def compute_f0(
-        self, wav: np.ndarray[typing.Any, np.dtype], p_len: int | None = None
+        self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
    ):
        x = wav
        if p_len is None:
@@ -36,7 +37,7 @@ class PMF0Predictor(F0Predictor):
        return f0

    def compute_f0_uv(
-        self, wav: np.ndarray[typing.Any, np.dtype], p_len: int | None = None
+        self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
    ):
        x = wav
        if p_len is None:
--- a/rvc/residuals.py
+++ b/rvc/residuals.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, List, Tuple

 import torch
 from torch import nn
@@ -15,46 +15,33 @@ from .utils import (
 LRELU_SLOPE = 0.1

 class ResBlock1(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int = 3,
+        dilation: List[int] = (1, 3, 5),
+    ):
        super(ResBlock1, self).__init__()
-        self.convs1 = nn.ModuleList(
-            [
+
+        self.convs1 = nn.ModuleList()
+        for d in dilation:
+            self.convs1.append(
                weight_norm(
                    Conv1d(
                        channels,
                        channels,
                        kernel_size,
                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
+                        dilation=d,
+                        padding=get_padding(kernel_size, d),
                    )
                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[2],
-                        padding=get_padding(kernel_size, dilation[2]),
-                    )
-                ),
-            ]
-        )
+            )
        self.convs1.apply(call_weight_data_normal_if_Conv)

-        self.convs2 = nn.ModuleList(
-            [
+        self.convs2 = nn.ModuleList()
+        for _ in dilation:
+            self.convs1.append(
                weight_norm(
                    Conv1d(
                        channels,
@@ -65,32 +52,22 @@ class ResBlock1(torch.nn.Module):
                        padding=get_padding(kernel_size, 1),
                    )
                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-            ]
-        )
+            )
        self.convs2.apply(call_weight_data_normal_if_Conv)
        self.lrelu_slope = LRELU_SLOPE

-    def forward(self, x: torch.Tensor, x_mask: Optional[torch.Tensor] = None):
+    def __call__(
+        self,
+        x: torch.Tensor,
+        x_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return super().__call__(x, x_mask=x_mask)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
        for c1, c2 in zip(self.convs1, self.convs2):
            xt = F.leaky_relu(x, self.lrelu_slope)
            if x_mask is not None:
@@ -130,36 +107,46 @@ class ResBlock1(torch.nn.Module):


 class ResBlock2(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+    """
+    Actually this module is not used currently
+    because all configs specified "resblock": "1"
+    """
+    def __init__(
+        self,
+        channels: int,
+        kernel_size=3,
+        dilation: List[int] = (1, 3),
+    ):
        super(ResBlock2, self).__init__()
-        self.convs = nn.ModuleList(
-            [
+        self.convs = nn.ModuleList()
+        for d in dilation:
+            self.convs.append(
                weight_norm(
                    Conv1d(
                        channels,
                        channels,
                        kernel_size,
                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
+                        dilation=d,
+                        padding=get_padding(kernel_size, d),
                    )
                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-            ]
-        )
+            )
        self.convs.apply(call_weight_data_normal_if_Conv)
        self.lrelu_slope = LRELU_SLOPE

-    def forward(self, x, x_mask: Optional[torch.Tensor] = None):
+    def __call__(
+        self,
+        x: torch.Tensor,
+        x_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return super().__call__(x, x_mask=x_mask)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
        for c in self.convs:
            xt = F.leaky_relu(x, self.lrelu_slope)
            if x_mask is not None:
@@ -188,14 +175,14 @@ class ResBlock2(torch.nn.Module):
 class ResidualCouplingLayer(nn.Module):
    def __init__(
        self,
-        channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        p_dropout=0,
-        gin_channels=0,
-        mean_only=False,
+        channels: int,
+        hidden_channels: int,
+        kernel_size: int,
+        dilation_rate: int,
+        n_layers: int,
+        p_dropout: int = 0,
+        gin_channels: int = 0,
+        mean_only: bool = False,
    ):
        assert channels % 2 == 0, "channels should be divisible by 2"
        super(ResidualCouplingLayer, self).__init__()
@@ -220,13 +207,22 @@ class ResidualCouplingLayer(nn.Module):
        self.post.weight.data.zero_()
        self.post.bias.data.zero_()

+    def __call__(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        g: Optional[torch.Tensor] = None,
+        reverse: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return super().__call__(x, x_mask, g=g, reverse=reverse)
+
    def forward(
        self,
        x: torch.Tensor,
        x_mask: torch.Tensor,
        g: Optional[torch.Tensor] = None,
        reverse: bool = False,
-    ):
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
        h = self.pre(x0) * x_mask
        h = self.enc(h, x_mask, g=g)
@@ -242,10 +238,10 @@ class ResidualCouplingLayer(nn.Module):
            x = torch.cat([x0, x1], 1)
            logdet = torch.sum(logs, [1, 2])
            return x, logdet
-        else:
-            x1 = (x1 - m) * torch.exp(-logs) * x_mask
-            x = torch.cat([x0, x1], 1)
-            return x, torch.zeros([1])
+
+        x1 = (x1 - m) * torch.exp(-logs) * x_mask
+        x = torch.cat([x0, x1], 1)
+        return x, torch.zeros([1])

    def remove_weight_norm(self):
        self.enc.remove_weight_norm()
@@ -258,3 +254,96 @@ class ResidualCouplingLayer(nn.Module):
            ):
                torch.nn.utils.remove_weight_norm(self.enc)
        return self
+
+class ResidualCouplingBlock(nn.Module):
+    class Flip(nn.Module):
+        """
+        torch.jit.script() Compiled functions
+        can't take variable number of arguments or
+        use keyword-only arguments with defaults
+        """
+        def forward(
+            self,
+            x: torch.Tensor,
+            x_mask: torch.Tensor,
+            g: Optional[torch.Tensor] = None,
+            reverse: bool = False,
+        ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+            x = torch.flip(x, [1])
+            if not reverse:
+                logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+                return x, logdet
+            else:
+                return x, torch.zeros([1], device=x.device)
+
+    def __init__(
+        self,
+        channels: int,
+        hidden_channels: int,
+        kernel_size: int,
+        dilation_rate: int,
+        n_layers: int,
+        n_flows: int = 4,
+        gin_channels: int = 0,
+    ):
+        super(ResidualCouplingBlock, self).__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.flows = nn.ModuleList()
+        for _ in range(n_flows):
+            self.flows.append(
+                ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(self.Flip())
+
+    def __call__(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        g: Optional[torch.Tensor] = None,
+        reverse: bool = False,
+    ) -> torch.Tensor:
+        return super().__call__(x, x_mask, g=g, reverse=reverse)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        g: Optional[torch.Tensor] = None,
+        reverse: bool = False,
+    ) -> torch.Tensor:
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x, _ = flow.forward(x, x_mask, g=g, reverse=reverse)
+        return x
+
+    def remove_weight_norm(self):
+        for i in range(self.n_flows):
+            self.flows[i * 2].remove_weight_norm()
+
+    def __prepare_scriptable__(self):
+        for i in range(self.n_flows):
+            for hook in self.flows[i * 2]._forward_pre_hooks.values():
+                if (
+                    hook.__module__ == "torch.nn.utils.weight_norm"
+                    and hook.__class__.__name__ == "WeightNorm"
+                ):
+                    torch.nn.utils.remove_weight_norm(self.flows[i * 2])
+        return self
--- a/rvc/transforms.py
+++ b/rvc/transforms.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import numpy as np
 import torch
 from torch.nn import functional as F
@@ -13,7 +15,7 @@ def piecewise_rational_quadratic_transform(
    unnormalized_heights: torch.Tensor,
    unnormalized_derivatives: torch.Tensor,
    inverse: bool = False,
-    tails: str | None = None,
+    tails: Optional[str] = None,
    tail_bound: float = 1.0,
    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,