mirror of
https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git
synced 2026-06-09 04:29:50 +08:00
fix(dml): train extract_f0_print error
ModuleNotFoundError: No module named 'torch.privateuseone' due to new prosess
This commit is contained in:
@@ -2,6 +2,7 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import importlib.util
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
@@ -38,6 +39,8 @@ f0method = sys.argv[3]
|
|||||||
device = sys.argv[4]
|
device = sys.argv[4]
|
||||||
is_half = sys.argv[5] == "True"
|
is_half = sys.argv[5] == "True"
|
||||||
|
|
||||||
|
if importlib.util.find_spec("torch_directml") is not None:
|
||||||
|
import torch_directml # use side effect
|
||||||
|
|
||||||
class FeatureInput(object):
|
class FeatureInput(object):
|
||||||
def __init__(self, is_half: bool, device="cpu", samplerate=16000, hop_size=160):
|
def __init__(self, is_half: bool, device="cpu", samplerate=16000, hop_size=160):
|
||||||
|
|||||||
@@ -11,14 +11,14 @@ class F0Predictor(object):
|
|||||||
f0_min=50,
|
f0_min=50,
|
||||||
f0_max=1100,
|
f0_max=1100,
|
||||||
sampling_rate=44100,
|
sampling_rate=44100,
|
||||||
device: Optional[str] = None,
|
device: Optional[Union[str, torch.device]] = None,
|
||||||
):
|
):
|
||||||
self.hop_length = hop_length
|
self.hop_length = hop_length
|
||||||
self.f0_min = f0_min
|
self.f0_min = f0_min
|
||||||
self.f0_max = f0_max
|
self.f0_max = f0_max
|
||||||
self.sampling_rate = sampling_rate
|
self.sampling_rate = sampling_rate
|
||||||
if device is None:
|
if not device:
|
||||||
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||||
self.device = device
|
self.device = device
|
||||||
|
|
||||||
def compute_f0(
|
def compute_f0(
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import Optional
|
from typing import Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -17,9 +17,9 @@ class MelSpectrogram(torch.nn.Module):
|
|||||||
hop_length: int,
|
hop_length: int,
|
||||||
n_fft: Optional[int] = None,
|
n_fft: Optional[int] = None,
|
||||||
mel_fmin: int = 0,
|
mel_fmin: int = 0,
|
||||||
mel_fmax: int = None,
|
mel_fmax: Optional[int] = None,
|
||||||
clamp: float = 1e-5,
|
clamp: float = 1e-5,
|
||||||
device=torch.device("cpu"),
|
device: Union[str, torch.device] = torch.device("cpu"),
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
if n_fft is None:
|
if n_fft is None:
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import os
|
import os
|
||||||
from typing import Any, Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ class MultiHeadAttention(nn.Module):
|
|||||||
channels: int,
|
channels: int,
|
||||||
out_channels: int,
|
out_channels: int,
|
||||||
n_heads: int,
|
n_heads: int,
|
||||||
|
window_size: int,
|
||||||
p_dropout: float = 0.0,
|
p_dropout: float = 0.0,
|
||||||
window_size: Optional[int] = None,
|
|
||||||
heads_share: bool = True,
|
heads_share: bool = True,
|
||||||
block_length: Optional[int] = None,
|
block_length: Optional[int] = None,
|
||||||
proximal_bias: bool = False,
|
proximal_bias: bool = False,
|
||||||
|
|||||||
@@ -42,8 +42,8 @@ class Encoder(nn.Module):
|
|||||||
hidden_channels,
|
hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
n_heads,
|
n_heads,
|
||||||
|
window_size,
|
||||||
p_dropout=p_dropout,
|
p_dropout=p_dropout,
|
||||||
window_size=window_size,
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
||||||
@@ -121,7 +121,7 @@ class TextEncoder(nn.Module):
|
|||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
phone: torch.Tensor,
|
phone: torch.Tensor,
|
||||||
pitch: torch.Tensor,
|
pitch: Optional[torch.Tensor],
|
||||||
lengths: torch.Tensor,
|
lengths: torch.Tensor,
|
||||||
skip_head: Optional[int] = None,
|
skip_head: Optional[int] = None,
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||||
@@ -135,7 +135,7 @@ class TextEncoder(nn.Module):
|
|||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
phone: torch.Tensor,
|
phone: torch.Tensor,
|
||||||
pitch: torch.Tensor,
|
pitch: Optional[torch.Tensor],
|
||||||
lengths: torch.Tensor,
|
lengths: torch.Tensor,
|
||||||
skip_head: Optional[int] = None,
|
skip_head: Optional[int] = None,
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||||
|
|||||||
@@ -46,6 +46,7 @@ class Generator(torch.nn.Module):
|
|||||||
|
|
||||||
self.resblocks = nn.ModuleList()
|
self.resblocks = nn.ModuleList()
|
||||||
resblock_module = ResBlock1 if resblock == "1" else ResBlock2
|
resblock_module = ResBlock1 if resblock == "1" else ResBlock2
|
||||||
|
ch = 0
|
||||||
for i in range(len(self.ups)):
|
for i in range(len(self.ups)):
|
||||||
ch = upsample_initial_channel // (2 ** (i + 1))
|
ch = upsample_initial_channel // (2 ** (i + 1))
|
||||||
for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
|
for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class WN(torch.nn.Module):
|
|||||||
dilation_rate: int,
|
dilation_rate: int,
|
||||||
n_layers: int,
|
n_layers: int,
|
||||||
gin_channels: int = 0,
|
gin_channels: int = 0,
|
||||||
p_dropout: int = 0,
|
p_dropout: float = 0,
|
||||||
):
|
):
|
||||||
super(WN, self).__init__()
|
super(WN, self).__init__()
|
||||||
assert kernel_size % 2 == 1
|
assert kernel_size % 2 == 1
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import Optional, List
|
from typing import Optional, List, Union
|
||||||
import math
|
import math
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@@ -83,7 +83,7 @@ class NSFGenerator(torch.nn.Module):
|
|||||||
self.conv_pre = Conv1d(
|
self.conv_pre = Conv1d(
|
||||||
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
||||||
)
|
)
|
||||||
resblock = ResBlock1 if resblock == "1" else ResBlock2
|
resblockcls = ResBlock1 if resblock == "1" else ResBlock2
|
||||||
|
|
||||||
self.ups = nn.ModuleList()
|
self.ups = nn.ModuleList()
|
||||||
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
||||||
@@ -114,12 +114,13 @@ class NSFGenerator(torch.nn.Module):
|
|||||||
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
|
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
|
||||||
|
|
||||||
self.resblocks = nn.ModuleList()
|
self.resblocks = nn.ModuleList()
|
||||||
|
ch = 0
|
||||||
for i in range(len(self.ups)):
|
for i in range(len(self.ups)):
|
||||||
ch: int = upsample_initial_channel // (2 ** (i + 1))
|
ch = upsample_initial_channel // (2 ** (i + 1))
|
||||||
for j, (k, d) in enumerate(
|
for j, (k, d) in enumerate(
|
||||||
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
||||||
):
|
):
|
||||||
self.resblocks.append(resblock(ch, k, d))
|
self.resblocks.append(resblockcls(ch, k, d))
|
||||||
|
|
||||||
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
||||||
self.ups.apply(call_weight_data_normal_if_Conv)
|
self.ups.apply(call_weight_data_normal_if_Conv)
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ class ResBlock1(torch.nn.Module):
|
|||||||
self,
|
self,
|
||||||
channels: int,
|
channels: int,
|
||||||
kernel_size: int = 3,
|
kernel_size: int = 3,
|
||||||
dilation: List[int] = (1, 3, 5),
|
dilation: List[int] = [1, 3, 5],
|
||||||
):
|
):
|
||||||
super(ResBlock1, self).__init__()
|
super(ResBlock1, self).__init__()
|
||||||
|
|
||||||
@@ -117,7 +117,7 @@ class ResBlock2(torch.nn.Module):
|
|||||||
self,
|
self,
|
||||||
channels: int,
|
channels: int,
|
||||||
kernel_size=3,
|
kernel_size=3,
|
||||||
dilation: List[int] = (1, 3),
|
dilation: List[int] = [1, 3],
|
||||||
):
|
):
|
||||||
super(ResBlock2, self).__init__()
|
super(ResBlock2, self).__init__()
|
||||||
self.convs = nn.ModuleList()
|
self.convs = nn.ModuleList()
|
||||||
@@ -182,7 +182,7 @@ class ResidualCouplingLayer(nn.Module):
|
|||||||
kernel_size: int,
|
kernel_size: int,
|
||||||
dilation_rate: int,
|
dilation_rate: int,
|
||||||
n_layers: int,
|
n_layers: int,
|
||||||
p_dropout: int = 0,
|
p_dropout: float = 0,
|
||||||
gin_channels: int = 0,
|
gin_channels: int = 0,
|
||||||
mean_only: bool = False,
|
mean_only: bool = False,
|
||||||
):
|
):
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ class SynthesizerTrnMsNSFsid(nn.Module):
|
|||||||
upsample_kernel_sizes: List[int],
|
upsample_kernel_sizes: List[int],
|
||||||
spk_embed_dim: int,
|
spk_embed_dim: int,
|
||||||
gin_channels: int,
|
gin_channels: int,
|
||||||
sr: Optional[Union[str, int]],
|
sr: Union[str, int],
|
||||||
encoder_dim: int,
|
encoder_dim: int,
|
||||||
use_f0: bool,
|
use_f0: bool,
|
||||||
):
|
):
|
||||||
@@ -143,7 +143,7 @@ class SynthesizerTrnMsNSFsid(nn.Module):
|
|||||||
torch.nn.utils.remove_weight_norm(self.enc_q)
|
torch.nn.utils.remove_weight_norm(self.enc_q)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@torch.jit.ignore
|
@torch.jit.ignore()
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
phone: torch.Tensor,
|
phone: torch.Tensor,
|
||||||
@@ -155,18 +155,20 @@ class SynthesizerTrnMsNSFsid(nn.Module):
|
|||||||
pitchf: Optional[torch.Tensor] = None,
|
pitchf: Optional[torch.Tensor] = None,
|
||||||
): # 这里ds是id,[bs,1]
|
): # 这里ds是id,[bs,1]
|
||||||
# print(1,pitch.shape)#[bs,t]
|
# print(1,pitch.shape)#[bs,t]
|
||||||
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
|
embg = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
|
||||||
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
||||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=embg)
|
||||||
z_p = self.flow(z, y_mask, g=g)
|
z_p = self.flow(z, y_mask, g=embg)
|
||||||
z_slice, ids_slice = rand_slice_segments_on_last_dim(
|
z_slice, ids_slice = rand_slice_segments_on_last_dim(
|
||||||
z, y_lengths, self.segment_size
|
z, y_lengths, self.segment_size
|
||||||
)
|
)
|
||||||
if pitchf is not None:
|
if pitchf is not None and isinstance(self.dec, NSFGenerator):
|
||||||
pitchf = slice_on_last_dim(pitchf, ids_slice, self.segment_size)
|
pitchf = slice_on_last_dim(pitchf, ids_slice, self.segment_size)
|
||||||
o = self.dec(z_slice, pitchf, g=g)
|
o = self.dec(z_slice, pitchf, g=embg) # type: ignore
|
||||||
|
elif isinstance(self.dec, Generator):
|
||||||
|
o = self.dec(z_slice, g=embg)
|
||||||
else:
|
else:
|
||||||
o = self.dec(z_slice, g=g)
|
raise KeyError(f"unknown dec type: {type(self.dec).__name__}")
|
||||||
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
||||||
|
|
||||||
@torch.jit.export
|
@torch.jit.export
|
||||||
@@ -201,15 +203,17 @@ class SynthesizerTrnMsNSFsid(nn.Module):
|
|||||||
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||||
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
||||||
del z_p, m_p, logs_p
|
del z_p, m_p, logs_p
|
||||||
if pitchf is not None:
|
if pitchf is not None and isinstance(self.dec, NSFGenerator):
|
||||||
o = self.dec(
|
o = self.dec(
|
||||||
z * x_mask,
|
z * x_mask,
|
||||||
pitchf,
|
pitchf,
|
||||||
g=g,
|
g=g,
|
||||||
n_res=return_length2,
|
n_res=return_length2,
|
||||||
)
|
)
|
||||||
else:
|
elif isinstance(self.dec, Generator):
|
||||||
o = self.dec(z * x_mask, g=g, n_res=return_length2)
|
o = self.dec(z * x_mask, g=g, n_res=return_length2)
|
||||||
|
else:
|
||||||
|
raise KeyError(f"unknown dec type: {type(self.dec).__name__}")
|
||||||
del x_mask, z
|
del x_mask, z
|
||||||
return o # , x_mask, (z, z_p, m_p, logs_p)
|
return o # , x_mask, (z, z_p, m_p, logs_p)
|
||||||
|
|
||||||
@@ -326,7 +330,7 @@ class SynthesizerTrnMs256NSFsid_nono(SynthesizerTrnMsNSFsid):
|
|||||||
upsample_kernel_sizes: List[int],
|
upsample_kernel_sizes: List[int],
|
||||||
spk_embed_dim: int,
|
spk_embed_dim: int,
|
||||||
gin_channels: int,
|
gin_channels: int,
|
||||||
sr=None,
|
sr: Union[str, int],
|
||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
spec_channels,
|
spec_channels,
|
||||||
@@ -346,6 +350,7 @@ class SynthesizerTrnMs256NSFsid_nono(SynthesizerTrnMsNSFsid):
|
|||||||
upsample_kernel_sizes,
|
upsample_kernel_sizes,
|
||||||
spk_embed_dim,
|
spk_embed_dim,
|
||||||
gin_channels,
|
gin_channels,
|
||||||
|
sr,
|
||||||
256,
|
256,
|
||||||
False,
|
False,
|
||||||
)
|
)
|
||||||
@@ -371,7 +376,7 @@ class SynthesizerTrnMs768NSFsid_nono(SynthesizerTrnMsNSFsid):
|
|||||||
upsample_kernel_sizes: List[int],
|
upsample_kernel_sizes: List[int],
|
||||||
spk_embed_dim: int,
|
spk_embed_dim: int,
|
||||||
gin_channels: int,
|
gin_channels: int,
|
||||||
sr=None,
|
sr: Union[str, int],
|
||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
spec_channels,
|
spec_channels,
|
||||||
@@ -391,6 +396,7 @@ class SynthesizerTrnMs768NSFsid_nono(SynthesizerTrnMsNSFsid):
|
|||||||
upsample_kernel_sizes,
|
upsample_kernel_sizes,
|
||||||
spk_embed_dim,
|
spk_embed_dim,
|
||||||
gin_channels,
|
gin_channels,
|
||||||
|
sr,
|
||||||
768,
|
768,
|
||||||
False,
|
False,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import List, Optional, Tuple, Iterator
|
from typing import List, Optional, Tuple, Iterator, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -17,7 +17,7 @@ def get_padding(kernel_size: int, dilation=1) -> int:
|
|||||||
|
|
||||||
def slice_on_last_dim(
|
def slice_on_last_dim(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
start_indices: List[int],
|
start_indices: Union[List[int], torch.Tensor],
|
||||||
segment_size=4,
|
segment_size=4,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
new_shape = [*x.shape]
|
new_shape = [*x.shape]
|
||||||
@@ -32,9 +32,9 @@ def slice_on_last_dim(
|
|||||||
|
|
||||||
def rand_slice_segments_on_last_dim(
|
def rand_slice_segments_on_last_dim(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
x_lengths: int = None,
|
x_lengths: Optional[Union[int, torch.Tensor]] = None,
|
||||||
segment_size=4,
|
segment_size=4,
|
||||||
) -> Tuple[torch.Tensor, List[int]]:
|
) -> Tuple[torch.Tensor, Union[List[int], torch.Tensor]]:
|
||||||
b, _, t = x.size()
|
b, _, t = x.size()
|
||||||
if x_lengths is None:
|
if x_lengths is None:
|
||||||
x_lengths = t
|
x_lengths = t
|
||||||
@@ -58,7 +58,7 @@ def activate_add_tanh_sigmoid_multiply(
|
|||||||
def sequence_mask(
|
def sequence_mask(
|
||||||
length: torch.Tensor,
|
length: torch.Tensor,
|
||||||
max_length: Optional[int] = None,
|
max_length: Optional[int] = None,
|
||||||
) -> torch.BoolTensor:
|
):
|
||||||
if max_length is None:
|
if max_length is None:
|
||||||
max_length = int(length.max())
|
max_length = int(length.max())
|
||||||
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import List, Optional, Union
|
from typing import List, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -25,7 +25,7 @@ class SynthesizerTrnMsNSFsid(SynthesizerBase):
|
|||||||
upsample_kernel_sizes: List[int],
|
upsample_kernel_sizes: List[int],
|
||||||
spk_embed_dim: int,
|
spk_embed_dim: int,
|
||||||
gin_channels: int,
|
gin_channels: int,
|
||||||
sr: Optional[Union[str, int]],
|
sr: Union[str, int],
|
||||||
encoder_dim: int,
|
encoder_dim: int,
|
||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
|
|||||||
Reference in New Issue
Block a user