1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-05 09:10:25 +08:00

New feature of real-time voice changing: formant shift adjustment (#1999)

* add formant shift for realtime-gui

* chore(i18n): sync locale on dev

* chore(format): run black on dev

* fix

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
yxlllc
2024-04-22 21:13:10 +08:00
committed by GitHub
parent 189eef5936
commit 0ab3a3296e
17 changed files with 92 additions and 10 deletions

View File

@@ -10,7 +10,6 @@ from torch import nn
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
from torch.nn import functional as F
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
from infer.lib.infer_pack import attentions, commons, modules
from infer.lib.infer_pack.commons import get_padding, init_weights
@@ -250,7 +249,17 @@ class Generator(torch.nn.Module):
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
def forward(
self,
x: torch.Tensor,
g: Optional[torch.Tensor] = None,
n_res: Optional[torch.Tensor] = None,
):
if n_res is not None:
assert isinstance(n_res, torch.Tensor)
n = int(n_res.item())
if n != x.shape[-1]:
x = F.interpolate(x, size=n, mode="linear")
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
@@ -529,9 +538,22 @@ class GeneratorNSF(torch.nn.Module):
self.lrelu_slope = modules.LRELU_SLOPE
def forward(self, x, f0, g: Optional[torch.Tensor] = None):
def forward(
self,
x,
f0,
g: Optional[torch.Tensor] = None,
n_res: Optional[torch.Tensor] = None,
):
har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2)
if n_res is not None:
assert isinstance(n_res, torch.Tensor)
n = int(n_res.item())
if n * self.upp != har_source.shape[-1]:
har_source = F.interpolate(har_source, size=n * self.upp, mode="linear")
if n != x.shape[-1]:
x = F.interpolate(x, size=n, mode="linear")
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
@@ -558,6 +580,7 @@ class GeneratorNSF(torch.nn.Module):
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
@@ -748,6 +771,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
sid: torch.Tensor,
skip_head: Optional[torch.Tensor] = None,
return_length: Optional[torch.Tensor] = None,
return_length2: Optional[torch.Tensor] = None,
):
g = self.emb_g(sid).unsqueeze(-1)
if skip_head is not None and return_length is not None:
@@ -767,7 +791,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, nsff0, g=g)
o = self.dec(z * x_mask, nsff0, g=g, n_res=return_length2)
return o, x_mask, (z, z_p, m_p, logs_p)
@@ -963,6 +987,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
sid: torch.Tensor,
skip_head: Optional[torch.Tensor] = None,
return_length: Optional[torch.Tensor] = None,
return_length2: Optional[torch.Tensor] = None,
):
g = self.emb_g(sid).unsqueeze(-1)
if skip_head is not None and return_length is not None:
@@ -981,7 +1006,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, g=g)
o = self.dec(z * x_mask, g=g, n_res=return_length2)
return o, x_mask, (z, z_p, m_p, logs_p)

View File

@@ -15,6 +15,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
import torchcrepe
from torchaudio.transforms import Resample
now_dir = os.getcwd()
sys.path.append(now_dir)
@@ -40,6 +41,7 @@ class RVC:
def __init__(
self,
key,
formant,
pth_path,
index_path,
index_rate,
@@ -68,6 +70,7 @@ class RVC:
# device="cpu"########强制cpu测试
self.device = config.device
self.f0_up_key = key
self.formant_shift = formant
self.f0_min = 50
self.f0_max = 1100
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
@@ -90,6 +93,8 @@ class RVC:
1024, device=self.device, dtype=torch.float32
)
self.resample_kernel = {}
if last_rvc is None:
models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
["assets/hubert/hubert_base.pt"],
@@ -187,6 +192,9 @@ class RVC:
def change_key(self, new_key):
self.f0_up_key = new_key
def change_formant(self, new_formant):
self.formant_shift = new_formant
def change_index_rate(self, new_index_rate):
if new_index_rate != 0 and self.index_rate == 0:
self.index = faiss.read_index(self.index_path)
@@ -390,12 +398,14 @@ class RVC:
printt("Index search FAILED")
t3 = ttime()
p_len = input_wav.shape[0] // 160
factor = pow(2, self.formant_shift / 12)
return_length2 = int(np.ceil(return_length * factor))
if self.if_f0 == 1:
f0_extractor_frame = block_frame_16k + 800
if f0method == "rmvpe":
f0_extractor_frame = 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
pitch, pitchf = self.get_f0(
input_wav[-f0_extractor_frame:], self.f0_up_key, self.n_cpu, f0method
input_wav[-f0_extractor_frame:], self.f0_up_key - self.formant_shift, self.n_cpu, f0method
)
shift = block_frame_16k // 160
self.cache_pitch[:-shift] = self.cache_pitch[shift:].clone()
@@ -403,13 +413,14 @@ class RVC:
self.cache_pitch[4 - pitch.shape[0] :] = pitch[3:-1]
self.cache_pitchf[4 - pitch.shape[0] :] = pitchf[3:-1]
cache_pitch = self.cache_pitch[None, -p_len:]
cache_pitchf = self.cache_pitchf[None, -p_len:]
cache_pitchf = self.cache_pitchf[None, -p_len:] * return_length2 / return_length
t4 = ttime()
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
feats = feats[:, :p_len, :]
p_len = torch.LongTensor([p_len]).to(self.device)
sid = torch.LongTensor([0]).to(self.device)
skip_head = torch.LongTensor([skip_head])
return_length2 = torch.LongTensor([return_length2])
return_length = torch.LongTensor([return_length])
with torch.no_grad():
if self.if_f0 == 1:
@@ -421,11 +432,24 @@ class RVC:
sid,
skip_head,
return_length,
return_length2,
)
else:
infered_audio, _, _ = self.net_g.infer(
feats, p_len, sid, skip_head, return_length
feats, p_len, sid, skip_head, return_length, return_length2
)
infered_audio = infered_audio.squeeze(1).float()
upp_res = int(np.floor(factor * self.tgt_sr // 100))
if upp_res != self.tgt_sr // 100:
if upp_res not in self.resample_kernel:
self.resample_kernel[upp_res] = Resample(
orig_freq=upp_res,
new_freq=self.tgt_sr // 100,
dtype=torch.float32,
).to(self.device)
infered_audio = self.resample_kernel[upp_res](
infered_audio[:, : return_length * upp_res]
)
t5 = ttime()
printt(
"Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs",
@@ -434,4 +458,4 @@ class RVC:
t4 - t3,
t5 - t4,
)
return infered_audio.squeeze().float()
return infered_audio.squeeze()