1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-05 01:10:22 +08:00

fix(rt): replace with new f0

This commit is contained in:
源文雨
2024-11-29 00:35:10 +09:00
parent 51c85fcc49
commit ef9db1fd44
4 changed files with 23 additions and 117 deletions

View File

@@ -1,6 +1,7 @@
from io import BytesIO from io import BytesIO
import os import os
from typing import Union, Literal, Optional from typing import Union, Literal, Optional
from pathlib import Path
import fairseq import fairseq
import faiss import faiss
@@ -10,7 +11,7 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from torchaudio.transforms import Resample from torchaudio.transforms import Resample
from rvc.f0 import PM, Harvest, RMVPE, CRePE, Dio, FCPE from rvc.f0 import Generator
from rvc.synthesizer import load_synthesizer from rvc.synthesizer import load_synthesizer
@@ -65,14 +66,7 @@ class RVC:
self.resample_kernel = {} self.resample_kernel = {}
self.f0_methods = { self.f0_gen = Generator(Path(os.environ["rmvpe_root"]), is_half, 0, device, self.window, self.sr)
"crepe": self._get_f0_crepe,
"rmvpe": self._get_f0_rmvpe,
"fcpe": self._get_f0_fcpe,
"pm": self._get_f0_pm,
"harvest": self._get_f0_harvest,
"dio": self._get_f0_dio,
}
models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
["assets/hubert/hubert_base.pt"], ["assets/hubert/hubert_base.pt"],
@@ -141,7 +135,6 @@ class RVC:
skip_head: int, skip_head: int,
return_length: int, return_length: int,
f0method: Union[tuple, str], f0method: Union[tuple, str],
inp_f0: Optional[np.ndarray] = None,
protect: float = 1.0, protect: float = 1.0,
) -> np.ndarray: ) -> np.ndarray:
with torch.no_grad(): with torch.no_grad():
@@ -205,16 +198,11 @@ class RVC:
f0_extractor_frame = ( f0_extractor_frame = (
5120 * ((f0_extractor_frame - 1) // 5120 + 1) - self.window 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - self.window
) )
if inp_f0 is not None: pitch, pitchf = self._get_f0(
pitch, pitchf = self._get_f0_post( input_wav[-f0_extractor_frame:],
inp_f0, self.f0_up_key - self.formant_shift self.f0_up_key - self.formant_shift,
) method=f0method,
else: )
pitch, pitchf = self._get_f0(
input_wav[-f0_extractor_frame:],
self.f0_up_key - self.formant_shift,
method=f0method,
)
shift = block_frame_16k // self.window shift = block_frame_16k // self.window
self.cache_pitch[:-shift] = self.cache_pitch[shift:].clone() self.cache_pitch[:-shift] = self.cache_pitch[shift:].clone()
self.cache_pitchf[:-shift] = self.cache_pitchf[shift:].clone() self.cache_pitchf[:-shift] = self.cache_pitchf[shift:].clone()
@@ -275,89 +263,9 @@ class RVC:
filter_radius: Optional[Union[int, float]] = None, filter_radius: Optional[Union[int, float]] = None,
method: Literal["crepe", "rmvpe", "fcpe", "pm", "harvest", "dio"] = "fcpe", method: Literal["crepe", "rmvpe", "fcpe", "pm", "harvest", "dio"] = "fcpe",
): ):
if method not in self.f0_methods.keys(): c, f = self.f0_gen.calculate(x, None, f0_up_key, method, filter_radius)
raise RuntimeError("Not supported f0 method: " + method) if not torch.is_tensor(c):
return self.f0_methods[method](x, f0_up_key, filter_radius) c = torch.from_numpy(c)
if not torch.is_tensor(f):
def _get_f0_post(self, f0, f0_up_key): f = torch.from_numpy(f)
f0 *= pow(2, f0_up_key / 12) return c.long().to(self.device), f.float().to(self.device)
if not torch.is_tensor(f0):
f0 = torch.from_numpy(f0)
f0 = f0.float().to(self.device).squeeze()
f0_mel = 1127 * torch.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
self.f0_mel_max - self.f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = torch.round(f0_mel).long()
return f0_coarse, f0
def _get_f0_pm(self, x, f0_up_key, filter_radius):
if not hasattr(self, "pm"):
self.pm = PM(hop_length=160, sampling_rate=16000)
f0 = self.pm.compute_f0(x.cpu().numpy())
return self._get_f0_post(f0, f0_up_key)
def _get_f0_harvest(self, x, f0_up_key, filter_radius=3):
if not hasattr(self, "harvest"):
self.harvest = Harvest(
self.window,
self.f0_min,
self.f0_max,
self.sr,
)
if filter_radius is None:
filter_radius = 3
f0 = self.harvest.compute_f0(x.cpu().numpy(), filter_radius=filter_radius)
return self._get_f0_post(f0, f0_up_key)
def _get_f0_dio(self, x, f0_up_key, filter_radius):
if not hasattr(self, "dio"):
self.dio = Dio(
self.window,
self.f0_min,
self.f0_max,
self.sr,
)
f0 = self.dio.compute_f0(x.cpu().numpy())
return self._get_f0_post(f0, f0_up_key)
def _get_f0_crepe(self, x, f0_up_key, filter_radius):
if hasattr(self, "crepe") == False:
self.crepe = CRePE(
self.window,
self.f0_min,
self.f0_max,
self.sr,
self.device,
)
f0 = self.crepe.compute_f0(x)
return self._get_f0_post(f0, f0_up_key)
def _get_f0_rmvpe(self, x, f0_up_key, filter_radius=0.03):
if hasattr(self, "rmvpe") == False:
self.rmvpe = RMVPE(
"%s/rmvpe.pt" % os.environ["rmvpe_root"],
is_half=self.is_half,
device=self.device,
use_jit=self.use_jit,
)
if filter_radius is None:
filter_radius = 0.03
return self._get_f0_post(
self.rmvpe.compute_f0(x, filter_radius=filter_radius),
f0_up_key,
)
def _get_f0_fcpe(self, x, f0_up_key, filter_radius):
if hasattr(self, "fcpe") == False:
self.fcpe = FCPE(
160,
self.f0_min,
self.f0_max,
16000,
self.device,
)
f0 = self.fcpe.compute_f0(x)
return self._get_f0_post(f0, f0_up_key)

View File

@@ -4,12 +4,12 @@ from typing import Optional, Union, Literal, Tuple
from numba import jit from numba import jit
import numpy as np import numpy as np
import torch
@jit(nopython=True) @jit(nopython=True)
def post_process( def post_process(
sr: int, tf0: int, # 每秒f0点数
window: int,
f0: np.ndarray, f0: np.ndarray,
f0_up_key: int, f0_up_key: int,
manual_x_pad: int, manual_x_pad: int,
@@ -19,7 +19,6 @@ def post_process(
) -> Tuple[np.ndarray, np.ndarray]: ) -> Tuple[np.ndarray, np.ndarray]:
f0 = np.multiply(f0, pow(2, f0_up_key / 12)) f0 = np.multiply(f0, pow(2, f0_up_key / 12))
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
tf0 = sr // window # 每秒f0点数
if manual_f0 is not None: if manual_f0 is not None:
delta_t = np.round( delta_t = np.round(
(manual_f0[:, 0].max() - manual_f0[:, 0].min()) * tf0 + 1 (manual_f0[:, 0].max() - manual_f0[:, 0].min()) * tf0 + 1
@@ -62,12 +61,14 @@ class Generator(object):
def calculate( def calculate(
self, self,
x: np.ndarray, x: np.ndarray,
p_len: int, p_len: Optional[int],
f0_up_key: int, f0_up_key: int,
f0_method: Literal["pm", "dio", "harvest", "crepe", "rmvpe", "fcpe"], f0_method: Literal["pm", "dio", "harvest", "crepe", "rmvpe", "fcpe"],
filter_radius: Optional[Union[int, float]], filter_radius: Optional[Union[int, float]],
manual_f0: Optional[Union[np.ndarray, list]] = None, manual_f0: Optional[Union[np.ndarray, list]] = None,
) -> Tuple[np.ndarray, np.ndarray]: ) -> Tuple[np.ndarray, np.ndarray]:
if torch.is_tensor(x):
x = x.cpu().numpy()
f0_min = 50 f0_min = 50
f0_max = 1100 f0_max = 1100
if f0_method == "pm": if f0_method == "pm":
@@ -130,8 +131,7 @@ class Generator(object):
raise ValueError(f"f0 method {f0_method} has not yet been supported") raise ValueError(f"f0 method {f0_method} has not yet been supported")
return post_process( return post_process(
self.sr, self.sr // self.window,
self.window,
f0, f0,
f0_up_key, f0_up_key,
self.x_pad, self.x_pad,

View File

@@ -31,7 +31,7 @@ def load_synthesizer(
pth_path: torch.serialization.FILE_LIKE, device=torch.device("cpu") pth_path: torch.serialization.FILE_LIKE, device=torch.device("cpu")
): ):
return get_synthesizer( return get_synthesizer(
torch.load(pth_path, map_location=torch.device("cpu")), torch.load(pth_path, map_location=torch.device("cpu"), weights_only=True),
device, device,
) )

6
web.py
View File

@@ -964,9 +964,7 @@ with gr.Blocks(title="RVC WebUI") as app:
"Select the pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'harvest': better bass but extremely slow; 'crepe': better quality but GPU intensive), 'rmvpe': best quality, and little GPU requirement" "Select the pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'harvest': better bass but extremely slow; 'crepe': better quality but GPU intensive), 'rmvpe': best quality, and little GPU requirement"
), ),
choices=( choices=(
["pm", "harvest", "crepe", "rmvpe"] ["pm", "dio", "harvest", "crepe", "rmvpe", "fcpe"]
if config.dml == False
else ["pm", "harvest", "rmvpe"]
), ),
value="rmvpe", value="rmvpe",
interactive=True, interactive=True,
@@ -1209,7 +1207,7 @@ with gr.Blocks(title="RVC WebUI") as app:
label=i18n( label=i18n(
"Select the pitch extraction algorithm: when extracting singing, you can use 'pm' to speed up. For high-quality speech with fast performance, but worse CPU usage, you can use 'dio'. 'harvest' results in better quality but is slower. 'rmvpe' has the best results and consumes less CPU/GPU" "Select the pitch extraction algorithm: when extracting singing, you can use 'pm' to speed up. For high-quality speech with fast performance, but worse CPU usage, you can use 'dio'. 'harvest' results in better quality but is slower. 'rmvpe' has the best results and consumes less CPU/GPU"
), ),
choices=["pm", "harvest", "dio", "rmvpe"], choices=["pm", "dio", "harvest", "crepe", "rmvpe", "fcpe"],
value="rmvpe", value="rmvpe",
interactive=True, interactive=True,
) )