mirror of
https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git
synced 2026-06-05 01:10:22 +08:00
fix(rt): replace with new f0
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import os
|
import os
|
||||||
from typing import Union, Literal, Optional
|
from typing import Union, Literal, Optional
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import fairseq
|
import fairseq
|
||||||
import faiss
|
import faiss
|
||||||
@@ -10,7 +11,7 @@ import torch.nn as nn
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torchaudio.transforms import Resample
|
from torchaudio.transforms import Resample
|
||||||
|
|
||||||
from rvc.f0 import PM, Harvest, RMVPE, CRePE, Dio, FCPE
|
from rvc.f0 import Generator
|
||||||
from rvc.synthesizer import load_synthesizer
|
from rvc.synthesizer import load_synthesizer
|
||||||
|
|
||||||
|
|
||||||
@@ -65,14 +66,7 @@ class RVC:
|
|||||||
|
|
||||||
self.resample_kernel = {}
|
self.resample_kernel = {}
|
||||||
|
|
||||||
self.f0_methods = {
|
self.f0_gen = Generator(Path(os.environ["rmvpe_root"]), is_half, 0, device, self.window, self.sr)
|
||||||
"crepe": self._get_f0_crepe,
|
|
||||||
"rmvpe": self._get_f0_rmvpe,
|
|
||||||
"fcpe": self._get_f0_fcpe,
|
|
||||||
"pm": self._get_f0_pm,
|
|
||||||
"harvest": self._get_f0_harvest,
|
|
||||||
"dio": self._get_f0_dio,
|
|
||||||
}
|
|
||||||
|
|
||||||
models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
|
models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
|
||||||
["assets/hubert/hubert_base.pt"],
|
["assets/hubert/hubert_base.pt"],
|
||||||
@@ -141,7 +135,6 @@ class RVC:
|
|||||||
skip_head: int,
|
skip_head: int,
|
||||||
return_length: int,
|
return_length: int,
|
||||||
f0method: Union[tuple, str],
|
f0method: Union[tuple, str],
|
||||||
inp_f0: Optional[np.ndarray] = None,
|
|
||||||
protect: float = 1.0,
|
protect: float = 1.0,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
@@ -205,16 +198,11 @@ class RVC:
|
|||||||
f0_extractor_frame = (
|
f0_extractor_frame = (
|
||||||
5120 * ((f0_extractor_frame - 1) // 5120 + 1) - self.window
|
5120 * ((f0_extractor_frame - 1) // 5120 + 1) - self.window
|
||||||
)
|
)
|
||||||
if inp_f0 is not None:
|
pitch, pitchf = self._get_f0(
|
||||||
pitch, pitchf = self._get_f0_post(
|
input_wav[-f0_extractor_frame:],
|
||||||
inp_f0, self.f0_up_key - self.formant_shift
|
self.f0_up_key - self.formant_shift,
|
||||||
)
|
method=f0method,
|
||||||
else:
|
)
|
||||||
pitch, pitchf = self._get_f0(
|
|
||||||
input_wav[-f0_extractor_frame:],
|
|
||||||
self.f0_up_key - self.formant_shift,
|
|
||||||
method=f0method,
|
|
||||||
)
|
|
||||||
shift = block_frame_16k // self.window
|
shift = block_frame_16k // self.window
|
||||||
self.cache_pitch[:-shift] = self.cache_pitch[shift:].clone()
|
self.cache_pitch[:-shift] = self.cache_pitch[shift:].clone()
|
||||||
self.cache_pitchf[:-shift] = self.cache_pitchf[shift:].clone()
|
self.cache_pitchf[:-shift] = self.cache_pitchf[shift:].clone()
|
||||||
@@ -275,89 +263,9 @@ class RVC:
|
|||||||
filter_radius: Optional[Union[int, float]] = None,
|
filter_radius: Optional[Union[int, float]] = None,
|
||||||
method: Literal["crepe", "rmvpe", "fcpe", "pm", "harvest", "dio"] = "fcpe",
|
method: Literal["crepe", "rmvpe", "fcpe", "pm", "harvest", "dio"] = "fcpe",
|
||||||
):
|
):
|
||||||
if method not in self.f0_methods.keys():
|
c, f = self.f0_gen.calculate(x, None, f0_up_key, method, filter_radius)
|
||||||
raise RuntimeError("Not supported f0 method: " + method)
|
if not torch.is_tensor(c):
|
||||||
return self.f0_methods[method](x, f0_up_key, filter_radius)
|
c = torch.from_numpy(c)
|
||||||
|
if not torch.is_tensor(f):
|
||||||
def _get_f0_post(self, f0, f0_up_key):
|
f = torch.from_numpy(f)
|
||||||
f0 *= pow(2, f0_up_key / 12)
|
return c.long().to(self.device), f.float().to(self.device)
|
||||||
if not torch.is_tensor(f0):
|
|
||||||
f0 = torch.from_numpy(f0)
|
|
||||||
f0 = f0.float().to(self.device).squeeze()
|
|
||||||
f0_mel = 1127 * torch.log(1 + f0 / 700)
|
|
||||||
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
|
|
||||||
self.f0_mel_max - self.f0_mel_min
|
|
||||||
) + 1
|
|
||||||
f0_mel[f0_mel <= 1] = 1
|
|
||||||
f0_mel[f0_mel > 255] = 255
|
|
||||||
f0_coarse = torch.round(f0_mel).long()
|
|
||||||
return f0_coarse, f0
|
|
||||||
|
|
||||||
def _get_f0_pm(self, x, f0_up_key, filter_radius):
|
|
||||||
if not hasattr(self, "pm"):
|
|
||||||
self.pm = PM(hop_length=160, sampling_rate=16000)
|
|
||||||
f0 = self.pm.compute_f0(x.cpu().numpy())
|
|
||||||
return self._get_f0_post(f0, f0_up_key)
|
|
||||||
|
|
||||||
def _get_f0_harvest(self, x, f0_up_key, filter_radius=3):
|
|
||||||
if not hasattr(self, "harvest"):
|
|
||||||
self.harvest = Harvest(
|
|
||||||
self.window,
|
|
||||||
self.f0_min,
|
|
||||||
self.f0_max,
|
|
||||||
self.sr,
|
|
||||||
)
|
|
||||||
if filter_radius is None:
|
|
||||||
filter_radius = 3
|
|
||||||
f0 = self.harvest.compute_f0(x.cpu().numpy(), filter_radius=filter_radius)
|
|
||||||
return self._get_f0_post(f0, f0_up_key)
|
|
||||||
|
|
||||||
def _get_f0_dio(self, x, f0_up_key, filter_radius):
|
|
||||||
if not hasattr(self, "dio"):
|
|
||||||
self.dio = Dio(
|
|
||||||
self.window,
|
|
||||||
self.f0_min,
|
|
||||||
self.f0_max,
|
|
||||||
self.sr,
|
|
||||||
)
|
|
||||||
f0 = self.dio.compute_f0(x.cpu().numpy())
|
|
||||||
return self._get_f0_post(f0, f0_up_key)
|
|
||||||
|
|
||||||
def _get_f0_crepe(self, x, f0_up_key, filter_radius):
|
|
||||||
if hasattr(self, "crepe") == False:
|
|
||||||
self.crepe = CRePE(
|
|
||||||
self.window,
|
|
||||||
self.f0_min,
|
|
||||||
self.f0_max,
|
|
||||||
self.sr,
|
|
||||||
self.device,
|
|
||||||
)
|
|
||||||
f0 = self.crepe.compute_f0(x)
|
|
||||||
return self._get_f0_post(f0, f0_up_key)
|
|
||||||
|
|
||||||
def _get_f0_rmvpe(self, x, f0_up_key, filter_radius=0.03):
|
|
||||||
if hasattr(self, "rmvpe") == False:
|
|
||||||
self.rmvpe = RMVPE(
|
|
||||||
"%s/rmvpe.pt" % os.environ["rmvpe_root"],
|
|
||||||
is_half=self.is_half,
|
|
||||||
device=self.device,
|
|
||||||
use_jit=self.use_jit,
|
|
||||||
)
|
|
||||||
if filter_radius is None:
|
|
||||||
filter_radius = 0.03
|
|
||||||
return self._get_f0_post(
|
|
||||||
self.rmvpe.compute_f0(x, filter_radius=filter_radius),
|
|
||||||
f0_up_key,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _get_f0_fcpe(self, x, f0_up_key, filter_radius):
|
|
||||||
if hasattr(self, "fcpe") == False:
|
|
||||||
self.fcpe = FCPE(
|
|
||||||
160,
|
|
||||||
self.f0_min,
|
|
||||||
self.f0_max,
|
|
||||||
16000,
|
|
||||||
self.device,
|
|
||||||
)
|
|
||||||
f0 = self.fcpe.compute_f0(x)
|
|
||||||
return self._get_f0_post(f0, f0_up_key)
|
|
||||||
|
|||||||
@@ -4,12 +4,12 @@ from typing import Optional, Union, Literal, Tuple
|
|||||||
|
|
||||||
from numba import jit
|
from numba import jit
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
@jit(nopython=True)
|
@jit(nopython=True)
|
||||||
def post_process(
|
def post_process(
|
||||||
sr: int,
|
tf0: int, # 每秒f0点数
|
||||||
window: int,
|
|
||||||
f0: np.ndarray,
|
f0: np.ndarray,
|
||||||
f0_up_key: int,
|
f0_up_key: int,
|
||||||
manual_x_pad: int,
|
manual_x_pad: int,
|
||||||
@@ -19,7 +19,6 @@ def post_process(
|
|||||||
) -> Tuple[np.ndarray, np.ndarray]:
|
) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
f0 = np.multiply(f0, pow(2, f0_up_key / 12))
|
f0 = np.multiply(f0, pow(2, f0_up_key / 12))
|
||||||
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
|
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
|
||||||
tf0 = sr // window # 每秒f0点数
|
|
||||||
if manual_f0 is not None:
|
if manual_f0 is not None:
|
||||||
delta_t = np.round(
|
delta_t = np.round(
|
||||||
(manual_f0[:, 0].max() - manual_f0[:, 0].min()) * tf0 + 1
|
(manual_f0[:, 0].max() - manual_f0[:, 0].min()) * tf0 + 1
|
||||||
@@ -62,12 +61,14 @@ class Generator(object):
|
|||||||
def calculate(
|
def calculate(
|
||||||
self,
|
self,
|
||||||
x: np.ndarray,
|
x: np.ndarray,
|
||||||
p_len: int,
|
p_len: Optional[int],
|
||||||
f0_up_key: int,
|
f0_up_key: int,
|
||||||
f0_method: Literal["pm", "dio", "harvest", "crepe", "rmvpe", "fcpe"],
|
f0_method: Literal["pm", "dio", "harvest", "crepe", "rmvpe", "fcpe"],
|
||||||
filter_radius: Optional[Union[int, float]],
|
filter_radius: Optional[Union[int, float]],
|
||||||
manual_f0: Optional[Union[np.ndarray, list]] = None,
|
manual_f0: Optional[Union[np.ndarray, list]] = None,
|
||||||
) -> Tuple[np.ndarray, np.ndarray]:
|
) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
|
if torch.is_tensor(x):
|
||||||
|
x = x.cpu().numpy()
|
||||||
f0_min = 50
|
f0_min = 50
|
||||||
f0_max = 1100
|
f0_max = 1100
|
||||||
if f0_method == "pm":
|
if f0_method == "pm":
|
||||||
@@ -130,8 +131,7 @@ class Generator(object):
|
|||||||
raise ValueError(f"f0 method {f0_method} has not yet been supported")
|
raise ValueError(f"f0 method {f0_method} has not yet been supported")
|
||||||
|
|
||||||
return post_process(
|
return post_process(
|
||||||
self.sr,
|
self.sr // self.window,
|
||||||
self.window,
|
|
||||||
f0,
|
f0,
|
||||||
f0_up_key,
|
f0_up_key,
|
||||||
self.x_pad,
|
self.x_pad,
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ def load_synthesizer(
|
|||||||
pth_path: torch.serialization.FILE_LIKE, device=torch.device("cpu")
|
pth_path: torch.serialization.FILE_LIKE, device=torch.device("cpu")
|
||||||
):
|
):
|
||||||
return get_synthesizer(
|
return get_synthesizer(
|
||||||
torch.load(pth_path, map_location=torch.device("cpu")),
|
torch.load(pth_path, map_location=torch.device("cpu"), weights_only=True),
|
||||||
device,
|
device,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
6
web.py
6
web.py
@@ -964,9 +964,7 @@ with gr.Blocks(title="RVC WebUI") as app:
|
|||||||
"Select the pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'harvest': better bass but extremely slow; 'crepe': better quality but GPU intensive), 'rmvpe': best quality, and little GPU requirement"
|
"Select the pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'harvest': better bass but extremely slow; 'crepe': better quality but GPU intensive), 'rmvpe': best quality, and little GPU requirement"
|
||||||
),
|
),
|
||||||
choices=(
|
choices=(
|
||||||
["pm", "harvest", "crepe", "rmvpe"]
|
["pm", "dio", "harvest", "crepe", "rmvpe", "fcpe"]
|
||||||
if config.dml == False
|
|
||||||
else ["pm", "harvest", "rmvpe"]
|
|
||||||
),
|
),
|
||||||
value="rmvpe",
|
value="rmvpe",
|
||||||
interactive=True,
|
interactive=True,
|
||||||
@@ -1209,7 +1207,7 @@ with gr.Blocks(title="RVC WebUI") as app:
|
|||||||
label=i18n(
|
label=i18n(
|
||||||
"Select the pitch extraction algorithm: when extracting singing, you can use 'pm' to speed up. For high-quality speech with fast performance, but worse CPU usage, you can use 'dio'. 'harvest' results in better quality but is slower. 'rmvpe' has the best results and consumes less CPU/GPU"
|
"Select the pitch extraction algorithm: when extracting singing, you can use 'pm' to speed up. For high-quality speech with fast performance, but worse CPU usage, you can use 'dio'. 'harvest' results in better quality but is slower. 'rmvpe' has the best results and consumes less CPU/GPU"
|
||||||
),
|
),
|
||||||
choices=["pm", "harvest", "dio", "rmvpe"],
|
choices=["pm", "dio", "harvest", "crepe", "rmvpe", "fcpe"],
|
||||||
value="rmvpe",
|
value="rmvpe",
|
||||||
interactive=True,
|
interactive=True,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user