1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-05 01:10:22 +08:00

optimize(f0): move fcpe into rvc.f0

This commit is contained in:
源文雨
2024-06-14 21:33:46 +09:00
parent 24dbc5edd2
commit 3b7d7c6d1a
6 changed files with 70 additions and 32 deletions

View File

@@ -13,7 +13,6 @@ import scipy.signal as signal
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torchcrepe
from torchaudio.transforms import Resample from torchaudio.transforms import Resample
from rvc.synthesizer import load_synthesizer from rvc.synthesizer import load_synthesizer
@@ -323,20 +322,17 @@ class RVC:
def get_f0_fcpe(self, x, f0_up_key): def get_f0_fcpe(self, x, f0_up_key):
if hasattr(self, "model_fcpe") == False: if hasattr(self, "model_fcpe") == False:
from torchfcpe import spawn_bundled_infer_model from rvc.f0 import FCPE
printt("Loading fcpe model") printt("Loading fcpe model")
if "privateuseone" in str(self.device): self.model_fcpe = FCPE(
self.device_fcpe = "cpu" 160,
else: self.f0_min,
self.device_fcpe = self.device self.f0_max,
self.model_fcpe = spawn_bundled_infer_model(self.device_fcpe) 16000,
f0 = self.model_fcpe.infer( self.device,
x.to(self.device_fcpe).unsqueeze(0).float(), )
sr=16000, f0 = self.model_fcpe.compute_f0(x)
decoder_mode="local_argmax",
threshold=0.006,
)
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
return self.get_f0_post(f0) return self.get_f0_post(f0)

View File

@@ -14,7 +14,7 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
from scipy import signal from scipy import signal
from rvc.f0 import PM, Harvest, RMVPE, CRePE, Dio from rvc.f0 import PM, Harvest, RMVPE, CRePE, Dio, FCPE
now_dir = os.getcwd() now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
@@ -118,21 +118,15 @@ class Pipeline(object):
elif f0_method == "fcpe": elif f0_method == "fcpe":
if not hasattr(self, "model_fcpe"): if not hasattr(self, "model_fcpe"):
from torchfcpe import spawn_bundled_infer_model
logger.info("Loading fcpe model") logger.info("Loading fcpe model")
self.model_fcpe = spawn_bundled_infer_model(self.device) self.model_fcpe = FCPE(
f0 = ( self.window,
self.model_fcpe.infer( f0_min,
torch.from_numpy(x).to(self.device).unsqueeze(0).float(), f0_max,
sr=16000, self.sr,
decoder_mode="local_argmax", self.device,
threshold=0.006,
) )
.squeeze() f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
.cpu()
.numpy()
)
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))

View File

@@ -2,8 +2,9 @@ from .f0 import F0Predictor
from .crepe import CRePE from .crepe import CRePE
from .dio import Dio from .dio import Dio
from .fcpe import FCPE
from .harvest import Harvest from .harvest import Harvest
from .pm import PM from .pm import PM
from .rmvpe import RMVPE from .rmvpe import RMVPE
__all__ = ["F0Predictor", "CRePE", "Dio", "Harvest", "PM", "RMVPE"] __all__ = ["F0Predictor", "CRePE", "Dio", "FCPE", "Harvest", "PM", "RMVPE"]

View File

@@ -16,6 +16,8 @@ class CRePE(F0Predictor):
sampling_rate=44100, sampling_rate=44100,
device="cpu", device="cpu",
): ):
if "privateuseone" in str(device):
device = "cpu"
super().__init__( super().__init__(
hop_length, hop_length,
f0_min, f0_min,
@@ -32,11 +34,13 @@ class CRePE(F0Predictor):
): ):
if p_len is None: if p_len is None:
p_len = wav.shape[0] // self.hop_length p_len = wav.shape[0] // self.hop_length
if not torch.is_tensor(wav):
wav = torch.from_numpy(wav)
# Pick a batch size that doesn't cause memory errors on your gpu # Pick a batch size that doesn't cause memory errors on your gpu
batch_size = 512 batch_size = 512
# Compute pitch using device 'device' # Compute pitch using device 'device'
f0, pd = torchcrepe.predict( f0, pd = torchcrepe.predict(
torch.tensor(np.copy(wav))[None].float().to(self.device), wav.float().to(self.device).unsqueeze(dim=0),
self.sampling_rate, self.sampling_rate,
self.hop_length, self.hop_length,
self.f0_min, self.f0_min,

45
rvc/f0/fcpe.py Normal file
View File

@@ -0,0 +1,45 @@
from typing import Any, Optional, Union
import numpy as np
import torch
from torchfcpe import spawn_bundled_infer_model
from .f0 import F0Predictor
class FCPE(F0Predictor):
def __init__(
self,
hop_length=512,
f0_min=50,
f0_max=1100,
sampling_rate=44100,
device="cpu",
):
super().__init__(
hop_length,
f0_min,
f0_max,
sampling_rate,
device,
)
self.model = spawn_bundled_infer_model(self.device)
def compute_f0(
self,
wav: np.ndarray[Any, np.dtype],
p_len: Optional[int] = None,
filter_radius: Optional[Union[int, float]] = 0.006,
):
if p_len is None:
p_len = wav.shape[0] // self.hop_length
if not torch.is_tensor(wav):
wav = torch.from_numpy(wav)
f0 = self.model.infer(
wav.float().to(self.device).unsqueeze(0),
sr=self.sampling_rate,
decoder_mode="local_argmax",
threshold=filter_radius,
).squeeze().cpu().numpy()
return self._interpolate_f0(self._resize_f0(f0, p_len))[0]

4
web.py
View File

@@ -861,9 +861,7 @@ with gr.Blocks(title="RVC WebUI") as app:
"Select the pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'harvest': better bass but extremely slow; 'crepe': better quality but GPU intensive), 'rmvpe': best quality, and little GPU requirement" "Select the pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'harvest': better bass but extremely slow; 'crepe': better quality but GPU intensive), 'rmvpe': best quality, and little GPU requirement"
), ),
choices=( choices=(
["pm", "dio", "harvest", "rmvpe"] ["pm", "dio", "harvest", "crepe", "rmvpe", "fcpe"]
if config.dml
else ["pm", "dio", "harvest", "crepe", "rmvpe"]
), ),
value="rmvpe", value="rmvpe",
interactive=True, interactive=True,