From 3b7d7c6d1a0fe1c104d8c6822ea20543ac064e18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Fri, 14 Jun 2024 21:33:46 +0900 Subject: [PATCH] optimize(f0): move fcpe into rvc.f0 --- infer/lib/rtrvc.py | 22 ++++++++---------- infer/modules/vc/pipeline.py | 22 +++++++----------- rvc/f0/__init__.py | 3 ++- rvc/f0/crepe.py | 6 ++++- rvc/f0/fcpe.py | 45 ++++++++++++++++++++++++++++++++++++ web.py | 4 +--- 6 files changed, 70 insertions(+), 32 deletions(-) create mode 100644 rvc/f0/fcpe.py diff --git a/infer/lib/rtrvc.py b/infer/lib/rtrvc.py index 10e2851..77f8db0 100644 --- a/infer/lib/rtrvc.py +++ b/infer/lib/rtrvc.py @@ -13,7 +13,6 @@ import scipy.signal as signal import torch import torch.nn as nn import torch.nn.functional as F -import torchcrepe from torchaudio.transforms import Resample from rvc.synthesizer import load_synthesizer @@ -323,20 +322,17 @@ class RVC: def get_f0_fcpe(self, x, f0_up_key): if hasattr(self, "model_fcpe") == False: - from torchfcpe import spawn_bundled_infer_model + from rvc.f0 import FCPE printt("Loading fcpe model") - if "privateuseone" in str(self.device): - self.device_fcpe = "cpu" - else: - self.device_fcpe = self.device - self.model_fcpe = spawn_bundled_infer_model(self.device_fcpe) - f0 = self.model_fcpe.infer( - x.to(self.device_fcpe).unsqueeze(0).float(), - sr=16000, - decoder_mode="local_argmax", - threshold=0.006, - ) + self.model_fcpe = FCPE( + 160, + self.f0_min, + self.f0_max, + 16000, + self.device, + ) + f0 = self.model_fcpe.compute_f0(x) f0 *= pow(2, f0_up_key / 12) return self.get_f0_post(f0) diff --git a/infer/modules/vc/pipeline.py b/infer/modules/vc/pipeline.py index 12ff383..4f4a401 100644 --- a/infer/modules/vc/pipeline.py +++ b/infer/modules/vc/pipeline.py @@ -14,7 +14,7 @@ import torch import torch.nn.functional as F from scipy import signal -from rvc.f0 import PM, Harvest, RMVPE, CRePE, Dio +from rvc.f0 import PM, Harvest, RMVPE, CRePE, Dio, FCPE now_dir = os.getcwd() sys.path.append(now_dir) @@ -118,21 +118,15 @@ class Pipeline(object): elif f0_method == "fcpe": if not hasattr(self, "model_fcpe"): - from torchfcpe import spawn_bundled_infer_model - logger.info("Loading fcpe model") - self.model_fcpe = spawn_bundled_infer_model(self.device) - f0 = ( - self.model_fcpe.infer( - torch.from_numpy(x).to(self.device).unsqueeze(0).float(), - sr=16000, - decoder_mode="local_argmax", - threshold=0.006, + self.model_fcpe = FCPE( + self.window, + f0_min, + f0_max, + self.sr, + self.device, ) - .squeeze() - .cpu() - .numpy() - ) + f0 = self.model_fcpe.compute_f0(x, p_len=p_len) f0 *= pow(2, f0_up_key / 12) # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) diff --git a/rvc/f0/__init__.py b/rvc/f0/__init__.py index ec38608..dc91d96 100644 --- a/rvc/f0/__init__.py +++ b/rvc/f0/__init__.py @@ -2,8 +2,9 @@ from .f0 import F0Predictor from .crepe import CRePE from .dio import Dio +from .fcpe import FCPE from .harvest import Harvest from .pm import PM from .rmvpe import RMVPE -__all__ = ["F0Predictor", "CRePE", "Dio", "Harvest", "PM", "RMVPE"] +__all__ = ["F0Predictor", "CRePE", "Dio", "FCPE", "Harvest", "PM", "RMVPE"] diff --git a/rvc/f0/crepe.py b/rvc/f0/crepe.py index 559c9c2..c159e9d 100644 --- a/rvc/f0/crepe.py +++ b/rvc/f0/crepe.py @@ -16,6 +16,8 @@ class CRePE(F0Predictor): sampling_rate=44100, device="cpu", ): + if "privateuseone" in str(device): + device = "cpu" super().__init__( hop_length, f0_min, @@ -32,11 +34,13 @@ class CRePE(F0Predictor): ): if p_len is None: p_len = wav.shape[0] // self.hop_length + if not torch.is_tensor(wav): + wav = torch.from_numpy(wav) # Pick a batch size that doesn't cause memory errors on your gpu batch_size = 512 # Compute pitch using device 'device' f0, pd = torchcrepe.predict( - torch.tensor(np.copy(wav))[None].float().to(self.device), + wav.float().to(self.device).unsqueeze(dim=0), self.sampling_rate, self.hop_length, self.f0_min, diff --git a/rvc/f0/fcpe.py b/rvc/f0/fcpe.py new file mode 100644 index 0000000..d992483 --- /dev/null +++ b/rvc/f0/fcpe.py @@ -0,0 +1,45 @@ +from typing import Any, Optional, Union + +import numpy as np +import torch +from torchfcpe import spawn_bundled_infer_model + +from .f0 import F0Predictor + + +class FCPE(F0Predictor): + def __init__( + self, + hop_length=512, + f0_min=50, + f0_max=1100, + sampling_rate=44100, + device="cpu", + ): + super().__init__( + hop_length, + f0_min, + f0_max, + sampling_rate, + device, + ) + + self.model = spawn_bundled_infer_model(self.device) + + def compute_f0( + self, + wav: np.ndarray[Any, np.dtype], + p_len: Optional[int] = None, + filter_radius: Optional[Union[int, float]] = 0.006, + ): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + if not torch.is_tensor(wav): + wav = torch.from_numpy(wav) + f0 = self.model.infer( + wav.float().to(self.device).unsqueeze(0), + sr=self.sampling_rate, + decoder_mode="local_argmax", + threshold=filter_radius, + ).squeeze().cpu().numpy() + return self._interpolate_f0(self._resize_f0(f0, p_len))[0] diff --git a/web.py b/web.py index 689b520..b747215 100644 --- a/web.py +++ b/web.py @@ -861,9 +861,7 @@ with gr.Blocks(title="RVC WebUI") as app: "Select the pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'harvest': better bass but extremely slow; 'crepe': better quality but GPU intensive), 'rmvpe': best quality, and little GPU requirement" ), choices=( - ["pm", "dio", "harvest", "rmvpe"] - if config.dml - else ["pm", "dio", "harvest", "crepe", "rmvpe"] + ["pm", "dio", "harvest", "crepe", "rmvpe", "fcpe"] ), value="rmvpe", interactive=True,