1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-05 01:10:22 +08:00

optimize(f0): move fcpe into rvc.f0

This commit is contained in:
源文雨
2024-06-14 21:33:46 +09:00
parent 24dbc5edd2
commit 3b7d7c6d1a
6 changed files with 70 additions and 32 deletions

View File

@@ -2,8 +2,9 @@ from .f0 import F0Predictor
from .crepe import CRePE
from .dio import Dio
from .fcpe import FCPE
from .harvest import Harvest
from .pm import PM
from .rmvpe import RMVPE
__all__ = ["F0Predictor", "CRePE", "Dio", "Harvest", "PM", "RMVPE"]
__all__ = ["F0Predictor", "CRePE", "Dio", "FCPE", "Harvest", "PM", "RMVPE"]

View File

@@ -16,6 +16,8 @@ class CRePE(F0Predictor):
sampling_rate=44100,
device="cpu",
):
if "privateuseone" in str(device):
device = "cpu"
super().__init__(
hop_length,
f0_min,
@@ -32,11 +34,13 @@ class CRePE(F0Predictor):
):
if p_len is None:
p_len = wav.shape[0] // self.hop_length
if not torch.is_tensor(wav):
wav = torch.from_numpy(wav)
# Pick a batch size that doesn't cause memory errors on your gpu
batch_size = 512
# Compute pitch using device 'device'
f0, pd = torchcrepe.predict(
torch.tensor(np.copy(wav))[None].float().to(self.device),
wav.float().to(self.device).unsqueeze(dim=0),
self.sampling_rate,
self.hop_length,
self.f0_min,

45
rvc/f0/fcpe.py Normal file
View File

@@ -0,0 +1,45 @@
from typing import Any, Optional, Union
import numpy as np
import torch
from torchfcpe import spawn_bundled_infer_model
from .f0 import F0Predictor
class FCPE(F0Predictor):
def __init__(
self,
hop_length=512,
f0_min=50,
f0_max=1100,
sampling_rate=44100,
device="cpu",
):
super().__init__(
hop_length,
f0_min,
f0_max,
sampling_rate,
device,
)
self.model = spawn_bundled_infer_model(self.device)
def compute_f0(
self,
wav: np.ndarray[Any, np.dtype],
p_len: Optional[int] = None,
filter_radius: Optional[Union[int, float]] = 0.006,
):
if p_len is None:
p_len = wav.shape[0] // self.hop_length
if not torch.is_tensor(wav):
wav = torch.from_numpy(wav)
f0 = self.model.infer(
wav.float().to(self.device).unsqueeze(0),
sr=self.sampling_rate,
decoder_mode="local_argmax",
threshold=filter_radius,
).squeeze().cpu().numpy()
return self._interpolate_f0(self._resize_f0(f0, p_len))[0]