1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-09 12:30:38 +08:00

optimize(f0): move some f0s into rvc.f0

This commit is contained in:
源文雨
2024-06-13 00:10:22 +09:00
parent d44a942882
commit 77b371d615
15 changed files with 91 additions and 185 deletions

View File

@@ -2,7 +2,7 @@ import torch
def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")): def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
from infer.lib.rmvpe import E2E from rvc.f0.e2e import E2E
model = E2E(4, 1, (2, 2)) model = E2E(4, 1, (2, 2))
ckpt = torch.load(model_path, map_location=device) ckpt = torch.load(model_path, map_location=device)

View File

@@ -6,17 +6,6 @@ import torch
from infer.lib import jit from infer.lib import jit
try:
# Fix "Torch not compiled with CUDA enabled"
import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
if torch.xpu.is_available():
from infer.modules.ipex import ipex_init
ipex_init()
except Exception: # pylint: disable=broad-exception-caught
pass
import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import logging import logging
@@ -127,13 +116,13 @@ class RMVPE:
return hidden[:, :n_frames] return hidden[:, :n_frames]
def decode(self, hidden, thred=0.03): def decode(self, hidden, thred=0.03):
cents_pred = self.to_local_average_cents(hidden, thred=thred) cents_pred = self.to_local_average_cents(hidden, threshold=thred)
f0 = 10 * (2 ** (cents_pred / 1200)) f0 = 10 * (2 ** (cents_pred / 1200))
f0[f0 == 10] = 0 f0[f0 == 10] = 0
# f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
return f0 return f0
def infer_from_audio(self, audio, thred=0.03): def infer_from_audio(self, audio, threshold=0.03):
# torch.cuda.synchronize() # torch.cuda.synchronize()
# t0 = ttime() # t0 = ttime()
if not torch.is_tensor(audio): if not torch.is_tensor(audio):
@@ -155,17 +144,15 @@ class RMVPE:
if self.is_half == True: if self.is_half == True:
hidden = hidden.astype("float32") hidden = hidden.astype("float32")
f0 = self.decode(hidden, thred=thred) f0 = self.decode(hidden, thred=threshold)
# torch.cuda.synchronize() # torch.cuda.synchronize()
# t3 = ttime() # t3 = ttime()
# print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
return f0 return f0
def to_local_average_cents(self, salience, thred=0.05): def to_local_average_cents(self, salience, threshold=0.05):
# t0 = ttime()
center = np.argmax(salience, axis=1) # 帧长#index center = np.argmax(salience, axis=1) # 帧长#index
salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368
# t1 = ttime()
center += 4 center += 4
todo_salience = [] todo_salience = []
todo_cents_mapping = [] todo_cents_mapping = []
@@ -174,15 +161,11 @@ class RMVPE:
for idx in range(salience.shape[0]): for idx in range(salience.shape[0]):
todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
# t2 = ttime()
todo_salience = np.array(todo_salience) # 帧长9 todo_salience = np.array(todo_salience) # 帧长9
todo_cents_mapping = np.array(todo_cents_mapping) # 帧长9 todo_cents_mapping = np.array(todo_cents_mapping) # 帧长9
product_sum = np.sum(todo_salience * todo_cents_mapping, 1) product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
weight_sum = np.sum(todo_salience, 1) # 帧长 weight_sum = np.sum(todo_salience, 1) # 帧长
devided = product_sum / weight_sum # 帧长 devided = product_sum / weight_sum # 帧长
# t3 = ttime()
maxx = np.max(salience, axis=1) # 帧长 maxx = np.max(salience, axis=1) # 帧长
devided[maxx <= thred] = 0 devided[maxx <= threshold] = 0
# t4 = ttime()
# print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
return devided return devided

View File

@@ -89,7 +89,7 @@ class FeatureInput(object):
self.model_rmvpe = RMVPE( self.model_rmvpe = RMVPE(
"assets/rmvpe/rmvpe.pt", is_half=False, device="cpu" "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu"
) )
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) f0 = self.model_rmvpe.infer_from_audio(x, threshold=0.03)
return f0 return f0
def coarse_f0(self, f0): def coarse_f0(self, f0):

View File

@@ -52,7 +52,7 @@ class FeatureInput(object):
self.model_rmvpe = RMVPE( self.model_rmvpe = RMVPE(
"assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda" "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda"
) )
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) f0 = self.model_rmvpe.infer_from_audio(x, threshold=0.03)
return f0 return f0
def coarse_f0(self, f0): def coarse_f0(self, f0):

View File

@@ -50,7 +50,7 @@ class FeatureInput(object):
self.model_rmvpe = RMVPE( self.model_rmvpe = RMVPE(
"assets/rmvpe/rmvpe.pt", is_half=False, device=device "assets/rmvpe/rmvpe.pt", is_half=False, device=device
) )
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) f0 = self.model_rmvpe.infer_from_audio(x, threshold=0.03)
return f0 return f0
def coarse_f0(self, f0): def coarse_f0(self, f0):

View File

@@ -47,7 +47,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter from torch.utils.tensorboard import SummaryWriter
from rvc import utils from rvc.layers import utils
from infer.lib.train.data_utils import ( from infer.lib.train.data_utils import (
DistributedBucketSampler, DistributedBucketSampler,
TextAudioCollate, TextAudioCollate,

View File

@@ -5,40 +5,24 @@ import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from functools import lru_cache
from time import time from time import time
import faiss import faiss
import librosa import librosa
import numpy as np import numpy as np
import parselmouth
import pyworld import pyworld
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import torchcrepe import torchcrepe
from scipy import signal from scipy import signal
from rvc.f0 import PM, Harvest
now_dir = os.getcwd() now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
input_audio_path2wav = {}
@lru_cache
def cache_harvest_f0(f0_cache_key, fs, f0max, f0min, frame_period):
audio = input_audio_path2wav[f0_cache_key]
f0, t = pyworld.harvest(
audio,
fs=fs,
f0_ceil=f0max,
f0_floor=f0min,
frame_period=frame_period,
)
f0 = pyworld.stonemask(audio, f0, t, fs)
return f0
def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频2是输出音频,rate是2的占比 def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频2是输出音频,rate是2的占比
# print(data1.max(),data2.max()) # print(data1.max(),data2.max())
@@ -90,37 +74,18 @@ class Pipeline(object):
filter_radius, filter_radius,
inp_f0=None, inp_f0=None,
): ):
global input_audio_path2wav
time_step = self.window / self.sr * 1000
f0_min = 50 f0_min = 50
f0_max = 1100 f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700)
if f0_method == "pm": if f0_method == "pm":
f0 = ( if not hasattr(self, "pm"):
parselmouth.Sound(x, self.sr) self.pm = PM(self.window, f0_min, f0_max, self.sr)
.to_pitch_ac( f0 = self.pm.compute_f0(x, p_len=p_len)
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
elif f0_method == "harvest": elif f0_method == "harvest":
from hashlib import md5 if not hasattr(self, "harvest"):
self.harvest = Harvest(self.window, f0_min, f0_max, self.sr)
f0_cache_key = md5(x.tobytes()).digest() f0 = self.harvest.compute_f0(x, p_len=p_len, filter_radius=filter_radius)
input_audio_path2wav[f0_cache_key] = x.astype(np.double)
f0 = cache_harvest_f0(f0_cache_key, self.sr, f0_max, f0_min, 10)
del input_audio_path2wav[f0_cache_key]
if filter_radius > 2:
f0 = signal.medfilt(f0, 3)
elif f0_method == "crepe": elif f0_method == "crepe":
model = "full" model = "full"
# Pick a batch size that doesn't cause memory errors on your gpu # Pick a batch size that doesn't cause memory errors on your gpu
@@ -155,7 +120,7 @@ class Pipeline(object):
device=self.device, device=self.device,
# use_jit=self.config.use_jit, # use_jit=self.config.use_jit,
) )
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) f0 = self.model_rmvpe.infer_from_audio(x, threshold=0.03)
if "privateuseone" in str(self.device): # clean ortruntime memory if "privateuseone" in str(self.device): # clean ortruntime memory
del self.model_rmvpe.model del self.model_rmvpe.model

View File

@@ -0,0 +1,4 @@
from .dio import Dio
from .harvest import Harvest
from .pm import PM
from .f0 import F0Predictor

View File

@@ -6,27 +6,15 @@ import pyworld
from .f0 import F0Predictor from .f0 import F0Predictor
class DioF0Predictor(F0Predictor): class Dio(F0Predictor):
def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
super().__init__(hop_length, f0_min, f0_max, sampling_rate) super().__init__(hop_length, f0_min, f0_max, sampling_rate)
def compute_f0(self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None): def compute_f0(
if p_len is None: self,
p_len = wav.shape[0] // self.hop_length wav: np.ndarray[Any, np.dtype],
f0, t = pyworld.dio( p_len: Optional[int] = None,
wav.astype(np.double), filter_radius: Optional[int] = None,
fs=self.sampling_rate,
f0_floor=self.f0_min,
f0_ceil=self.f0_max,
frame_period=1000 * self.hop_length / self.sampling_rate,
)
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
for index, pitch in enumerate(f0):
f0[index] = round(pitch, 1)
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
def compute_f0_uv(
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
): ):
if p_len is None: if p_len is None:
p_len = wav.shape[0] // self.hop_length p_len = wav.shape[0] // self.hop_length
@@ -40,4 +28,4 @@ class DioF0Predictor(F0Predictor):
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
for index, pitch in enumerate(f0): for index, pitch in enumerate(f0):
f0[index] = round(pitch, 1) f0[index] = round(pitch, 1)
return self.interpolate_f0(self.resize_f0(f0, p_len)) return self.interpolate_f0(self.resize_f0(f0, p_len))[0]

View File

@@ -11,11 +11,10 @@ class F0Predictor(object):
self.sampling_rate = sampling_rate self.sampling_rate = sampling_rate
def compute_f0( def compute_f0(
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None self,
): ... wav: np.ndarray[Any, np.dtype],
p_len: Optional[int] = None,
def compute_f0_uv( filter_radius: Optional[int] = None,
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
): ... ): ...
def interpolate_f0(self, f0: np.ndarray[Any, np.dtype]): def interpolate_f0(self, f0: np.ndarray[Any, np.dtype]):

View File

@@ -2,38 +2,31 @@ from typing import Any, Optional
import numpy as np import numpy as np
import pyworld import pyworld
from scipy import signal
from .f0 import F0Predictor from .f0 import F0Predictor
class HarvestF0Predictor(F0Predictor): class Harvest(F0Predictor):
def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
super().__init__(hop_length, f0_min, f0_max, sampling_rate) super().__init__(hop_length, f0_min, f0_max, sampling_rate)
def compute_f0(self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None): def compute_f0(
if p_len is None: self,
p_len = wav.shape[0] // self.hop_length wav: np.ndarray[Any, np.dtype],
f0, t = pyworld.harvest( p_len: Optional[int] = None,
wav.astype(np.double), filter_radius: Optional[int] = None,
fs=self.sampling_rate,
f0_ceil=self.f0_max,
f0_floor=self.f0_min,
frame_period=1000 * self.hop_length / self.sampling_rate,
)
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
def compute_f0_uv(
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
): ):
if p_len is None: if p_len is None:
p_len = wav.shape[0] // self.hop_length p_len = wav.shape[0] // self.hop_length
f0, t = pyworld.harvest( f0, t = pyworld.harvest(
wav.astype(np.double), wav.astype(np.double),
fs=self.sampling_rate, fs=self.sampling_rate,
f0_floor=self.f0_min,
f0_ceil=self.f0_max, f0_ceil=self.f0_max,
f0_floor=self.f0_min,
frame_period=1000 * self.hop_length / self.sampling_rate, frame_period=1000 * self.hop_length / self.sampling_rate,
) )
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
return self.interpolate_f0(self.resize_f0(f0, p_len)) if filter_radius is not None and filter_radius > 2:
f0 = signal.medfilt(f0, 3)
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]

39
rvc/f0/pm.py Normal file
View File

@@ -0,0 +1,39 @@
from typing import Any, Optional
import numpy as np
import parselmouth
from .f0 import F0Predictor
class PM(F0Predictor):
def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
super().__init__(hop_length, f0_min, f0_max, sampling_rate)
def compute_f0(
self,
wav: np.ndarray[Any, np.dtype],
p_len: Optional[int] = None,
filter_radius: Optional[int] = None,
):
x = wav
if p_len is None:
p_len = x.shape[0] // self.hop_length
else:
assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
time_step = self.hop_length / self.sampling_rate * 1000
f0 = (
parselmouth.Sound(x, self.sampling_rate)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=self.f0_min,
pitch_ceiling=self.f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
return self.interpolate_f0(f0)[0]

View File

@@ -1,4 +0,0 @@
from .dio import DioF0Predictor
from .harvest import HarvestF0Predictor
from .pm import PMF0Predictor
from .f0 import F0Predictor

View File

@@ -1,61 +0,0 @@
from typing import Any, Optional
import numpy as np
import parselmouth
from .f0 import F0Predictor
class PMF0Predictor(F0Predictor):
def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
super().__init__(hop_length, f0_min, f0_max, sampling_rate)
def compute_f0(self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None):
x = wav
if p_len is None:
p_len = x.shape[0] // self.hop_length
else:
assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
time_step = self.hop_length / self.sampling_rate * 1000
f0 = (
parselmouth.Sound(x, self.sampling_rate)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=self.f0_min,
pitch_ceiling=self.f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
f0, uv = self.interpolate_f0(f0)
return f0
def compute_f0_uv(
self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None
):
x = wav
if p_len is None:
p_len = x.shape[0] // self.hop_length
else:
assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
time_step = self.hop_length / self.sampling_rate * 1000
f0 = (
parselmouth.Sound(x, self.sampling_rate)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=self.f0_min,
pitch_ceiling=self.f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
f0, uv = self.interpolate_f0(f0)
return f0, uv

View File

@@ -5,10 +5,10 @@ import librosa
import numpy as np import numpy as np
import onnxruntime import onnxruntime
from .f0 import ( from rvc.f0 import (
PMF0Predictor, PM,
HarvestF0Predictor, Harvest,
DioF0Predictor, Dio,
F0Predictor, F0Predictor,
) )
@@ -52,9 +52,9 @@ class ContentVec(Model):
predictors: typing.Dict[str, F0Predictor] = { predictors: typing.Dict[str, F0Predictor] = {
"pm": PMF0Predictor, "pm": PM,
"harvest": HarvestF0Predictor, "harvest": Harvest,
"dio": DioF0Predictor, "dio": Dio,
} }