From 77b371d61552b8163f70d60a0a11adeed1545074 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Thu, 13 Jun 2024 00:10:22 +0900 Subject: [PATCH] optimize(f0): move some f0s into rvc.f0 --- infer/lib/jit/rmvpe.py | 2 +- infer/lib/rmvpe.py | 27 ++------ .../modules/train/extract/extract_f0_print.py | 2 +- .../modules/train/extract/extract_f0_rmvpe.py | 2 +- .../train/extract/extract_f0_rmvpe_dml.py | 2 +- infer/modules/train/train.py | 2 +- infer/modules/vc/pipeline.py | 53 +++------------- rvc/f0/__init__.py | 4 ++ rvc/{onnx => }/f0/dio.py | 26 +++----- rvc/{onnx => }/f0/f0.py | 9 ++- rvc/{onnx => }/f0/harvest.py | 29 ++++----- rvc/f0/pm.py | 39 ++++++++++++ rvc/onnx/f0/__init__.py | 4 -- rvc/onnx/f0/pm.py | 61 ------------------- rvc/onnx/infer.py | 14 ++--- 15 files changed, 91 insertions(+), 185 deletions(-) rename rvc/{onnx => }/f0/dio.py (50%) rename rvc/{onnx => }/f0/f0.py (88%) rename rvc/{onnx => }/f0/harvest.py (50%) create mode 100644 rvc/f0/pm.py delete mode 100644 rvc/onnx/f0/__init__.py delete mode 100644 rvc/onnx/f0/pm.py diff --git a/infer/lib/jit/rmvpe.py b/infer/lib/jit/rmvpe.py index e71c39f..6240802 100644 --- a/infer/lib/jit/rmvpe.py +++ b/infer/lib/jit/rmvpe.py @@ -2,7 +2,7 @@ import torch def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")): - from infer.lib.rmvpe import E2E + from rvc.f0.e2e import E2E model = E2E(4, 1, (2, 2)) ckpt = torch.load(model_path, map_location=device) diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py index d384d87..a954037 100644 --- a/infer/lib/rmvpe.py +++ b/infer/lib/rmvpe.py @@ -6,17 +6,6 @@ import torch from infer.lib import jit -try: - # Fix "Torch not compiled with CUDA enabled" - import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import - - if torch.xpu.is_available(): - from infer.modules.ipex import ipex_init - - ipex_init() -except Exception: # pylint: disable=broad-exception-caught - pass -import torch.nn as nn import torch.nn.functional as F import logging @@ -127,13 +116,13 @@ class RMVPE: return hidden[:, :n_frames] def decode(self, hidden, thred=0.03): - cents_pred = self.to_local_average_cents(hidden, thred=thred) + cents_pred = self.to_local_average_cents(hidden, threshold=thred) f0 = 10 * (2 ** (cents_pred / 1200)) f0[f0 == 10] = 0 # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) return f0 - def infer_from_audio(self, audio, thred=0.03): + def infer_from_audio(self, audio, threshold=0.03): # torch.cuda.synchronize() # t0 = ttime() if not torch.is_tensor(audio): @@ -155,17 +144,15 @@ class RMVPE: if self.is_half == True: hidden = hidden.astype("float32") - f0 = self.decode(hidden, thred=thred) + f0 = self.decode(hidden, thred=threshold) # torch.cuda.synchronize() # t3 = ttime() # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) return f0 - def to_local_average_cents(self, salience, thred=0.05): - # t0 = ttime() + def to_local_average_cents(self, salience, threshold=0.05): center = np.argmax(salience, axis=1) # 帧长#index salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 - # t1 = ttime() center += 4 todo_salience = [] todo_cents_mapping = [] @@ -174,15 +161,11 @@ class RMVPE: for idx in range(salience.shape[0]): todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) - # t2 = ttime() todo_salience = np.array(todo_salience) # 帧长,9 todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9 product_sum = np.sum(todo_salience * todo_cents_mapping, 1) weight_sum = np.sum(todo_salience, 1) # 帧长 devided = product_sum / weight_sum # 帧长 - # t3 = ttime() maxx = np.max(salience, axis=1) # 帧长 - devided[maxx <= thred] = 0 - # t4 = ttime() - # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + devided[maxx <= threshold] = 0 return devided diff --git a/infer/modules/train/extract/extract_f0_print.py b/infer/modules/train/extract/extract_f0_print.py index 9d231e4..1de2579 100644 --- a/infer/modules/train/extract/extract_f0_print.py +++ b/infer/modules/train/extract/extract_f0_print.py @@ -89,7 +89,7 @@ class FeatureInput(object): self.model_rmvpe = RMVPE( "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu" ) - f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + f0 = self.model_rmvpe.infer_from_audio(x, threshold=0.03) return f0 def coarse_f0(self, f0): diff --git a/infer/modules/train/extract/extract_f0_rmvpe.py b/infer/modules/train/extract/extract_f0_rmvpe.py index 90b2073..64ae034 100644 --- a/infer/modules/train/extract/extract_f0_rmvpe.py +++ b/infer/modules/train/extract/extract_f0_rmvpe.py @@ -52,7 +52,7 @@ class FeatureInput(object): self.model_rmvpe = RMVPE( "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda" ) - f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + f0 = self.model_rmvpe.infer_from_audio(x, threshold=0.03) return f0 def coarse_f0(self, f0): diff --git a/infer/modules/train/extract/extract_f0_rmvpe_dml.py b/infer/modules/train/extract/extract_f0_rmvpe_dml.py index 243e825..2e48ee2 100644 --- a/infer/modules/train/extract/extract_f0_rmvpe_dml.py +++ b/infer/modules/train/extract/extract_f0_rmvpe_dml.py @@ -50,7 +50,7 @@ class FeatureInput(object): self.model_rmvpe = RMVPE( "assets/rmvpe/rmvpe.pt", is_half=False, device=device ) - f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + f0 = self.model_rmvpe.infer_from_audio(x, threshold=0.03) return f0 def coarse_f0(self, f0): diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py index d16e40e..2d70f83 100644 --- a/infer/modules/train/train.py +++ b/infer/modules/train/train.py @@ -47,7 +47,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter -from rvc import utils +from rvc.layers import utils from infer.lib.train.data_utils import ( DistributedBucketSampler, TextAudioCollate, diff --git a/infer/modules/vc/pipeline.py b/infer/modules/vc/pipeline.py index fb42cd8..00c65b8 100644 --- a/infer/modules/vc/pipeline.py +++ b/infer/modules/vc/pipeline.py @@ -5,40 +5,24 @@ import logging logger = logging.getLogger(__name__) -from functools import lru_cache from time import time import faiss import librosa import numpy as np -import parselmouth import pyworld import torch import torch.nn.functional as F import torchcrepe from scipy import signal +from rvc.f0 import PM, Harvest + now_dir = os.getcwd() sys.path.append(now_dir) bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) -input_audio_path2wav = {} - - -@lru_cache -def cache_harvest_f0(f0_cache_key, fs, f0max, f0min, frame_period): - audio = input_audio_path2wav[f0_cache_key] - f0, t = pyworld.harvest( - audio, - fs=fs, - f0_ceil=f0max, - f0_floor=f0min, - frame_period=frame_period, - ) - f0 = pyworld.stonemask(audio, f0, t, fs) - return f0 - def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 # print(data1.max(),data2.max()) @@ -90,37 +74,18 @@ class Pipeline(object): filter_radius, inp_f0=None, ): - global input_audio_path2wav - time_step = self.window / self.sr * 1000 f0_min = 50 f0_max = 1100 f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700) if f0_method == "pm": - f0 = ( - parselmouth.Sound(x, self.sr) - .to_pitch_ac( - time_step=time_step / 1000, - voicing_threshold=0.6, - pitch_floor=f0_min, - pitch_ceiling=f0_max, - ) - .selected_array["frequency"] - ) - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad( - f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" - ) + if not hasattr(self, "pm"): + self.pm = PM(self.window, f0_min, f0_max, self.sr) + f0 = self.pm.compute_f0(x, p_len=p_len) elif f0_method == "harvest": - from hashlib import md5 - - f0_cache_key = md5(x.tobytes()).digest() - input_audio_path2wav[f0_cache_key] = x.astype(np.double) - f0 = cache_harvest_f0(f0_cache_key, self.sr, f0_max, f0_min, 10) - del input_audio_path2wav[f0_cache_key] - if filter_radius > 2: - f0 = signal.medfilt(f0, 3) + if not hasattr(self, "harvest"): + self.harvest = Harvest(self.window, f0_min, f0_max, self.sr) + f0 = self.harvest.compute_f0(x, p_len=p_len, filter_radius=filter_radius) elif f0_method == "crepe": model = "full" # Pick a batch size that doesn't cause memory errors on your gpu @@ -155,7 +120,7 @@ class Pipeline(object): device=self.device, # use_jit=self.config.use_jit, ) - f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + f0 = self.model_rmvpe.infer_from_audio(x, threshold=0.03) if "privateuseone" in str(self.device): # clean ortruntime memory del self.model_rmvpe.model diff --git a/rvc/f0/__init__.py b/rvc/f0/__init__.py index e69de29..72b34dc 100644 --- a/rvc/f0/__init__.py +++ b/rvc/f0/__init__.py @@ -0,0 +1,4 @@ +from .dio import Dio +from .harvest import Harvest +from .pm import PM +from .f0 import F0Predictor diff --git a/rvc/onnx/f0/dio.py b/rvc/f0/dio.py similarity index 50% rename from rvc/onnx/f0/dio.py rename to rvc/f0/dio.py index c142670..ef488ac 100644 --- a/rvc/onnx/f0/dio.py +++ b/rvc/f0/dio.py @@ -6,27 +6,15 @@ import pyworld from .f0 import F0Predictor -class DioF0Predictor(F0Predictor): +class Dio(F0Predictor): def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): super().__init__(hop_length, f0_min, f0_max, sampling_rate) - def compute_f0(self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None): - if p_len is None: - p_len = wav.shape[0] // self.hop_length - f0, t = pyworld.dio( - wav.astype(np.double), - fs=self.sampling_rate, - f0_floor=self.f0_min, - f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sampling_rate, - ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) - for index, pitch in enumerate(f0): - f0[index] = round(pitch, 1) - return self.interpolate_f0(self.resize_f0(f0, p_len))[0] - - def compute_f0_uv( - self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None + def compute_f0( + self, + wav: np.ndarray[Any, np.dtype], + p_len: Optional[int] = None, + filter_radius: Optional[int] = None, ): if p_len is None: p_len = wav.shape[0] // self.hop_length @@ -40,4 +28,4 @@ class DioF0Predictor(F0Predictor): f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) for index, pitch in enumerate(f0): f0[index] = round(pitch, 1) - return self.interpolate_f0(self.resize_f0(f0, p_len)) + return self.interpolate_f0(self.resize_f0(f0, p_len))[0] diff --git a/rvc/onnx/f0/f0.py b/rvc/f0/f0.py similarity index 88% rename from rvc/onnx/f0/f0.py rename to rvc/f0/f0.py index 7131491..d29e296 100644 --- a/rvc/onnx/f0/f0.py +++ b/rvc/f0/f0.py @@ -11,11 +11,10 @@ class F0Predictor(object): self.sampling_rate = sampling_rate def compute_f0( - self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None - ): ... - - def compute_f0_uv( - self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None + self, + wav: np.ndarray[Any, np.dtype], + p_len: Optional[int] = None, + filter_radius: Optional[int] = None, ): ... def interpolate_f0(self, f0: np.ndarray[Any, np.dtype]): diff --git a/rvc/onnx/f0/harvest.py b/rvc/f0/harvest.py similarity index 50% rename from rvc/onnx/f0/harvest.py rename to rvc/f0/harvest.py index 54956ba..d57850a 100644 --- a/rvc/onnx/f0/harvest.py +++ b/rvc/f0/harvest.py @@ -2,38 +2,31 @@ from typing import Any, Optional import numpy as np import pyworld +from scipy import signal from .f0 import F0Predictor -class HarvestF0Predictor(F0Predictor): +class Harvest(F0Predictor): def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): super().__init__(hop_length, f0_min, f0_max, sampling_rate) - def compute_f0(self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None): - if p_len is None: - p_len = wav.shape[0] // self.hop_length - f0, t = pyworld.harvest( - wav.astype(np.double), - fs=self.sampling_rate, - f0_ceil=self.f0_max, - f0_floor=self.f0_min, - frame_period=1000 * self.hop_length / self.sampling_rate, - ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) - return self.interpolate_f0(self.resize_f0(f0, p_len))[0] - - def compute_f0_uv( - self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None + def compute_f0( + self, + wav: np.ndarray[Any, np.dtype], + p_len: Optional[int] = None, + filter_radius: Optional[int] = None, ): if p_len is None: p_len = wav.shape[0] // self.hop_length f0, t = pyworld.harvest( wav.astype(np.double), fs=self.sampling_rate, - f0_floor=self.f0_min, f0_ceil=self.f0_max, + f0_floor=self.f0_min, frame_period=1000 * self.hop_length / self.sampling_rate, ) f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) - return self.interpolate_f0(self.resize_f0(f0, p_len)) + if filter_radius is not None and filter_radius > 2: + f0 = signal.medfilt(f0, 3) + return self.interpolate_f0(self.resize_f0(f0, p_len))[0] diff --git a/rvc/f0/pm.py b/rvc/f0/pm.py new file mode 100644 index 0000000..11feaf6 --- /dev/null +++ b/rvc/f0/pm.py @@ -0,0 +1,39 @@ +from typing import Any, Optional + +import numpy as np +import parselmouth + +from .f0 import F0Predictor + + +class PM(F0Predictor): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + super().__init__(hop_length, f0_min, f0_max, sampling_rate) + + def compute_f0( + self, + wav: np.ndarray[Any, np.dtype], + p_len: Optional[int] = None, + filter_radius: Optional[int] = None, + ): + x = wav + if p_len is None: + p_len = x.shape[0] // self.hop_length + else: + assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" + time_step = self.hop_length / self.sampling_rate * 1000 + f0 = ( + parselmouth.Sound(x, self.sampling_rate) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=self.f0_min, + pitch_ceiling=self.f0_max, + ) + .selected_array["frequency"] + ) + + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") + return self.interpolate_f0(f0)[0] diff --git a/rvc/onnx/f0/__init__.py b/rvc/onnx/f0/__init__.py deleted file mode 100644 index 71d4522..0000000 --- a/rvc/onnx/f0/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .dio import DioF0Predictor -from .harvest import HarvestF0Predictor -from .pm import PMF0Predictor -from .f0 import F0Predictor diff --git a/rvc/onnx/f0/pm.py b/rvc/onnx/f0/pm.py deleted file mode 100644 index 7513e15..0000000 --- a/rvc/onnx/f0/pm.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import Any, Optional - -import numpy as np -import parselmouth - -from .f0 import F0Predictor - - -class PMF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): - super().__init__(hop_length, f0_min, f0_max, sampling_rate) - - def compute_f0(self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None): - x = wav - if p_len is None: - p_len = x.shape[0] // self.hop_length - else: - assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - time_step = self.hop_length / self.sampling_rate * 1000 - f0 = ( - parselmouth.Sound(x, self.sampling_rate) - .to_pitch_ac( - time_step=time_step / 1000, - voicing_threshold=0.6, - pitch_floor=self.f0_min, - pitch_ceiling=self.f0_max, - ) - .selected_array["frequency"] - ) - - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") - f0, uv = self.interpolate_f0(f0) - return f0 - - def compute_f0_uv( - self, wav: np.ndarray[Any, np.dtype], p_len: Optional[int] = None - ): - x = wav - if p_len is None: - p_len = x.shape[0] // self.hop_length - else: - assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - time_step = self.hop_length / self.sampling_rate * 1000 - f0 = ( - parselmouth.Sound(x, self.sampling_rate) - .to_pitch_ac( - time_step=time_step / 1000, - voicing_threshold=0.6, - pitch_floor=self.f0_min, - pitch_ceiling=self.f0_max, - ) - .selected_array["frequency"] - ) - - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") - f0, uv = self.interpolate_f0(f0) - return f0, uv diff --git a/rvc/onnx/infer.py b/rvc/onnx/infer.py index a8e5a4f..b6895cd 100644 --- a/rvc/onnx/infer.py +++ b/rvc/onnx/infer.py @@ -5,10 +5,10 @@ import librosa import numpy as np import onnxruntime -from .f0 import ( - PMF0Predictor, - HarvestF0Predictor, - DioF0Predictor, +from rvc.f0 import ( + PM, + Harvest, + Dio, F0Predictor, ) @@ -52,9 +52,9 @@ class ContentVec(Model): predictors: typing.Dict[str, F0Predictor] = { - "pm": PMF0Predictor, - "harvest": HarvestF0Predictor, - "dio": DioF0Predictor, + "pm": PM, + "harvest": Harvest, + "dio": Dio, }