From 0ab3a3296e0122062265a9f19cdd4da433c1acce Mon Sep 17 00:00:00 2001 From: yxlllc <33565655+yxlllc@users.noreply.github.com> Date: Mon, 22 Apr 2024 21:13:10 +0800 Subject: [PATCH] New feature of real-time voice changing: formant shift adjustment (#1999) * add formant shift for realtime-gui * chore(i18n): sync locale on dev * chore(format): run black on dev * fix --------- Co-authored-by: github-actions[bot] --- configs/config.json | 2 +- gui_v1.py | 20 +++++++++++++++++++ i18n/locale/en_US.json | 1 + i18n/locale/es_ES.json | 1 + i18n/locale/fr_FR.json | 1 + i18n/locale/it_IT.json | 1 + i18n/locale/ja_JP.json | 1 + i18n/locale/ko_KR.json | 1 + i18n/locale/pt_BR.json | 1 + i18n/locale/ru_RU.json | 1 + i18n/locale/tr_TR.json | 1 + i18n/locale/zh_CN.json | 1 + i18n/locale/zh_HK.json | 1 + i18n/locale/zh_SG.json | 1 + i18n/locale/zh_TW.json | 1 + infer/lib/infer_pack/models.py | 35 +++++++++++++++++++++++++++++----- infer/lib/rtrvc.py | 32 +++++++++++++++++++++++++++---- 17 files changed, 92 insertions(+), 10 deletions(-) diff --git a/configs/config.json b/configs/config.json index 3c99324..e79bd50 100644 --- a/configs/config.json +++ b/configs/config.json @@ -1 +1 @@ -{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "VoiceMeeter Output (VB-Audio Vo", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi", "sr_type": "sr_device", "threhold": -60.0, "pitch": 12.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.15, "crossfade_length": 0.08, "extra_time": 2.0, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"} \ No newline at end of file +{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "VoiceMeeter Output (VB-Audio Vo", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi", "sr_type": "sr_device", "threhold": -60.0, "pitch": 12.0, "formant": 0.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.15, "crossfade_length": 0.08, "extra_time": 2.0, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"} \ No newline at end of file diff --git a/gui_v1.py b/gui_v1.py index 5042c9a..2b30b00 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -114,6 +114,7 @@ if __name__ == "__main__": self.pth_path: str = "" self.index_path: str = "" self.pitch: int = 0 + self.formant: float = 0.0 self.sr_type: str = "sr_model" self.block_time: float = 0.25 # s self.threhold: int = -60 @@ -212,6 +213,7 @@ if __name__ == "__main__": "sr_type": "sr_model", "threhold": -60, "pitch": 0, + "formant": 0.0, "index_rate": 0, "rms_mix_rate": 0, "block_time": 0.25, @@ -354,6 +356,17 @@ if __name__ == "__main__": enable_events=True, ), ], + [ + sg.Text(i18n("共振偏移")), + sg.Slider( + range=(-5, 5), + key="formant", + resolution=0.01, + orientation="h", + default_value=data.get("formant", 0.0), + enable_events=True, + ), + ], [ sg.Text(i18n("Index Rate")), sg.Slider( @@ -579,6 +592,7 @@ if __name__ == "__main__": ], "threhold": values["threhold"], "pitch": values["pitch"], + "formant": values["formant"], "rms_mix_rate": values["rms_mix_rate"], "index_rate": values["index_rate"], # "device_latency": values["device_latency"], @@ -621,6 +635,10 @@ if __name__ == "__main__": self.gui_config.pitch = values["pitch"] if hasattr(self, "rvc"): self.rvc.change_key(values["pitch"]) + elif event == "formant": + self.gui_config.formant = values["formant"] + if hasattr(self, "rvc"): + self.rvc.change_formant(values["formant"]) elif event == "index_rate": self.gui_config.index_rate = values["index_rate"] if hasattr(self, "rvc"): @@ -679,6 +697,7 @@ if __name__ == "__main__": ] self.gui_config.threhold = values["threhold"] self.gui_config.pitch = values["pitch"] + self.gui_config.formant = values["formant"] self.gui_config.block_time = values["block_time"] self.gui_config.crossfade_time = values["crossfade_length"] self.gui_config.extra_time = values["extra_time"] @@ -703,6 +722,7 @@ if __name__ == "__main__": torch.cuda.empty_cache() self.rvc = rtrvc.RVC( self.gui_config.pitch, + self.gui_config.formant, self.gui_config.pth_path, self.gui_config.index_path, self.gui_config.index_rate, diff --git a/i18n/locale/en_US.json b/i18n/locale/en_US.json index f208d8d..be09af7 100644 --- a/i18n/locale/en_US.json +++ b/i18n/locale/en_US.json @@ -37,6 +37,7 @@ "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modify model information (only supported for small model files extracted from the 'weights' folder)", "停止音频转换": "Stop audio conversion", "全流程结束!": "All processes have been completed!", + "共振偏移": "共振偏移", "刷新音色列表和索引路径": "Refresh voice list and index path", "加载模型": "Load model", "加载预训练底模D路径": "Load pre-trained base model D path:", diff --git a/i18n/locale/es_ES.json b/i18n/locale/es_ES.json index 73b2e86..99961ec 100644 --- a/i18n/locale/es_ES.json +++ b/i18n/locale/es_ES.json @@ -37,6 +37,7 @@ "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modificar la información del modelo (solo admite archivos de modelos pequeños extraídos en la carpeta weights)", "停止音频转换": "Detener la conversión de audio", "全流程结束!": "¡Todo el proceso ha terminado!", + "共振偏移": "共振偏移", "刷新音色列表和索引路径": "Actualizar la lista de modelos e índice de rutas", "加载模型": "Cargar modelo", "加载预训练底模D路径": "Cargue la ruta del modelo D base pre-entrenada.", diff --git a/i18n/locale/fr_FR.json b/i18n/locale/fr_FR.json index cbf5b35..7fb7710 100644 --- a/i18n/locale/fr_FR.json +++ b/i18n/locale/fr_FR.json @@ -37,6 +37,7 @@ "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modifier les informations du modèle (uniquement pris en charge pour les petits fichiers de modèle extraits du dossier 'weights')", "停止音频转换": "Arrêter la conversion audio", "全流程结束!": "Toutes les étapes ont été terminées !", + "共振偏移": "共振偏移", "刷新音色列表和索引路径": "Actualiser la liste des voix et le vers l'index.", "加载模型": "Charger le modèle.", "加载预训练底模D路径": "Charger le chemin du modèle de base pré-entraîné D :", diff --git a/i18n/locale/it_IT.json b/i18n/locale/it_IT.json index c6aa02d..deb3ff4 100644 --- a/i18n/locale/it_IT.json +++ b/i18n/locale/it_IT.json @@ -37,6 +37,7 @@ "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modifica le informazioni sul modello (supportato solo per i file di modello di piccole dimensioni estratti dalla cartella 'weights')", "停止音频转换": "Arresta la conversione audio", "全流程结束!": "Tutti i processi sono stati completati!", + "共振偏移": "共振偏移", "刷新音色列表和索引路径": "Aggiorna l'elenco delle voci e il percorso dell'indice", "加载模型": "Carica modello", "加载预训练底模D路径": "Carica il percorso D del modello base pre-addestrato:", diff --git a/i18n/locale/ja_JP.json b/i18n/locale/ja_JP.json index b6ce535..a5d03aa 100644 --- a/i18n/locale/ja_JP.json +++ b/i18n/locale/ja_JP.json @@ -37,6 +37,7 @@ "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "モデル情報の修正(weightsフォルダから抽出された小さなモデルファイルのみ対応)", "停止音频转换": "音声変換を停止", "全流程结束!": "全工程が完了!", + "共振偏移": "共振偏移", "刷新音色列表和索引路径": "音源リストとインデックスパスの更新", "加载模型": "モデルをロード", "加载预训练底模D路径": "事前学習済みのDモデルのパス", diff --git a/i18n/locale/ko_KR.json b/i18n/locale/ko_KR.json index dcaab63..dfc9140 100644 --- a/i18n/locale/ko_KR.json +++ b/i18n/locale/ko_KR.json @@ -37,6 +37,7 @@ "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "모델 정보 수정(오직 weights 폴더 아래에서 추출된 작은 모델 파일만 지원)", "停止音频转换": "오디오 변환 중지", "全流程结束!": "전체 과정 완료!", + "共振偏移": "共振偏移", "刷新音色列表和索引路径": "음색 목록 및 인덱스 경로 새로고침", "加载模型": "모델 로드", "加载预训练底模D路径": "미리 훈련된 베이스 모델 D 경로 로드", diff --git a/i18n/locale/pt_BR.json b/i18n/locale/pt_BR.json index 3d87b08..74b5a3b 100644 --- a/i18n/locale/pt_BR.json +++ b/i18n/locale/pt_BR.json @@ -37,6 +37,7 @@ "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modificar informações do modelo (suportado apenas para arquivos de modelo pequenos extraídos da pasta 'weights')", "停止音频转换": "Conversão de áudio", "全流程结束!": "Todos os processos foram concluídos!", + "共振偏移": "共振偏移", "刷新音色列表和索引路径": "Atualizar lista de voz e caminho do Index", "加载模型": "Modelo", "加载预训练底模D路径": "Carregue o caminho D do modelo base pré-treinado:", diff --git a/i18n/locale/ru_RU.json b/i18n/locale/ru_RU.json index 42f2bd6..f75ebd1 100644 --- a/i18n/locale/ru_RU.json +++ b/i18n/locale/ru_RU.json @@ -37,6 +37,7 @@ "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Изменить информацию о модели (работает только с маленькими моделями, взятыми из папки 'weights')", "停止音频转换": "Закончить конвертацию аудио", "全流程结束!": "Все процессы завершены!", + "共振偏移": "共振偏移", "刷新音色列表和索引路径": "Обновить список голосов и индексов", "加载模型": "Загрузить модель", "加载预训练底模D路径": "Путь к предварительно обученной базовой модели D:", diff --git a/i18n/locale/tr_TR.json b/i18n/locale/tr_TR.json index 90c47d6..e81eca4 100644 --- a/i18n/locale/tr_TR.json +++ b/i18n/locale/tr_TR.json @@ -37,6 +37,7 @@ "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Model bilgilerini düzenle (sadece 'weights' klasöründen çıkarılan küçük model dosyaları desteklenir)", "停止音频转换": "Ses dönüştürmeyi durdur", "全流程结束!": "Tüm işlemler tamamlandı!", + "共振偏移": "共振偏移", "刷新音色列表和索引路径": "Ses listesini ve indeks yolunu yenile", "加载模型": "Model yükle", "加载预训练底模D路径": "Önceden eğitilmiş temel D modelini yükleme yolu:", diff --git a/i18n/locale/zh_CN.json b/i18n/locale/zh_CN.json index 8657c73..8d84760 100644 --- a/i18n/locale/zh_CN.json +++ b/i18n/locale/zh_CN.json @@ -37,6 +37,7 @@ "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型信息(仅支持weights文件夹下提取的小模型文件)", "停止音频转换": "停止音频转换", "全流程结束!": "全流程结束!", + "共振偏移": "共振偏移", "刷新音色列表和索引路径": "刷新音色列表和索引路径", "加载模型": "加载模型", "加载预训练底模D路径": "加载预训练底模D路径", diff --git a/i18n/locale/zh_HK.json b/i18n/locale/zh_HK.json index 6043201..4b3774a 100644 --- a/i18n/locale/zh_HK.json +++ b/i18n/locale/zh_HK.json @@ -37,6 +37,7 @@ "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", "停止音频转换": "停止音訊轉換", "全流程结束!": "全流程结束!", + "共振偏移": "共振偏移", "刷新音色列表和索引路径": "刷新音色列表和索引路徑", "加载模型": "載入模型", "加载预训练底模D路径": "加載預訓練底模D路徑", diff --git a/i18n/locale/zh_SG.json b/i18n/locale/zh_SG.json index 6043201..4b3774a 100644 --- a/i18n/locale/zh_SG.json +++ b/i18n/locale/zh_SG.json @@ -37,6 +37,7 @@ "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", "停止音频转换": "停止音訊轉換", "全流程结束!": "全流程结束!", + "共振偏移": "共振偏移", "刷新音色列表和索引路径": "刷新音色列表和索引路徑", "加载模型": "載入模型", "加载预训练底模D路径": "加載預訓練底模D路徑", diff --git a/i18n/locale/zh_TW.json b/i18n/locale/zh_TW.json index 6043201..4b3774a 100644 --- a/i18n/locale/zh_TW.json +++ b/i18n/locale/zh_TW.json @@ -37,6 +37,7 @@ "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", "停止音频转换": "停止音訊轉換", "全流程结束!": "全流程结束!", + "共振偏移": "共振偏移", "刷新音色列表和索引路径": "刷新音色列表和索引路徑", "加载模型": "載入模型", "加载预训练底模D路径": "加載預訓練底模D路徑", diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py index 262814d..a1a27e2 100644 --- a/infer/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -10,7 +10,6 @@ from torch import nn from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d from torch.nn import functional as F from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm - from infer.lib.infer_pack import attentions, commons, modules from infer.lib.infer_pack.commons import get_padding, init_weights @@ -250,7 +249,17 @@ class Generator(torch.nn.Module): if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None): + def forward( + self, + x: torch.Tensor, + g: Optional[torch.Tensor] = None, + n_res: Optional[torch.Tensor] = None, + ): + if n_res is not None: + assert isinstance(n_res, torch.Tensor) + n = int(n_res.item()) + if n != x.shape[-1]: + x = F.interpolate(x, size=n, mode="linear") x = self.conv_pre(x) if g is not None: x = x + self.cond(g) @@ -529,9 +538,22 @@ class GeneratorNSF(torch.nn.Module): self.lrelu_slope = modules.LRELU_SLOPE - def forward(self, x, f0, g: Optional[torch.Tensor] = None): + def forward( + self, + x, + f0, + g: Optional[torch.Tensor] = None, + n_res: Optional[torch.Tensor] = None, + ): har_source, noi_source, uv = self.m_source(f0, self.upp) har_source = har_source.transpose(1, 2) + if n_res is not None: + assert isinstance(n_res, torch.Tensor) + n = int(n_res.item()) + if n * self.upp != har_source.shape[-1]: + har_source = F.interpolate(har_source, size=n * self.upp, mode="linear") + if n != x.shape[-1]: + x = F.interpolate(x, size=n, mode="linear") x = self.conv_pre(x) if g is not None: x = x + self.cond(g) @@ -558,6 +580,7 @@ class GeneratorNSF(torch.nn.Module): x = F.leaky_relu(x) x = self.conv_post(x) x = torch.tanh(x) + return x def remove_weight_norm(self): @@ -748,6 +771,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module): sid: torch.Tensor, skip_head: Optional[torch.Tensor] = None, return_length: Optional[torch.Tensor] = None, + return_length2: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) if skip_head is not None and return_length is not None: @@ -767,7 +791,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module): m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, nsff0, g=g) + o = self.dec(z * x_mask, nsff0, g=g, n_res=return_length2) return o, x_mask, (z, z_p, m_p, logs_p) @@ -963,6 +987,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): sid: torch.Tensor, skip_head: Optional[torch.Tensor] = None, return_length: Optional[torch.Tensor] = None, + return_length2: Optional[torch.Tensor] = None, ): g = self.emb_g(sid).unsqueeze(-1) if skip_head is not None and return_length is not None: @@ -981,7 +1006,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, g=g) + o = self.dec(z * x_mask, g=g, n_res=return_length2) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/infer/lib/rtrvc.py b/infer/lib/rtrvc.py index aa5b86c..8da568c 100644 --- a/infer/lib/rtrvc.py +++ b/infer/lib/rtrvc.py @@ -15,6 +15,7 @@ import torch import torch.nn as nn import torch.nn.functional as F import torchcrepe +from torchaudio.transforms import Resample now_dir = os.getcwd() sys.path.append(now_dir) @@ -40,6 +41,7 @@ class RVC: def __init__( self, key, + formant, pth_path, index_path, index_rate, @@ -68,6 +70,7 @@ class RVC: # device="cpu"########强制cpu测试 self.device = config.device self.f0_up_key = key + self.formant_shift = formant self.f0_min = 50 self.f0_max = 1100 self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) @@ -90,6 +93,8 @@ class RVC: 1024, device=self.device, dtype=torch.float32 ) + self.resample_kernel = {} + if last_rvc is None: models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( ["assets/hubert/hubert_base.pt"], @@ -187,6 +192,9 @@ class RVC: def change_key(self, new_key): self.f0_up_key = new_key + def change_formant(self, new_formant): + self.formant_shift = new_formant + def change_index_rate(self, new_index_rate): if new_index_rate != 0 and self.index_rate == 0: self.index = faiss.read_index(self.index_path) @@ -390,12 +398,14 @@ class RVC: printt("Index search FAILED") t3 = ttime() p_len = input_wav.shape[0] // 160 + factor = pow(2, self.formant_shift / 12) + return_length2 = int(np.ceil(return_length * factor)) if self.if_f0 == 1: f0_extractor_frame = block_frame_16k + 800 if f0method == "rmvpe": f0_extractor_frame = 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 pitch, pitchf = self.get_f0( - input_wav[-f0_extractor_frame:], self.f0_up_key, self.n_cpu, f0method + input_wav[-f0_extractor_frame:], self.f0_up_key - self.formant_shift, self.n_cpu, f0method ) shift = block_frame_16k // 160 self.cache_pitch[:-shift] = self.cache_pitch[shift:].clone() @@ -403,13 +413,14 @@ class RVC: self.cache_pitch[4 - pitch.shape[0] :] = pitch[3:-1] self.cache_pitchf[4 - pitch.shape[0] :] = pitchf[3:-1] cache_pitch = self.cache_pitch[None, -p_len:] - cache_pitchf = self.cache_pitchf[None, -p_len:] + cache_pitchf = self.cache_pitchf[None, -p_len:] * return_length2 / return_length t4 = ttime() feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) feats = feats[:, :p_len, :] p_len = torch.LongTensor([p_len]).to(self.device) sid = torch.LongTensor([0]).to(self.device) skip_head = torch.LongTensor([skip_head]) + return_length2 = torch.LongTensor([return_length2]) return_length = torch.LongTensor([return_length]) with torch.no_grad(): if self.if_f0 == 1: @@ -421,11 +432,24 @@ class RVC: sid, skip_head, return_length, + return_length2, ) else: infered_audio, _, _ = self.net_g.infer( - feats, p_len, sid, skip_head, return_length + feats, p_len, sid, skip_head, return_length, return_length2 ) + infered_audio = infered_audio.squeeze(1).float() + upp_res = int(np.floor(factor * self.tgt_sr // 100)) + if upp_res != self.tgt_sr // 100: + if upp_res not in self.resample_kernel: + self.resample_kernel[upp_res] = Resample( + orig_freq=upp_res, + new_freq=self.tgt_sr // 100, + dtype=torch.float32, + ).to(self.device) + infered_audio = self.resample_kernel[upp_res]( + infered_audio[:, : return_length * upp_res] + ) t5 = ttime() printt( "Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs", @@ -434,4 +458,4 @@ class RVC: t4 - t3, t5 - t4, ) - return infered_audio.squeeze().float() + return infered_audio.squeeze()