feat(all): optimize hierarchy of files

2026-06-06 17:50:25 +08:00 · 2024-04-20 21:29:25 +09:00
parent 1ac5e09f68
commit 4762e5bc21
30 changed files with 729 additions and 856 deletions
--- a/tools/cmd/calc_rvc_model_similarity.py
+++ b/tools/cmd/calc_rvc_model_similarity.py
@@ -0,0 +1,96 @@
+# This code references https://huggingface.co/JosephusCheung/ASimilarityCalculatior/blob/main/qwerty.py
+# Fill in the path of the model to be queried and the root directory of the reference models, and this script will return the similarity between the model to be queried and all reference models.
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def cal_cross_attn(to_q, to_k, to_v, rand_input):
+    hidden_dim, embed_dim = to_q.shape
+    attn_to_q = nn.Linear(hidden_dim, embed_dim, bias=False)
+    attn_to_k = nn.Linear(hidden_dim, embed_dim, bias=False)
+    attn_to_v = nn.Linear(hidden_dim, embed_dim, bias=False)
+    attn_to_q.load_state_dict({"weight": to_q})
+    attn_to_k.load_state_dict({"weight": to_k})
+    attn_to_v.load_state_dict({"weight": to_v})
+
+    return torch.einsum(
+        "ik, jk -> ik",
+        F.softmax(
+            torch.einsum("ij, kj -> ik", attn_to_q(rand_input), attn_to_k(rand_input)),
+            dim=-1,
+        ),
+        attn_to_v(rand_input),
+    )
+
+
+def model_hash(filename):
+    try:
+        with open(filename, "rb") as file:
+            import hashlib
+
+            m = hashlib.sha256()
+
+            file.seek(0x100000)
+            m.update(file.read(0x10000))
+            return m.hexdigest()[0:8]
+    except FileNotFoundError:
+        return "NOFILE"
+
+
+def eval(model, n, input):
+    qk = f"enc_p.encoder.attn_layers.{n}.conv_q.weight"
+    uk = f"enc_p.encoder.attn_layers.{n}.conv_k.weight"
+    vk = f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
+    atoq, atok, atov = model[qk][:, :, 0], model[uk][:, :, 0], model[vk][:, :, 0]
+
+    attn = cal_cross_attn(atoq, atok, atov, input)
+    return attn
+
+
+def main(path, root):
+    torch.manual_seed(114514)
+    model_a = torch.load(path, map_location="cpu")["weight"]
+
+    logger.info("Query:\t\t%s\t%s" % (path, model_hash(path)))
+
+    map_attn_a = {}
+    map_rand_input = {}
+    for n in range(6):
+        hidden_dim, embed_dim, _ = model_a[
+            f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
+        ].shape
+        rand_input = torch.randn([embed_dim, hidden_dim])
+
+        map_attn_a[n] = eval(model_a, n, rand_input)
+        map_rand_input[n] = rand_input
+
+    del model_a
+
+    for name in sorted(list(os.listdir(root))):
+        path = "%s/%s" % (root, name)
+        model_b = torch.load(path, map_location="cpu")["weight"]
+
+        sims = []
+        for n in range(6):
+            attn_a = map_attn_a[n]
+            attn_b = eval(model_b, n, map_rand_input[n])
+
+            sim = torch.mean(torch.cosine_similarity(attn_a, attn_b))
+            sims.append(sim)
+
+        logger.info(
+            "Reference:\t%s\t%s\t%s"
+            % (path, model_hash(path), f"{torch.mean(torch.stack(sims)) * 1e2:.2f}%")
+        )
+
+
+if __name__ == "__main__":
+    query_path = r"assets\weights\mi v3.pth"
+    reference_root = r"assets\weights"
+    main(query_path, reference_root)
--- a/tools/cmd/infer-pm-index256.py
+++ b/tools/cmd/infer-pm-index256.py
@@ -0,0 +1,203 @@
+"""
+
+对源特征进行检索
+"""
+
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+
+import parselmouth
+import torch
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+# import torchcrepe
+from time import time as ttime
+
+# import pyworld
+import librosa
+import numpy as np
+import soundfile as sf
+import torch.nn.functional as F
+from fairseq import checkpoint_utils
+
+# from models import SynthesizerTrn256#hifigan_nonsf
+# from lib.infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
+from infer.lib.infer_pack.models import (
+    SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
+)  # hifigan_nsf
+from scipy.io import wavfile
+
+# from lib.infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
+# from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
+# from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
+
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_path = r"E:\codes\py39\vits_vc_gpu_train\assets\hubert\hubert_base.pt"  #
+logger.info("Load model(s) from {}".format(model_path))
+models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+    [model_path],
+    suffix="",
+)
+model = models[0]
+model = model.to(device)
+model = model.half()
+model.eval()
+
+# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
+# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
+net_g = SynthesizerTrn256(
+    1025,
+    32,
+    192,
+    192,
+    768,
+    2,
+    6,
+    3,
+    0,
+    "1",
+    [3, 7, 11],
+    [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+    [10, 10, 2, 2],
+    512,
+    [16, 16, 4, 4],
+    183,
+    256,
+    is_half=True,
+)  # hifigan#512#256#no_dropout
+# net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
+# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
+#
+# net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [5,5], 512, [15,15], 0)#ms
+# net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,10], 512, [16,16], 0)#idwt2
+
+# weights=torch.load("infer/ft-mi_1k-noD.pt")
+# weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt")
+# weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt")
+# weights=torch.load("infer/ft-mi-sim1k.pt")
+weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt")
+logger.debug(net_g.load_state_dict(weights, strict=True))
+
+net_g.eval().to(device)
+net_g.half()
+
+
+def get_f0(x, p_len, f0_up_key=0):
+    time_step = 160 / 16000 * 1000
+    f0_min = 50
+    f0_max = 1100
+    f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+    f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+
+    f0 = (
+        parselmouth.Sound(x, 16000)
+        .to_pitch_ac(
+            time_step=time_step / 1000,
+            voicing_threshold=0.6,
+            pitch_floor=f0_min,
+            pitch_ceiling=f0_max,
+        )
+        .selected_array["frequency"]
+    )
+
+    pad_size = (p_len - len(f0) + 1) // 2
+    if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+        f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
+    f0 *= pow(2, f0_up_key / 12)
+    f0bak = f0.copy()
+
+    f0_mel = 1127 * np.log(1 + f0 / 700)
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
+        f0_mel_max - f0_mel_min
+    ) + 1
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > 255] = 255
+    # f0_mel[f0_mel > 188] = 188
+    f0_coarse = np.rint(f0_mel).astype(np.int32)
+    return f0_coarse, f0bak
+
+
+import faiss
+
+index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
+big_npy = np.load("infer/big_src_feature_mi.npy")
+ta0 = ta1 = ta2 = 0
+for idx, name in enumerate(
+    [
+        "冬之花clip1.wav",
+    ]
+):  ##
+    wav_path = "todo-songs/%s" % name  #
+    f0_up_key = -2  #
+    audio, sampling_rate = sf.read(wav_path)
+    if len(audio.shape) > 1:
+        audio = librosa.to_mono(audio.transpose(1, 0))
+    if sampling_rate != 16000:
+        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
+
+    feats = torch.from_numpy(audio).float()
+    if feats.dim() == 2:  # double channels
+        feats = feats.mean(-1)
+    assert feats.dim() == 1, feats.dim()
+    feats = feats.view(1, -1)
+    padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+    inputs = {
+        "source": feats.half().to(device),
+        "padding_mask": padding_mask.to(device),
+        "output_layer": 9,  # layer 9
+    }
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    t0 = ttime()
+    with torch.no_grad():
+        logits = model.extract_features(**inputs)
+        feats = model.final_proj(logits[0])
+
+    ####索引优化
+    npy = feats[0].cpu().numpy().astype("float32")
+    D, I = index.search(npy, 1)
+    feats = (
+        torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
+    )
+
+    feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    t1 = ttime()
+    # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
+    p_len = min(feats.shape[1], 10000)  #
+    pitch, pitchf = get_f0(audio, p_len, f0_up_key)
+    p_len = min(feats.shape[1], 10000, pitch.shape[0])  # 太大了爆显存
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    t2 = ttime()
+    feats = feats[:, :p_len, :]
+    pitch = pitch[:p_len]
+    pitchf = pitchf[:p_len]
+    p_len = torch.LongTensor([p_len]).to(device)
+    pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
+    sid = torch.LongTensor([0]).to(device)
+    pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
+    with torch.no_grad():
+        audio = (
+            net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
+            .data.cpu()
+            .float()
+            .numpy()
+        )  # nsf
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    t3 = ttime()
+    ta0 += t1 - t0
+    ta1 += t2 - t1
+    ta2 += t3 - t2
+    # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
+    # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
+    # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
+    wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio)  ##
+
+
+logger.debug("%.2fs %.2fs %.2fs", ta0, ta1, ta2)  #
--- a/tools/cmd/infer_batch_rvc.py
+++ b/tools/cmd/infer_batch_rvc.py
@@ -0,0 +1,72 @@
+import argparse
+import os
+import sys
+
+print("Command-line arguments:", sys.argv)
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+import sys
+
+import tqdm as tq
+from dotenv import load_dotenv
+from scipy.io import wavfile
+
+from configs.config import Config
+from infer.modules.vc.modules import VC
+
+
+def arg_parse() -> tuple:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--f0up_key", type=int, default=0)
+    parser.add_argument("--input_path", type=str, help="input path")
+    parser.add_argument("--index_path", type=str, help="index path")
+    parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
+    parser.add_argument("--opt_path", type=str, help="opt path")
+    parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
+    parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
+    parser.add_argument("--device", type=str, help="device")
+    parser.add_argument("--is_half", type=bool, help="use half -> True")
+    parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
+    parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
+    parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
+    parser.add_argument("--protect", type=float, default=0.33, help="protect")
+
+    args = parser.parse_args()
+    sys.argv = sys.argv[:1]
+
+    return args
+
+
+def main():
+    load_dotenv()
+    args = arg_parse()
+    config = Config()
+    config.device = args.device if args.device else config.device
+    config.is_half = args.is_half if args.is_half else config.is_half
+    vc = VC(config)
+    vc.get_vc(args.model_name)
+    audios = os.listdir(args.input_path)
+    for file in tq.tqdm(audios):
+        if file.endswith(".wav"):
+            file_path = os.path.join(args.input_path, file)
+            _, wav_opt = vc.vc_single(
+                0,
+                file_path,
+                args.f0up_key,
+                None,
+                args.f0method,
+                args.index_path,
+                None,
+                args.index_rate,
+                args.filter_radius,
+                args.resample_sr,
+                args.rms_mix_rate,
+                args.protect,
+            )
+            out_path = os.path.join(args.opt_path, file)
+            wavfile.write(out_path, wav_opt[0], wav_opt[1])
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/cmd/infer_cli.py
+++ b/tools/cmd/infer_cli.py
@@ -0,0 +1,67 @@
+import argparse
+import os
+import sys
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from dotenv import load_dotenv
+from scipy.io import wavfile
+
+from configs.config import Config
+from infer.modules.vc.modules import VC
+
+####
+# USAGE
+#
+# In your Terminal or CMD or whatever
+
+
+def arg_parse() -> tuple:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--f0up_key", type=int, default=0)
+    parser.add_argument("--input_path", type=str, help="input path")
+    parser.add_argument("--index_path", type=str, help="index path")
+    parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
+    parser.add_argument("--opt_path", type=str, help="opt path")
+    parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
+    parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
+    parser.add_argument("--device", type=str, help="device")
+    parser.add_argument("--is_half", type=bool, help="use half -> True")
+    parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
+    parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
+    parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
+    parser.add_argument("--protect", type=float, default=0.33, help="protect")
+
+    args = parser.parse_args()
+    sys.argv = sys.argv[:1]
+
+    return args
+
+
+def main():
+    load_dotenv()
+    args = arg_parse()
+    config = Config()
+    config.device = args.device if args.device else config.device
+    config.is_half = args.is_half if args.is_half else config.is_half
+    vc = VC(config)
+    vc.get_vc(args.model_name)
+    _, wav_opt = vc.vc_single(
+        0,
+        args.input_path,
+        args.f0up_key,
+        None,
+        args.f0method,
+        args.index_path,
+        None,
+        args.index_rate,
+        args.filter_radius,
+        args.resample_sr,
+        args.rms_mix_rate,
+        args.protect,
+    )
+    wavfile.write(args.opt_path, wav_opt[0], wav_opt[1])
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/cmd/train-index-v2.py
+++ b/tools/cmd/train-index-v2.py
@@ -0,0 +1,80 @@
+"""
+格式：直接cid为自带的index位；aid放不下了，通过字典来查，反正就5w个
+"""
+
+import os
+import traceback
+import logging
+
+logger = logging.getLogger(__name__)
+
+from multiprocessing import cpu_count
+
+import faiss
+import numpy as np
+from sklearn.cluster import MiniBatchKMeans
+
+# ###########如果是原始特征要先写save
+n_cpu = 0
+if n_cpu == 0:
+    n_cpu = cpu_count()
+inp_root = r"./logs/anz/3_feature768"
+npys = []
+listdir_res = list(os.listdir(inp_root))
+for name in sorted(listdir_res):
+    phone = np.load("%s/%s" % (inp_root, name))
+    npys.append(phone)
+big_npy = np.concatenate(npys, 0)
+big_npy_idx = np.arange(big_npy.shape[0])
+np.random.shuffle(big_npy_idx)
+big_npy = big_npy[big_npy_idx]
+logger.debug(big_npy.shape)  # (6196072, 192)#fp32#4.43G
+if big_npy.shape[0] > 2e5:
+    # if(1):
+    info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]
+    logger.info(info)
+    try:
+        big_npy = (
+            MiniBatchKMeans(
+                n_clusters=10000,
+                verbose=True,
+                batch_size=256 * n_cpu,
+                compute_labels=False,
+                init="random",
+            )
+            .fit(big_npy)
+            .cluster_centers_
+        )
+    except:
+        info = traceback.format_exc()
+        logger.warning(info)
+
+np.save("tools/infer/big_src_feature_mi.npy", big_npy)
+
+##################train+add
+# big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
+n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
+index = faiss.index_factory(768, "IVF%s,Flat" % n_ivf)  # mi
+logger.info("Training...")
+index_ivf = faiss.extract_index_ivf(index)  #
+index_ivf.nprobe = 1
+index.train(big_npy)
+faiss.write_index(
+    index, "tools/infer/trained_IVF%s_Flat_baseline_src_feat_v2.index" % (n_ivf)
+)
+logger.info("Adding...")
+batch_size_add = 8192
+for i in range(0, big_npy.shape[0], batch_size_add):
+    index.add(big_npy[i : i + batch_size_add])
+faiss.write_index(
+    index, "tools/infer/added_IVF%s_Flat_mi_baseline_src_feat.index" % (n_ivf)
+)
+"""
+大小（都是FP32）
+big_src_feature 2.95G
+    (3098036, 256)
+big_emb         4.43G
+    (6196072, 192)
+big_emb双倍是因为求特征要repeat后再加pitch
+
+"""
--- a/tools/cmd/train-index.py
+++ b/tools/cmd/train-index.py
@@ -0,0 +1,43 @@
+"""
+格式：直接cid为自带的index位；aid放不下了，通过字典来查，反正就5w个
+"""
+
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+
+import faiss
+import numpy as np
+
+# ###########如果是原始特征要先写save
+inp_root = r"E:\codes\py39\dataset\mi\2-co256"
+npys = []
+for name in sorted(list(os.listdir(inp_root))):
+    phone = np.load("%s/%s" % (inp_root, name))
+    npys.append(phone)
+big_npy = np.concatenate(npys, 0)
+logger.debug(big_npy.shape)  # (6196072, 192)#fp32#4.43G
+np.save("infer/big_src_feature_mi.npy", big_npy)
+
+##################train+add
+# big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
+logger.debug(big_npy.shape)
+index = faiss.index_factory(256, "IVF512,Flat")  # mi
+logger.info("Training...")
+index_ivf = faiss.extract_index_ivf(index)  #
+index_ivf.nprobe = 9
+index.train(big_npy)
+faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index")
+logger.info("Adding...")
+index.add(big_npy)
+faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index")
+"""
+大小（都是FP32）
+big_src_feature 2.95G
+    (3098036, 256)
+big_emb         4.43G
+    (6196072, 192)
+big_emb双倍是因为求特征要repeat后再加pitch
+
+"""
--- a/tools/cmd/trans_weights.py
+++ b/tools/cmd/trans_weights.py
@@ -0,0 +1,18 @@
+import pdb
+
+import torch
+
+# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf#
+# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf#
+# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf#
+# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf#
+a = torch.load(
+    r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth"
+)[
+    "model"
+]  # sim_nsf#
+for key in a.keys():
+    a[key] = a[key].half()
+# torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")#
+# torch.save(a,"ft-mi-sim1k.pt")#
+torch.save(a, "ft-mi-no_opt-no_dropout.pt")  #