1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-06 17:50:25 +08:00

feat(all): optimize hierarchy of files

This commit is contained in:
源文雨
2024-04-20 21:29:25 +09:00
parent 1ac5e09f68
commit 4762e5bc21
30 changed files with 729 additions and 856 deletions

View File

@@ -0,0 +1,96 @@
# This code references https://huggingface.co/JosephusCheung/ASimilarityCalculatior/blob/main/qwerty.py
# Fill in the path of the model to be queried and the root directory of the reference models, and this script will return the similarity between the model to be queried and all reference models.
import os
import logging
logger = logging.getLogger(__name__)
import torch
import torch.nn as nn
import torch.nn.functional as F
def cal_cross_attn(to_q, to_k, to_v, rand_input):
hidden_dim, embed_dim = to_q.shape
attn_to_q = nn.Linear(hidden_dim, embed_dim, bias=False)
attn_to_k = nn.Linear(hidden_dim, embed_dim, bias=False)
attn_to_v = nn.Linear(hidden_dim, embed_dim, bias=False)
attn_to_q.load_state_dict({"weight": to_q})
attn_to_k.load_state_dict({"weight": to_k})
attn_to_v.load_state_dict({"weight": to_v})
return torch.einsum(
"ik, jk -> ik",
F.softmax(
torch.einsum("ij, kj -> ik", attn_to_q(rand_input), attn_to_k(rand_input)),
dim=-1,
),
attn_to_v(rand_input),
)
def model_hash(filename):
try:
with open(filename, "rb") as file:
import hashlib
m = hashlib.sha256()
file.seek(0x100000)
m.update(file.read(0x10000))
return m.hexdigest()[0:8]
except FileNotFoundError:
return "NOFILE"
def eval(model, n, input):
qk = f"enc_p.encoder.attn_layers.{n}.conv_q.weight"
uk = f"enc_p.encoder.attn_layers.{n}.conv_k.weight"
vk = f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
atoq, atok, atov = model[qk][:, :, 0], model[uk][:, :, 0], model[vk][:, :, 0]
attn = cal_cross_attn(atoq, atok, atov, input)
return attn
def main(path, root):
torch.manual_seed(114514)
model_a = torch.load(path, map_location="cpu")["weight"]
logger.info("Query:\t\t%s\t%s" % (path, model_hash(path)))
map_attn_a = {}
map_rand_input = {}
for n in range(6):
hidden_dim, embed_dim, _ = model_a[
f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
].shape
rand_input = torch.randn([embed_dim, hidden_dim])
map_attn_a[n] = eval(model_a, n, rand_input)
map_rand_input[n] = rand_input
del model_a
for name in sorted(list(os.listdir(root))):
path = "%s/%s" % (root, name)
model_b = torch.load(path, map_location="cpu")["weight"]
sims = []
for n in range(6):
attn_a = map_attn_a[n]
attn_b = eval(model_b, n, map_rand_input[n])
sim = torch.mean(torch.cosine_similarity(attn_a, attn_b))
sims.append(sim)
logger.info(
"Reference:\t%s\t%s\t%s"
% (path, model_hash(path), f"{torch.mean(torch.stack(sims)) * 1e2:.2f}%")
)
if __name__ == "__main__":
query_path = r"assets\weights\mi v3.pth"
reference_root = r"assets\weights"
main(query_path, reference_root)

View File

@@ -0,0 +1,203 @@
"""
对源特征进行检索
"""
import os
import logging
logger = logging.getLogger(__name__)
import parselmouth
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# import torchcrepe
from time import time as ttime
# import pyworld
import librosa
import numpy as np
import soundfile as sf
import torch.nn.functional as F
from fairseq import checkpoint_utils
# from models import SynthesizerTrn256#hifigan_nonsf
# from lib.infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
from infer.lib.infer_pack.models import (
SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
) # hifigan_nsf
from scipy.io import wavfile
# from lib.infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
# from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
# from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = r"E:\codes\py39\vits_vc_gpu_train\assets\hubert\hubert_base.pt" #
logger.info("Load model(s) from {}".format(model_path))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[model_path],
suffix="",
)
model = models[0]
model = model.to(device)
model = model.half()
model.eval()
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
net_g = SynthesizerTrn256(
1025,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 10, 2, 2],
512,
[16, 16, 4, 4],
183,
256,
is_half=True,
) # hifigan#512#256#no_dropout
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
#
# net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [5,5], 512, [15,15], 0)#ms
# net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,10], 512, [16,16], 0)#idwt2
# weights=torch.load("infer/ft-mi_1k-noD.pt")
# weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt")
# weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt")
# weights=torch.load("infer/ft-mi-sim1k.pt")
weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt")
logger.debug(net_g.load_state_dict(weights, strict=True))
net_g.eval().to(device)
net_g.half()
def get_f0(x, p_len, f0_up_key=0):
time_step = 160 / 16000 * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0 = (
parselmouth.Sound(x, 16000)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
f0 *= pow(2, f0_up_key / 12)
f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
# f0_mel[f0_mel > 188] = 188
f0_coarse = np.rint(f0_mel).astype(np.int32)
return f0_coarse, f0bak
import faiss
index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
big_npy = np.load("infer/big_src_feature_mi.npy")
ta0 = ta1 = ta2 = 0
for idx, name in enumerate(
[
"冬之花clip1.wav",
]
): ##
wav_path = "todo-songs/%s" % name #
f0_up_key = -2 #
audio, sampling_rate = sf.read(wav_path)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
feats = torch.from_numpy(audio).float()
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
inputs = {
"source": feats.half().to(device),
"padding_mask": padding_mask.to(device),
"output_layer": 9, # layer 9
}
if torch.cuda.is_available():
torch.cuda.synchronize()
t0 = ttime()
with torch.no_grad():
logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0])
####索引优化
npy = feats[0].cpu().numpy().astype("float32")
D, I = index.search(npy, 1)
feats = (
torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
)
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
if torch.cuda.is_available():
torch.cuda.synchronize()
t1 = ttime()
# p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
p_len = min(feats.shape[1], 10000) #
pitch, pitchf = get_f0(audio, p_len, f0_up_key)
p_len = min(feats.shape[1], 10000, pitch.shape[0]) # 太大了爆显存
if torch.cuda.is_available():
torch.cuda.synchronize()
t2 = ttime()
feats = feats[:, :p_len, :]
pitch = pitch[:p_len]
pitchf = pitchf[:p_len]
p_len = torch.LongTensor([p_len]).to(device)
pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
sid = torch.LongTensor([0]).to(device)
pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
with torch.no_grad():
audio = (
net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
.data.cpu()
.float()
.numpy()
) # nsf
if torch.cuda.is_available():
torch.cuda.synchronize()
t3 = ttime()
ta0 += t1 - t0
ta1 += t2 - t1
ta2 += t3 - t2
# wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
# wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
# wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio) ##
logger.debug("%.2fs %.2fs %.2fs", ta0, ta1, ta2) #

View File

@@ -0,0 +1,72 @@
import argparse
import os
import sys
print("Command-line arguments:", sys.argv)
now_dir = os.getcwd()
sys.path.append(now_dir)
import sys
import tqdm as tq
from dotenv import load_dotenv
from scipy.io import wavfile
from configs.config import Config
from infer.modules.vc.modules import VC
def arg_parse() -> tuple:
parser = argparse.ArgumentParser()
parser.add_argument("--f0up_key", type=int, default=0)
parser.add_argument("--input_path", type=str, help="input path")
parser.add_argument("--index_path", type=str, help="index path")
parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
parser.add_argument("--opt_path", type=str, help="opt path")
parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
parser.add_argument("--device", type=str, help="device")
parser.add_argument("--is_half", type=bool, help="use half -> True")
parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
parser.add_argument("--protect", type=float, default=0.33, help="protect")
args = parser.parse_args()
sys.argv = sys.argv[:1]
return args
def main():
load_dotenv()
args = arg_parse()
config = Config()
config.device = args.device if args.device else config.device
config.is_half = args.is_half if args.is_half else config.is_half
vc = VC(config)
vc.get_vc(args.model_name)
audios = os.listdir(args.input_path)
for file in tq.tqdm(audios):
if file.endswith(".wav"):
file_path = os.path.join(args.input_path, file)
_, wav_opt = vc.vc_single(
0,
file_path,
args.f0up_key,
None,
args.f0method,
args.index_path,
None,
args.index_rate,
args.filter_radius,
args.resample_sr,
args.rms_mix_rate,
args.protect,
)
out_path = os.path.join(args.opt_path, file)
wavfile.write(out_path, wav_opt[0], wav_opt[1])
if __name__ == "__main__":
main()

67
tools/cmd/infer_cli.py Normal file
View File

@@ -0,0 +1,67 @@
import argparse
import os
import sys
now_dir = os.getcwd()
sys.path.append(now_dir)
from dotenv import load_dotenv
from scipy.io import wavfile
from configs.config import Config
from infer.modules.vc.modules import VC
####
# USAGE
#
# In your Terminal or CMD or whatever
def arg_parse() -> tuple:
parser = argparse.ArgumentParser()
parser.add_argument("--f0up_key", type=int, default=0)
parser.add_argument("--input_path", type=str, help="input path")
parser.add_argument("--index_path", type=str, help="index path")
parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
parser.add_argument("--opt_path", type=str, help="opt path")
parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
parser.add_argument("--device", type=str, help="device")
parser.add_argument("--is_half", type=bool, help="use half -> True")
parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
parser.add_argument("--protect", type=float, default=0.33, help="protect")
args = parser.parse_args()
sys.argv = sys.argv[:1]
return args
def main():
load_dotenv()
args = arg_parse()
config = Config()
config.device = args.device if args.device else config.device
config.is_half = args.is_half if args.is_half else config.is_half
vc = VC(config)
vc.get_vc(args.model_name)
_, wav_opt = vc.vc_single(
0,
args.input_path,
args.f0up_key,
None,
args.f0method,
args.index_path,
None,
args.index_rate,
args.filter_radius,
args.resample_sr,
args.rms_mix_rate,
args.protect,
)
wavfile.write(args.opt_path, wav_opt[0], wav_opt[1])
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,80 @@
"""
格式直接cid为自带的index位aid放不下了通过字典来查反正就5w个
"""
import os
import traceback
import logging
logger = logging.getLogger(__name__)
from multiprocessing import cpu_count
import faiss
import numpy as np
from sklearn.cluster import MiniBatchKMeans
# ###########如果是原始特征要先写save
n_cpu = 0
if n_cpu == 0:
n_cpu = cpu_count()
inp_root = r"./logs/anz/3_feature768"
npys = []
listdir_res = list(os.listdir(inp_root))
for name in sorted(listdir_res):
phone = np.load("%s/%s" % (inp_root, name))
npys.append(phone)
big_npy = np.concatenate(npys, 0)
big_npy_idx = np.arange(big_npy.shape[0])
np.random.shuffle(big_npy_idx)
big_npy = big_npy[big_npy_idx]
logger.debug(big_npy.shape) # (6196072, 192)#fp32#4.43G
if big_npy.shape[0] > 2e5:
# if(1):
info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]
logger.info(info)
try:
big_npy = (
MiniBatchKMeans(
n_clusters=10000,
verbose=True,
batch_size=256 * n_cpu,
compute_labels=False,
init="random",
)
.fit(big_npy)
.cluster_centers_
)
except:
info = traceback.format_exc()
logger.warning(info)
np.save("tools/infer/big_src_feature_mi.npy", big_npy)
##################train+add
# big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
index = faiss.index_factory(768, "IVF%s,Flat" % n_ivf) # mi
logger.info("Training...")
index_ivf = faiss.extract_index_ivf(index) #
index_ivf.nprobe = 1
index.train(big_npy)
faiss.write_index(
index, "tools/infer/trained_IVF%s_Flat_baseline_src_feat_v2.index" % (n_ivf)
)
logger.info("Adding...")
batch_size_add = 8192
for i in range(0, big_npy.shape[0], batch_size_add):
index.add(big_npy[i : i + batch_size_add])
faiss.write_index(
index, "tools/infer/added_IVF%s_Flat_mi_baseline_src_feat.index" % (n_ivf)
)
"""
大小都是FP32
big_src_feature 2.95G
(3098036, 256)
big_emb 4.43G
(6196072, 192)
big_emb双倍是因为求特征要repeat后再加pitch
"""

43
tools/cmd/train-index.py Normal file
View File

@@ -0,0 +1,43 @@
"""
格式直接cid为自带的index位aid放不下了通过字典来查反正就5w个
"""
import os
import logging
logger = logging.getLogger(__name__)
import faiss
import numpy as np
# ###########如果是原始特征要先写save
inp_root = r"E:\codes\py39\dataset\mi\2-co256"
npys = []
for name in sorted(list(os.listdir(inp_root))):
phone = np.load("%s/%s" % (inp_root, name))
npys.append(phone)
big_npy = np.concatenate(npys, 0)
logger.debug(big_npy.shape) # (6196072, 192)#fp32#4.43G
np.save("infer/big_src_feature_mi.npy", big_npy)
##################train+add
# big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
logger.debug(big_npy.shape)
index = faiss.index_factory(256, "IVF512,Flat") # mi
logger.info("Training...")
index_ivf = faiss.extract_index_ivf(index) #
index_ivf.nprobe = 9
index.train(big_npy)
faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index")
logger.info("Adding...")
index.add(big_npy)
faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index")
"""
大小都是FP32
big_src_feature 2.95G
(3098036, 256)
big_emb 4.43G
(6196072, 192)
big_emb双倍是因为求特征要repeat后再加pitch
"""

View File

@@ -0,0 +1,18 @@
import pdb
import torch
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf#
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf#
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf#
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf#
a = torch.load(
r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth"
)[
"model"
] # sim_nsf#
for key in a.keys():
a[key] = a[key].half()
# torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")#
# torch.save(a,"ft-mi-sim1k.pt")#
torch.save(a, "ft-mi-no_opt-no_dropout.pt") #