1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-05 01:10:22 +08:00

optimize: some training optimizations (#95)

* optimzie(train&uvr5): rm sf & simp. AudioPre

* fix(audio): too many mallocs

* feat(audio): load_audio support stereo

* fix(audio): float32 wav saving

* fix(train): missing ckpt var
This commit is contained in:
源文雨
2024-11-28 03:20:14 +09:00
committed by GitHub
parent f4644ec1ec
commit a8783c6639
19 changed files with 163 additions and 433 deletions

View File

@@ -1,11 +1,16 @@
from io import BufferedWriter, BytesIO
from pathlib import Path
from typing import Dict, Tuple
from typing import Dict, Tuple, Optional, Union, List
import os
import math
import wave
import numpy as np
from numba import jit
import av
from av.audio.resampler import AudioResampler
from av.audio.frame import AudioFrame
import scipy.io.wavfile as wavfile
video_format_dict: Dict[str, str] = {
"m4a": "mp4",
@@ -17,6 +22,29 @@ audio_format_dict: Dict[str, str] = {
}
@jit(nopython=True)
def float_to_int16(audio: np.ndarray) -> np.ndarray:
am = int(math.ceil(float(np.abs(audio).max())) * 32768)
am = 32767 * 32768 // am
return np.multiply(audio, am).astype(np.int16)
def float_np_array_to_wav_buf(wav: np.ndarray, sr: int, f32=False) -> BytesIO:
buf = BytesIO()
if f32:
wavfile.write(buf, sr, wav.astype(np.float32))
else:
with wave.open(buf, "wb") as wf:
wf.setnchannels(2 if len(wav.shape) > 1 else 1)
wf.setsampwidth(2) # Sample width in bytes
wf.setframerate(sr) # Sample rate in Hz
wf.writeframes(float_to_int16(wav.T if len(wav.shape) > 1 else wav))
buf.seek(0, 0)
return buf
def save_audio(path: str, audio: np.ndarray, sr: int, f32=False):
with open(path, "wb") as f:
f.write(float_np_array_to_wav_buf(audio, sr, f32).getbuffer())
def wav2(i: BytesIO, o: BufferedWriter, format: str):
inp = av.open(i, "r")
format = video_format_dict.get(format, format)
@@ -36,43 +64,72 @@ def wav2(i: BytesIO, o: BufferedWriter, format: str):
inp.close()
def load_audio(file: str, sr: int) -> np.ndarray:
if not Path(file).exists():
def load_audio(
file: Union[str, BytesIO, Path],
sr: Optional[int]=None,
format: Optional[str]=None,
mono=True
) -> Union[np.ndarray, Tuple[np.ndarray, int]]:
if (isinstance(file, str) and not Path(file).exists()) or (isinstance(file, Path) and not file.exists()):
raise FileNotFoundError(f"File not found: {file}")
rate = 0
try:
container = av.open(file)
resampler = AudioResampler(format="fltp", layout="mono", rate=sr)
container = av.open(file, format=format)
audio_stream = next(s for s in container.streams if s.type == "audio")
channels = 1 if audio_stream.layout == "mono" else 2
container.seek(0)
resampler = AudioResampler(format="fltp", layout=audio_stream.layout, rate=sr) if sr is not None else None
# Estimated maximum total number of samples to pre-allocate the array
# AV stores length in microseconds by default
estimated_total_samples = int(container.duration * sr // 1_000_000)
decoded_audio = np.zeros(estimated_total_samples + 1, dtype=np.float32)
# Estimated maximum total number of samples to pre-allocate the array
# AV stores length in microseconds by default
estimated_total_samples = int(container.duration * sr // 1_000_000) if sr is not None else 48000
decoded_audio = np.zeros(estimated_total_samples + 1 if channels == 1 else (channels, estimated_total_samples + 1), dtype=np.float32)
offset = 0
for frame in container.decode(audio=0):
frame.pts = None # Clear presentation timestamp to avoid resampling issues
resampled_frames = resampler.resample(frame)
offset = 0
def process_packet(packet: List[AudioFrame]):
frames_data = []
rate = 0
for frame in packet:
frame.pts = None # 清除时间戳,避免重新采样问题
resampled_frames = resampler.resample(frame) if resampler is not None else [frame]
for resampled_frame in resampled_frames:
frame_data = resampled_frame.to_ndarray()[0]
end_index = offset + len(frame_data)
frame_data = resampled_frame.to_ndarray()
rate = resampled_frame.rate
frames_data.append(frame_data)
return (rate, frames_data)
# Check if decoded_audio has enough space, and resize if necessary
if end_index > decoded_audio.shape[0]:
decoded_audio = np.resize(decoded_audio, end_index + 1)
def frame_iter(container):
for p in container.demux(container.streams.audio[0]):
yield p.decode()
decoded_audio[offset:end_index] = frame_data
offset += len(frame_data)
for r, frames_data in map(process_packet, frame_iter(container)):
if not rate: rate = r
for frame_data in frames_data:
end_index = offset + len(frame_data[0])
# Truncate the array to the actual size
decoded_audio = decoded_audio[:offset]
except Exception as e:
raise RuntimeError(f"Failed to load audio: {e}")
# 检查 decoded_audio 是否有足够的空间,并在必要时调整大小
if end_index > decoded_audio.shape[1]:
decoded_audio = np.resize(decoded_audio, (decoded_audio.shape[0], end_index*4))
return decoded_audio
np.copyto(decoded_audio[..., offset:end_index], frame_data)
offset += len(frame_data[0])
# Truncate the array to the actual size
decoded_audio = decoded_audio[..., :offset]
if mono and decoded_audio.shape[0] > 1:
decoded_audio = decoded_audio.mean(0)
if sr is not None:
return decoded_audio
return decoded_audio, rate
def downsample_audio(input_path: str, output_path: str, format: str) -> None:
def downsample_audio(input_path: str, output_path: str, format: str, br=128_000) -> None:
"""
default to 128kb/s (equivalent to -q:a 2)
"""
if not os.path.exists(input_path):
return
@@ -83,7 +140,7 @@ def downsample_audio(input_path: str, output_path: str, format: str) -> None:
input_stream = input_container.streams.audio[0]
output_stream = output_container.add_stream(format)
output_stream.bit_rate = 128_000 # 128kb/s (equivalent to -q:a 2)
output_stream.bit_rate = br
# Copy packets from the input file to the output file
for packet in input_container.demux(input_stream):
@@ -141,7 +198,7 @@ def resample_audio(
print(f"Failed to remove the original file: {e}")
def get_audio_properties(input_path: str) -> Tuple:
def get_audio_properties(input_path: str) -> Tuple[int, int]:
container = av.open(input_path)
audio_stream = next(s for s in container.streams if s.type == "audio")
channels = 1 if audio_stream.layout == "mono" else 2

View File

@@ -183,8 +183,7 @@ def main():
import os.path
from argparse import ArgumentParser
import librosa
import soundfile
from .audio import load_audio, save_audio
parser = ArgumentParser()
parser.add_argument("audio", type=str, help="The audio to be sliced")
@@ -230,7 +229,7 @@ def main():
out = args.out
if out is None:
out = os.path.dirname(os.path.abspath(args.audio))
audio, sr = librosa.load(args.audio, sr=None, mono=False)
audio, sr = load_audio(args.audio, mono=False)
slicer = Slicer(
sr=sr,
threshold=args.db_thresh,
@@ -245,15 +244,11 @@ def main():
for i, chunk in enumerate(chunks):
if len(chunk.shape) > 1:
chunk = chunk.T
soundfile.write(
os.path.join(
out,
f"%s_%d.wav"
% (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
),
chunk,
sr,
)
save_audio(os.path.join(
out,
f"%s_%d.wav"
% (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
), chunk, sr)
if __name__ == "__main__":

View File

@@ -16,62 +16,12 @@ MATPLOTLIB_FLAG = False
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging
"""
def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
##################
def go(model, bkey):
saved_state_dict = checkpoint_dict[bkey]
if hasattr(model, "module"):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
new_state_dict = {}
for k, v in state_dict.items(): # 模型需要的shape
try:
new_state_dict[k] = saved_state_dict[k]
if saved_state_dict[k].shape != state_dict[k].shape:
logger.warning(
"shape-%s-mismatch. need: %s, get: %s",
k,
state_dict[k].shape,
saved_state_dict[k].shape,
) #
raise KeyError
except:
# logger.info(traceback.format_exc())
logger.info("%s is not in the checkpoint", k) # pretrain缺失的
new_state_dict[k] = v # 模型自带的随机值
if hasattr(model, "module"):
model.module.load_state_dict(new_state_dict, strict=False)
else:
model.load_state_dict(new_state_dict, strict=False)
return model
go(combd, "combd")
model = go(sbd, "sbd")
#############
logger.info("Loaded model weights")
iteration = checkpoint_dict["iteration"]
learning_rate = checkpoint_dict["learning_rate"]
if (
optimizer is not None and load_opt == 1
): ###加载不了如果是空的的话重新初始化可能还会影响lr时间表的更新因此在train文件最外围catch
# try:
optimizer.load_state_dict(checkpoint_dict["optimizer"])
# except:
# traceback.print_exc()
logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
"""
def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
assert os.path.isfile(checkpoint_path)
saved_state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"]
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
saved_state_dict = checkpoint_dict["model"]
if hasattr(model, "module"):
state_dict = model.module.state_dict()
else:
@@ -132,34 +82,6 @@ def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path)
)
"""
def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path):
logger.info(
"Saving model and optimizer state at epoch {} to {}".format(
iteration, checkpoint_path
)
)
if hasattr(combd, "module"):
state_dict_combd = combd.module.state_dict()
else:
state_dict_combd = combd.state_dict()
if hasattr(sbd, "module"):
state_dict_sbd = sbd.module.state_dict()
else:
state_dict_sbd = sbd.state_dict()
torch.save(
{
"combd": state_dict_combd,
"sbd": state_dict_sbd,
"iteration": iteration,
"optimizer": optimizer.state_dict(),
"learning_rate": learning_rate,
},
checkpoint_path,
)
"""
def summarize(
writer,
global_step,
@@ -366,53 +288,6 @@ def get_hparams(init=True):
return hparams
"""
def get_hparams_from_dir(model_dir):
config_save_path = os.path.join(model_dir, "config.json")
with open(config_save_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
hparams.model_dir = model_dir
return hparams
def get_hparams_from_file(config_path):
with open(config_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
return hparams
def check_git_hash(model_dir):
source_dir = os.path.dirname(os.path.realpath(__file__))
if not os.path.exists(os.path.join(source_dir, ".git")):
logger.warning(
"{} is not a git repository, therefore hash value comparison will be ignored.".format(
source_dir
)
)
return
cur_hash = subprocess.getoutput("git rev-parse HEAD")
path = os.path.join(model_dir, "githash")
if os.path.exists(path):
saved_hash = open(path).read()
if saved_hash != cur_hash:
logger.warning(
"git hash values are different. {}(saved) != {}(current)".format(
saved_hash[:8], cur_hash[:8]
)
)
else:
open(path, "w").write(cur_hash)
"""
def get_logger(model_dir, filename="train.log"):
global logger
logger = logging.getLogger(os.path.basename(model_dir))

View File

@@ -2,6 +2,11 @@ import os
import sys
import traceback
now_dir = os.getcwd()
sys.path.append(now_dir)
from infer.lib.audio import load_audio
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
@@ -20,7 +25,6 @@ else:
is_half = sys.argv[7].lower() == "true"
import fairseq
import numpy as np
import soundfile as sf
import torch
import torch.nn.functional as F
@@ -64,11 +68,9 @@ os.makedirs(outPath, exist_ok=True)
# wave must be 16k, hop_size=320
def readwave(wav_path, normalize=False):
wav, sr = sf.read(wav_path)
wav, sr = load_audio(wav_path)
assert sr == 16000
feats = torch.from_numpy(wav).float()
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
if normalize:
with torch.no_grad():

View File

@@ -16,11 +16,9 @@ per = float(sys.argv[6])
import os
import traceback
import librosa
import numpy as np
from scipy.io import wavfile
from infer.lib.audio import load_audio
from infer.lib.audio import load_audio, float_np_array_to_wav_buf, save_audio
from infer.lib.slicer2 import Slicer
f = open("%s/preprocess.log" % exp_dir, "a+")
@@ -64,19 +62,15 @@ class PreProcess:
tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
1 - self.alpha
) * tmp_audio
wavfile.write(
"%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
self.sr,
tmp_audio.astype(np.float32),
)
tmp_audio = librosa.resample(
tmp_audio, orig_sr=self.sr, target_sr=16000
) # , res_type="soxr_vhq"
wavfile.write(
"%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
16000,
tmp_audio.astype(np.float32),
)
save_audio("%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), tmp_audio, self.sr, f32=True)
with open("%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), "wb") as f:
f.write(float_np_array_to_wav_buf(
load_audio(
float_np_array_to_wav_buf(tmp_audio, self.sr, f32=True),
sr=16000,
format="wav",
)
, 16000, True).getbuffer())
def pipeline(self, path, idx0):
try:

View File

@@ -5,12 +5,10 @@ logger = logging.getLogger(__name__)
import librosa
import numpy as np
import soundfile as sf
import torch
from tqdm import tqdm
import av
from infer.lib.audio import downsample_audio
from infer.lib.audio import downsample_audio, save_audio
cpu = torch.device("cpu")
@@ -210,15 +208,13 @@ class Predictor:
sources = self.demix(mix.T)
opt = sources[0].T
if format in ["wav", "flac"]:
sf.write(
"%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate
)
sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate)
save_audio("%s/vocal_%s.%s" % (vocal_root, basename, format), mix - opt, rate)
save_audio("%s/instrument_%s.%s" % (others_root, basename, format), opt, rate)
else:
path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename)
path_other = "%s/%s_others.wav" % (others_root, basename)
sf.write(path_vocal, mix - opt, rate)
sf.write(path_other, opt, rate)
path_vocal = "%s/vocal_%s.wav" % (vocal_root, basename)
path_other = "%s/instrument_%s.wav" % (others_root, basename)
save_audio(path_vocal, opt, rate)
save_audio(path_other, opt, rate)
opt_path_vocal = path_vocal[:-4] + ".%s" % format
opt_path_other = path_other[:-4] + ".%s" % format
downsample_audio(path_vocal, opt_path_vocal, format)

View File

@@ -9,7 +9,7 @@ import torch
from configs import Config
from infer.modules.uvr5.mdxnet import MDXNetDereverb
from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho
from infer.modules.uvr5.vr import AudioPre
config = Config()
@@ -27,8 +27,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
if model_name == "onnx_dereverb_By_FoxJoy":
pre_fun = MDXNetDereverb(15, config.device)
else:
func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
pre_fun = func(
pre_fun = AudioPre(
agg=int(agg),
model_path=os.path.join(
os.getenv("weight_uvr5_root"), model_name + ".pth"
@@ -72,18 +71,10 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
infos.append("%s->Success" % (os.path.basename(inp_path)))
yield "\n".join(infos)
except:
try:
if done == 0:
pre_fun._path_audio_(
inp_path, save_root_ins, save_root_vocal, format0
)
infos.append("%s->Success" % (os.path.basename(inp_path)))
yield "\n".join(infos)
except:
infos.append(
"%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
)
yield "\n".join(infos)
infos.append(
"%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
)
yield "\n".join(infos)
except:
infos.append(traceback.format_exc())
yield "\n".join(infos)

View File

@@ -5,8 +5,7 @@ logger = logging.getLogger(__name__)
import librosa
import numpy as np
import soundfile as sf
from infer.lib.audio import downsample_audio
from infer.lib.audio import downsample_audio, save_audio
import torch
from infer.lib.uvr5_pack.lib_v5 import nets_123821KB as Nets
@@ -20,6 +19,8 @@ class AudioPre:
def __init__(self, agg, model_path, device, is_half, tta=False):
self.model_path = model_path
self.device = device
self.is_de_echo = "DeEcho" in model_path
self.is_reverse = self.is_de_echo or "HP3" in model_path
self.data = {
# Processing Options
"postprocess": False,
@@ -29,8 +30,13 @@ class AudioPre:
"agg": agg,
"high_end_process": "mirroring",
}
mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json")
model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
if self.is_de_echo:
mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json")
nout = 64 if "DeReverb" in model_path else 48
model = CascadedNet(mp.param["bins"] * 2, nout)
else:
mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json")
model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk)
model.eval()
@@ -123,30 +129,28 @@ class AudioPre:
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
logger.info("%s instruments done" % name)
head = "instrument_"
if self.is_reverse:
head = "vocal_"
else:
head = "instrument_"
if format in ["wav", "flac"]:
sf.write(
os.path.join(
save_audio(os.path.join(
ins_root,
head + "{}_{}.{}".format(name, self.data["agg"], format),
),
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
) #
), wav_instrument, self.mp.param["sr"])
else:
path = os.path.join(
ins_root, head + "{}_{}.wav".format(name, self.data["agg"])
)
sf.write(
path,
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
)
save_audio(path, wav_instrument, self.mp.param["sr"])
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
downsample_audio(path, opt_format_path, format)
if vocal_root is not None:
head = "vocal_"
if self.is_reverse:
head = "instrument_"
else:
head = "vocal_"
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
@@ -158,185 +162,15 @@ class AudioPre:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
logger.info("%s vocals done" % name)
if format in ["wav", "flac"]:
sf.write(
os.path.join(
save_audio(os.path.join(
vocal_root,
head + "{}_{}.{}".format(name, self.data["agg"], format),
),
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
), wav_vocals, self.mp.param["sr"])
else:
path = os.path.join(
vocal_root, head + "{}_{}.wav".format(name, self.data["agg"])
)
sf.write(
path,
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
opt_format_path = path[:-4] + ".%s" % format
downsample_audio(path, opt_format_path, format)
class AudioPreDeEcho:
def __init__(self, agg, model_path, device, is_half, tta=False):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
"postprocess": False,
"tta": tta,
# Constants
"window_size": 512,
"agg": agg,
"high_end_process": "mirroring",
}
mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json")
nout = 64 if "DeReverb" in model_path else 48
model = CascadedNet(mp.param["bins"] * 2, nout)
cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk)
model.eval()
if is_half:
model = model.half().to(device)
else:
model = model.to(device)
self.mp = mp
self.model = model
def _path_audio_(
self, music_file, vocal_root=None, ins_root=None, format="flac"
): # 3个VR模型vocal和ins是反的
if ins_root is None and vocal_root is None:
return "No save root."
name = os.path.basename(music_file)
if ins_root is not None:
os.makedirs(ins_root, exist_ok=True)
if vocal_root is not None:
os.makedirs(vocal_root, exist_ok=True)
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param["band"])
# print(bands_n)
for d in range(bands_n, 0, -1):
bp = self.mp.param["band"][d]
if d == bands_n: # high-end band
(
X_wave[d],
_,
) = librosa.load( # 理论上librosa读取可能对某些音频有bug应该上ffmpeg读取但是太麻烦了弃坑
music_file,
sr=bp["sr"],
mono=False,
dtype=np.float32,
res_type=bp["res_type"],
)
if X_wave[d].ndim == 1:
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
else: # lower bands
X_wave[d] = librosa.resample(
X_wave[d + 1],
orig_sr=self.mp.param["band"][d + 1]["sr"],
target_sr=bp["sr"],
res_type=bp["res_type"],
)
# Stft of wave source
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
X_wave[d],
bp["hl"],
bp["n_fft"],
self.mp.param["mid_side"],
self.mp.param["mid_side_b2"],
self.mp.param["reverse"],
)
# pdb.set_trace()
if d == bands_n and self.data["high_end_process"] != "none":
input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
)
input_high_end = X_spec_s[d][
:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
]
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
aggresive_set = float(self.data["agg"] / 100)
aggressiveness = {
"value": aggresive_set,
"split_bin": self.mp.param["band"][1]["crop_stop"],
}
with torch.no_grad():
pred, X_mag, X_phase = inference(
X_spec_m, self.device, self.model, aggressiveness, self.data
)
# Postprocess
if self.data["postprocess"]:
pred_inv = np.clip(X_mag - pred, 0, np.inf)
pred = spec_utils.mask_silence(pred, pred_inv)
y_spec_m = pred * X_phase
v_spec_m = X_spec_m - y_spec_m
if ins_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
y_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
logger.info("%s instruments done" % name)
if format in ["wav", "flac"]:
sf.write(
os.path.join(
ins_root,
"vocal_{}_{}.{}".format(name, self.data["agg"], format),
),
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
) #
else:
path = os.path.join(
ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
)
sf.write(
path,
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
)
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
downsample_audio(path, opt_format_path, format)
if vocal_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
v_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
logger.info("%s vocals done" % name)
if format in ["wav", "flac"]:
sf.write(
os.path.join(
vocal_root,
"instrument_{}_{}.{}".format(name, self.data["agg"], format),
),
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
else:
path = os.path.join(
vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
)
sf.write(
path,
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
save_audio(path, wav_vocals, self.mp.param["sr"])
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
downsample_audio(path, opt_format_path, format)

View File

@@ -5,11 +5,10 @@ import os
logger = logging.getLogger(__name__)
import numpy as np
import soundfile as sf
import torch
from io import BytesIO
from infer.lib.audio import load_audio, wav2
from infer.lib.audio import load_audio, wav2, save_audio, float_np_array_to_wav_buf
from rvc.synthesizer import get_synthesizer, load_synthesizer
from .info import show_model_info
from .pipeline import Pipeline
@@ -253,23 +252,16 @@ class VC:
try:
tgt_sr, audio_opt = opt
if format1 in ["wav", "flac"]:
sf.write(
"%s/%s.%s"
% (opt_root, os.path.basename(path), format1),
audio_opt,
tgt_sr,
)
save_audio("%s/%s.%s"
% (opt_root, os.path.basename(path), format1), audio_opt, tgt_sr)
else:
path = "%s/%s.%s" % (
opt_root,
os.path.basename(path),
format1,
)
with BytesIO() as wavf:
sf.write(wavf, audio_opt, tgt_sr, format="wav")
wavf.seek(0, 0)
with open(path, "wb") as outf:
wav2(wavf, outf, format1)
with open(path, "wb") as outf:
wav2(float_np_array_to_wav_buf(audio_opt, tgt_sr), outf, format1)
except:
info += traceback.format_exc()
infos.append("%s->%s" % (os.path.basename(path), info))

View File

@@ -10,7 +10,6 @@ faiss-cpu==1.7.3
gradio
Cython
pydub>=0.25.1
soundfile>=0.12.1
tensorboardX
Jinja2>=3.1.2
json5

View File

@@ -9,7 +9,6 @@ faiss-cpu==1.7.3
gradio
Cython
pydub>=0.25.1
soundfile>=0.12.1
tensorboardX
Jinja2>=3.1.2
json5

View File

@@ -18,7 +18,6 @@ PyYAML
resampy
scikit_learn
scipy
SoundFile
tensorboard
tqdm
wave

View File

@@ -18,7 +18,6 @@ PyYAML
resampy
scikit_learn
scipy
SoundFile
tensorboard
tqdm
wave

View File

@@ -14,7 +14,6 @@ faiss-cpu==1.7.3
gradio
Cython
pydub>=0.25.1
soundfile>=0.12.1
tensorboardX
Jinja2>=3.1.2
json5

View File

@@ -9,7 +9,6 @@ faiss-cpu
gradio
Cython
pydub>=0.25.1
soundfile>=0.12.1
tensorboardX
Jinja2>=3.1.2
json5

View File

@@ -9,7 +9,6 @@ faiss-cpu
gradio
Cython
pydub>=0.25.1
soundfile>=0.12.1
tensorboardX
Jinja2>=3.1.2
json5

View File

@@ -18,7 +18,6 @@ from time import time as ttime
# import pyworld
import librosa
import numpy as np
import soundfile as sf
import torch.nn.functional as F
from fairseq import checkpoint_utils
@@ -33,6 +32,7 @@ from scipy.io import wavfile
# from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
# from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
from infer.lib.audio import load_audio
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = r"E:\codes\py39\vits_vc_gpu_train\assets\hubert\hubert_base.pt" #
@@ -132,7 +132,7 @@ for idx, name in enumerate(
): ##
wav_path = "todo-songs/%s" % name #
f0_up_key = -2 #
audio, sampling_rate = sf.read(wav_path)
audio, sampling_rate = load_audio(wav_path)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000:

View File

@@ -1,8 +1,9 @@
import soundfile
import librosa
from rvc.onnx import RVC
from infer.lib.audio import save_audio
hop_size = 512
sampling_rate = 40000 # 采样率
f0_up_key = 0 # 升降调
@@ -19,4 +20,4 @@ wav, sr = librosa.load(wav_path, sr=sampling_rate)
audio = model.infer(wav, sr, sampling_rate, sid, f0_method, f0_up_key)
soundfile.write(out_path, audio, sampling_rate)
save_audio(out_path, audio, sampling_rate)

8
web.py
View File

@@ -141,8 +141,8 @@ weight_uvr5_root = os.getenv("weight_uvr5_root")
index_root = os.getenv("index_root")
outside_index_root = os.getenv("outside_index_root")
names = []
index_paths = []
names = [""]
index_paths = [""]
def lookup_names(weight_root):
global names
@@ -168,9 +168,9 @@ for name in os.listdir(weight_uvr5_root):
def change_choices():
global index_paths, names
names = []
names = [""]
lookup_names(weight_root)
index_paths = []
index_paths = [""]
lookup_indices(index_root)
lookup_indices(outside_index_root)
return {"choices": sorted(names), "__type__": "update"}, {