1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-05 01:10:22 +08:00

optimize: some training optimizations (#95)

* optimzie(train&uvr5): rm sf & simp. AudioPre

* fix(audio): too many mallocs

* feat(audio): load_audio support stereo

* fix(audio): float32 wav saving

* fix(train): missing ckpt var
This commit is contained in:
源文雨
2024-11-28 03:20:14 +09:00
committed by GitHub
parent f4644ec1ec
commit a8783c6639
19 changed files with 163 additions and 433 deletions

View File

@@ -1,11 +1,16 @@
from io import BufferedWriter, BytesIO from io import BufferedWriter, BytesIO
from pathlib import Path from pathlib import Path
from typing import Dict, Tuple from typing import Dict, Tuple, Optional, Union, List
import os import os
import math
import wave
import numpy as np import numpy as np
from numba import jit
import av import av
from av.audio.resampler import AudioResampler from av.audio.resampler import AudioResampler
from av.audio.frame import AudioFrame
import scipy.io.wavfile as wavfile
video_format_dict: Dict[str, str] = { video_format_dict: Dict[str, str] = {
"m4a": "mp4", "m4a": "mp4",
@@ -17,6 +22,29 @@ audio_format_dict: Dict[str, str] = {
} }
@jit(nopython=True)
def float_to_int16(audio: np.ndarray) -> np.ndarray:
am = int(math.ceil(float(np.abs(audio).max())) * 32768)
am = 32767 * 32768 // am
return np.multiply(audio, am).astype(np.int16)
def float_np_array_to_wav_buf(wav: np.ndarray, sr: int, f32=False) -> BytesIO:
buf = BytesIO()
if f32:
wavfile.write(buf, sr, wav.astype(np.float32))
else:
with wave.open(buf, "wb") as wf:
wf.setnchannels(2 if len(wav.shape) > 1 else 1)
wf.setsampwidth(2) # Sample width in bytes
wf.setframerate(sr) # Sample rate in Hz
wf.writeframes(float_to_int16(wav.T if len(wav.shape) > 1 else wav))
buf.seek(0, 0)
return buf
def save_audio(path: str, audio: np.ndarray, sr: int, f32=False):
with open(path, "wb") as f:
f.write(float_np_array_to_wav_buf(audio, sr, f32).getbuffer())
def wav2(i: BytesIO, o: BufferedWriter, format: str): def wav2(i: BytesIO, o: BufferedWriter, format: str):
inp = av.open(i, "r") inp = av.open(i, "r")
format = video_format_dict.get(format, format) format = video_format_dict.get(format, format)
@@ -36,43 +64,72 @@ def wav2(i: BytesIO, o: BufferedWriter, format: str):
inp.close() inp.close()
def load_audio(file: str, sr: int) -> np.ndarray: def load_audio(
if not Path(file).exists(): file: Union[str, BytesIO, Path],
sr: Optional[int]=None,
format: Optional[str]=None,
mono=True
) -> Union[np.ndarray, Tuple[np.ndarray, int]]:
if (isinstance(file, str) and not Path(file).exists()) or (isinstance(file, Path) and not file.exists()):
raise FileNotFoundError(f"File not found: {file}") raise FileNotFoundError(f"File not found: {file}")
rate = 0
try: container = av.open(file, format=format)
container = av.open(file) audio_stream = next(s for s in container.streams if s.type == "audio")
resampler = AudioResampler(format="fltp", layout="mono", rate=sr) channels = 1 if audio_stream.layout == "mono" else 2
container.seek(0)
resampler = AudioResampler(format="fltp", layout=audio_stream.layout, rate=sr) if sr is not None else None
# Estimated maximum total number of samples to pre-allocate the array # Estimated maximum total number of samples to pre-allocate the array
# AV stores length in microseconds by default # AV stores length in microseconds by default
estimated_total_samples = int(container.duration * sr // 1_000_000) estimated_total_samples = int(container.duration * sr // 1_000_000) if sr is not None else 48000
decoded_audio = np.zeros(estimated_total_samples + 1, dtype=np.float32) decoded_audio = np.zeros(estimated_total_samples + 1 if channels == 1 else (channels, estimated_total_samples + 1), dtype=np.float32)
offset = 0 offset = 0
for frame in container.decode(audio=0):
frame.pts = None # Clear presentation timestamp to avoid resampling issues def process_packet(packet: List[AudioFrame]):
resampled_frames = resampler.resample(frame) frames_data = []
rate = 0
for frame in packet:
frame.pts = None # 清除时间戳,避免重新采样问题
resampled_frames = resampler.resample(frame) if resampler is not None else [frame]
for resampled_frame in resampled_frames: for resampled_frame in resampled_frames:
frame_data = resampled_frame.to_ndarray()[0] frame_data = resampled_frame.to_ndarray()
end_index = offset + len(frame_data) rate = resampled_frame.rate
frames_data.append(frame_data)
return (rate, frames_data)
# Check if decoded_audio has enough space, and resize if necessary def frame_iter(container):
if end_index > decoded_audio.shape[0]: for p in container.demux(container.streams.audio[0]):
decoded_audio = np.resize(decoded_audio, end_index + 1) yield p.decode()
decoded_audio[offset:end_index] = frame_data for r, frames_data in map(process_packet, frame_iter(container)):
offset += len(frame_data) if not rate: rate = r
for frame_data in frames_data:
end_index = offset + len(frame_data[0])
# Truncate the array to the actual size # 检查 decoded_audio 是否有足够的空间,并在必要时调整大小
decoded_audio = decoded_audio[:offset] if end_index > decoded_audio.shape[1]:
except Exception as e: decoded_audio = np.resize(decoded_audio, (decoded_audio.shape[0], end_index*4))
raise RuntimeError(f"Failed to load audio: {e}")
return decoded_audio np.copyto(decoded_audio[..., offset:end_index], frame_data)
offset += len(frame_data[0])
# Truncate the array to the actual size
decoded_audio = decoded_audio[..., :offset]
if mono and decoded_audio.shape[0] > 1:
decoded_audio = decoded_audio.mean(0)
if sr is not None:
return decoded_audio
return decoded_audio, rate
def downsample_audio(input_path: str, output_path: str, format: str) -> None: def downsample_audio(input_path: str, output_path: str, format: str, br=128_000) -> None:
"""
default to 128kb/s (equivalent to -q:a 2)
"""
if not os.path.exists(input_path): if not os.path.exists(input_path):
return return
@@ -83,7 +140,7 @@ def downsample_audio(input_path: str, output_path: str, format: str) -> None:
input_stream = input_container.streams.audio[0] input_stream = input_container.streams.audio[0]
output_stream = output_container.add_stream(format) output_stream = output_container.add_stream(format)
output_stream.bit_rate = 128_000 # 128kb/s (equivalent to -q:a 2) output_stream.bit_rate = br
# Copy packets from the input file to the output file # Copy packets from the input file to the output file
for packet in input_container.demux(input_stream): for packet in input_container.demux(input_stream):
@@ -141,7 +198,7 @@ def resample_audio(
print(f"Failed to remove the original file: {e}") print(f"Failed to remove the original file: {e}")
def get_audio_properties(input_path: str) -> Tuple: def get_audio_properties(input_path: str) -> Tuple[int, int]:
container = av.open(input_path) container = av.open(input_path)
audio_stream = next(s for s in container.streams if s.type == "audio") audio_stream = next(s for s in container.streams if s.type == "audio")
channels = 1 if audio_stream.layout == "mono" else 2 channels = 1 if audio_stream.layout == "mono" else 2

View File

@@ -183,8 +183,7 @@ def main():
import os.path import os.path
from argparse import ArgumentParser from argparse import ArgumentParser
import librosa from .audio import load_audio, save_audio
import soundfile
parser = ArgumentParser() parser = ArgumentParser()
parser.add_argument("audio", type=str, help="The audio to be sliced") parser.add_argument("audio", type=str, help="The audio to be sliced")
@@ -230,7 +229,7 @@ def main():
out = args.out out = args.out
if out is None: if out is None:
out = os.path.dirname(os.path.abspath(args.audio)) out = os.path.dirname(os.path.abspath(args.audio))
audio, sr = librosa.load(args.audio, sr=None, mono=False) audio, sr = load_audio(args.audio, mono=False)
slicer = Slicer( slicer = Slicer(
sr=sr, sr=sr,
threshold=args.db_thresh, threshold=args.db_thresh,
@@ -245,15 +244,11 @@ def main():
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
if len(chunk.shape) > 1: if len(chunk.shape) > 1:
chunk = chunk.T chunk = chunk.T
soundfile.write( save_audio(os.path.join(
os.path.join( out,
out, f"%s_%d.wav"
f"%s_%d.wav" % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
% (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), ), chunk, sr)
),
chunk,
sr,
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -16,62 +16,12 @@ MATPLOTLIB_FLAG = False
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging logger = logging
"""
def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
##################
def go(model, bkey):
saved_state_dict = checkpoint_dict[bkey]
if hasattr(model, "module"):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
new_state_dict = {}
for k, v in state_dict.items(): # 模型需要的shape
try:
new_state_dict[k] = saved_state_dict[k]
if saved_state_dict[k].shape != state_dict[k].shape:
logger.warning(
"shape-%s-mismatch. need: %s, get: %s",
k,
state_dict[k].shape,
saved_state_dict[k].shape,
) #
raise KeyError
except:
# logger.info(traceback.format_exc())
logger.info("%s is not in the checkpoint", k) # pretrain缺失的
new_state_dict[k] = v # 模型自带的随机值
if hasattr(model, "module"):
model.module.load_state_dict(new_state_dict, strict=False)
else:
model.load_state_dict(new_state_dict, strict=False)
return model
go(combd, "combd")
model = go(sbd, "sbd")
#############
logger.info("Loaded model weights")
iteration = checkpoint_dict["iteration"]
learning_rate = checkpoint_dict["learning_rate"]
if (
optimizer is not None and load_opt == 1
): ###加载不了如果是空的的话重新初始化可能还会影响lr时间表的更新因此在train文件最外围catch
# try:
optimizer.load_state_dict(checkpoint_dict["optimizer"])
# except:
# traceback.print_exc()
logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
"""
def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
assert os.path.isfile(checkpoint_path) assert os.path.isfile(checkpoint_path)
saved_state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] checkpoint_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
saved_state_dict = checkpoint_dict["model"]
if hasattr(model, "module"): if hasattr(model, "module"):
state_dict = model.module.state_dict() state_dict = model.module.state_dict()
else: else:
@@ -132,34 +82,6 @@ def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path)
) )
"""
def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path):
logger.info(
"Saving model and optimizer state at epoch {} to {}".format(
iteration, checkpoint_path
)
)
if hasattr(combd, "module"):
state_dict_combd = combd.module.state_dict()
else:
state_dict_combd = combd.state_dict()
if hasattr(sbd, "module"):
state_dict_sbd = sbd.module.state_dict()
else:
state_dict_sbd = sbd.state_dict()
torch.save(
{
"combd": state_dict_combd,
"sbd": state_dict_sbd,
"iteration": iteration,
"optimizer": optimizer.state_dict(),
"learning_rate": learning_rate,
},
checkpoint_path,
)
"""
def summarize( def summarize(
writer, writer,
global_step, global_step,
@@ -366,53 +288,6 @@ def get_hparams(init=True):
return hparams return hparams
"""
def get_hparams_from_dir(model_dir):
config_save_path = os.path.join(model_dir, "config.json")
with open(config_save_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
hparams.model_dir = model_dir
return hparams
def get_hparams_from_file(config_path):
with open(config_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
return hparams
def check_git_hash(model_dir):
source_dir = os.path.dirname(os.path.realpath(__file__))
if not os.path.exists(os.path.join(source_dir, ".git")):
logger.warning(
"{} is not a git repository, therefore hash value comparison will be ignored.".format(
source_dir
)
)
return
cur_hash = subprocess.getoutput("git rev-parse HEAD")
path = os.path.join(model_dir, "githash")
if os.path.exists(path):
saved_hash = open(path).read()
if saved_hash != cur_hash:
logger.warning(
"git hash values are different. {}(saved) != {}(current)".format(
saved_hash[:8], cur_hash[:8]
)
)
else:
open(path, "w").write(cur_hash)
"""
def get_logger(model_dir, filename="train.log"): def get_logger(model_dir, filename="train.log"):
global logger global logger
logger = logging.getLogger(os.path.basename(model_dir)) logger = logging.getLogger(os.path.basename(model_dir))

View File

@@ -2,6 +2,11 @@ import os
import sys import sys
import traceback import traceback
now_dir = os.getcwd()
sys.path.append(now_dir)
from infer.lib.audio import load_audio
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
@@ -20,7 +25,6 @@ else:
is_half = sys.argv[7].lower() == "true" is_half = sys.argv[7].lower() == "true"
import fairseq import fairseq
import numpy as np import numpy as np
import soundfile as sf
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
@@ -64,11 +68,9 @@ os.makedirs(outPath, exist_ok=True)
# wave must be 16k, hop_size=320 # wave must be 16k, hop_size=320
def readwave(wav_path, normalize=False): def readwave(wav_path, normalize=False):
wav, sr = sf.read(wav_path) wav, sr = load_audio(wav_path)
assert sr == 16000 assert sr == 16000
feats = torch.from_numpy(wav).float() feats = torch.from_numpy(wav).float()
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim() assert feats.dim() == 1, feats.dim()
if normalize: if normalize:
with torch.no_grad(): with torch.no_grad():

View File

@@ -16,11 +16,9 @@ per = float(sys.argv[6])
import os import os
import traceback import traceback
import librosa
import numpy as np import numpy as np
from scipy.io import wavfile
from infer.lib.audio import load_audio from infer.lib.audio import load_audio, float_np_array_to_wav_buf, save_audio
from infer.lib.slicer2 import Slicer from infer.lib.slicer2 import Slicer
f = open("%s/preprocess.log" % exp_dir, "a+") f = open("%s/preprocess.log" % exp_dir, "a+")
@@ -64,19 +62,15 @@ class PreProcess:
tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + ( tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
1 - self.alpha 1 - self.alpha
) * tmp_audio ) * tmp_audio
wavfile.write( save_audio("%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), tmp_audio, self.sr, f32=True)
"%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), with open("%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), "wb") as f:
self.sr, f.write(float_np_array_to_wav_buf(
tmp_audio.astype(np.float32), load_audio(
) float_np_array_to_wav_buf(tmp_audio, self.sr, f32=True),
tmp_audio = librosa.resample( sr=16000,
tmp_audio, orig_sr=self.sr, target_sr=16000 format="wav",
) # , res_type="soxr_vhq" )
wavfile.write( , 16000, True).getbuffer())
"%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
16000,
tmp_audio.astype(np.float32),
)
def pipeline(self, path, idx0): def pipeline(self, path, idx0):
try: try:

View File

@@ -5,12 +5,10 @@ logger = logging.getLogger(__name__)
import librosa import librosa
import numpy as np import numpy as np
import soundfile as sf
import torch import torch
from tqdm import tqdm from tqdm import tqdm
import av
from infer.lib.audio import downsample_audio from infer.lib.audio import downsample_audio, save_audio
cpu = torch.device("cpu") cpu = torch.device("cpu")
@@ -210,15 +208,13 @@ class Predictor:
sources = self.demix(mix.T) sources = self.demix(mix.T)
opt = sources[0].T opt = sources[0].T
if format in ["wav", "flac"]: if format in ["wav", "flac"]:
sf.write( save_audio("%s/vocal_%s.%s" % (vocal_root, basename, format), mix - opt, rate)
"%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate save_audio("%s/instrument_%s.%s" % (others_root, basename, format), opt, rate)
)
sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate)
else: else:
path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename) path_vocal = "%s/vocal_%s.wav" % (vocal_root, basename)
path_other = "%s/%s_others.wav" % (others_root, basename) path_other = "%s/instrument_%s.wav" % (others_root, basename)
sf.write(path_vocal, mix - opt, rate) save_audio(path_vocal, opt, rate)
sf.write(path_other, opt, rate) save_audio(path_other, opt, rate)
opt_path_vocal = path_vocal[:-4] + ".%s" % format opt_path_vocal = path_vocal[:-4] + ".%s" % format
opt_path_other = path_other[:-4] + ".%s" % format opt_path_other = path_other[:-4] + ".%s" % format
downsample_audio(path_vocal, opt_path_vocal, format) downsample_audio(path_vocal, opt_path_vocal, format)

View File

@@ -9,7 +9,7 @@ import torch
from configs import Config from configs import Config
from infer.modules.uvr5.mdxnet import MDXNetDereverb from infer.modules.uvr5.mdxnet import MDXNetDereverb
from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho from infer.modules.uvr5.vr import AudioPre
config = Config() config = Config()
@@ -27,8 +27,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
if model_name == "onnx_dereverb_By_FoxJoy": if model_name == "onnx_dereverb_By_FoxJoy":
pre_fun = MDXNetDereverb(15, config.device) pre_fun = MDXNetDereverb(15, config.device)
else: else:
func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho pre_fun = AudioPre(
pre_fun = func(
agg=int(agg), agg=int(agg),
model_path=os.path.join( model_path=os.path.join(
os.getenv("weight_uvr5_root"), model_name + ".pth" os.getenv("weight_uvr5_root"), model_name + ".pth"
@@ -72,18 +71,10 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
infos.append("%s->Success" % (os.path.basename(inp_path))) infos.append("%s->Success" % (os.path.basename(inp_path)))
yield "\n".join(infos) yield "\n".join(infos)
except: except:
try: infos.append(
if done == 0: "%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
pre_fun._path_audio_( )
inp_path, save_root_ins, save_root_vocal, format0 yield "\n".join(infos)
)
infos.append("%s->Success" % (os.path.basename(inp_path)))
yield "\n".join(infos)
except:
infos.append(
"%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
)
yield "\n".join(infos)
except: except:
infos.append(traceback.format_exc()) infos.append(traceback.format_exc())
yield "\n".join(infos) yield "\n".join(infos)

View File

@@ -5,8 +5,7 @@ logger = logging.getLogger(__name__)
import librosa import librosa
import numpy as np import numpy as np
import soundfile as sf from infer.lib.audio import downsample_audio, save_audio
from infer.lib.audio import downsample_audio
import torch import torch
from infer.lib.uvr5_pack.lib_v5 import nets_123821KB as Nets from infer.lib.uvr5_pack.lib_v5 import nets_123821KB as Nets
@@ -20,6 +19,8 @@ class AudioPre:
def __init__(self, agg, model_path, device, is_half, tta=False): def __init__(self, agg, model_path, device, is_half, tta=False):
self.model_path = model_path self.model_path = model_path
self.device = device self.device = device
self.is_de_echo = "DeEcho" in model_path
self.is_reverse = self.is_de_echo or "HP3" in model_path
self.data = { self.data = {
# Processing Options # Processing Options
"postprocess": False, "postprocess": False,
@@ -29,8 +30,13 @@ class AudioPre:
"agg": agg, "agg": agg,
"high_end_process": "mirroring", "high_end_process": "mirroring",
} }
mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") if self.is_de_echo:
model = Nets.CascadedASPPNet(mp.param["bins"] * 2) mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json")
nout = 64 if "DeReverb" in model_path else 48
model = CascadedNet(mp.param["bins"] * 2, nout)
else:
mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json")
model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
cpk = torch.load(model_path, map_location="cpu") cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk) model.load_state_dict(cpk)
model.eval() model.eval()
@@ -123,30 +129,28 @@ class AudioPre:
else: else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
logger.info("%s instruments done" % name) logger.info("%s instruments done" % name)
head = "instrument_" if self.is_reverse:
head = "vocal_"
else:
head = "instrument_"
if format in ["wav", "flac"]: if format in ["wav", "flac"]:
sf.write( save_audio(os.path.join(
os.path.join(
ins_root, ins_root,
head + "{}_{}.{}".format(name, self.data["agg"], format), head + "{}_{}.{}".format(name, self.data["agg"], format),
), ), wav_instrument, self.mp.param["sr"])
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
) #
else: else:
path = os.path.join( path = os.path.join(
ins_root, head + "{}_{}.wav".format(name, self.data["agg"]) ins_root, head + "{}_{}.wav".format(name, self.data["agg"])
) )
sf.write( save_audio(path, wav_instrument, self.mp.param["sr"])
path,
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
)
if os.path.exists(path): if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format opt_format_path = path[:-4] + ".%s" % format
downsample_audio(path, opt_format_path, format) downsample_audio(path, opt_format_path, format)
if vocal_root is not None: if vocal_root is not None:
head = "vocal_" if self.is_reverse:
head = "instrument_"
else:
head = "vocal_"
if self.data["high_end_process"].startswith("mirroring"): if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring( input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], v_spec_m, input_high_end, self.mp self.data["high_end_process"], v_spec_m, input_high_end, self.mp
@@ -158,185 +162,15 @@ class AudioPre:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
logger.info("%s vocals done" % name) logger.info("%s vocals done" % name)
if format in ["wav", "flac"]: if format in ["wav", "flac"]:
sf.write( save_audio(os.path.join(
os.path.join(
vocal_root, vocal_root,
head + "{}_{}.{}".format(name, self.data["agg"], format), head + "{}_{}.{}".format(name, self.data["agg"], format),
), ), wav_vocals, self.mp.param["sr"])
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
else: else:
path = os.path.join( path = os.path.join(
vocal_root, head + "{}_{}.wav".format(name, self.data["agg"]) vocal_root, head + "{}_{}.wav".format(name, self.data["agg"])
) )
sf.write( save_audio(path, wav_vocals, self.mp.param["sr"])
path,
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
opt_format_path = path[:-4] + ".%s" % format
downsample_audio(path, opt_format_path, format)
class AudioPreDeEcho:
def __init__(self, agg, model_path, device, is_half, tta=False):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
"postprocess": False,
"tta": tta,
# Constants
"window_size": 512,
"agg": agg,
"high_end_process": "mirroring",
}
mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json")
nout = 64 if "DeReverb" in model_path else 48
model = CascadedNet(mp.param["bins"] * 2, nout)
cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk)
model.eval()
if is_half:
model = model.half().to(device)
else:
model = model.to(device)
self.mp = mp
self.model = model
def _path_audio_(
self, music_file, vocal_root=None, ins_root=None, format="flac"
): # 3个VR模型vocal和ins是反的
if ins_root is None and vocal_root is None:
return "No save root."
name = os.path.basename(music_file)
if ins_root is not None:
os.makedirs(ins_root, exist_ok=True)
if vocal_root is not None:
os.makedirs(vocal_root, exist_ok=True)
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param["band"])
# print(bands_n)
for d in range(bands_n, 0, -1):
bp = self.mp.param["band"][d]
if d == bands_n: # high-end band
(
X_wave[d],
_,
) = librosa.load( # 理论上librosa读取可能对某些音频有bug应该上ffmpeg读取但是太麻烦了弃坑
music_file,
sr=bp["sr"],
mono=False,
dtype=np.float32,
res_type=bp["res_type"],
)
if X_wave[d].ndim == 1:
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
else: # lower bands
X_wave[d] = librosa.resample(
X_wave[d + 1],
orig_sr=self.mp.param["band"][d + 1]["sr"],
target_sr=bp["sr"],
res_type=bp["res_type"],
)
# Stft of wave source
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
X_wave[d],
bp["hl"],
bp["n_fft"],
self.mp.param["mid_side"],
self.mp.param["mid_side_b2"],
self.mp.param["reverse"],
)
# pdb.set_trace()
if d == bands_n and self.data["high_end_process"] != "none":
input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
)
input_high_end = X_spec_s[d][
:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
]
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
aggresive_set = float(self.data["agg"] / 100)
aggressiveness = {
"value": aggresive_set,
"split_bin": self.mp.param["band"][1]["crop_stop"],
}
with torch.no_grad():
pred, X_mag, X_phase = inference(
X_spec_m, self.device, self.model, aggressiveness, self.data
)
# Postprocess
if self.data["postprocess"]:
pred_inv = np.clip(X_mag - pred, 0, np.inf)
pred = spec_utils.mask_silence(pred, pred_inv)
y_spec_m = pred * X_phase
v_spec_m = X_spec_m - y_spec_m
if ins_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
y_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
logger.info("%s instruments done" % name)
if format in ["wav", "flac"]:
sf.write(
os.path.join(
ins_root,
"vocal_{}_{}.{}".format(name, self.data["agg"], format),
),
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
) #
else:
path = os.path.join(
ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
)
sf.write(
path,
(np.array(wav_instrument) * 32768).astype("int16"),
self.mp.param["sr"],
)
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
downsample_audio(path, opt_format_path, format)
if vocal_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
v_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
logger.info("%s vocals done" % name)
if format in ["wav", "flac"]:
sf.write(
os.path.join(
vocal_root,
"instrument_{}_{}.{}".format(name, self.data["agg"], format),
),
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
else:
path = os.path.join(
vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
)
sf.write(
path,
(np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"],
)
if os.path.exists(path): if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format opt_format_path = path[:-4] + ".%s" % format
downsample_audio(path, opt_format_path, format) downsample_audio(path, opt_format_path, format)

View File

@@ -5,11 +5,10 @@ import os
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
import numpy as np import numpy as np
import soundfile as sf
import torch import torch
from io import BytesIO from io import BytesIO
from infer.lib.audio import load_audio, wav2 from infer.lib.audio import load_audio, wav2, save_audio, float_np_array_to_wav_buf
from rvc.synthesizer import get_synthesizer, load_synthesizer from rvc.synthesizer import get_synthesizer, load_synthesizer
from .info import show_model_info from .info import show_model_info
from .pipeline import Pipeline from .pipeline import Pipeline
@@ -253,23 +252,16 @@ class VC:
try: try:
tgt_sr, audio_opt = opt tgt_sr, audio_opt = opt
if format1 in ["wav", "flac"]: if format1 in ["wav", "flac"]:
sf.write( save_audio("%s/%s.%s"
"%s/%s.%s" % (opt_root, os.path.basename(path), format1), audio_opt, tgt_sr)
% (opt_root, os.path.basename(path), format1),
audio_opt,
tgt_sr,
)
else: else:
path = "%s/%s.%s" % ( path = "%s/%s.%s" % (
opt_root, opt_root,
os.path.basename(path), os.path.basename(path),
format1, format1,
) )
with BytesIO() as wavf: with open(path, "wb") as outf:
sf.write(wavf, audio_opt, tgt_sr, format="wav") wav2(float_np_array_to_wav_buf(audio_opt, tgt_sr), outf, format1)
wavf.seek(0, 0)
with open(path, "wb") as outf:
wav2(wavf, outf, format1)
except: except:
info += traceback.format_exc() info += traceback.format_exc()
infos.append("%s->%s" % (os.path.basename(path), info)) infos.append("%s->%s" % (os.path.basename(path), info))

View File

@@ -10,7 +10,6 @@ faiss-cpu==1.7.3
gradio gradio
Cython Cython
pydub>=0.25.1 pydub>=0.25.1
soundfile>=0.12.1
tensorboardX tensorboardX
Jinja2>=3.1.2 Jinja2>=3.1.2
json5 json5

View File

@@ -9,7 +9,6 @@ faiss-cpu==1.7.3
gradio gradio
Cython Cython
pydub>=0.25.1 pydub>=0.25.1
soundfile>=0.12.1
tensorboardX tensorboardX
Jinja2>=3.1.2 Jinja2>=3.1.2
json5 json5

View File

@@ -18,7 +18,6 @@ PyYAML
resampy resampy
scikit_learn scikit_learn
scipy scipy
SoundFile
tensorboard tensorboard
tqdm tqdm
wave wave

View File

@@ -18,7 +18,6 @@ PyYAML
resampy resampy
scikit_learn scikit_learn
scipy scipy
SoundFile
tensorboard tensorboard
tqdm tqdm
wave wave

View File

@@ -14,7 +14,6 @@ faiss-cpu==1.7.3
gradio gradio
Cython Cython
pydub>=0.25.1 pydub>=0.25.1
soundfile>=0.12.1
tensorboardX tensorboardX
Jinja2>=3.1.2 Jinja2>=3.1.2
json5 json5

View File

@@ -9,7 +9,6 @@ faiss-cpu
gradio gradio
Cython Cython
pydub>=0.25.1 pydub>=0.25.1
soundfile>=0.12.1
tensorboardX tensorboardX
Jinja2>=3.1.2 Jinja2>=3.1.2
json5 json5

View File

@@ -9,7 +9,6 @@ faiss-cpu
gradio gradio
Cython Cython
pydub>=0.25.1 pydub>=0.25.1
soundfile>=0.12.1
tensorboardX tensorboardX
Jinja2>=3.1.2 Jinja2>=3.1.2
json5 json5

View File

@@ -18,7 +18,6 @@ from time import time as ttime
# import pyworld # import pyworld
import librosa import librosa
import numpy as np import numpy as np
import soundfile as sf
import torch.nn.functional as F import torch.nn.functional as F
from fairseq import checkpoint_utils from fairseq import checkpoint_utils
@@ -33,6 +32,7 @@ from scipy.io import wavfile
# from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
# from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
from infer.lib.audio import load_audio
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = r"E:\codes\py39\vits_vc_gpu_train\assets\hubert\hubert_base.pt" # model_path = r"E:\codes\py39\vits_vc_gpu_train\assets\hubert\hubert_base.pt" #
@@ -132,7 +132,7 @@ for idx, name in enumerate(
): ## ): ##
wav_path = "todo-songs/%s" % name # wav_path = "todo-songs/%s" % name #
f0_up_key = -2 # f0_up_key = -2 #
audio, sampling_rate = sf.read(wav_path) audio, sampling_rate = load_audio(wav_path)
if len(audio.shape) > 1: if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0)) audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000: if sampling_rate != 16000:

View File

@@ -1,8 +1,9 @@
import soundfile
import librosa import librosa
from rvc.onnx import RVC from rvc.onnx import RVC
from infer.lib.audio import save_audio
hop_size = 512 hop_size = 512
sampling_rate = 40000 # 采样率 sampling_rate = 40000 # 采样率
f0_up_key = 0 # 升降调 f0_up_key = 0 # 升降调
@@ -19,4 +20,4 @@ wav, sr = librosa.load(wav_path, sr=sampling_rate)
audio = model.infer(wav, sr, sampling_rate, sid, f0_method, f0_up_key) audio = model.infer(wav, sr, sampling_rate, sid, f0_method, f0_up_key)
soundfile.write(out_path, audio, sampling_rate) save_audio(out_path, audio, sampling_rate)

8
web.py
View File

@@ -141,8 +141,8 @@ weight_uvr5_root = os.getenv("weight_uvr5_root")
index_root = os.getenv("index_root") index_root = os.getenv("index_root")
outside_index_root = os.getenv("outside_index_root") outside_index_root = os.getenv("outside_index_root")
names = [] names = [""]
index_paths = [] index_paths = [""]
def lookup_names(weight_root): def lookup_names(weight_root):
global names global names
@@ -168,9 +168,9 @@ for name in os.listdir(weight_uvr5_root):
def change_choices(): def change_choices():
global index_paths, names global index_paths, names
names = [] names = [""]
lookup_names(weight_root) lookup_names(weight_root)
index_paths = [] index_paths = [""]
lookup_indices(index_root) lookup_indices(index_root)
lookup_indices(outside_index_root) lookup_indices(outside_index_root)
return {"choices": sorted(names), "__type__": "update"}, { return {"choices": sorted(names), "__type__": "update"}, {