1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-14 23:44:56 +08:00

feat(audio): use PyAV instead of ffmpeg

replaced usage of ffmpeg in favor of PyAV (`av`)
This commit is contained in:
Alex Murkoff
2024-06-11 12:08:37 +07:00
parent 9d699b1d99
commit 76ba0e20ff
6 changed files with 126 additions and 67 deletions

View File

@@ -18,7 +18,6 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
sudo apt update sudo apt update
sudo apt -y install ffmpeg
wget https://github.com/fumiama/RVC-Models-Downloader/releases/download/v0.2.3/rvcmd_linux_amd64.deb wget https://github.com/fumiama/RVC-Models-Downloader/releases/download/v0.2.3/rvcmd_linux_amd64.deb
sudo apt -y install ./rvcmd_linux_amd64.deb sudo apt -y install ./rvcmd_linux_amd64.deb
python -m pip install --upgrade pip python -m pip install --upgrade pip

View File

@@ -8,7 +8,7 @@ WORKDIR /app
# Install dependenceis to add PPAs # Install dependenceis to add PPAs
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y -qq ffmpeg aria2 && apt clean && \ apt-get install -y -qq aria2 && apt clean && \
apt-get install -y software-properties-common && \ apt-get install -y software-properties-common && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*

View File

@@ -1,9 +1,9 @@
from io import BufferedWriter, BytesIO from io import BufferedWriter, BytesIO
from pathlib import Path from pathlib import Path
from typing import Dict from typing import Dict
import ffmpeg
import numpy as np import numpy as np
import av import av
from av.audio.resampler import AudioResampler
video_format_dict: Dict[str, str] = { video_format_dict: Dict[str, str] = {
"m4a": "mp4", "m4a": "mp4",
@@ -38,19 +38,20 @@ def load_audio(file: str, sr: int) -> np.ndarray:
raise FileNotFoundError(f"File not found: {file}") raise FileNotFoundError(f"File not found: {file}")
try: try:
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 container = av.open(file)
# This launches a subprocess to decode audio while down-mixing and resampling as necessary. resampler = AudioResampler(format='fltp', layout='mono', rate=sr)
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. decoded_audio = []
file = str(clean_path(file)) # 防止小白拷路径头尾带了空格和"和回车
out, _ = ( for frame in container.decode(audio=0):
ffmpeg.input(file, threads=0) frame.pts = None # Clear presentation timestamp to avoid resampling issues
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) resampled = resampler.resample(frame)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) decoded_audio.append(resampled.to_ndarray())
)
audio = np.concatenate(decoded_audio)
except Exception as e: except Exception as e:
raise RuntimeError(f"Failed to load audio: {e}") raise RuntimeError(f"Failed to load audio: {e}")
return np.frombuffer(out, np.float32).flatten() return audio.flatten()
def clean_path(path: str) -> Path: def clean_path(path: str) -> Path:

View File

@@ -8,6 +8,7 @@ import numpy as np
import soundfile as sf import soundfile as sf
import torch import torch
from tqdm import tqdm from tqdm import tqdm
import av
cpu = torch.device("cpu") cpu = torch.device("cpu")
@@ -218,20 +219,38 @@ class Predictor:
sf.write(path_other, opt, rate) sf.write(path_other, opt, rate)
opt_path_vocal = path_vocal[:-4] + ".%s" % format opt_path_vocal = path_vocal[:-4] + ".%s" % format
opt_path_other = path_other[:-4] + ".%s" % format opt_path_other = path_other[:-4] + ".%s" % format
if os.path.exists(path_vocal): process_audio(path_vocal, opt_path_vocal, format)
os.system(f'ffmpeg -i "{path_vocal}" -vn "{opt_path_vocal}" -q:a 2 -y') process_audio(path_other, opt_path_other, format)
if os.path.exists(opt_path_vocal):
try: def process_audio(input_path: str, output_path: str, format: str) -> None:
os.remove(path_vocal) if not os.path.exists(input_path): return
except:
pass input_container = av.open(input_path)
if os.path.exists(path_other): output_container = av.open(output_path, 'w')
os.system(f'ffmpeg -i "{path_other}" -vn "{opt_path_other}" -q:a 2 -y')
if os.path.exists(opt_path_other): # Create a stream in the output container
try: input_stream = input_container.streams.audio[0]
os.remove(path_other) output_stream = output_container.add_stream(format)
except:
pass output_stream.bit_rate = 128_000 # 128kb/s (equivalent to -q:a 2)
# Copy packets from the input file to the output file
for packet in input_container.demux(input_stream):
for frame in packet.decode():
for out_packet in output_stream.encode(frame):
output_container.mux(out_packet)
for packet in output_stream.encode():
output_container.mux(packet)
# Close the containers
input_container.close()
output_container.close()
try: # Remove the original file
os.remove(input_path)
except Exception as e:
print(f"Failed to remove the original file: {e}")
class MDXNetDereverb: class MDXNetDereverb:

View File

@@ -4,7 +4,8 @@ import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
import ffmpeg import av
from av.audio.resampler import AudioResampler
import torch import torch
from configs import Config from configs import Config
@@ -46,27 +47,23 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
need_reformat = 1 need_reformat = 1
done = 0 done = 0
try: try:
info = ffmpeg.probe(inp_path, cmd="ffprobe") container = av.open(inp_path)
if ( audio_stream = next(s for s in container.streams if s.type == 'audio')
info["streams"][0]["channels"] == 2
and info["streams"][0]["sample_rate"] == "44100" # Check the audio stream's properties
): if audio_stream.channels == 2 and audio_stream.rate == 44100:
pre_fun._path_audio_(inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3)
need_reformat = 0 need_reformat = 0
pre_fun._path_audio_(
inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3
)
done = 1 done = 1
except: except Exception as e:
need_reformat = 1 need_reformat = 1
traceback.print_exc() print(f"Exception {e} occured. Will reformat")
if need_reformat == 1: if need_reformat == 1:
tmp_path = "%s/%s.reformatted.wav" % ( tmp_path = "%s/%s.reformatted.wav" % (
os.path.join(os.environ["TEMP"]), os.path.join(os.environ["TEMP"]),
os.path.basename(inp_path), os.path.basename(inp_path),
) )
os.system( process_audio(inp_path, tmp_path)
f'ffmpeg -i "{inp_path}" -vn -acodec pcm_s16le -ac 2 -ar 44100 "{tmp_path}" -y'
)
inp_path = tmp_path inp_path = tmp_path
try: try:
if done == 0: if done == 0:
@@ -108,3 +105,37 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
torch.mps.empty_cache() torch.mps.empty_cache()
logger.info("Executed torch.mps.empty_cache()") logger.info("Executed torch.mps.empty_cache()")
yield "\n".join(infos) yield "\n".join(infos)
def process_audio(input_path: str, output_path: str) -> None:
if not os.path.exists(input_path): return
input_container = av.open(input_path)
output_container = av.open(output_path, 'w')
# Create a stream in the output container
input_stream = input_container.streams.audio[0]
output_stream = output_container.add_stream('pcm_s16le', rate=44100, layout='stereo')
resampler = AudioResampler('pcm_s16le', 'stereo', 44100)
output_stream.bit_rate = 128_000 # 128kb/s (equivalent to -q:a 2)
# Copy packets from the input file to the output file
for packet in input_container.demux(input_stream):
for frame in packet.decode():
frame.pts = None # Clear presentation timestamp to avoid resampling issues
resampled = resampler.resample(frame)
for out_packet in output_stream.encode(resampled):
output_container.mux(out_packet)
for packet in output_stream.encode():
output_container.mux(packet)
# Close the containers
input_container.close()
output_container.close()
try: # Remove the original file
os.remove(input_path)
except Exception as e:
print(f"Failed to remove the original file: {e}")

View File

@@ -6,6 +6,7 @@ logger = logging.getLogger(__name__)
import librosa import librosa
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
import av
import torch import torch
from infer.lib.uvr5_pack.lib_v5 import nets_123821KB as Nets from infer.lib.uvr5_pack.lib_v5 import nets_123821KB as Nets
@@ -146,12 +147,7 @@ class AudioPre:
) )
if os.path.exists(path): if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format opt_format_path = path[:-4] + ".%s" % format
os.system(f'ffmpeg -i "{path}" -vn "{opt_format_path}" -q:a 2 -y') process_audio(path, opt_format_path, format)
if os.path.exists(opt_format_path):
try:
os.remove(path)
except:
pass
if vocal_root is not None: if vocal_root is not None:
if is_hp3 == True: if is_hp3 == True:
head = "instrument_" head = "instrument_"
@@ -185,15 +181,38 @@ class AudioPre:
(np.array(wav_vocals) * 32768).astype("int16"), (np.array(wav_vocals) * 32768).astype("int16"),
self.mp.param["sr"], self.mp.param["sr"],
) )
if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format
opt_format_path = path[:-4] + ".%s" % format process_audio(path, opt_format_path, format)
os.system(f'ffmpeg -i "{path}" -vn "{opt_format_path}" -q:a 2 -y')
if os.path.exists(opt_format_path):
try:
os.remove(path)
except:
pass
def process_audio(input_path: str, output_path: str, format: str) -> None:
if not os.path.exists(input_path): return
input_container = av.open(input_path)
output_container = av.open(output_path, 'w')
# Create a stream in the output container
input_stream = input_container.streams.audio[0]
output_stream = output_container.add_stream(format)
output_stream.bit_rate = 128_000 # 128kb/s (equivalent to -q:a 2)
# Copy packets from the input file to the output file
for packet in input_container.demux(input_stream):
for frame in packet.decode():
for out_packet in output_stream.encode(frame):
output_container.mux(out_packet)
for packet in output_stream.encode():
output_container.mux(packet)
# Close the containers
input_container.close()
output_container.close()
try: # Remove the original file
os.remove(input_path)
except Exception as e:
print(f"Failed to remove the original file: {e}")
class AudioPreDeEcho: class AudioPreDeEcho:
def __init__(self, agg, model_path, device, is_half, tta=False): def __init__(self, agg, model_path, device, is_half, tta=False):
@@ -323,12 +342,7 @@ class AudioPreDeEcho:
) )
if os.path.exists(path): if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format opt_format_path = path[:-4] + ".%s" % format
os.system(f'ffmpeg -i "{path}" -vn "{opt_format_path}" -q:a 2 -y') process_audio(path, opt_format_path, format)
if os.path.exists(opt_format_path):
try:
os.remove(path)
except:
pass
if vocal_root is not None: if vocal_root is not None:
if self.data["high_end_process"].startswith("mirroring"): if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring( input_high_end_ = spec_utils.mirroring(
@@ -360,9 +374,4 @@ class AudioPreDeEcho:
) )
if os.path.exists(path): if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format opt_format_path = path[:-4] + ".%s" % format
os.system(f'ffmpeg -i "{path}" -vn "{opt_format_path}" -q:a 2 -y') process_audio(path, opt_format_path, format)
if os.path.exists(opt_format_path):
try:
os.remove(path)
except:
pass