From 76ba0e20ff2c85c166918df92b66fa6139fc2baa Mon Sep 17 00:00:00 2001 From: Alex Murkoff <79400603+alexlnkp@users.noreply.github.com> Date: Tue, 11 Jun 2024 12:08:37 +0700 Subject: [PATCH] feat(audio): use PyAV instead of ffmpeg replaced usage of ffmpeg in favor of PyAV (`av`) --- .github/workflows/unitest.yml | 1 - Dockerfile | 2 +- infer/lib/audio.py | 23 ++++++------- infer/modules/uvr5/mdxnet.py | 47 +++++++++++++++++++-------- infer/modules/uvr5/modules.py | 59 +++++++++++++++++++++++++-------- infer/modules/uvr5/vr.py | 61 ++++++++++++++++++++--------------- 6 files changed, 126 insertions(+), 67 deletions(-) diff --git a/.github/workflows/unitest.yml b/.github/workflows/unitest.yml index aa56fbe..303ca71 100644 --- a/.github/workflows/unitest.yml +++ b/.github/workflows/unitest.yml @@ -18,7 +18,6 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt -y install ffmpeg wget https://github.com/fumiama/RVC-Models-Downloader/releases/download/v0.2.3/rvcmd_linux_amd64.deb sudo apt -y install ./rvcmd_linux_amd64.deb python -m pip install --upgrade pip diff --git a/Dockerfile b/Dockerfile index 1b30c7c..b37ecf2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,7 @@ WORKDIR /app # Install dependenceis to add PPAs RUN apt-get update && \ - apt-get install -y -qq ffmpeg aria2 && apt clean && \ + apt-get install -y -qq aria2 && apt clean && \ apt-get install -y software-properties-common && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/infer/lib/audio.py b/infer/lib/audio.py index e406ab8..ff5d0b2 100644 --- a/infer/lib/audio.py +++ b/infer/lib/audio.py @@ -1,9 +1,9 @@ from io import BufferedWriter, BytesIO from pathlib import Path from typing import Dict -import ffmpeg import numpy as np import av +from av.audio.resampler import AudioResampler video_format_dict: Dict[str, str] = { "m4a": "mp4", @@ -38,19 +38,20 @@ def load_audio(file: str, sr: int) -> np.ndarray: raise FileNotFoundError(f"File not found: {file}") try: - # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 - # This launches a subprocess to decode audio while down-mixing and resampling as necessary. - # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. - file = str(clean_path(file)) # 防止小白拷路径头尾带了空格和"和回车 - out, _ = ( - ffmpeg.input(file, threads=0) - .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) - .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) - ) + container = av.open(file) + resampler = AudioResampler(format='fltp', layout='mono', rate=sr) + decoded_audio = [] + + for frame in container.decode(audio=0): + frame.pts = None # Clear presentation timestamp to avoid resampling issues + resampled = resampler.resample(frame) + decoded_audio.append(resampled.to_ndarray()) + + audio = np.concatenate(decoded_audio) except Exception as e: raise RuntimeError(f"Failed to load audio: {e}") - return np.frombuffer(out, np.float32).flatten() + return audio.flatten() def clean_path(path: str) -> Path: diff --git a/infer/modules/uvr5/mdxnet.py b/infer/modules/uvr5/mdxnet.py index 25edd98..0a431be 100644 --- a/infer/modules/uvr5/mdxnet.py +++ b/infer/modules/uvr5/mdxnet.py @@ -8,6 +8,7 @@ import numpy as np import soundfile as sf import torch from tqdm import tqdm +import av cpu = torch.device("cpu") @@ -218,20 +219,38 @@ class Predictor: sf.write(path_other, opt, rate) opt_path_vocal = path_vocal[:-4] + ".%s" % format opt_path_other = path_other[:-4] + ".%s" % format - if os.path.exists(path_vocal): - os.system(f'ffmpeg -i "{path_vocal}" -vn "{opt_path_vocal}" -q:a 2 -y') - if os.path.exists(opt_path_vocal): - try: - os.remove(path_vocal) - except: - pass - if os.path.exists(path_other): - os.system(f'ffmpeg -i "{path_other}" -vn "{opt_path_other}" -q:a 2 -y') - if os.path.exists(opt_path_other): - try: - os.remove(path_other) - except: - pass + process_audio(path_vocal, opt_path_vocal, format) + process_audio(path_other, opt_path_other, format) + +def process_audio(input_path: str, output_path: str, format: str) -> None: + if not os.path.exists(input_path): return + + input_container = av.open(input_path) + output_container = av.open(output_path, 'w') + + # Create a stream in the output container + input_stream = input_container.streams.audio[0] + output_stream = output_container.add_stream(format) + + output_stream.bit_rate = 128_000 # 128kb/s (equivalent to -q:a 2) + + # Copy packets from the input file to the output file + for packet in input_container.demux(input_stream): + for frame in packet.decode(): + for out_packet in output_stream.encode(frame): + output_container.mux(out_packet) + + for packet in output_stream.encode(): + output_container.mux(packet) + + # Close the containers + input_container.close() + output_container.close() + + try: # Remove the original file + os.remove(input_path) + except Exception as e: + print(f"Failed to remove the original file: {e}") class MDXNetDereverb: diff --git a/infer/modules/uvr5/modules.py b/infer/modules/uvr5/modules.py index ab9642f..1463402 100644 --- a/infer/modules/uvr5/modules.py +++ b/infer/modules/uvr5/modules.py @@ -4,7 +4,8 @@ import logging logger = logging.getLogger(__name__) -import ffmpeg +import av +from av.audio.resampler import AudioResampler import torch from configs import Config @@ -46,27 +47,23 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format need_reformat = 1 done = 0 try: - info = ffmpeg.probe(inp_path, cmd="ffprobe") - if ( - info["streams"][0]["channels"] == 2 - and info["streams"][0]["sample_rate"] == "44100" - ): + container = av.open(inp_path) + audio_stream = next(s for s in container.streams if s.type == 'audio') + + # Check the audio stream's properties + if audio_stream.channels == 2 and audio_stream.rate == 44100: + pre_fun._path_audio_(inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3) need_reformat = 0 - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3 - ) done = 1 - except: + except Exception as e: need_reformat = 1 - traceback.print_exc() + print(f"Exception {e} occured. Will reformat") if need_reformat == 1: tmp_path = "%s/%s.reformatted.wav" % ( os.path.join(os.environ["TEMP"]), os.path.basename(inp_path), ) - os.system( - f'ffmpeg -i "{inp_path}" -vn -acodec pcm_s16le -ac 2 -ar 44100 "{tmp_path}" -y' - ) + process_audio(inp_path, tmp_path) inp_path = tmp_path try: if done == 0: @@ -108,3 +105,37 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format torch.mps.empty_cache() logger.info("Executed torch.mps.empty_cache()") yield "\n".join(infos) + +def process_audio(input_path: str, output_path: str) -> None: + if not os.path.exists(input_path): return + + input_container = av.open(input_path) + output_container = av.open(output_path, 'w') + + # Create a stream in the output container + input_stream = input_container.streams.audio[0] + output_stream = output_container.add_stream('pcm_s16le', rate=44100, layout='stereo') + + resampler = AudioResampler('pcm_s16le', 'stereo', 44100) + + output_stream.bit_rate = 128_000 # 128kb/s (equivalent to -q:a 2) + + # Copy packets from the input file to the output file + for packet in input_container.demux(input_stream): + for frame in packet.decode(): + frame.pts = None # Clear presentation timestamp to avoid resampling issues + resampled = resampler.resample(frame) + for out_packet in output_stream.encode(resampled): + output_container.mux(out_packet) + + for packet in output_stream.encode(): + output_container.mux(packet) + + # Close the containers + input_container.close() + output_container.close() + + try: # Remove the original file + os.remove(input_path) + except Exception as e: + print(f"Failed to remove the original file: {e}") \ No newline at end of file diff --git a/infer/modules/uvr5/vr.py b/infer/modules/uvr5/vr.py index d542f16..8160e4a 100644 --- a/infer/modules/uvr5/vr.py +++ b/infer/modules/uvr5/vr.py @@ -6,6 +6,7 @@ logger = logging.getLogger(__name__) import librosa import numpy as np import soundfile as sf +import av import torch from infer.lib.uvr5_pack.lib_v5 import nets_123821KB as Nets @@ -146,12 +147,7 @@ class AudioPre: ) if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format - os.system(f'ffmpeg -i "{path}" -vn "{opt_format_path}" -q:a 2 -y') - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass + process_audio(path, opt_format_path, format) if vocal_root is not None: if is_hp3 == True: head = "instrument_" @@ -185,15 +181,38 @@ class AudioPre: (np.array(wav_vocals) * 32768).astype("int16"), self.mp.param["sr"], ) - if os.path.exists(path): - opt_format_path = path[:-4] + ".%s" % format - os.system(f'ffmpeg -i "{path}" -vn "{opt_format_path}" -q:a 2 -y') - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass + opt_format_path = path[:-4] + ".%s" % format + process_audio(path, opt_format_path, format) +def process_audio(input_path: str, output_path: str, format: str) -> None: + if not os.path.exists(input_path): return + + input_container = av.open(input_path) + output_container = av.open(output_path, 'w') + + # Create a stream in the output container + input_stream = input_container.streams.audio[0] + output_stream = output_container.add_stream(format) + + output_stream.bit_rate = 128_000 # 128kb/s (equivalent to -q:a 2) + + # Copy packets from the input file to the output file + for packet in input_container.demux(input_stream): + for frame in packet.decode(): + for out_packet in output_stream.encode(frame): + output_container.mux(out_packet) + + for packet in output_stream.encode(): + output_container.mux(packet) + + # Close the containers + input_container.close() + output_container.close() + + try: # Remove the original file + os.remove(input_path) + except Exception as e: + print(f"Failed to remove the original file: {e}") class AudioPreDeEcho: def __init__(self, agg, model_path, device, is_half, tta=False): @@ -323,12 +342,7 @@ class AudioPreDeEcho: ) if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format - os.system(f'ffmpeg -i "{path}" -vn "{opt_format_path}" -q:a 2 -y') - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass + process_audio(path, opt_format_path, format) if vocal_root is not None: if self.data["high_end_process"].startswith("mirroring"): input_high_end_ = spec_utils.mirroring( @@ -360,9 +374,4 @@ class AudioPreDeEcho: ) if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format - os.system(f'ffmpeg -i "{path}" -vn "{opt_format_path}" -q:a 2 -y') - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass + process_audio(path, opt_format_path, format)