diff --git a/gui.py b/gui.py index 283e0d2..c447de1 100644 --- a/gui.py +++ b/gui.py @@ -76,8 +76,11 @@ if __name__ == "__main__": import json import multiprocessing import re + import threading import time from multiprocessing import Queue, cpu_count + from infer.lib.audio import AudioIoProcess + from multiprocessing.shared_memory import SharedMemory import librosa from infer.modules.gui import TorchGate @@ -144,6 +147,15 @@ if __name__ == "__main__": self.input_devices_indices = None self.output_devices_indices = None self.stream = None + self.in_mem = None + self.out_mem = None + self.in_buf = None + self.out_buf = None + self.in_ptr = None + self.out_ptr = None + self.play_ptr = None + self.in_evt = None + self.stop_evt = None self.update_devices() self.launcher() @@ -564,7 +576,7 @@ if __name__ == "__main__": event, values = self.window.read() if event == sg.WINDOW_CLOSED: self.stop_stream() - exit() + # exit() if event == "reload_devices" or event == "sg_hostapi": self.gui_config.sg_hostapi = values["sg_hostapi"] self.update_devices(hostapi_name=values["sg_hostapi"]) @@ -639,7 +651,7 @@ if __name__ == "__main__": json.dump(settings, j) if self.stream is not None: self.delay_time = ( - self.stream.latency[-1] + self.stream.get_latency() + values["block_time"] + values["crossfade_length"] + 0.01 @@ -667,7 +679,7 @@ if __name__ == "__main__": self.rvc.set_index_rate(values["index_rate"]) elif event == "rms_mix_rate": self.gui_config.rms_mix_rate = values["rms_mix_rate"] - elif event in ["pm", "dio", "harvest", "crepe", "rmvpe", "fcpe"]: + elif event in ["pm", "harvest", "crepe", "rmvpe", "fcpe"]: self.gui_config.f0method = event elif event == "I_noise_reduce": self.gui_config.I_noise_reduce = values["I_noise_reduce"] @@ -867,36 +879,80 @@ if __name__ == "__main__": "WASAPI" in self.gui_config.sg_hostapi and self.gui_config.sg_wasapi_exclusive ): - extra_settings = sd.WasapiSettings(exclusive=True) + wasapi_exclusive = True else: - extra_settings = None - self.stream = sd.Stream( - callback=self.audio_callback, - blocksize=self.block_frame, - samplerate=self.gui_config.samplerate, - channels=self.gui_config.channels, - dtype="float32", - extra_settings=extra_settings, + wasapi_exclusive = False + self.stream = AudioIoProcess( + input_device=sd.default.device[0], + output_device=sd.default.device[1], + input_audio_block_size = self.block_frame, + sample_rate = self.gui_config.samplerate, + channel_num=self.gui_config.channels, + is_input_wasapi_exclusive=wasapi_exclusive, + is_output_wasapi_exclusive=wasapi_exclusive, + is_device_combined = True + # TODO: Add control UI to allow devices with different type API & different WASAPI settings ) + self.in_mem = SharedMemory(name=self.stream.get_in_mem_name()) + self.out_mem = SharedMemory(name=self.stream.get_out_mem_name()) + self.in_buf = np.ndarray( + self.stream.get_np_shape(), + dtype=self.stream.get_np_dtype(), + buffer=self.in_mem.buf, + order='C' + ) + self.out_buf = np.ndarray( + self.stream.get_np_shape(), + dtype=self.stream.get_np_dtype(), + buffer=self.out_mem.buf, + order='C' + ) + self.in_ptr, \ + self.out_ptr, \ + self.play_ptr, \ + self.in_evt, \ + self.stop_evt = self.stream.get_ptrs_and_events() + self.stream.start() + def audio_loop(): + while flag_vc: + self.audio_infer(self.block_frame << 1) + + threading.Thread( + target=audio_loop, + daemon=True + ).start() + def stop_stream(self): global flag_vc if flag_vc: flag_vc = False if self.stream is not None: - self.stream.abort() - self.stream.close() + print("Exiting") + self.stop_evt.set() + self.in_mem.close() + self.out_mem.close() + self.stream.join() self.stream = None - def audio_callback( - self, indata: np.ndarray, outdata: np.ndarray, frames, times, status + def audio_infer( + self, buf_size:int # 2 * self.block_frame ): """ 音频处理 """ global flag_vc + + self.in_evt.wait() + rptr = self.in_ptr.value + self.in_evt.clear() + start_time = time.perf_counter() + + rend = rptr + self.block_frame + indata = np.copy(self.in_buf[rptr:rend]) + indata = librosa.to_mono(indata.T) if self.gui_config.threhold > -60: indata = np.append(self.rms_buffer, indata) @@ -1039,13 +1095,47 @@ if __name__ == "__main__": self.sola_buffer[:] = infer_wav[ self.block_frame : self.block_frame + self.sola_buffer_frame ] - outdata[:] = ( + outdata = ( infer_wav[: self.block_frame] .repeat(self.gui_config.channels, 1) .t() .cpu() .numpy() ) + + # 装填输出缓冲 + start = self.out_ptr.value + play_pos = self.play_ptr.value + + # 计算播放进度差(写指针距离播放指针的帧数) + delta = (start - play_pos + buf_size) % buf_size + + if delta < self.block_frame: + # 装填赶不上播放,导致播放进度追上来了, + # 此时已产生无法挽回的破音, + # 只好直接卡着播放指针写入,保证接下来的尽快放出来 + print("[W] Output underrun") + write_pos = play_pos + else: + # 否则按块对齐 + write_pos = (start + self.block_frame) % buf_size + + # 写入共享缓冲区 + end = (write_pos + self.block_frame) % buf_size + if end > write_pos: + self.out_buf[write_pos:end] = outdata + else: + first = buf_size - write_pos + self.out_buf[write_pos:] = outdata[:first] + self.out_buf[:end] = outdata[first:] + + # 更新写指针 + self.out_ptr.value = write_pos + + if self.in_evt.is_set(): + print("[W] Input overrun") + self.in_evt.clear() + total_time = time.perf_counter() - start_time if flag_vc: self.window["infer_time"].update(int(total_time * 1000)) diff --git a/infer/lib/audio.py b/infer/lib/audio.py index 417876c..37e76ee 100644 --- a/infer/lib/audio.py +++ b/infer/lib/audio.py @@ -4,6 +4,9 @@ from typing import Dict, Tuple, Optional, Union, List import os import math import wave +import signal +from multiprocessing import Process, Value, Event +from multiprocessing.shared_memory import SharedMemory import numpy as np from numba import jit @@ -195,3 +198,167 @@ def get_audio_properties(input_path: str) -> Tuple[int, int]: rate = audio_stream.base_rate container.close() return channels, rate + +class AudioIoProcess(Process): + def __init__(self, + input_device, + output_device, + input_audio_block_size: int, + sample_rate: int, + channel_num: int = 2, + is_device_combined: bool = True, + is_input_wasapi_exclusive: bool = False, + is_output_wasapi_exclusive: bool = False + ): + super().__init__() + self.in_dev = input_device + self.out_dev = output_device + self.block_size: int = input_audio_block_size + self.buf_size: int = self.block_size << 1 # 双缓冲 + self.sample_rate: int = sample_rate + self.channels: int = channel_num + self.is_device_combined: bool = is_device_combined + self.is_input_wasapi_exclusive: bool = is_input_wasapi_exclusive + self.is_output_wasapi_exclusive: bool = is_output_wasapi_exclusive + + self.__rec_ptr = 0 + self.in_ptr = Value('i', 0) # 当收满一个block时由本进程设置 + self.out_ptr = Value('i', 0) # 由主进程设置,指示下一次预期写入位置 + self.play_ptr = Value('i', 0) # 由本进程设置,指示当前音频已经播放到哪里 + self.in_evt = Event() # 当收满一个block时由本进程设置 + self.stop_evt = Event() # 当主进程停止音频活动时由主进程设置 + + self.latency = Value('d', 114514.1919810) + + self.buf_shape: tuple = (self.buf_size, self.channels) + self.buf_dtype: np.dtype = np.float32 + self.buf_nbytes: int = int( + np.prod(self.buf_shape) * np.dtype(self.buf_dtype).itemsize) + + self.in_mem = SharedMemory(create=True, size=self.buf_nbytes) + self.out_mem = SharedMemory(create=True, size=self.buf_nbytes) + self.in_mem_name: str = self.in_mem.name + self.out_mem_name: str = self.out_mem.name + + self.in_buf = None + self.out_buf = None + + def get_in_mem_name(self) -> str: + return self.in_mem_name + + def get_out_mem_name(self) -> str: + return self.out_mem_name + + def get_np_shape(self) -> tuple: + return self.buf_shape + + def get_np_dtype(self) -> np.dtype: + return self.buf_dtype + + def get_ptrs_and_events(self): + return self.in_ptr, \ + self.out_ptr,\ + self.play_ptr,\ + self.in_evt, \ + self.stop_evt\ + + def get_latency(self) -> float: + return self.latency.value + + def run(self): + import sounddevice as sd + + signal.signal(signal.SIGINT, signal.SIG_IGN) + + in_mem = SharedMemory(name=self.in_mem_name) + self.in_buf = np.ndarray( + self.buf_shape, dtype=self.buf_dtype, buffer=in_mem.buf, order='C') + self.in_buf.fill(0.0) + + out_mem = SharedMemory(name=self.out_mem_name) + self.out_buf = np.ndarray( + self.buf_shape, dtype=self.buf_dtype, buffer=out_mem.buf, order='C') + self.out_buf.fill(0.0) + + exclusive_settings = sd.WasapiSettings(exclusive=True) + + sd.default.device = (self.in_dev, self.out_dev) + + def output_callback(outdata, frames, time_info, status): + play_ptr = self.play_ptr.value + end_ptr = play_ptr + frames + + if end_ptr <= self.buf_size: + outdata[:] = self.out_buf[play_ptr:end_ptr] + else: + first = self.buf_size - play_ptr + second = end_ptr - self.buf_size + outdata[:first] = self.out_buf[play_ptr:] + outdata[first:] = self.out_buf[:second] + + self.play_ptr.value = end_ptr % self.buf_size + + def input_callback(indata, frames, time_info, status): + # 收录输入数据 + end_ptr = self.__rec_ptr + frames + if end_ptr <= self.buf_size: # 整块拷贝 + self.in_buf[self.__rec_ptr:end_ptr] = indata + else: # 处理回绕 + first = self.buf_size - self.__rec_ptr + second = end_ptr - self.buf_size + self.in_buf[self.__rec_ptr:] = indata[:first] + self.in_buf[:second] = indata[first:] + write_pos = self.__rec_ptr + self.__rec_ptr = end_ptr % self.buf_size + + # 设置信号 + if write_pos < self.block_size and self.__rec_ptr >= self.block_size: + self.in_ptr.value = 0 + self.in_evt.set() # 通知主线程来取甲缓冲 + elif write_pos < self.buf_size and self.__rec_ptr < write_pos: + self.in_ptr.value = self.block_size + self.in_evt.set() # 通知主线程来取乙缓冲 + + def combined_callback(indata, outdata, frames, time_info, status): + output_callback(outdata, frames, time_info, status) # 优先出声 + input_callback(indata, frames, time_info, status) + + if self.is_device_combined: + with sd.Stream( + samplerate=self.sample_rate, + channels=self.channels, + dtype=self.buf_dtype, + latency='low', + extra_settings=exclusive_settings if + self.is_input_wasapi_exclusive and + self.is_output_wasapi_exclusive else None, + callback=combined_callback + ) as s: + self.latency.value = s.latency[-1] + self.stop_evt.wait() + self.out_buf.fill(0.0) + else: + with sd.InputStream( + samplerate=self.sample_rate, + channels=self.channels, + dtype=self.buf_dtype, + latency='low', + extra_settings=exclusive_settings if self.is_input_wasapi_exclusive else None, + callback=input_callback + ) as si, sd.OutputStream( + samplerate=self.sample_rate, + channels=self.channels, + dtype=self.buf_dtype, + latency='low', + extra_settings=exclusive_settings if self.is_output_wasapi_exclusive else None, + callback=output_callback + ) as so: + self.latency.value = si.latency[-1] + so.latency[-1] + self.stop_evt.wait() + self.out_buf.fill(0.0) + + # 清理共享内存 + in_mem.close() + out_mem.close() + in_mem.unlink() + out_mem.unlink() diff --git a/requirements/gui-dml.txt b/requirements/gui-dml.txt index db0dad0..f823449 100644 --- a/requirements/gui-dml.txt +++ b/requirements/gui-dml.txt @@ -4,11 +4,6 @@ #torch 1.11.0 with cuda 11.3 #pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 einops -fairseq -flask -flask_cors -gin -gin_config librosa local_attention matplotlib @@ -20,7 +15,6 @@ scikit_learn scipy tensorboard tqdm -wave FreeSimpleGUI sounddevice gradio diff --git a/requirements/gui.txt b/requirements/gui.txt index b6af49d..277a887 100644 --- a/requirements/gui.txt +++ b/requirements/gui.txt @@ -4,11 +4,6 @@ #torch 1.11.0 with cuda 11.3 #pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 einops -fairseq -flask -flask_cors -gin -gin_config librosa local_attention matplotlib @@ -20,7 +15,6 @@ scikit_learn scipy tensorboard tqdm -wave FreeSimpleGUI sounddevice gradio