1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-05 01:10:22 +08:00

optimize(rmvpe): move deepunet&e2e into rvc

This commit is contained in:
源文雨
2024-06-12 20:51:46 +09:00
parent 1e22d468ea
commit e486649a91
4 changed files with 289 additions and 287 deletions

View File

@@ -1,9 +1,10 @@
from io import BufferedWriter, BytesIO
from pathlib import Path
from typing import Dict, Tuple
import os
import numpy as np
import av
import os
from av.audio.resampler import AudioResampler
video_format_dict: Dict[str, str] = {
@@ -44,10 +45,8 @@ def load_audio(file: str, sr: int) -> np.ndarray:
resampler = AudioResampler(format="fltp", layout="mono", rate=sr)
# Estimated maximum total number of samples to pre-allocate the array
audio_duration_sec: float = (
container.duration / 1_000_000
) # AV stores length in microseconds by default
estimated_total_samples = int(audio_duration_sec * sr + 0.5)
# AV stores length in microseconds by default
estimated_total_samples = int(container.duration * sr // 1_000_000)
decoded_audio = np.zeros(estimated_total_samples + 1, dtype=np.float32)
offset = 0
@@ -55,7 +54,7 @@ def load_audio(file: str, sr: int) -> np.ndarray:
frame.pts = None # Clear presentation timestamp to avoid resampling issues
resampled_frames = resampler.resample(frame)
for resampled_frame in resampled_frames:
frame_data = np.array(resampled_frame.to_ndarray()).flatten()
frame_data = resampled_frame.to_ndarray()[0]
end_index = offset + len(frame_data)
# Check if decoded_audio has enough space, and resize if necessary