1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-05 01:10:22 +08:00

feat(all): optimize hierarchy of files

This commit is contained in:
源文雨
2024-04-20 21:29:25 +09:00
parent 1ac5e09f68
commit 4762e5bc21
30 changed files with 729 additions and 856 deletions

353
tools/ipynb/v1.ipynb Normal file
View File

@@ -0,0 +1,353 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WHBMn6dOWm-S"
},
"source": [
"# [Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) Training notebook"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZFFCx5J80SGa"
},
"source": [
"[![RVC v1](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/tools/colab/v1.ipynb)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "GmFP6bN9dvOq"
},
"outputs": [],
"source": [
"# @title 查看显卡\n",
"!nvidia-smi"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jwu07JgqoFON"
},
"outputs": [],
"source": [
"# @title 挂载谷歌云盘\n",
"\n",
"from google.colab import drive\n",
"\n",
"drive.mount(\"/content/drive\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "wjddIFr1oS3W"
},
"outputs": [],
"source": [
"# @title 安装依赖\n",
"!apt -y install build-essential python3-dev ffmpeg\n",
"!pip3 install --upgrade setuptools wheel\n",
"!pip3 install --upgrade pip"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ge_97mfpgqTm"
},
"outputs": [],
"source": [
"# @title 克隆仓库\n",
"\n",
"!git clone --depth=1 -b v1 https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI\n",
"%cd /content/Retrieval-based-Voice-Conversion-WebUI"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "BLDEZADkvlw1"
},
"outputs": [],
"source": [
"# @title 安装依赖\n",
"!pip install -r requirements.txt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "pqE0PrnuRqI2"
},
"outputs": [],
"source": [
"# @title 下载安装 RVC-Models-Downloader\n",
"!wget https://github.com/RVC-Project/RVC-Models-Downloader/releases/download/v0.2.1/rvcmd_linux_amd64.deb\n",
"!apt install ./rvcmd_linux_amd64.deb"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "UG3XpUwEomUz"
},
"outputs": [],
"source": [
"# @title 下载所需资源\n",
"!rvcmd -notrs -w 1 -notui assets/v1\n",
"!rvcmd -notrs -w 1 -notui assets/rmvpe"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Mwk7Q0Loqzjx"
},
"outputs": [],
"source": [
"# @title 从谷歌云盘加载打包好的数据集到/content/dataset\n",
"\n",
"# @markdown 数据集位置\n",
"DATASET = \"/content/drive/MyDrive/mydataset.zip\" # @param {type:\"string\"}\n",
"\n",
"!mkdir -p /content/dataset\n",
"!unzip -d /content/dataset -B {DATASET}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "PDlFxWHWEynD"
},
"outputs": [],
"source": [
"# @title 重命名数据集中的重名文件\n",
"!ls -a /content/dataset/\n",
"!rename 's/(\\w+)\\.(\\w+)~(\\d*)/$1_$3.$2/' /content/dataset/*.*~*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "7vh6vphDwO0b"
},
"outputs": [],
"source": [
"# @title 启动web\n",
"%cd /content/Retrieval-based-Voice-Conversion-WebUI\n",
"# %load_ext tensorboard\n",
"# %tensorboard --logdir /content/Retrieval-based-Voice-Conversion-WebUI/logs\n",
"!python3 infer-web.py --colab --pycmd python3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "FgJuNeAwx5Y_"
},
"outputs": [],
"source": [
"# @title 手动将训练后的模型文件备份到谷歌云盘\n",
"# @markdown 需要自己查看logs文件夹下模型的文件名手动修改下方命令末尾的文件名\n",
"\n",
"# @markdown 模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown 模型epoch\n",
"MODELEPOCH = 3200 # @param {type:\"integer\"}\n",
"\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/added_*.index /content/drive/MyDrive/\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/total_*.npy /content/drive/MyDrive/\n",
"\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/weights/{MODELNAME}.pth /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "OVQoLQJXS7WX"
},
"outputs": [],
"source": [
"# @title 从谷歌云盘恢复pth\n",
"# @markdown 需要自己查看logs文件夹下模型的文件名手动修改下方命令末尾的文件名\n",
"\n",
"# @markdown 模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown 模型epoch\n",
"MODELEPOCH = 3200 # @param {type:\"integer\"}\n",
"\n",
"!mkdir -p /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
"\n",
"!cp /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
"!cp /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
"!cp /content/drive/MyDrive/*.index /content/\n",
"!cp /content/drive/MyDrive/*.npy /content/\n",
"!cp /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/weights/{MODELNAME}.pth"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ZKAyuKb9J6dz"
},
"outputs": [],
"source": [
"# @title 手动预处理(不推荐)\n",
"# @markdown 模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown 采样率\n",
"BITRATE = 48000 # @param {type:\"integer\"}\n",
"# @markdown 使用的进程数\n",
"THREADCOUNT = 8 # @param {type:\"integer\"}\n",
"\n",
"!python3 trainset_preprocess_pipeline_print.py /content/dataset {BITRATE} {THREADCOUNT} logs/{MODELNAME} True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "CrxJqzAUKmPJ"
},
"outputs": [],
"source": [
"# @title 手动提取特征(不推荐)\n",
"# @markdown 模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown 使用的进程数\n",
"THREADCOUNT = 8 # @param {type:\"integer\"}\n",
"# @markdown 音高提取算法\n",
"ALGO = \"harvest\" # @param {type:\"string\"}\n",
"\n",
"!python3 extract_f0_print.py logs/{MODELNAME} {THREADCOUNT} {ALGO}\n",
"\n",
"!python3 extract_feature_print.py cpu 1 0 0 logs/{MODELNAME} True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "IMLPLKOaKj58"
},
"outputs": [],
"source": [
"# @title 手动训练(不推荐)\n",
"# @markdown 模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown 使用的GPU\n",
"USEGPU = \"0\" # @param {type:\"string\"}\n",
"# @markdown 批大小\n",
"BATCHSIZE = 32 # @param {type:\"integer\"}\n",
"# @markdown 停止的epoch\n",
"MODELEPOCH = 3200 # @param {type:\"integer\"}\n",
"# @markdown 保存epoch间隔\n",
"EPOCHSAVE = 100 # @param {type:\"integer\"}\n",
"# @markdown 采样率\n",
"MODELSAMPLE = \"48k\" # @param {type:\"string\"}\n",
"# @markdown 是否缓存训练集\n",
"CACHEDATA = 1 # @param {type:\"integer\"}\n",
"# @markdown 是否仅保存最新的ckpt文件\n",
"ONLYLATEST = 0 # @param {type:\"integer\"}\n",
"\n",
"!python3 train_nsf_sim_cache_sid_load_pretrain.py -e lulu -sr {MODELSAMPLE} -f0 1 -bs {BATCHSIZE} -g {USEGPU} -te {MODELEPOCH} -se {EPOCHSAVE} -pg pretrained/f0G{MODELSAMPLE}.pth -pd pretrained/f0D{MODELSAMPLE}.pth -l {ONLYLATEST} -c {CACHEDATA}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "haYA81hySuDl"
},
"outputs": [],
"source": [
"# @title 删除其它pth只留选中的慎点仔细看代码\n",
"# @markdown 模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown 选中模型epoch\n",
"MODELEPOCH = 3200 # @param {type:\"integer\"}\n",
"\n",
"!echo \"备份选中的模型。。。\"\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth\n",
"\n",
"!echo \"正在删除。。。\"\n",
"!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
"!rm /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*.pth\n",
"\n",
"!echo \"恢复选中的模型。。。\"\n",
"!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
"!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
"\n",
"!echo \"删除完成\"\n",
"!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "QhSiPTVPoIRh"
},
"outputs": [],
"source": [
"# @title 清除项目下所有文件,只留选中的模型(慎点,仔细看代码)\n",
"# @markdown 模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown 选中模型epoch\n",
"MODELEPOCH = 3200 # @param {type:\"integer\"}\n",
"\n",
"!echo \"备份选中的模型。。。\"\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth\n",
"\n",
"!echo \"正在删除。。。\"\n",
"!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
"!rm -rf /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*\n",
"\n",
"!echo \"恢复选中的模型。。。\"\n",
"!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
"!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
"\n",
"!echo \"删除完成\"\n",
"!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}"
]
}
],
"metadata": {
"colab": {
"private_outputs": true,
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

352
tools/ipynb/v2.ipynb Normal file
View File

@@ -0,0 +1,352 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "QTSdqTqGcbyr"
},
"source": [
"# [Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) Training notebook"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZFFCx5J80SGa"
},
"source": [
"[![RVC v2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/tools/colab/v2.ipynb)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "GmFP6bN9dvOq"
},
"outputs": [],
"source": [
"# @title #查看显卡\n",
"!nvidia-smi"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jwu07JgqoFON"
},
"outputs": [],
"source": [
"# @title 挂载谷歌云盘\n",
"\n",
"from google.colab import drive\n",
"\n",
"drive.mount(\"/content/drive\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "wjddIFr1oS3W"
},
"outputs": [],
"source": [
"# @title #安装依赖\n",
"!apt -y install build-essential python3-dev ffmpeg\n",
"!pip3 install --upgrade setuptools wheel\n",
"!pip3 install --upgrade pip"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ge_97mfpgqTm"
},
"outputs": [],
"source": [
"# @title #克隆仓库\n",
"\n",
"!git clone --depth=1 -b v2.2 https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI\n",
"%cd /content/Retrieval-based-Voice-Conversion-WebUI"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "BLDEZADkvlw1"
},
"outputs": [],
"source": [
"# @title 安装依赖\n",
"!pip install -r requirements.txt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "pqE0PrnuRqI2"
},
"outputs": [],
"source": [
"# @title 下载安装 RVC-Models-Downloader\n",
"!wget https://github.com/RVC-Project/RVC-Models-Downloader/releases/download/v0.2.1/rvcmd_linux_amd64.deb\n",
"!apt install ./rvcmd_linux_amd64.deb"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "UG3XpUwEomUz"
},
"outputs": [],
"source": [
"# @title 下载所需资源\n",
"!rvcmd -notrs -w 1 -notui assets/all"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Mwk7Q0Loqzjx"
},
"outputs": [],
"source": [
"# @title #从谷歌云盘加载打包好的数据集到/content/dataset\n",
"\n",
"# @markdown 数据集位置\n",
"DATASET = \"/content/drive/MyDrive/mydataset.zip\" # @param {type:\"string\"}\n",
"\n",
"!mkdir -p /content/dataset\n",
"!unzip -d /content/dataset -B {DATASET}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "PDlFxWHWEynD"
},
"outputs": [],
"source": [
"# @title #重命名数据集中的重名文件\n",
"!ls -a /content/dataset/\n",
"!rename 's/(\\w+)\\.(\\w+)~(\\d*)/$1_$3.$2/' /content/dataset/*.*~*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "7vh6vphDwO0b"
},
"outputs": [],
"source": [
"# @title #启动webui\n",
"%cd /content/Retrieval-based-Voice-Conversion-WebUI\n",
"# %load_ext tensorboard\n",
"# %tensorboard --logdir /content/Retrieval-based-Voice-Conversion-WebUI/logs\n",
"!python3 infer-web.py --colab --pycmd python3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "FgJuNeAwx5Y_"
},
"outputs": [],
"source": [
"# @title #手动将训练后的模型文件备份到谷歌云盘\n",
"# @markdown #需要自己查看logs文件夹下模型的文件名手动修改下方命令末尾的文件名\n",
"\n",
"# @markdown #模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown #模型epoch\n",
"MODELEPOCH = 3200 # @param {type:\"integer\"}\n",
"\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/added_*.index /content/drive/MyDrive/\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/total_*.npy /content/drive/MyDrive/\n",
"\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/weights/{MODELNAME}.pth /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "OVQoLQJXS7WX"
},
"outputs": [],
"source": [
"# @title 从谷歌云盘恢复pth\n",
"# @markdown 需要自己查看logs文件夹下模型的文件名手动修改下方命令末尾的文件名\n",
"\n",
"# @markdown 模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown 模型epoch\n",
"MODELEPOCH = 3200 # @param {type:\"integer\"}\n",
"\n",
"!mkdir -p /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
"\n",
"!cp /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
"!cp /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
"!cp /content/drive/MyDrive/*.index /content/\n",
"!cp /content/drive/MyDrive/*.npy /content/\n",
"!cp /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/weights/{MODELNAME}.pth"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ZKAyuKb9J6dz"
},
"outputs": [],
"source": [
"# @title 手动预处理(不推荐)\n",
"# @markdown 模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown 采样率\n",
"BITRATE = 48000 # @param {type:\"integer\"}\n",
"# @markdown 使用的进程数\n",
"THREADCOUNT = 8 # @param {type:\"integer\"}\n",
"\n",
"!python3 trainset_preprocess_pipeline_print.py /content/dataset {BITRATE} {THREADCOUNT} logs/{MODELNAME} True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "CrxJqzAUKmPJ"
},
"outputs": [],
"source": [
"# @title 手动提取特征(不推荐)\n",
"# @markdown 模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown 使用的进程数\n",
"THREADCOUNT = 8 # @param {type:\"integer\"}\n",
"# @markdown 音高提取算法\n",
"ALGO = \"harvest\" # @param {type:\"string\"}\n",
"\n",
"!python3 extract_f0_print.py logs/{MODELNAME} {THREADCOUNT} {ALGO}\n",
"\n",
"!python3 extract_feature_print.py cpu 1 0 0 logs/{MODELNAME} True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "IMLPLKOaKj58"
},
"outputs": [],
"source": [
"# @title 手动训练(不推荐)\n",
"# @markdown 模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown 使用的GPU\n",
"USEGPU = \"0\" # @param {type:\"string\"}\n",
"# @markdown 批大小\n",
"BATCHSIZE = 32 # @param {type:\"integer\"}\n",
"# @markdown 停止的epoch\n",
"MODELEPOCH = 3200 # @param {type:\"integer\"}\n",
"# @markdown 保存epoch间隔\n",
"EPOCHSAVE = 100 # @param {type:\"integer\"}\n",
"# @markdown 采样率\n",
"MODELSAMPLE = \"48k\" # @param {type:\"string\"}\n",
"# @markdown 是否缓存训练集\n",
"CACHEDATA = 1 # @param {type:\"integer\"}\n",
"# @markdown 是否仅保存最新的ckpt文件\n",
"ONLYLATEST = 0 # @param {type:\"integer\"}\n",
"\n",
"!python3 train_nsf_sim_cache_sid_load_pretrain.py -e lulu -sr {MODELSAMPLE} -f0 1 -bs {BATCHSIZE} -g {USEGPU} -te {MODELEPOCH} -se {EPOCHSAVE} -pg pretrained/f0G{MODELSAMPLE}.pth -pd pretrained/f0D{MODELSAMPLE}.pth -l {ONLYLATEST} -c {CACHEDATA}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "haYA81hySuDl"
},
"outputs": [],
"source": [
"# @title 删除其它pth只留选中的慎点仔细看代码\n",
"# @markdown 模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown 选中模型epoch\n",
"MODELEPOCH = 3200 # @param {type:\"integer\"}\n",
"\n",
"!echo \"备份选中的模型。。。\"\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth\n",
"\n",
"!echo \"正在删除。。。\"\n",
"!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
"!rm /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*.pth\n",
"\n",
"!echo \"恢复选中的模型。。。\"\n",
"!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
"!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
"\n",
"!echo \"删除完成\"\n",
"!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "QhSiPTVPoIRh"
},
"outputs": [],
"source": [
"# @title 清除项目下所有文件,只留选中的模型(慎点,仔细看代码)\n",
"# @markdown 模型名\n",
"MODELNAME = \"mymodel\" # @param {type:\"string\"}\n",
"# @markdown 选中模型epoch\n",
"MODELEPOCH = 3200 # @param {type:\"integer\"}\n",
"\n",
"!echo \"备份选中的模型。。。\"\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n",
"!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth\n",
"\n",
"!echo \"正在删除。。。\"\n",
"!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
"!rm -rf /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*\n",
"\n",
"!echo \"恢复选中的模型。。。\"\n",
"!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
"!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
"\n",
"!echo \"删除完成\"\n",
"!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}"
]
}
],
"metadata": {
"colab": {
"private_outputs": true,
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@@ -1,6 +1,6 @@
import soundfile
from ..infer.lib.infer_pack.onnx_inference import OnnxRVC
from infer.lib.infer_pack.onnx_inference import OnnxRVC
hop_size = 512
sampling_rate = 40000 # 采样率

View File

@@ -1,445 +0,0 @@
from io import BytesIO
import os
import pickle
import sys
import traceback
from infer.lib import jit
from infer.lib.jit.get_synthesizer import get_synthesizer
from time import time as ttime
import fairseq
import faiss
import numpy as np
import parselmouth
import pyworld
import scipy.signal as signal
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchcrepe
from infer.lib.infer_pack.models import (
SynthesizerTrnMs256NSFsid,
SynthesizerTrnMs256NSFsid_nono,
SynthesizerTrnMs768NSFsid,
SynthesizerTrnMs768NSFsid_nono,
)
now_dir = os.getcwd()
sys.path.append(now_dir)
from multiprocessing import Manager as M
from configs.config import Config
# config = Config()
mm = M()
def printt(strr, *args):
if len(args) == 0:
print(strr)
else:
print(strr % args)
# config.device=torch.device("cpu")########强制cpu测试
# config.is_half=False########强制cpu测试
class RVC:
def __init__(
self,
key,
pth_path,
index_path,
index_rate,
n_cpu,
inp_q,
opt_q,
config: Config,
last_rvc=None,
) -> None:
"""
初始化
"""
try:
if config.dml == True:
def forward_dml(ctx, x, scale):
ctx.scale = scale
res = x.clone().detach()
return res
fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
# global config
self.config = config
self.inp_q = inp_q
self.opt_q = opt_q
# device="cpu"########强制cpu测试
self.device = config.device
self.f0_up_key = key
self.f0_min = 50
self.f0_max = 1100
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
self.n_cpu = n_cpu
self.use_jit = self.config.use_jit
self.is_half = config.is_half
if index_rate != 0:
self.index = faiss.read_index(index_path)
self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
printt("Index search enabled")
self.pth_path: str = pth_path
self.index_path = index_path
self.index_rate = index_rate
self.cache_pitch: torch.Tensor = torch.zeros(
1024, device=self.device, dtype=torch.long
)
self.cache_pitchf = torch.zeros(
1024, device=self.device, dtype=torch.float32
)
if last_rvc is None:
models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
["assets/hubert/hubert_base.pt"],
suffix="",
)
hubert_model = models[0]
hubert_model = hubert_model.to(self.device)
if self.is_half:
hubert_model = hubert_model.half()
else:
hubert_model = hubert_model.float()
hubert_model.eval()
self.model = hubert_model
else:
self.model = last_rvc.model
self.net_g: nn.Module = None
def set_default_model():
self.net_g, cpt = get_synthesizer(self.pth_path, self.device)
self.tgt_sr = cpt["config"][-1]
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
self.if_f0 = cpt.get("f0", 1)
self.version = cpt.get("version", "v1")
if self.is_half:
self.net_g = self.net_g.half()
else:
self.net_g = self.net_g.float()
def set_jit_model():
jit_pth_path = self.pth_path.rstrip(".pth")
jit_pth_path += ".half.jit" if self.is_half else ".jit"
reload = False
if str(self.device) == "cuda":
self.device = torch.device("cuda:0")
if os.path.exists(jit_pth_path):
cpt = jit.load(jit_pth_path)
model_device = cpt["device"]
if model_device != str(self.device):
reload = True
else:
reload = True
if reload:
cpt = jit.synthesizer_jit_export(
self.pth_path,
"script",
None,
device=self.device,
is_half=self.is_half,
)
self.tgt_sr = cpt["config"][-1]
self.if_f0 = cpt.get("f0", 1)
self.version = cpt.get("version", "v1")
self.net_g = torch.jit.load(
BytesIO(cpt["model"]), map_location=self.device
)
self.net_g.infer = self.net_g.forward
self.net_g.eval().to(self.device)
def set_synthesizer():
if self.use_jit and not config.dml:
if self.is_half and "cpu" in str(self.device):
printt(
"Use default Synthesizer model. \
Jit is not supported on the CPU for half floating point"
)
set_default_model()
else:
set_jit_model()
else:
set_default_model()
if last_rvc is None or last_rvc.pth_path != self.pth_path:
set_synthesizer()
else:
self.tgt_sr = last_rvc.tgt_sr
self.if_f0 = last_rvc.if_f0
self.version = last_rvc.version
self.is_half = last_rvc.is_half
if last_rvc.use_jit != self.use_jit:
set_synthesizer()
else:
self.net_g = last_rvc.net_g
if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"):
self.model_rmvpe = last_rvc.model_rmvpe
if last_rvc is not None and hasattr(last_rvc, "model_fcpe"):
self.device_fcpe = last_rvc.device_fcpe
self.model_fcpe = last_rvc.model_fcpe
except:
printt(traceback.format_exc())
def change_key(self, new_key):
self.f0_up_key = new_key
def change_index_rate(self, new_index_rate):
if new_index_rate != 0 and self.index_rate == 0:
self.index = faiss.read_index(self.index_path)
self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
printt("Index search enabled")
self.index_rate = new_index_rate
def get_f0_post(self, f0):
if not torch.is_tensor(f0):
f0 = torch.from_numpy(f0)
f0 = f0.float().to(self.device).squeeze()
f0_mel = 1127 * torch.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
self.f0_mel_max - self.f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = torch.round(f0_mel).long()
return f0_coarse, f0
def get_f0(self, x, f0_up_key, n_cpu, method="harvest"):
n_cpu = int(n_cpu)
if method == "crepe":
return self.get_f0_crepe(x, f0_up_key)
if method == "rmvpe":
return self.get_f0_rmvpe(x, f0_up_key)
if method == "fcpe":
return self.get_f0_fcpe(x, f0_up_key)
x = x.cpu().numpy()
if method == "pm":
p_len = x.shape[0] // 160 + 1
f0_min = 65
l_pad = int(np.ceil(1.5 / f0_min * 16000))
r_pad = l_pad + 1
s = parselmouth.Sound(np.pad(x, (l_pad, r_pad)), 16000).to_pitch_ac(
time_step=0.01,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=1100,
)
assert np.abs(s.t1 - 1.5 / f0_min) < 0.001
f0 = s.selected_array["frequency"]
if len(f0) < p_len:
f0 = np.pad(f0, (0, p_len - len(f0)))
f0 = f0[:p_len]
f0 *= pow(2, f0_up_key / 12)
return self.get_f0_post(f0)
if n_cpu == 1:
f0, t = pyworld.harvest(
x.astype(np.double),
fs=16000,
f0_ceil=1100,
f0_floor=50,
frame_period=10,
)
f0 = signal.medfilt(f0, 3)
f0 *= pow(2, f0_up_key / 12)
return self.get_f0_post(f0)
f0bak = np.zeros(x.shape[0] // 160 + 1, dtype=np.float64)
length = len(x)
part_length = 160 * ((length // 160 - 1) // n_cpu + 1)
n_cpu = (length // 160 - 1) // (part_length // 160) + 1
ts = ttime()
res_f0 = mm.dict()
for idx in range(n_cpu):
tail = part_length * (idx + 1) + 320
if idx == 0:
self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts))
else:
self.inp_q.put(
(idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts)
)
while 1:
res_ts = self.opt_q.get()
if res_ts == ts:
break
f0s = [i[1] for i in sorted(res_f0.items(), key=lambda x: x[0])]
for idx, f0 in enumerate(f0s):
if idx == 0:
f0 = f0[:-3]
elif idx != n_cpu - 1:
f0 = f0[2:-3]
else:
f0 = f0[2:]
f0bak[part_length * idx // 160 : part_length * idx // 160 + f0.shape[0]] = (
f0
)
f0bak = signal.medfilt(f0bak, 3)
f0bak *= pow(2, f0_up_key / 12)
return self.get_f0_post(f0bak)
def get_f0_crepe(self, x, f0_up_key):
if "privateuseone" in str(
self.device
): ###不支持dmlcpu又太慢用不成拿fcpe顶替
return self.get_f0(x, f0_up_key, 1, "fcpe")
# printt("using crepe,device:%s"%self.device)
f0, pd = torchcrepe.predict(
x.unsqueeze(0).float(),
16000,
160,
self.f0_min,
self.f0_max,
"full",
batch_size=512,
# device=self.device if self.device.type!="privateuseone" else "cpu",###crepe不用半精度全部是全精度所以不愁###cpu延迟高到没法用
device=self.device,
return_periodicity=True,
)
pd = torchcrepe.filter.median(pd, 3)
f0 = torchcrepe.filter.mean(f0, 3)
f0[pd < 0.1] = 0
f0 *= pow(2, f0_up_key / 12)
return self.get_f0_post(f0)
def get_f0_rmvpe(self, x, f0_up_key):
if hasattr(self, "model_rmvpe") == False:
from infer.lib.rmvpe import RMVPE
printt("Loading rmvpe model")
self.model_rmvpe = RMVPE(
"assets/rmvpe/rmvpe.pt",
is_half=self.is_half,
device=self.device,
use_jit=self.config.use_jit,
)
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
f0 *= pow(2, f0_up_key / 12)
return self.get_f0_post(f0)
def get_f0_fcpe(self, x, f0_up_key):
if hasattr(self, "model_fcpe") == False:
from torchfcpe import spawn_bundled_infer_model
printt("Loading fcpe model")
if "privateuseone" in str(self.device):
self.device_fcpe = "cpu"
else:
self.device_fcpe = self.device
self.model_fcpe = spawn_bundled_infer_model(self.device_fcpe)
f0 = self.model_fcpe.infer(
x.to(self.device_fcpe).unsqueeze(0).float(),
sr=16000,
decoder_mode="local_argmax",
threshold=0.006,
)
f0 *= pow(2, f0_up_key / 12)
return self.get_f0_post(f0)
def infer(
self,
input_wav: torch.Tensor,
block_frame_16k,
skip_head,
return_length,
f0method,
) -> np.ndarray:
t1 = ttime()
with torch.no_grad():
if self.config.is_half:
feats = input_wav.half().view(1, -1)
else:
feats = input_wav.float().view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
inputs = {
"source": feats,
"padding_mask": padding_mask,
"output_layer": 9 if self.version == "v1" else 12,
}
logits = self.model.extract_features(**inputs)
feats = (
self.model.final_proj(logits[0]) if self.version == "v1" else logits[0]
)
feats = torch.cat((feats, feats[:, -1:, :]), 1)
t2 = ttime()
try:
if hasattr(self, "index") and self.index_rate != 0:
npy = feats[0][skip_head // 2 :].cpu().numpy().astype("float32")
score, ix = self.index.search(npy, k=8)
if (ix >= 0).all():
weight = np.square(1 / score)
weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(
self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1
)
if self.config.is_half:
npy = npy.astype("float16")
feats[0][skip_head // 2 :] = (
torch.from_numpy(npy).unsqueeze(0).to(self.device)
* self.index_rate
+ (1 - self.index_rate) * feats[0][skip_head // 2 :]
)
else:
printt(
"Invalid index. You MUST use added_xxxx.index but not trained_xxxx.index!"
)
else:
printt("Index search FAILED or disabled")
except:
traceback.print_exc()
printt("Index search FAILED")
t3 = ttime()
p_len = input_wav.shape[0] // 160
if self.if_f0 == 1:
f0_extractor_frame = block_frame_16k + 800
if f0method == "rmvpe":
f0_extractor_frame = 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
pitch, pitchf = self.get_f0(
input_wav[-f0_extractor_frame:], self.f0_up_key, self.n_cpu, f0method
)
shift = block_frame_16k // 160
self.cache_pitch[:-shift] = self.cache_pitch[shift:].clone()
self.cache_pitchf[:-shift] = self.cache_pitchf[shift:].clone()
self.cache_pitch[4 - pitch.shape[0] :] = pitch[3:-1]
self.cache_pitchf[4 - pitch.shape[0] :] = pitchf[3:-1]
cache_pitch = self.cache_pitch[None, -p_len:]
cache_pitchf = self.cache_pitchf[None, -p_len:]
t4 = ttime()
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
feats = feats[:, :p_len, :]
p_len = torch.LongTensor([p_len]).to(self.device)
sid = torch.LongTensor([0]).to(self.device)
skip_head = torch.LongTensor([skip_head])
return_length = torch.LongTensor([return_length])
with torch.no_grad():
if self.if_f0 == 1:
infered_audio, _, _ = self.net_g.infer(
feats,
p_len,
cache_pitch,
cache_pitchf,
sid,
skip_head,
return_length,
)
else:
infered_audio, _, _ = self.net_g.infer(
feats, p_len, sid, skip_head, return_length
)
t5 = ttime()
printt(
"Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs",
t2 - t1,
t3 - t2,
t4 - t3,
t5 - t4,
)
return infered_audio.squeeze().float()

View File

@@ -1,13 +0,0 @@
"""
TorchGating is a PyTorch-based implementation of Spectral Gating
================================================
Author: Asaf Zorea
Contents
--------
torchgate imports all the functions from PyTorch, and in addition provides:
TorchGating --- A PyTorch module that applies a spectral gate to an input signal
"""
from .torchgate import TorchGate

View File

@@ -1,280 +0,0 @@
import torch
from infer.lib.rmvpe import STFT
from torch.nn.functional import conv1d, conv2d
from typing import Union, Optional
from .utils import linspace, temperature_sigmoid, amp_to_db
class TorchGate(torch.nn.Module):
"""
A PyTorch module that applies a spectral gate to an input signal.
Arguments:
sr {int} -- Sample rate of the input signal.
nonstationary {bool} -- Whether to use non-stationary or stationary masking (default: {False}).
n_std_thresh_stationary {float} -- Number of standard deviations above mean to threshold noise for
stationary masking (default: {1.5}).
n_thresh_nonstationary {float} -- Number of multiplies above smoothed magnitude spectrogram. for
non-stationary masking (default: {1.3}).
temp_coeff_nonstationary {float} -- Temperature coefficient for non-stationary masking (default: {0.1}).
n_movemean_nonstationary {int} -- Number of samples for moving average smoothing in non-stationary masking
(default: {20}).
prop_decrease {float} -- Proportion to decrease signal by where the mask is zero (default: {1.0}).
n_fft {int} -- Size of FFT for STFT (default: {1024}).
win_length {[int]} -- Window length for STFT. If None, defaults to `n_fft` (default: {None}).
hop_length {[int]} -- Hop length for STFT. If None, defaults to `win_length` // 4 (default: {None}).
freq_mask_smooth_hz {float} -- Frequency smoothing width for mask (in Hz). If None, no smoothing is applied
(default: {500}).
time_mask_smooth_ms {float} -- Time smoothing width for mask (in ms). If None, no smoothing is applied
(default: {50}).
"""
@torch.no_grad()
def __init__(
self,
sr: int,
nonstationary: bool = False,
n_std_thresh_stationary: float = 1.5,
n_thresh_nonstationary: float = 1.3,
temp_coeff_nonstationary: float = 0.1,
n_movemean_nonstationary: int = 20,
prop_decrease: float = 1.0,
n_fft: int = 1024,
win_length: bool = None,
hop_length: int = None,
freq_mask_smooth_hz: float = 500,
time_mask_smooth_ms: float = 50,
):
super().__init__()
# General Params
self.sr = sr
self.nonstationary = nonstationary
assert 0.0 <= prop_decrease <= 1.0
self.prop_decrease = prop_decrease
# STFT Params
self.n_fft = n_fft
self.win_length = self.n_fft if win_length is None else win_length
self.hop_length = self.win_length // 4 if hop_length is None else hop_length
# Stationary Params
self.n_std_thresh_stationary = n_std_thresh_stationary
# Non-Stationary Params
self.temp_coeff_nonstationary = temp_coeff_nonstationary
self.n_movemean_nonstationary = n_movemean_nonstationary
self.n_thresh_nonstationary = n_thresh_nonstationary
# Smooth Mask Params
self.freq_mask_smooth_hz = freq_mask_smooth_hz
self.time_mask_smooth_ms = time_mask_smooth_ms
self.register_buffer("smoothing_filter", self._generate_mask_smoothing_filter())
@torch.no_grad()
def _generate_mask_smoothing_filter(self) -> Union[torch.Tensor, None]:
"""
A PyTorch module that applies a spectral gate to an input signal using the STFT.
Returns:
smoothing_filter (torch.Tensor): a 2D tensor representing the smoothing filter,
with shape (n_grad_freq, n_grad_time), where n_grad_freq is the number of frequency
bins to smooth and n_grad_time is the number of time frames to smooth.
If both self.freq_mask_smooth_hz and self.time_mask_smooth_ms are None, returns None.
"""
if self.freq_mask_smooth_hz is None and self.time_mask_smooth_ms is None:
return None
n_grad_freq = (
1
if self.freq_mask_smooth_hz is None
else int(self.freq_mask_smooth_hz / (self.sr / (self.n_fft / 2)))
)
if n_grad_freq < 1:
raise ValueError(
f"freq_mask_smooth_hz needs to be at least {int((self.sr / (self._n_fft / 2)))} Hz"
)
n_grad_time = (
1
if self.time_mask_smooth_ms is None
else int(self.time_mask_smooth_ms / ((self.hop_length / self.sr) * 1000))
)
if n_grad_time < 1:
raise ValueError(
f"time_mask_smooth_ms needs to be at least {int((self.hop_length / self.sr) * 1000)} ms"
)
if n_grad_time == 1 and n_grad_freq == 1:
return None
v_f = torch.cat(
[
linspace(0, 1, n_grad_freq + 1, endpoint=False),
linspace(1, 0, n_grad_freq + 2),
]
)[1:-1]
v_t = torch.cat(
[
linspace(0, 1, n_grad_time + 1, endpoint=False),
linspace(1, 0, n_grad_time + 2),
]
)[1:-1]
smoothing_filter = torch.outer(v_f, v_t).unsqueeze(0).unsqueeze(0)
return smoothing_filter / smoothing_filter.sum()
@torch.no_grad()
def _stationary_mask(
self, X_db: torch.Tensor, xn: Optional[torch.Tensor] = None
) -> torch.Tensor:
"""
Computes a stationary binary mask to filter out noise in a log-magnitude spectrogram.
Arguments:
X_db (torch.Tensor): 2D tensor of shape (frames, freq_bins) containing the log-magnitude spectrogram.
xn (torch.Tensor): 1D tensor containing the audio signal corresponding to X_db.
Returns:
sig_mask (torch.Tensor): Binary mask of the same shape as X_db, where values greater than the threshold
are set to 1, and the rest are set to 0.
"""
if xn is not None:
if "privateuseone" in str(xn.device):
if not hasattr(self, "stft"):
self.stft = STFT(
filter_length=self.n_fft,
hop_length=self.hop_length,
win_length=self.win_length,
window="hann",
).to(xn.device)
XN = self.stft.transform(xn)
else:
XN = torch.stft(
xn,
n_fft=self.n_fft,
hop_length=self.hop_length,
win_length=self.win_length,
return_complex=True,
pad_mode="constant",
center=True,
window=torch.hann_window(self.win_length).to(xn.device),
)
XN_db = amp_to_db(XN).to(dtype=X_db.dtype)
else:
XN_db = X_db
# calculate mean and standard deviation along the frequency axis
std_freq_noise, mean_freq_noise = torch.std_mean(XN_db, dim=-1)
# compute noise threshold
noise_thresh = mean_freq_noise + std_freq_noise * self.n_std_thresh_stationary
# create binary mask by thresholding the spectrogram
sig_mask = X_db > noise_thresh.unsqueeze(2)
return sig_mask
@torch.no_grad()
def _nonstationary_mask(self, X_abs: torch.Tensor) -> torch.Tensor:
"""
Computes a non-stationary binary mask to filter out noise in a log-magnitude spectrogram.
Arguments:
X_abs (torch.Tensor): 2D tensor of shape (frames, freq_bins) containing the magnitude spectrogram.
Returns:
sig_mask (torch.Tensor): Binary mask of the same shape as X_abs, where values greater than the threshold
are set to 1, and the rest are set to 0.
"""
X_smoothed = (
conv1d(
X_abs.reshape(-1, 1, X_abs.shape[-1]),
torch.ones(
self.n_movemean_nonstationary,
dtype=X_abs.dtype,
device=X_abs.device,
).view(1, 1, -1),
padding="same",
).view(X_abs.shape)
/ self.n_movemean_nonstationary
)
# Compute slowness ratio and apply temperature sigmoid
slowness_ratio = (X_abs - X_smoothed) / (X_smoothed + 1e-6)
sig_mask = temperature_sigmoid(
slowness_ratio, self.n_thresh_nonstationary, self.temp_coeff_nonstationary
)
return sig_mask
def forward(
self, x: torch.Tensor, xn: Optional[torch.Tensor] = None
) -> torch.Tensor:
"""
Apply the proposed algorithm to the input signal.
Arguments:
x (torch.Tensor): The input audio signal, with shape (batch_size, signal_length).
xn (Optional[torch.Tensor]): The noise signal used for stationary noise reduction. If `None`, the input
signal is used as the noise signal. Default: `None`.
Returns:
torch.Tensor: The denoised audio signal, with the same shape as the input signal.
"""
# Compute short-time Fourier transform (STFT)
if "privateuseone" in str(x.device):
if not hasattr(self, "stft"):
self.stft = STFT(
filter_length=self.n_fft,
hop_length=self.hop_length,
win_length=self.win_length,
window="hann",
).to(x.device)
X, phase = self.stft.transform(x, return_phase=True)
else:
X = torch.stft(
x,
n_fft=self.n_fft,
hop_length=self.hop_length,
win_length=self.win_length,
return_complex=True,
pad_mode="constant",
center=True,
window=torch.hann_window(self.win_length).to(x.device),
)
# Compute signal mask based on stationary or nonstationary assumptions
if self.nonstationary:
sig_mask = self._nonstationary_mask(X.abs())
else:
sig_mask = self._stationary_mask(amp_to_db(X), xn)
# Propagate decrease in signal power
sig_mask = self.prop_decrease * (sig_mask.float() - 1.0) + 1.0
# Smooth signal mask with 2D convolution
if self.smoothing_filter is not None:
sig_mask = conv2d(
sig_mask.unsqueeze(1),
self.smoothing_filter.to(sig_mask.dtype),
padding="same",
)
# Apply signal mask to STFT magnitude and phase components
Y = X * sig_mask.squeeze(1)
# Inverse STFT to obtain time-domain signal
if "privateuseone" in str(Y.device):
y = self.stft.inverse(Y, phase)
else:
y = torch.istft(
Y,
n_fft=self.n_fft,
hop_length=self.hop_length,
win_length=self.win_length,
center=True,
window=torch.hann_window(self.win_length).to(Y.device),
)
return y.to(dtype=x.dtype)

View File

@@ -1,70 +0,0 @@
import torch
from torch.types import Number
@torch.no_grad()
def amp_to_db(
x: torch.Tensor, eps=torch.finfo(torch.float64).eps, top_db=40
) -> torch.Tensor:
"""
Convert the input tensor from amplitude to decibel scale.
Arguments:
x {[torch.Tensor]} -- [Input tensor.]
Keyword Arguments:
eps {[float]} -- [Small value to avoid numerical instability.]
(default: {torch.finfo(torch.float64).eps})
top_db {[float]} -- [threshold the output at ``top_db`` below the peak]
` (default: {40})
Returns:
[torch.Tensor] -- [Output tensor in decibel scale.]
"""
x_db = 20 * torch.log10(x.abs() + eps)
return torch.max(x_db, (x_db.max(-1).values - top_db).unsqueeze(-1))
@torch.no_grad()
def temperature_sigmoid(x: torch.Tensor, x0: float, temp_coeff: float) -> torch.Tensor:
"""
Apply a sigmoid function with temperature scaling.
Arguments:
x {[torch.Tensor]} -- [Input tensor.]
x0 {[float]} -- [Parameter that controls the threshold of the sigmoid.]
temp_coeff {[float]} -- [Parameter that controls the slope of the sigmoid.]
Returns:
[torch.Tensor] -- [Output tensor after applying the sigmoid with temperature scaling.]
"""
return torch.sigmoid((x - x0) / temp_coeff)
@torch.no_grad()
def linspace(
start: Number, stop: Number, num: int = 50, endpoint: bool = True, **kwargs
) -> torch.Tensor:
"""
Generate a linearly spaced 1-D tensor.
Arguments:
start {[Number]} -- [The starting value of the sequence.]
stop {[Number]} -- [The end value of the sequence, unless `endpoint` is set to False.
In that case, the sequence consists of all but the last of ``num + 1``
evenly spaced samples, so that `stop` is excluded. Note that the step
size changes when `endpoint` is False.]
Keyword Arguments:
num {[int]} -- [Number of samples to generate. Default is 50. Must be non-negative.]
endpoint {[bool]} -- [If True, `stop` is the last sample. Otherwise, it is not included.
Default is True.]
**kwargs -- [Additional arguments to be passed to the underlying PyTorch `linspace` function.]
Returns:
[torch.Tensor] -- [1-D tensor of `num` equally spaced samples from `start` to `stop`.]
"""
if endpoint:
return torch.linspace(start, stop, num, **kwargs)
else:
return torch.linspace(start, stop, num + 1, **kwargs)[:-1]