From 6f90ce30461e9155fc0bbef098207aec890af323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Thu, 6 Jun 2024 21:34:45 +0900 Subject: [PATCH] optimize(uvr5): remove redundant files --- infer/lib/uvr5_pack/lib_v5/layers.py | 95 +++++----- .../lib/uvr5_pack/lib_v5/layers_123812KB .py | 118 ------------- infer/lib/uvr5_pack/lib_v5/layers_537227KB.py | 126 -------------- infer/lib/uvr5_pack/lib_v5/layers_new.py | 125 ------------- infer/lib/uvr5_pack/lib_v5/nets.py | 164 ++++++++++-------- infer/lib/uvr5_pack/lib_v5/nets_123812KB.py | 122 ------------- infer/lib/uvr5_pack/lib_v5/nets_537227KB.py | 123 ------------- infer/lib/uvr5_pack/lib_v5/nets_61968KB.py | 122 ------------- infer/lib/uvr5_pack/lib_v5/nets_new.py | 133 -------------- infer/lib/uvr5_pack/lib_v5/spec_utils.py | 152 ---------------- infer/lib/uvr5_pack/utils.py | 31 ---- infer/modules/uvr5/vr.py | 2 +- 12 files changed, 139 insertions(+), 1174 deletions(-) delete mode 100644 infer/lib/uvr5_pack/lib_v5/layers_123812KB .py delete mode 100644 infer/lib/uvr5_pack/lib_v5/layers_537227KB.py delete mode 100644 infer/lib/uvr5_pack/lib_v5/layers_new.py delete mode 100644 infer/lib/uvr5_pack/lib_v5/nets_123812KB.py delete mode 100644 infer/lib/uvr5_pack/lib_v5/nets_537227KB.py delete mode 100644 infer/lib/uvr5_pack/lib_v5/nets_61968KB.py delete mode 100644 infer/lib/uvr5_pack/lib_v5/nets_new.py diff --git a/infer/lib/uvr5_pack/lib_v5/layers.py b/infer/lib/uvr5_pack/lib_v5/layers.py index 4fc1b5c..44153b6 100644 --- a/infer/lib/uvr5_pack/lib_v5/layers.py +++ b/infer/lib/uvr5_pack/lib_v5/layers.py @@ -26,40 +26,17 @@ class Conv2DBNActiv(nn.Module): return self.conv(x) -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - class Encoder(nn.Module): def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) + h = self.conv1(x) + h = self.conv2(h) - return h, skip + return h class Decoder(nn.Module): @@ -67,15 +44,19 @@ class Decoder(nn.Module): self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False ): super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None def __call__(self, x, skip=None): x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + if skip is not None: skip = spec_utils.crop_center(skip, x) x = torch.cat([x, skip], dim=1) - h = self.conv(x) + + h = self.conv1(x) + # h = self.conv2(h) if self.dropout is not None: h = self.dropout(h) @@ -84,25 +65,24 @@ class Decoder(nn.Module): class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): + def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): super(ASPPModule, self).__init__() self.conv1 = nn.Sequential( nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), + Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ), ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) + self.conv3 = Conv2DBNActiv( + nin, nout, 3, 1, dilations[0], dilations[0], activ=activ ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + self.conv4 = Conv2DBNActiv( + nin, nout, 3, 1, dilations[1], dilations[1], activ=activ ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) + self.conv5 = Conv2DBNActiv( + nin, nout, 3, 1, dilations[2], dilations[2], activ=activ ) + self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None def forward(self, x): _, _, h, w = x.size() @@ -114,5 +94,32 @@ class ASPPModule(nn.Module): feat4 = self.conv4(x) feat5 = self.conv5(x) out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) - bottle = self.bottleneck(out) - return bottle + out = self.bottleneck(out) + + if self.dropout is not None: + out = self.dropout(out) + + return out + + +class LSTMModule(nn.Module): + def __init__(self, nin_conv, nin_lstm, nout_lstm): + super(LSTMModule, self).__init__() + self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) + self.lstm = nn.LSTM( + input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True + ) + self.dense = nn.Sequential( + nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU() + ) + + def forward(self, x): + N, _, nbins, nframes = x.size() + h = self.conv(x)[:, 0] # N, nbins, nframes + h = h.permute(2, 0, 1) # nframes, N, nbins + h, _ = self.lstm(h) + h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins + h = h.reshape(nframes, N, 1, nbins) + h = h.permute(1, 2, 3, 0) + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py b/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py deleted file mode 100644 index 4fc1b5c..0000000 --- a/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py +++ /dev/null @@ -1,118 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py deleted file mode 100644 index 9b127bc..0000000 --- a/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py +++ /dev/null @@ -1,126 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - feat6 = self.conv6(x) - feat7 = self.conv7(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_new.py b/infer/lib/uvr5_pack/lib_v5/layers_new.py deleted file mode 100644 index 44153b6..0000000 --- a/infer/lib/uvr5_pack/lib_v5/layers_new.py +++ /dev/null @@ -1,125 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) - - def __call__(self, x): - h = self.conv1(x) - h = self.conv2(h) - - return h - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - - h = self.conv1(x) - # h = self.conv2(h) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) - self.conv3 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) - out = self.bottleneck(out) - - if self.dropout is not None: - out = self.dropout(out) - - return out - - -class LSTMModule(nn.Module): - def __init__(self, nin_conv, nin_lstm, nout_lstm): - super(LSTMModule, self).__init__() - self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) - self.lstm = nn.LSTM( - input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True - ) - self.dense = nn.Sequential( - nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU() - ) - - def forward(self, x): - N, _, nbins, nframes = x.size() - h = self.conv(x)[:, 0] # N, nbins, nframes - h = h.permute(2, 0, 1) # nframes, N, nbins - h, _ = self.lstm(h) - h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins - h = h.reshape(nframes, N, 1, nbins) - h = h.permute(1, 2, 3, 0) - - return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets.py b/infer/lib/uvr5_pack/lib_v5/nets.py index 5da3948..6224533 100644 --- a/infer/lib/uvr5_pack/lib_v5/nets.py +++ b/infer/lib/uvr5_pack/lib_v5/nets.py @@ -1,85 +1,100 @@ -import layers import torch import torch.nn.functional as F from torch import nn -from . import spec_utils +from . import layers -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) +class BaseNet(nn.Module): + def __init__( + self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6)) + ): + super(BaseNet, self).__init__() + self.enc1 = layers.Conv2DBNActiv(nin, nout, 3, 1, 1) + self.enc2 = layers.Encoder(nout, nout * 2, 3, 2, 1) + self.enc3 = layers.Encoder(nout * 2, nout * 4, 3, 2, 1) + self.enc4 = layers.Encoder(nout * 4, nout * 6, 3, 2, 1) + self.enc5 = layers.Encoder(nout * 6, nout * 8, 3, 2, 1) - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + self.aspp = layers.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + self.dec4 = layers.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) + self.dec3 = layers.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) + self.dec2 = layers.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) + self.lstm_dec2 = layers.LSTMModule(nout * 2, nin_lstm, nout_lstm) + self.dec1 = layers.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) + e1 = self.enc1(x) + e2 = self.enc2(e1) + e3 = self.enc3(e2) + e4 = self.enc4(e3) + e5 = self.enc5(e4) - h = self.aspp(h) + h = self.aspp(e5) h = self.dec4(h, e4) h = self.dec3(h, e3) h = self.dec2(h, e2) + h = torch.cat([h, self.lstm_dec2(h)], dim=1) h = self.dec1(h, e1) return h -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 16) - self.stg1_high_band_net = BaseASPPNet(2, 16) - - self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(8, 16) - - self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(16, 32) - - self.out = nn.Conv2d(32, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) +class CascadedNet(nn.Module): + def __init__(self, n_fft, nout=32, nout_lstm=128): + super(CascadedNet, self).__init__() self.max_bin = n_fft // 2 self.output_bin = n_fft // 2 + 1 + self.nin_lstm = self.max_bin // 2 + self.offset = 64 - self.offset = 128 + self.stg1_low_band_net = nn.Sequential( + BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), + layers.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), + ) - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() + self.stg1_high_band_net = BaseNet( + 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2 + ) + self.stg2_low_band_net = nn.Sequential( + BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), + layers.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), + ) + self.stg2_high_band_net = BaseNet( + nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2 + ) + + self.stg3_full_band_net = BaseNet( + 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm + ) + + self.out = nn.Conv2d(nout, 2, 1, bias=False) + self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) + + def forward(self, x): x = x[:, :, : self.max_bin] bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) + l1_in = x[:, :, :bandw] + h1_in = x[:, :, bandw:] + l1 = self.stg1_low_band_net(l1_in) + h1 = self.stg1_high_band_net(h1_in) + aux1 = torch.cat([l1, h1], dim=2) - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + l2_in = torch.cat([l1_in, l1], dim=1) + h2_in = torch.cat([h1_in, h1], dim=1) + l2 = self.stg2_low_band_net(l2_in) + h2 = self.stg2_high_band_net(h2_in) + aux2 = torch.cat([l2, h2], dim=2) - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) + f3_in = torch.cat([x, aux1, aux2], dim=1) + f3 = self.stg3_full_band_net(f3_in) - mask = torch.sigmoid(self.out(h)) + mask = torch.sigmoid(self.out(f3)) mask = F.pad( input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), @@ -87,37 +102,32 @@ class CascadedASPPNet(nn.Module): ) if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + aux = torch.cat([aux1, aux2], dim=1) + aux = torch.sigmoid(self.aux_out(aux)) + aux = F.pad( + input=aux, + pad=(0, 0, 0, self.output_bin - aux.size()[2]), mode="replicate", ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix + return mask, aux else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) + return mask - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) + def predict_mask(self, x): + mask = self.forward(x) if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 + mask = mask[:, :, :, self.offset : -self.offset] + assert mask.size()[3] > 0 - return h + return mask + + def predict(self, x, aggressiveness=None): + mask = self.forward(x) + pred_mag = x * mask + + if self.offset > 0: + pred_mag = pred_mag[:, :, :, self.offset : -self.offset] + assert pred_mag.size()[3] > 0 + + return pred_mag diff --git a/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py deleted file mode 100644 index 167d4cb..0000000 --- a/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py +++ /dev/null @@ -1,122 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_123821KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 32) - self.stg1_high_band_net = BaseASPPNet(2, 32) - - self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(16, 32) - - self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(32, 64) - - self.out = nn.Conv2d(64, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py deleted file mode 100644 index 823b44f..0000000 --- a/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py +++ /dev/null @@ -1,123 +0,0 @@ -import numpy as np -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_537238KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 64) - self.stg1_high_band_net = BaseASPPNet(2, 64) - - self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(32, 64) - - self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(64, 128) - - self.out = nn.Conv2d(128, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py b/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py deleted file mode 100644 index 167d4cb..0000000 --- a/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py +++ /dev/null @@ -1,122 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_123821KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 32) - self.stg1_high_band_net = BaseASPPNet(2, 32) - - self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(16, 32) - - self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(32, 64) - - self.out = nn.Conv2d(64, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_new.py b/infer/lib/uvr5_pack/lib_v5/nets_new.py deleted file mode 100644 index 1c0f4fa..0000000 --- a/infer/lib/uvr5_pack/lib_v5/nets_new.py +++ /dev/null @@ -1,133 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_new - - -class BaseNet(nn.Module): - def __init__( - self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6)) - ): - super(BaseNet, self).__init__() - self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1) - self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1) - self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1) - self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1) - self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1) - - self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) - - self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) - self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) - self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) - self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm) - self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) - - def __call__(self, x): - e1 = self.enc1(x) - e2 = self.enc2(e1) - e3 = self.enc3(e2) - e4 = self.enc4(e3) - e5 = self.enc5(e4) - - h = self.aspp(e5) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = torch.cat([h, self.lstm_dec2(h)], dim=1) - h = self.dec1(h, e1) - - return h - - -class CascadedNet(nn.Module): - def __init__(self, n_fft, nout=32, nout_lstm=128): - super(CascadedNet, self).__init__() - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - self.nin_lstm = self.max_bin // 2 - self.offset = 64 - - self.stg1_low_band_net = nn.Sequential( - BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), - layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), - ) - - self.stg1_high_band_net = BaseNet( - 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2 - ) - - self.stg2_low_band_net = nn.Sequential( - BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), - layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), - ) - self.stg2_high_band_net = BaseNet( - nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2 - ) - - self.stg3_full_band_net = BaseNet( - 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm - ) - - self.out = nn.Conv2d(nout, 2, 1, bias=False) - self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) - - def forward(self, x): - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - l1_in = x[:, :, :bandw] - h1_in = x[:, :, bandw:] - l1 = self.stg1_low_band_net(l1_in) - h1 = self.stg1_high_band_net(h1_in) - aux1 = torch.cat([l1, h1], dim=2) - - l2_in = torch.cat([l1_in, l1], dim=1) - h2_in = torch.cat([h1_in, h1], dim=1) - l2 = self.stg2_low_band_net(l2_in) - h2 = self.stg2_high_band_net(h2_in) - aux2 = torch.cat([l2, h2], dim=2) - - f3_in = torch.cat([x, aux1, aux2], dim=1) - f3 = self.stg3_full_band_net(f3_in) - - mask = torch.sigmoid(self.out(f3)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux = torch.cat([aux1, aux2], dim=1) - aux = torch.sigmoid(self.aux_out(aux)) - aux = F.pad( - input=aux, - pad=(0, 0, 0, self.output_bin - aux.size()[2]), - mode="replicate", - ) - return mask, aux - else: - return mask - - def predict_mask(self, x): - mask = self.forward(x) - - if self.offset > 0: - mask = mask[:, :, :, self.offset : -self.offset] - assert mask.size()[3] > 0 - - return mask - - def predict(self, x, aggressiveness=None): - mask = self.forward(x) - pred_mag = x * mask - - if self.offset > 0: - pred_mag = pred_mag[:, :, :, self.offset : -self.offset] - assert pred_mag.size()[3] > 0 - - return pred_mag diff --git a/infer/lib/uvr5_pack/lib_v5/spec_utils.py b/infer/lib/uvr5_pack/lib_v5/spec_utils.py index a9634fd..4637635 100644 --- a/infer/lib/uvr5_pack/lib_v5/spec_utils.py +++ b/infer/lib/uvr5_pack/lib_v5/spec_utils.py @@ -5,8 +5,6 @@ import os import librosa import numpy as np -import soundfile as sf -from tqdm import tqdm def crop_center(h1, h2): @@ -520,153 +518,3 @@ def istft(spec, hl): wave_left = librosa.istft(spec_left, hop_length=hl) wave_right = librosa.istft(spec_right, hop_length=hl) wave = np.asfortranarray([wave_left, wave_right]) - - -if __name__ == "__main__": - import argparse - import sys - import time - - import cv2 - from model_param_init import ModelParameters - - p = argparse.ArgumentParser() - p.add_argument( - "--algorithm", - "-a", - type=str, - choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"], - default="min_mag", - ) - p.add_argument( - "--model_params", - "-m", - type=str, - default=os.path.join("modelparams", "1band_sr44100_hl512.json"), - ) - p.add_argument("--output_name", "-o", type=str, default="output") - p.add_argument("--vocals_only", "-v", action="store_true") - p.add_argument("input", nargs="+") - args = p.parse_args() - - start_time = time.time() - - if args.algorithm.startswith("invert") and len(args.input) != 2: - raise ValueError("There should be two input files.") - - if not args.algorithm.startswith("invert") and len(args.input) < 2: - raise ValueError("There must be at least two input files.") - - wave, specs = {}, {} - mp = ModelParameters(args.model_params) - - for i in range(len(args.input)): - spec = {} - - for d in range(len(mp.param["band"]), 0, -1): - bp = mp.param["band"][d] - - if d == len(mp.param["band"]): # high-end band - wave[d], _ = librosa.load( - args.input[i], - bp["sr"], - False, - dtype=np.float32, - res_type=bp["res_type"], - ) - - if len(wave[d].shape) == 1: # mono to stereo - wave[d] = np.array([wave[d], wave[d]]) - else: # lower bands - wave[d] = librosa.resample( - wave[d + 1], - mp.param["band"][d + 1]["sr"], - bp["sr"], - res_type=bp["res_type"], - ) - - spec[d] = wave_to_spectrogram( - wave[d], - bp["hl"], - bp["n_fft"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ) - - specs[i] = combine_spectrograms(spec, mp) - - del wave - - if args.algorithm == "deep": - d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1]) - v_spec = d_spec - specs[1] - sf.write( - os.path.join("{}.wav".format(args.output_name)), - cmb_spectrogram_to_wave(v_spec, mp), - mp.param["sr"], - ) - - if args.algorithm.startswith("invert"): - ln = min([specs[0].shape[2], specs[1].shape[2]]) - specs[0] = specs[0][:, :, :ln] - specs[1] = specs[1][:, :, :ln] - - if "invert_p" == args.algorithm: - X_mag = np.abs(specs[0]) - y_mag = np.abs(specs[1]) - max_mag = np.where(X_mag >= y_mag, X_mag, y_mag) - v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0])) - else: - specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2) - v_spec = specs[0] - specs[1] - - if not args.vocals_only: - X_mag = np.abs(specs[0]) - y_mag = np.abs(specs[1]) - v_mag = np.abs(v_spec) - - X_image = spectrogram_to_image(X_mag) - y_image = spectrogram_to_image(y_mag) - v_image = spectrogram_to_image(v_mag) - - cv2.imwrite("{}_X.png".format(args.output_name), X_image) - cv2.imwrite("{}_y.png".format(args.output_name), y_image) - cv2.imwrite("{}_v.png".format(args.output_name), v_image) - - sf.write( - "{}_X.wav".format(args.output_name), - cmb_spectrogram_to_wave(specs[0], mp), - mp.param["sr"], - ) - sf.write( - "{}_y.wav".format(args.output_name), - cmb_spectrogram_to_wave(specs[1], mp), - mp.param["sr"], - ) - - sf.write( - "{}_v.wav".format(args.output_name), - cmb_spectrogram_to_wave(v_spec, mp), - mp.param["sr"], - ) - else: - if not args.algorithm == "deep": - sf.write( - os.path.join("ensembled", "{}.wav".format(args.output_name)), - cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), - mp.param["sr"], - ) - - if args.algorithm == "align": - trackalignment = [ - { - "file1": '"{}"'.format(args.input[0]), - "file2": '"{}"'.format(args.input[1]), - } - ] - - for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."): - os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}") - - # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1)) diff --git a/infer/lib/uvr5_pack/utils.py b/infer/lib/uvr5_pack/utils.py index f4805cd..71056c0 100644 --- a/infer/lib/uvr5_pack/utils.py +++ b/infer/lib/uvr5_pack/utils.py @@ -1,17 +1,8 @@ -import json - import numpy as np import torch from tqdm import tqdm -def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict: - with open(file_name, "r") as f: - data = json.load(f) - - return data - - def make_padding(width, cropsize, offset): left = offset roi_size = cropsize - left * 2 @@ -97,25 +88,3 @@ def inference(X_spec, device, model, aggressiveness, data): return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase) else: return pred * coef, X_mag, np.exp(1.0j * X_phase) - - -def _get_name_params(model_path, model_hash): - data = load_data() - flag = False - ModelName = model_path - for type in list(data): - for model in list(data[type][0]): - for i in range(len(data[type][0][model])): - if str(data[type][0][model][i]["hash_name"]) == model_hash: - flag = True - elif str(data[type][0][model][i]["hash_name"]) in ModelName: - flag = True - - if flag: - model_params_auto = data[type][0][model][i]["model_params"] - param_name_auto = data[type][0][model][i]["param_name"] - if type == "equivalent": - return param_name_auto, model_params_auto - else: - flag = False - return param_name_auto, model_params_auto diff --git a/infer/modules/uvr5/vr.py b/infer/modules/uvr5/vr.py index fe922b2..9bc0f20 100644 --- a/infer/modules/uvr5/vr.py +++ b/infer/modules/uvr5/vr.py @@ -11,7 +11,7 @@ import torch from infer.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets from infer.lib.uvr5_pack.lib_v5 import spec_utils from infer.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters -from infer.lib.uvr5_pack.lib_v5.nets_new import CascadedNet +from infer.lib.uvr5_pack.lib_v5.nets import CascadedNet from infer.lib.uvr5_pack.utils import inference