From 67e94925d18ef91f5ad99fff18bd9e777ae4162a Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Fri, 31 Mar 2023 19:43:37 +0900
Subject: [PATCH 01/47] feat: add `half` options

---
 src/so_vits_svc_fork/inference/infer_tool.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/so_vits_svc_fork/inference/infer_tool.py b/src/so_vits_svc_fork/inference/infer_tool.py
index 5604192f..9a2e1c09 100644
--- a/src/so_vits_svc_fork/inference/infer_tool.py
+++ b/src/so_vits_svc_fork/inference/infer_tool.py
@@ -94,6 +94,7 @@ def __init__(
         config_path: str,
         device: torch.device | str | None = None,
         cluster_model_path: Path | str | None = None,
+        half: bool = True,
     ):
         self.net_g_path = net_g_path
         if device is None:
@@ -106,6 +107,7 @@ def __init__(
         self.hop_size = self.hps_ms.data.hop_length
         self.spk2id = self.hps_ms.spk
         self.hubert_model = utils.get_hubert_model().to(self.dev)
+        self.half = half
         self.load_model()
         if cluster_model_path is not None and Path(cluster_model_path).exists():
             self.cluster_model = cluster.get_cluster_model(cluster_model_path)
@@ -117,7 +119,7 @@ def load_model(self):
             **self.hps_ms.model,
         )
         _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
-        if "half" in self.net_g_path and torch.cuda.is_available():
+        if self.half:
             _ = self.net_g_ms.half().eval().to(self.dev)
         else:
             _ = self.net_g_ms.eval().to(self.dev)
@@ -205,8 +207,10 @@ def infer(
         c, f0, uv = self.get_unit_f0(
             audio, transpose, cluster_infer_ratio, speaker, f0_method
         )
-        if "half" in self.net_g_path and torch.cuda.is_available():
+        if self.half:
             c = c.half()
+            f0 = f0.half()
+            uv = uv.half()
 
         # inference
         with torch.no_grad():

From 9a6de1f8094ca294e9bb1c9851e92de65a10591c Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Fri, 31 Mar 2023 19:44:21 +0900
Subject: [PATCH 02/47] fix(hifigan): fix dtype for n

---
 src/so_vits_svc_fork/vdecoder/hifigan/models.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/so_vits_svc_fork/vdecoder/hifigan/models.py b/src/so_vits_svc_fork/vdecoder/hifigan/models.py
index 1c83dea9..7b84f9c3 100644
--- a/src/so_vits_svc_fork/vdecoder/hifigan/models.py
+++ b/src/so_vits_svc_fork/vdecoder/hifigan/models.py
@@ -286,8 +286,11 @@ def forward(self, f0):
         with torch.no_grad():
             # f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
             # fundamental component
+            # fn = torch.multiply(
+            #    f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)
+            # )
             fn = torch.multiply(
-                f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)
+                f0, torch.arange(1, self.harmonic_num + 2).to(f0.device).to(f0.dtype)
             )
 
             # generate sine waveforms

From 59dedf1981e0db8961342c7642308c8f0eec25be Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Fri, 31 Mar 2023 20:04:38 +0900
Subject: [PATCH 03/47] fix: remove unused files

---
 src/so_vits_svc_fork/vdecoder/hifigan/env.py  |  15 --
 .../vdecoder/hifigan/models.py                |  56 ------
 .../vdecoder/hifigan/nvSTFT.py                | 162 ------------------
 3 files changed, 233 deletions(-)
 delete mode 100644 src/so_vits_svc_fork/vdecoder/hifigan/env.py
 delete mode 100644 src/so_vits_svc_fork/vdecoder/hifigan/nvSTFT.py

diff --git a/src/so_vits_svc_fork/vdecoder/hifigan/env.py b/src/so_vits_svc_fork/vdecoder/hifigan/env.py
deleted file mode 100644
index a1231a2d..00000000
--- a/src/so_vits_svc_fork/vdecoder/hifigan/env.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import os
-import shutil
-
-
-class AttrDict(dict):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.__dict__ = self
-
-
-def build_env(config, config_name, path):
-    t_path = os.path.join(path, config_name)
-    if config != t_path:
-        os.makedirs(path, exist_ok=True)
-        shutil.copyfile(config, os.path.join(path, config_name))
diff --git a/src/so_vits_svc_fork/vdecoder/hifigan/models.py b/src/so_vits_svc_fork/vdecoder/hifigan/models.py
index 7b84f9c3..779f4ec6 100644
--- a/src/so_vits_svc_fork/vdecoder/hifigan/models.py
+++ b/src/so_vits_svc_fork/vdecoder/hifigan/models.py
@@ -1,5 +1,3 @@
-import json
-import os
 from logging import getLogger
 
 import numpy as np
@@ -9,7 +7,6 @@
 from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
 from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
 
-from .env import AttrDict
 from .utils import get_padding, init_weights
 
 LOG = getLogger(__name__)
@@ -17,25 +14,6 @@
 LRELU_SLOPE = 0.1
 
 
-def load_model(model_path, device="cuda"):
-    config_file = os.path.join(os.path.split(model_path)[0], "config.json")
-    with open(config_file) as f:
-        data = f.read()
-
-    global h
-    json_config = json.loads(data)
-    h = AttrDict(json_config)
-
-    generator = Generator(h).to(device)
-
-    cp_dict = torch.load(model_path)
-    generator.load_state_dict(cp_dict["generator"])
-    generator.eval()
-    generator.remove_weight_norm()
-    del cp_dict
-    return generator, h
-
-
 class ResBlock1(torch.nn.Module):
     def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
         super().__init__()
@@ -621,37 +599,3 @@ def forward(self, y, y_hat):
             fmap_gs.append(fmap_g)
 
         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-def feature_loss(fmap_r, fmap_g):
-    loss = 0
-    for dr, dg in zip(fmap_r, fmap_g):
-        for rl, gl in zip(dr, dg):
-            loss += torch.mean(torch.abs(rl - gl))
-
-    return loss * 2
-
-
-def discriminator_loss(disc_real_outputs, disc_generated_outputs):
-    loss = 0
-    r_losses = []
-    g_losses = []
-    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
-        r_loss = torch.mean((1 - dr) ** 2)
-        g_loss = torch.mean(dg**2)
-        loss += r_loss + g_loss
-        r_losses.append(r_loss.item())
-        g_losses.append(g_loss.item())
-
-    return loss, r_losses, g_losses
-
-
-def generator_loss(disc_outputs):
-    loss = 0
-    gen_losses = []
-    for dg in disc_outputs:
-        l = torch.mean((1 - dg) ** 2)
-        gen_losses.append(l)
-        loss += l
-
-    return loss, gen_losses
diff --git a/src/so_vits_svc_fork/vdecoder/hifigan/nvSTFT.py b/src/so_vits_svc_fork/vdecoder/hifigan/nvSTFT.py
deleted file mode 100644
index a33805d6..00000000
--- a/src/so_vits_svc_fork/vdecoder/hifigan/nvSTFT.py
+++ /dev/null
@@ -1,162 +0,0 @@
-import os
-
-os.environ["LRU_CACHE_CAPACITY"] = "3"
-from logging import getLogger
-
-import librosa
-import numpy as np
-import soundfile as sf
-import torch
-import torch.utils.data
-from librosa.filters import mel as librosa_mel_fn
-
-LOG = getLogger(__name__)
-
-
-def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
-    sampling_rate = None
-    try:
-        data, sampling_rate = sf.read(full_path, always_2d=True)  # than soundfile.
-    except Exception as ex:
-        LOG.info(f"'{full_path}' failed to load.\nException:")
-        LOG.info(ex)
-        if return_empty_on_exception:
-            return [], sampling_rate or target_sr or 32000
-        else:
-            raise Exception(ex)
-
-    if len(data.shape) > 1:
-        data = data[:, 0]
-        assert (
-            len(data) > 2
-        )  # check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
-
-    if np.issubdtype(data.dtype, np.integer):  # if audio data is type int
-        max_mag = -np.iinfo(
-            data.dtype
-        ).min  # maximum magnitude = min possible value of intXX
-    else:  # if audio data is type fp32
-        max_mag = max(np.amax(data), -np.amin(data))
-        max_mag = (
-            (2**31) + 1
-            if max_mag > (2**15)
-            else ((2**15) + 1 if max_mag > 1.01 else 1.0)
-        )  # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
-
-    data = torch.FloatTensor(data.astype(np.float32)) / max_mag
-
-    if (
-        torch.isinf(data) | torch.isnan(data)
-    ).any() and return_empty_on_exception:  # resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
-        return [], sampling_rate or target_sr or 32000
-    if target_sr is not None and sampling_rate != target_sr:
-        data = torch.from_numpy(
-            librosa.core.resample(
-                data.numpy(), orig_sr=sampling_rate, target_sr=target_sr
-            )
-        )
-        sampling_rate = target_sr
-
-    return data, sampling_rate
-
-
-def dynamic_range_compression(x, C=1, clip_val=1e-5):
-    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
-
-
-def dynamic_range_decompression(x, C=1):
-    return np.exp(x) / C
-
-
-def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
-    return torch.log(torch.clamp(x, min=clip_val) * C)
-
-
-def dynamic_range_decompression_torch(x, C=1):
-    return torch.exp(x) / C
-
-
-class STFT:
-    def __init__(
-        self,
-        sr=22050,
-        n_mels=80,
-        n_fft=1024,
-        win_size=1024,
-        hop_length=256,
-        fmin=20,
-        fmax=11025,
-        clip_val=1e-5,
-    ):
-        self.target_sr = sr
-
-        self.n_mels = n_mels
-        self.n_fft = n_fft
-        self.win_size = win_size
-        self.hop_length = hop_length
-        self.fmin = fmin
-        self.fmax = fmax
-        self.clip_val = clip_val
-        self.mel_basis = {}
-        self.hann_window = {}
-
-    def get_mel(self, y, center=False):
-        sampling_rate = self.target_sr
-        n_mels = self.n_mels
-        n_fft = self.n_fft
-        win_size = self.win_size
-        hop_length = self.hop_length
-        fmin = self.fmin
-        fmax = self.fmax
-        clip_val = self.clip_val
-
-        if torch.min(y) < -1.0:
-            LOG.info("min value is ", torch.min(y))
-        if torch.max(y) > 1.0:
-            LOG.info("max value is ", torch.max(y))
-
-        if fmax not in self.mel_basis:
-            mel = librosa_mel_fn(
-                sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
-            )
-            self.mel_basis[str(fmax) + "_" + str(y.device)] = (
-                torch.from_numpy(mel).float().to(y.device)
-            )
-            self.hann_window[str(y.device)] = torch.hann_window(self.win_size).to(
-                y.device
-            )
-
-        y = torch.nn.functional.pad(
-            y.unsqueeze(1),
-            (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)),
-            mode="reflect",
-        )
-        y = y.squeeze(1)
-
-        spec = torch.stft(
-            y,
-            n_fft,
-            hop_length=hop_length,
-            win_length=win_size,
-            window=self.hann_window[str(y.device)],
-            center=center,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-        )
-        # LOG.info(111,spec)
-        spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
-        # LOG.info(222,spec)
-        spec = torch.matmul(self.mel_basis[str(fmax) + "_" + str(y.device)], spec)
-        # LOG.info(333,spec)
-        spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
-        # LOG.info(444,spec)
-        return spec
-
-    def __call__(self, audiopath):
-        audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
-        spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
-        return spect
-
-
-stft = STFT()

From 87fd22b12b0c415a48cc71b539ded61a83779f96 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Fri, 31 Mar 2023 20:42:06 +0900
Subject: [PATCH 04/47] feat: mb-istft-vits prototype

---
 .idea/workspace.xml                           |  52 +--
 LICENSE                                       | 201 ++++++++++
 src/so_vits_svc_fork/models.py                |  51 ++-
 src/so_vits_svc_fork/train.py                 |  27 +-
 .../vdecoder/mb_istft/__init__.py             |   0
 .../vdecoder/mb_istft/generators.py           | 374 ++++++++++++++++++
 .../vdecoder/mb_istft/loss.py                 |  11 +
 .../vdecoder/mb_istft/pqmf.py                 | 130 ++++++
 .../vdecoder/mb_istft/stft.py                 | 248 ++++++++++++
 .../vdecoder/mb_istft/stft_loss.py            | 140 +++++++
 10 files changed, 1177 insertions(+), 57 deletions(-)
 create mode 100644 src/so_vits_svc_fork/vdecoder/mb_istft/__init__.py
 create mode 100644 src/so_vits_svc_fork/vdecoder/mb_istft/generators.py
 create mode 100644 src/so_vits_svc_fork/vdecoder/mb_istft/loss.py
 create mode 100644 src/so_vits_svc_fork/vdecoder/mb_istft/pqmf.py
 create mode 100644 src/so_vits_svc_fork/vdecoder/mb_istft/stft.py
 create mode 100644 src/so_vits_svc_fork/vdecoder/mb_istft/stft_loss.py

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 7b269da5..4c57db80 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,56 +2,10 @@
 <project version="4">
   <component name="ChangeListManager">
     <list default="true" id="7aacf003-229b-4eb0-80a4-ff105dc3c3d4" name="変更" comment="">
-      <change beforePath="$PROJECT_DIR$/.github/ISSUE_TEMPLATE/1-bug_report.md" beforeDir="false" afterPath="$PROJECT_DIR$/.github/ISSUE_TEMPLATE/1-bug_report.md" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.github/ISSUE_TEMPLATE/2-feature-request.md" beforeDir="false" afterPath="$PROJECT_DIR$/.github/ISSUE_TEMPLATE/2-feature-request.md" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.github/workflows/ci.yml" beforeDir="false" afterPath="$PROJECT_DIR$/.github/workflows/ci.yml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.github/workflows/hacktoberfest.yml" beforeDir="false" afterPath="$PROJECT_DIR$/.github/workflows/hacktoberfest.yml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.github/workflows/issue-manager.yml" beforeDir="false" afterPath="$PROJECT_DIR$/.github/workflows/issue-manager.yml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.github/workflows/labels.yml" beforeDir="false" afterPath="$PROJECT_DIR$/.github/workflows/labels.yml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.gitpod.yml" beforeDir="false" afterPath="$PROJECT_DIR$/.gitpod.yml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.idea/inspectionProfiles/Project_Default.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/inspectionProfiles/Project_Default.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.idea/inspectionProfiles/profiles_settings.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/inspectionProfiles/profiles_settings.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.idea/misc.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/misc.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.idea/modules.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/modules.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.idea/so-vits-svc-fork.iml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/so-vits-svc-fork.iml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.idea/vcs.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/vcs.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.idea/watcherTasks.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/watcherTasks.xml" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.pre-commit-config.yaml" beforeDir="false" afterPath="$PROJECT_DIR$/.pre-commit-config.yaml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.readthedocs.yml" beforeDir="false" afterPath="$PROJECT_DIR$/.readthedocs.yml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/README_zh_CN.md" beforeDir="false" afterPath="$PROJECT_DIR$/README_zh_CN.md" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/commitlint.config.js" beforeDir="false" afterPath="$PROJECT_DIR$/commitlint.config.js" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/pyproject.toml" beforeDir="false" afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/renovate.json" beforeDir="false" afterPath="$PROJECT_DIR$/renovate.json" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/app.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/cluster/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/cluster/__init__.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/cluster/train_cluster.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/cluster/train_cluster.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/configs_template/config_template.json" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/configs_template/config_template.json" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/data_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/data_utils.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/hubert/hubert_model.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/hubert/hubert_model.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/hubert/hubert_model_onnx.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/hubert/hubert_model_onnx.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/inference/infer_tool.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/inference/infer_tool.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/inference/infer_tool_grad.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/inference/infer_tool_grad.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/inference/slicer.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/inference/slicer.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/inference_main.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/inference_main.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/models.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/models.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/attentions.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/attentions.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/commons.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/commons.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/losses.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/losses.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/mel_processing.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/mel_processing.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/modules.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/modules.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/onnx_export.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/onnx_export.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/onnxexport/model_onnx.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/onnxexport/model_onnx.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_flist_config.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_flist_config.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_hubert_f0.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_hubert_f0.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/resample.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/resample.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/spec_gen.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/spec_gen.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/train.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/train.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/utils.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/env.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/env.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/env.py" beforeDir="false" />
       <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/models.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/models.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/nvSTFT.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/nvSTFT.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/utils.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/nvSTFT.py" beforeDir="false" />
     </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -140,6 +94,8 @@
       <updated>1678892092249</updated>
       <workItem from="1678892093553" duration="810000" />
       <workItem from="1678932243084" duration="593000" />
+      <workItem from="1680174456649" duration="1005000" />
+      <workItem from="1680251014707" duration="2225000" />
     </task>
     <servers />
   </component>
diff --git a/LICENSE b/LICENSE
index 764c22e5..0bcb3ed1 100644
--- a/LICENSE
+++ b/LICENSE
@@ -20,3 +20,204 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/src/so_vits_svc_fork/models.py b/src/so_vits_svc_fork/models.py
index 2dc3db7c..fd2da327 100644
--- a/src/so_vits_svc_fork/models.py
+++ b/src/so_vits_svc_fork/models.py
@@ -1,3 +1,6 @@
+from logging import getLogger
+from typing import Literal
+
 import torch
 from torch import nn
 from torch.nn import Conv1d, Conv2d
@@ -13,6 +16,8 @@
 from .utils import f0_to_coarse
 from .vdecoder.hifigan.models import Generator
 
+LOG = getLogger(__name__)
+
 
 class ResidualCouplingBlock(nn.Module):
     def __init__(
@@ -392,7 +397,11 @@ def __init__(
         ssl_dim,
         n_speakers,
         sampling_rate=44100,
-        **kwargs
+        type_: Literal["hifi-gan", "istft", "ms-istft", "mb-istft"] = "hifi-gan",
+        gen_istft_n_fft: int = 16,
+        gen_istft_hop_size: int = 4,
+        subbands: bool = False,
+        **kwargs,
     ):
         super().__init__()
         self.spec_channels = spec_channels
@@ -436,7 +445,29 @@ def __init__(
             "upsample_kernel_sizes": upsample_kernel_sizes,
             "gin_channels": gin_channels,
         }
-        self.dec = Generator(h=hps)
+
+        LOG.info(f"Decoder type: {type_}")
+        if type_ == "hifi-gan":
+            self.dec = Generator(h=hps)
+            self.mb = False
+        else:
+            from .vdecoder.mb_istft.generators import (
+                Multiband_iSTFT_Generator,
+                Multistream_iSTFT_Generator,
+                iSTFT_Generator,
+            )
+
+            # gen_istft_n_fft, gen_istft_hop_size, subbands
+            if type_ == "istft":
+                self.dec = iSTFT_Generator(**hps)
+            elif type_ == "ms-istft":
+                self.dec = Multistream_iSTFT_Generator(**hps)
+            elif type_ == "mb-istft":
+                self.dec = Multiband_iSTFT_Generator(**hps)
+            else:
+                raise ValueError(f"Unknown type: {type_}")
+            self.mb = True
+
         self.enc_q = Encoder(
             spec_channels,
             inter_channels,
@@ -485,10 +516,15 @@ def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None):
         )
 
         # nsf decoder
-        o = self.dec(z_slice, g=g, f0=pitch_slice)
-
+        # MB-iSTFT-VITS
+        if self.dec:
+            o, o_mb = self.dec(z_slice, g=g)
+        else:
+            o = self.dec(z_slice, g=g, f0=pitch_slice)
+            o_mb = None
         return (
             o,
+            o_mb,
             ids_slice,
             spec_mask,
             (z, z_p, m_p, logs_p, m_q, logs_q),
@@ -515,5 +551,10 @@ def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False):
             x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale
         )
         z = self.flow(z_p, c_mask, g=g, reverse=True)
-        o = self.dec(z * c_mask, g=g, f0=f0)
+
+        # MB-iSTFT-VITS
+        if self.mb:
+            o, o_mb = self.dec(z * c_mask, g=g)
+        else:
+            o = self.dec(z * c_mask, g=g, f0=f0)
         return o
diff --git a/src/so_vits_svc_fork/train.py b/src/so_vits_svc_fork/train.py
index ec09859c..515d0ebc 100644
--- a/src/so_vits_svc_fork/train.py
+++ b/src/so_vits_svc_fork/train.py
@@ -267,6 +267,17 @@ def train_and_evaluate(
                 loss_gen, losses_gen = generator_loss(y_d_hat_g)
                 loss_lf0 = F.mse_loss(pred_lf0, lf0)
                 loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_lf0
+
+                # MB-iSTFT-VITS
+                loss_subband = torch.tensor(0.0)
+                if hps.model.__dict__.get("type_") == "mb-istft-vits":
+                    from .vdecoder.mb_istft.loss import subband_stft_loss
+                    from .vdecoder.mb_istft.pqmf import PQMF
+
+                    y_mb = PQMF(y.device).analysis(y)
+                    loss_subband = subband_stft_loss(hps, y_mb, y_hat)
+                loss_gen_all += loss_subband
+
         optim_g.zero_grad()
         scaler.scale(loss_gen_all).backward()
         scaler.unscale_(optim_g)
@@ -277,15 +288,21 @@ def train_and_evaluate(
         if rank == 0:
             if global_step % hps.train.log_interval == 0:
                 lr = optim_g.param_groups[0]["lr"]
-                losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_kl]
+                losses = {
+                    "discriminator": loss_disc.item(),
+                    "generator": loss_gen.item(),
+                    "feature_matching": loss_fm.item(),
+                    "melspectrogram": loss_mel.item(),
+                    "kl_divergence": loss_kl.item(),
+                }
+                if hps.model.__dict__.get("type_") == "mb-istft-vits":
+                    losses["subband_stft"] = loss_subband.item()
                 LOG.info(
                     "Train Epoch: {} [{:.0f}%]".format(
                         epoch, 100.0 * batch_idx / len(train_loader)
                     )
                 )
-                LOG.info(
-                    f"Losses: {[x.item() for x in losses]}, step: {global_step}, lr: {lr}"
-                )
+                LOG.info(f"Losses: {losses}, step: {global_step}, lr: {lr}")
 
                 scalar_dict = {
                     "loss/g/total": loss_gen_all,
@@ -300,6 +317,8 @@ def train_and_evaluate(
                         "loss/g/mel": loss_mel,
                         "loss/g/kl": loss_kl,
                         "loss/g/lf0": loss_lf0,
+                        # MB-iSTFT-VITS
+                        "loss/g/subband": loss_subband,
                     }
                 )
 
diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/__init__.py b/src/so_vits_svc_fork/vdecoder/mb_istft/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/generators.py b/src/so_vits_svc_fork/vdecoder/mb_istft/generators.py
new file mode 100644
index 00000000..c5e469a4
--- /dev/null
+++ b/src/so_vits_svc_fork/vdecoder/mb_istft/generators.py
@@ -0,0 +1,374 @@
+import math
+
+import torch
+from torch import nn
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn import functional as F
+from torch.nn.utils import remove_weight_norm, weight_norm
+
+from ...modules import modules
+from ...modules.commons import get_padding, init_weights
+from .pqmf import PQMF
+from .stft import TorchSTFT
+
+
+class iSTFT_Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gen_istft_n_fft,
+        gen_istft_hop_size,
+        gin_channels=0,
+    ):
+        super().__init__()
+        # self.h = h
+        self.gen_istft_n_fft = gen_istft_n_fft
+        self.gen_istft_hop_size = gen_istft_hop_size
+
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = weight_norm(
+            Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.post_n_fft = self.gen_istft_n_fft
+        self.conv_post = weight_norm(Conv1d(ch, self.post_n_fft + 2, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = torch.nn.ReflectionPad1d((1, 0))
+        self.stft = TorchSTFT(
+            filter_length=self.gen_istft_n_fft,
+            hop_length=self.gen_istft_hop_size,
+            win_length=self.gen_istft_n_fft,
+        )
+
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.reflection_pad(x)
+        x = self.conv_post(x)
+        spec = torch.exp(x[:, : self.post_n_fft // 2 + 1, :])
+        phase = math.pi * torch.sin(x[:, self.post_n_fft // 2 + 1 :, :])
+        out = self.stft.inverse(spec, phase).to(x.device)
+        return out, None
+
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+
+class Multiband_iSTFT_Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gen_istft_n_fft,
+        gen_istft_hop_size,
+        subbands,
+        gin_channels=0,
+    ):
+        super().__init__()
+        # self.h = h
+        self.subbands = subbands
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = weight_norm(
+            Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.post_n_fft = gen_istft_n_fft
+        self.ups.apply(init_weights)
+        self.reflection_pad = torch.nn.ReflectionPad1d((1, 0))
+        self.reshape_pixelshuffle = []
+
+        self.subband_conv_post = weight_norm(
+            Conv1d(ch, self.subbands * (self.post_n_fft + 2), 7, 1, padding=3)
+        )
+
+        self.subband_conv_post.apply(init_weights)
+
+        self.gen_istft_n_fft = gen_istft_n_fft
+        self.gen_istft_hop_size = gen_istft_hop_size
+
+    def forward(self, x, g=None):
+        stft = TorchSTFT(
+            filter_length=self.gen_istft_n_fft,
+            hop_length=self.gen_istft_hop_size,
+            win_length=self.gen_istft_n_fft,
+        ).to(x.device)
+        pqmf = PQMF(x.device)
+
+        x = self.conv_pre(x)  # [B, ch, length]
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+
+        x = F.leaky_relu(x)
+        x = self.reflection_pad(x)
+        x = self.subband_conv_post(x)
+        x = torch.reshape(
+            x, (x.shape[0], self.subbands, x.shape[1] // self.subbands, x.shape[-1])
+        )
+
+        spec = torch.exp(x[:, :, : self.post_n_fft // 2 + 1, :])
+        phase = math.pi * torch.sin(x[:, :, self.post_n_fft // 2 + 1 :, :])
+
+        y_mb_hat = stft.inverse(
+            torch.reshape(
+                spec,
+                (
+                    spec.shape[0] * self.subbands,
+                    self.gen_istft_n_fft // 2 + 1,
+                    spec.shape[-1],
+                ),
+            ),
+            torch.reshape(
+                phase,
+                (
+                    phase.shape[0] * self.subbands,
+                    self.gen_istft_n_fft // 2 + 1,
+                    phase.shape[-1],
+                ),
+            ),
+        )
+        y_mb_hat = torch.reshape(
+            y_mb_hat, (x.shape[0], self.subbands, 1, y_mb_hat.shape[-1])
+        )
+        y_mb_hat = y_mb_hat.squeeze(-2)
+
+        y_g_hat = pqmf.synthesis(y_mb_hat)
+
+        return y_g_hat, y_mb_hat
+
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+
+
+class Multistream_iSTFT_Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gen_istft_n_fft,
+        gen_istft_hop_size,
+        subbands,
+        gin_channels=0,
+    ):
+        super().__init__()
+        # self.h = h
+        self.subbands = subbands
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = weight_norm(
+            Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.post_n_fft = gen_istft_n_fft
+        self.ups.apply(init_weights)
+        self.reflection_pad = torch.nn.ReflectionPad1d((1, 0))
+        self.reshape_pixelshuffle = []
+
+        self.subband_conv_post = weight_norm(
+            Conv1d(ch, self.subbands * (self.post_n_fft + 2), 7, 1, padding=3)
+        )
+
+        self.subband_conv_post.apply(init_weights)
+
+        self.gen_istft_n_fft = gen_istft_n_fft
+        self.gen_istft_hop_size = gen_istft_hop_size
+
+        updown_filter = torch.zeros(
+            (self.subbands, self.subbands, self.subbands)
+        ).float()
+        for k in range(self.subbands):
+            updown_filter[k, k, 0] = 1.0
+        self.register_buffer("updown_filter", updown_filter)
+        self.multistream_conv_post = weight_norm(
+            Conv1d(4, 1, kernel_size=63, bias=False, padding=get_padding(63, 1))
+        )
+        self.multistream_conv_post.apply(init_weights)
+
+    def forward(self, x, g=None):
+        stft = TorchSTFT(
+            filter_length=self.gen_istft_n_fft,
+            hop_length=self.gen_istft_hop_size,
+            win_length=self.gen_istft_n_fft,
+        ).to(x.device)
+        # pqmf = PQMF(x.device)
+
+        x = self.conv_pre(x)  # [B, ch, length]
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+
+        x = F.leaky_relu(x)
+        x = self.reflection_pad(x)
+        x = self.subband_conv_post(x)
+        x = torch.reshape(
+            x, (x.shape[0], self.subbands, x.shape[1] // self.subbands, x.shape[-1])
+        )
+
+        spec = torch.exp(x[:, :, : self.post_n_fft // 2 + 1, :])
+        phase = math.pi * torch.sin(x[:, :, self.post_n_fft // 2 + 1 :, :])
+
+        y_mb_hat = stft.inverse(
+            torch.reshape(
+                spec,
+                (
+                    spec.shape[0] * self.subbands,
+                    self.gen_istft_n_fft // 2 + 1,
+                    spec.shape[-1],
+                ),
+            ),
+            torch.reshape(
+                phase,
+                (
+                    phase.shape[0] * self.subbands,
+                    self.gen_istft_n_fft // 2 + 1,
+                    phase.shape[-1],
+                ),
+            ),
+        )
+        y_mb_hat = torch.reshape(
+            y_mb_hat, (x.shape[0], self.subbands, 1, y_mb_hat.shape[-1])
+        )
+        y_mb_hat = y_mb_hat.squeeze(-2)
+
+        y_mb_hat = F.conv_transpose1d(
+            y_mb_hat,
+            self.updown_filter.cuda(x.device) * self.subbands,
+            stride=self.subbands,
+        )
+
+        y_g_hat = self.multistream_conv_post(y_mb_hat)
+
+        return y_g_hat, y_mb_hat
+
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/loss.py b/src/so_vits_svc_fork/vdecoder/mb_istft/loss.py
new file mode 100644
index 00000000..9895befd
--- /dev/null
+++ b/src/so_vits_svc_fork/vdecoder/mb_istft/loss.py
@@ -0,0 +1,11 @@
+from .stft_loss import MultiResolutionSTFTLoss
+
+
+def subband_stft_loss(h, y_mb, y_hat_mb):
+    sub_stft_loss = MultiResolutionSTFTLoss(
+        h.train.fft_sizes, h.train.hop_sizes, h.train.win_lengths
+    )
+    y_mb = y_mb.view(-1, y_mb.size(2))
+    y_hat_mb = y_hat_mb.view(-1, y_hat_mb.size(2))
+    sub_sc_loss, sub_mag_loss = sub_stft_loss(y_hat_mb[:, : y_mb.size(-1)], y_mb)
+    return sub_sc_loss + sub_mag_loss
diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/pqmf.py b/src/so_vits_svc_fork/vdecoder/mb_istft/pqmf.py
new file mode 100644
index 00000000..987dde8e
--- /dev/null
+++ b/src/so_vits_svc_fork/vdecoder/mb_istft/pqmf.py
@@ -0,0 +1,130 @@
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Pseudo QMF modules."""
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy.signal import kaiser
+
+
+def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0):
+    """Design prototype filter for PQMF.
+    This method is based on `A Kaiser window approach for the design of prototype
+    filters of cosine modulated filterbanks`_.
+    Args:
+        taps (int): The number of filter taps.
+        cutoff_ratio (float): Cut-off frequency ratio.
+        beta (float): Beta coefficient for kaiser window.
+    Returns:
+        ndarray: Impluse response of prototype filter (taps + 1,).
+    .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
+        https://ieeexplore.ieee.org/abstract/document/681427
+    """
+    # check the arguments are valid
+    assert taps % 2 == 0, "The number of taps mush be even number."
+    assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
+
+    # make initial filter
+    omega_c = np.pi * cutoff_ratio
+    with np.errstate(invalid="ignore"):
+        h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) / (
+            np.pi * (np.arange(taps + 1) - 0.5 * taps)
+        )
+    h_i[taps // 2] = np.cos(0) * cutoff_ratio  # fix nan due to indeterminate form
+
+    # apply kaiser window
+    w = kaiser(taps + 1, beta)
+    h = h_i * w
+
+    return h
+
+
+class PQMF(torch.nn.Module):
+    """PQMF module.
+    This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
+    .. _`Near-perfect-reconstruction pseudo-QMF banks`:
+        https://ieeexplore.ieee.org/document/258122
+    """
+
+    def __init__(self, device, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0):
+        """Initialize PQMF module.
+        Args:
+            subbands (int): The number of subbands.
+            taps (int): The number of filter taps.
+            cutoff_ratio (float): Cut-off frequency ratio.
+            beta (float): Beta coefficient for kaiser window.
+        """
+        super().__init__()
+
+        # define filter coefficient
+        h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
+        h_analysis = np.zeros((subbands, len(h_proto)))
+        h_synthesis = np.zeros((subbands, len(h_proto)))
+        for k in range(subbands):
+            h_analysis[k] = (
+                2
+                * h_proto
+                * np.cos(
+                    (2 * k + 1)
+                    * (np.pi / (2 * subbands))
+                    * (np.arange(taps + 1) - ((taps - 1) / 2))
+                    + (-1) ** k * np.pi / 4
+                )
+            )
+            h_synthesis[k] = (
+                2
+                * h_proto
+                * np.cos(
+                    (2 * k + 1)
+                    * (np.pi / (2 * subbands))
+                    * (np.arange(taps + 1) - ((taps - 1) / 2))
+                    - (-1) ** k * np.pi / 4
+                )
+            )
+
+        # convert to tensor
+        analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).cuda(device)
+        synthesis_filter = (
+            torch.from_numpy(h_synthesis).float().unsqueeze(0).cuda(device)
+        )
+
+        # register coefficients as buffer
+        self.register_buffer("analysis_filter", analysis_filter)
+        self.register_buffer("synthesis_filter", synthesis_filter)
+
+        # filter for downsampling & upsampling
+        updown_filter = torch.zeros((subbands, subbands, subbands)).float().cuda(device)
+        for k in range(subbands):
+            updown_filter[k, k, 0] = 1.0
+        self.register_buffer("updown_filter", updown_filter)
+        self.subbands = subbands
+
+        # keep padding info
+        self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
+
+    def analysis(self, x):
+        """Analysis with PQMF.
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+        Returns:
+            Tensor: Output tensor (B, subbands, T // subbands).
+        """
+        x = F.conv1d(self.pad_fn(x), self.analysis_filter)
+        return F.conv1d(x, self.updown_filter, stride=self.subbands)
+
+    def synthesis(self, x):
+        """Synthesis with PQMF.
+        Args:
+            x (Tensor): Input tensor (B, subbands, T // subbands).
+        Returns:
+            Tensor: Output tensor (B, 1, T).
+        """
+        # NOTE(kan-bayashi): Power will be dreased so here multiply by # subbands.
+        #   Not sure this is the correct way, it is better to check again.
+        # TODO(kan-bayashi): Understand the reconstruction procedure
+        x = F.conv_transpose1d(
+            x, self.updown_filter * self.subbands, stride=self.subbands
+        )
+        return F.conv1d(self.pad_fn(x), self.synthesis_filter)
diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/stft.py b/src/so_vits_svc_fork/vdecoder/mb_istft/stft.py
new file mode 100644
index 00000000..8a111dca
--- /dev/null
+++ b/src/so_vits_svc_fork/vdecoder/mb_istft/stft.py
@@ -0,0 +1,248 @@
+"""
+BSD 3-Clause License
+Copyright (c) 2017, Prem Seetharaman
+All rights reserved.
+* Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+import librosa.util as librosa_util
+import numpy as np
+import torch
+import torch.nn.functional as F
+from librosa.util import pad_center, tiny
+from scipy.signal import get_window
+from torch.autograd import Variable
+
+
+def window_sumsquare(
+    window,
+    n_frames,
+    hop_length=200,
+    win_length=800,
+    n_fft=800,
+    dtype=np.float32,
+    norm=None,
+):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+    n_frames : int > 0
+        The number of analysis frames
+    hop_length : int > 0
+        The number of samples to advance between frames
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+    n_fft : int > 0
+        The length of each analysis frame.
+    dtype : np.dtype
+        The data type of the output
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
+    win_sq = librosa_util.pad_center(win_sq, n_fft)
+
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
+    return x
+
+
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+
+    def __init__(
+        self, filter_length=800, hop_length=200, win_length=800, window="hann"
+    ):
+        super().__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        scale = self.filter_length / self.hop_length
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+
+        cutoff = int(self.filter_length / 2 + 1)
+        fourier_basis = np.vstack(
+            [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
+        )
+
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        inverse_basis = torch.FloatTensor(
+            np.linalg.pinv(scale * fourier_basis).T[:, None, :]
+        )
+
+        if window is not None:
+            assert filter_length >= win_length
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+
+            # window the bases
+            forward_basis *= fft_window
+            inverse_basis *= fft_window
+
+        self.register_buffer("forward_basis", forward_basis.float())
+        self.register_buffer("inverse_basis", inverse_basis.float())
+
+    def transform(self, input_data):
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+
+        self.num_samples = num_samples
+
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        input_data = F.pad(
+            input_data.unsqueeze(1),
+            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+            mode="reflect",
+        )
+        input_data = input_data.squeeze(1)
+
+        forward_transform = F.conv1d(
+            input_data,
+            Variable(self.forward_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )
+
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
+
+        return magnitude, phase
+
+    def inverse(self, magnitude, phase):
+        recombine_magnitude_phase = torch.cat(
+            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
+        )
+
+        inverse_transform = F.conv_transpose1d(
+            recombine_magnitude_phase,
+            Variable(self.inverse_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )
+
+        if self.window is not None:
+            window_sum = window_sumsquare(
+                self.window,
+                magnitude.size(-1),
+                hop_length=self.hop_length,
+                win_length=self.win_length,
+                n_fft=self.filter_length,
+                dtype=np.float32,
+            )
+            # remove modulation effects
+            approx_nonzero_indices = torch.from_numpy(
+                np.where(window_sum > tiny(window_sum))[0]
+            )
+            window_sum = torch.autograd.Variable(
+                torch.from_numpy(window_sum), requires_grad=False
+            )
+            window_sum = (
+                window_sum.to(inverse_transform.device())
+                if magnitude.is_cuda
+                else window_sum
+            )
+            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
+                approx_nonzero_indices
+            ]
+
+            # scale by hop ratio
+            inverse_transform *= float(self.filter_length) / self.hop_length
+
+        inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
+        inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
+
+        return inverse_transform
+
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction
+
+
+class TorchSTFT(torch.nn.Module):
+    def __init__(
+        self, filter_length=800, hop_length=200, win_length=800, window="hann"
+    ):
+        super().__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = torch.from_numpy(
+            get_window(window, win_length, fftbins=True).astype(np.float32)
+        )
+
+    def transform(self, input_data):
+        forward_transform = torch.stft(
+            input_data,
+            self.filter_length,
+            self.hop_length,
+            self.win_length,
+            window=self.window,
+            return_complex=True,
+        )
+
+        return torch.abs(forward_transform), torch.angle(forward_transform)
+
+    def inverse(self, magnitude, phase):
+        inverse_transform = torch.istft(
+            magnitude * torch.exp(phase * 1j),
+            self.filter_length,
+            self.hop_length,
+            self.win_length,
+            window=self.window.to(magnitude.device),
+        )
+
+        return inverse_transform.unsqueeze(
+            -2
+        )  # unsqueeze to stay consistent with conv_transpose1d implementation
+
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction
diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/stft_loss.py b/src/so_vits_svc_fork/vdecoder/mb_istft/stft_loss.py
new file mode 100644
index 00000000..ac30172d
--- /dev/null
+++ b/src/so_vits_svc_fork/vdecoder/mb_istft/stft_loss.py
@@ -0,0 +1,140 @@
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""STFT-based Loss modules."""
+
+import torch
+import torch.nn.functional as F
+
+
+def stft(x, fft_size, hop_size, win_length, window):
+    """Perform STFT and convert to magnitude spectrogram.
+    Args:
+        x (Tensor): Input signal tensor (B, T).
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length.
+        window (str): Window function type.
+    Returns:
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    """
+    x_stft = torch.stft(x, fft_size, hop_size, win_length, window.to(x.device))
+    real = x_stft[..., 0]
+    imag = x_stft[..., 1]
+
+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
+    return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)
+
+
+class SpectralConvergengeLoss(torch.nn.Module):
+    """Spectral convergence loss module."""
+
+    def __init__(self):
+        """Initialize spectral convergence loss module."""
+        super().__init__()
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Spectral convergence loss value.
+        """
+        return torch.norm(y_mag - x_mag) / torch.norm(
+            y_mag
+        )  # MB-iSTFT-VITS changed here due to codespell
+
+
+class LogSTFTMagnitudeLoss(torch.nn.Module):
+    """Log STFT magnitude loss module."""
+
+    def __init__(self):
+        """Initialize los STFT magnitude loss module."""
+        super().__init__()
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Log STFT magnitude loss value.
+        """
+        return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
+
+
+class STFTLoss(torch.nn.Module):
+    """STFT loss module."""
+
+    def __init__(
+        self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"
+    ):
+        """Initialize STFT loss module."""
+        super().__init__()
+        self.fft_size = fft_size
+        self.shift_size = shift_size
+        self.win_length = win_length
+        self.window = getattr(torch, window)(win_length)
+        self.spectral_convergenge_loss = SpectralConvergengeLoss()
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Spectral convergence loss value.
+            Tensor: Log STFT magnitude loss value.
+        """
+        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
+        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
+        sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+
+        return sc_loss, mag_loss
+
+
+class MultiResolutionSTFTLoss(torch.nn.Module):
+    """Multi resolution STFT loss module."""
+
+    def __init__(
+        self,
+        fft_sizes=[1024, 2048, 512],
+        hop_sizes=[120, 240, 50],
+        win_lengths=[600, 1200, 240],
+        window="hann_window",
+    ):
+        """Initialize Multi resolution STFT loss module.
+        Args:
+            fft_sizes (list): List of FFT sizes.
+            hop_sizes (list): List of hop sizes.
+            win_lengths (list): List of window lengths.
+            window (str): Window function type.
+        """
+        super().__init__()
+        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
+        self.stft_losses = torch.nn.ModuleList()
+        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
+            self.stft_losses += [STFTLoss(fs, ss, wl, window)]
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Multi resolution spectral convergence loss value.
+            Tensor: Multi resolution log STFT magnitude loss value.
+        """
+        sc_loss = 0.0
+        mag_loss = 0.0
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+
+        return sc_loss, mag_loss

From 5b3fb65ff6645daeda69123e569e5ec3c91f433a Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Fri, 31 Mar 2023 22:08:48 +0900
Subject: [PATCH 05/47] fix(__main__): fix automatic model search algo

---
 src/so_vits_svc_fork/__main__.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
index b2660bdb..0e639710 100644
--- a/src/so_vits_svc_fork/__main__.py
+++ b/src/so_vits_svc_fork/__main__.py
@@ -222,7 +222,9 @@ def infer(
     output_path = Path(output_path)
     model_path = Path(model_path)
     if model_path.is_dir():
-        model_path = list(sorted(model_path.glob("*.pth")))[-1]
+        model_path = list(
+            sorted(model_path.glob("G_*.pth"), key=lambda x: x.stat().st_mtime)
+        )[-1]
         LOG.info(f"Since model_path is a directory, use {model_path}")
     config_path = Path(config_path)
     if cluster_model_path is not None:
@@ -381,7 +383,9 @@ def vc(
     if cluster_model_path is not None:
         cluster_model_path = Path(cluster_model_path)
     if model_path.is_dir():
-        model_path = list(sorted(model_path.glob("*.pth")))[-1]
+        model_path = list(
+            sorted(model_path.glob("G_*.pth"), key=lambda x: x.stat().st_mtime)
+        )[-1]
         LOG.info(f"Since model_path is a directory, use {model_path}")
 
     realtime(

From 1ce2580148ebcf89e03e4da174ec5c06ab7445c1 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Fri, 31 Mar 2023 22:23:34 +0900
Subject: [PATCH 06/47] fix(train): add loss calculation

---
 src/so_vits_svc_fork/train.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/so_vits_svc_fork/train.py b/src/so_vits_svc_fork/train.py
index 515d0ebc..e0959410 100644
--- a/src/so_vits_svc_fork/train.py
+++ b/src/so_vits_svc_fork/train.py
@@ -217,6 +217,7 @@ def train_and_evaluate(
         with autocast(enabled=hps.train.fp16_run):
             (
                 y_hat,
+                y_hat_mb,
                 ids_slice,
                 z_mask,
                 (z, z_p, m_p, logs_p, m_q, logs_q),
@@ -270,12 +271,12 @@ def train_and_evaluate(
 
                 # MB-iSTFT-VITS
                 loss_subband = torch.tensor(0.0)
-                if hps.model.__dict__.get("type_") == "mb-istft-vits":
+                if hps.model.type_ == "mb-istft":
                     from .vdecoder.mb_istft.loss import subband_stft_loss
                     from .vdecoder.mb_istft.pqmf import PQMF
 
-                    y_mb = PQMF(y.device).analysis(y)
-                    loss_subband = subband_stft_loss(hps, y_mb, y_hat)
+                    y_mb = PQMF(y.device, hps.model.subbands).analysis(y)
+                    loss_subband = subband_stft_loss(hps, y_mb, y_hat_mb)
                 loss_gen_all += loss_subband
 
         optim_g.zero_grad()
@@ -295,7 +296,7 @@ def train_and_evaluate(
                     "melspectrogram": loss_mel.item(),
                     "kl_divergence": loss_kl.item(),
                 }
-                if hps.model.__dict__.get("type_") == "mb-istft-vits":
+                if hps.model.type_ == "mb-istft":
                     losses["subband_stft"] = loss_subband.item()
                 LOG.info(
                     "Train Epoch: {} [{:.0f}%]".format(
@@ -317,10 +318,10 @@ def train_and_evaluate(
                         "loss/g/mel": loss_mel,
                         "loss/g/kl": loss_kl,
                         "loss/g/lf0": loss_lf0,
-                        # MB-iSTFT-VITS
-                        "loss/g/subband": loss_subband,
                     }
                 )
+                if hps.model.type_ == "mb-istft":
+                    scalar_dict["loss/g/subband"] = loss_subband
 
                 # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
                 # scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})

From cc463db98cb334ced7e70319c45965bf14d3e9a3 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Fri, 31 Mar 2023 22:24:05 +0900
Subject: [PATCH 07/47] fix(infer_tool): half=False by default

---
 src/so_vits_svc_fork/inference/infer_tool.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/so_vits_svc_fork/inference/infer_tool.py b/src/so_vits_svc_fork/inference/infer_tool.py
index 9a2e1c09..462ef3d6 100644
--- a/src/so_vits_svc_fork/inference/infer_tool.py
+++ b/src/so_vits_svc_fork/inference/infer_tool.py
@@ -94,7 +94,7 @@ def __init__(
         config_path: str,
         device: torch.device | str | None = None,
         cluster_model_path: Path | str | None = None,
-        half: bool = True,
+        half: bool = False,
     ):
         self.net_g_path = net_g_path
         if device is None:

From aac151ae6a247e9a8d130cd078c9a8075a30c693 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Fri, 31 Mar 2023 22:24:48 +0900
Subject: [PATCH 08/47] fix(stft_loss): return_complex = False

---
 src/so_vits_svc_fork/vdecoder/mb_istft/stft_loss.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/stft_loss.py b/src/so_vits_svc_fork/vdecoder/mb_istft/stft_loss.py
index ac30172d..c685cb02 100644
--- a/src/so_vits_svc_fork/vdecoder/mb_istft/stft_loss.py
+++ b/src/so_vits_svc_fork/vdecoder/mb_istft/stft_loss.py
@@ -18,7 +18,9 @@ def stft(x, fft_size, hop_size, win_length, window):
     Returns:
         Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
     """
-    x_stft = torch.stft(x, fft_size, hop_size, win_length, window.to(x.device))
+    x_stft = torch.stft(
+        x, fft_size, hop_size, win_length, window.to(x.device), return_complex=False
+    )
     real = x_stft[..., 0]
     imag = x_stft[..., 1]
 

From b84788563d48f3d3da6d08b8897fd109e5d03b2b Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Fri, 31 Mar 2023 22:27:02 +0900
Subject: [PATCH 09/47] fix: update hps

---
 .../configs_template/config_template.json     | 23 ++++--
 src/so_vits_svc_fork/models.py                | 80 +++++++++++--------
 .../vdecoder/mb_istft/generators.py           |  2 +-
 3 files changed, 65 insertions(+), 40 deletions(-)

diff --git a/src/so_vits_svc_fork/configs_template/config_template.json b/src/so_vits_svc_fork/configs_template/config_template.json
index 45852762..1116c0a8 100644
--- a/src/so_vits_svc_fork/configs_template/config_template.json
+++ b/src/so_vits_svc_fork/configs_template/config_template.json
@@ -7,7 +7,7 @@
     "learning_rate": 0.0001,
     "betas": [0.8, 0.99],
     "eps": 1e-9,
-    "batch_size": 6,
+    "batch_size": 2,
     "fp16_run": false,
     "lr_decay": 0.999875,
     "segment_size": 10240,
@@ -18,7 +18,11 @@
     "use_sr": true,
     "max_speclen": 512,
     "port": "8001",
-    "keep_ckpts": 3
+    "keep_ckpts": 3,
+    "fft_sizes": [384, 683, 171],
+    "hop_sizes": [30, 60, 10],
+    "win_lengths": [150, 300, 60],
+    "window": "hann_window"
   },
   "data": {
     "training_files": "filelists/44k/train.txt",
@@ -47,14 +51,21 @@
       [1, 3, 5],
       [1, 3, 5]
     ],
-    "upsample_rates": [8, 8, 2, 2, 2],
+    "upsample_rates": [4, 4],
     "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [16, 16, 4, 4, 4],
+    "upsample_kernel_sizes": [16, 16],
     "n_layers_q": 3,
     "use_spectral_norm": false,
     "gin_channels": 256,
     "ssl_dim": 256,
-    "n_speakers": 200
+    "n_speakers": 200,
+    "type_": "mb-istft",
+    "gen_istft_n_fft": 16,
+    "gen_istft_hop_size": 4,
+    "subbands": 8
   },
-  "spk": {}
+  "spk": {
+    "34j": 0,
+    "kiritan": 1
+  }
 }
diff --git a/src/so_vits_svc_fork/models.py b/src/so_vits_svc_fork/models.py
index fd2da327..1d369efd 100644
--- a/src/so_vits_svc_fork/models.py
+++ b/src/so_vits_svc_fork/models.py
@@ -1,5 +1,5 @@
 from logging import getLogger
-from typing import Literal
+from typing import Any, Literal, Sequence
 
 import torch
 from torch import nn
@@ -378,30 +378,30 @@ class SynthesizerTrn(nn.Module):
 
     def __init__(
         self,
-        spec_channels,
-        segment_size,
-        inter_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        gin_channels,
-        ssl_dim,
-        n_speakers,
-        sampling_rate=44100,
+        spec_channels: int,
+        segment_size: int,
+        inter_channels: int,
+        hidden_channels: int,
+        filter_channels: int,
+        n_heads: int,
+        n_layers: int,
+        kernel_size: int,
+        p_dropout: int,
+        resblock: str,
+        resblock_kernel_sizes: Sequence[int],
+        resblock_dilation_sizes: Sequence[Sequence[int]],
+        upsample_rates: Sequence[int],
+        upsample_initial_channel: int,
+        upsample_kernel_sizes: Sequence[int],
+        gin_channels: int,
+        ssl_dim: int,
+        n_speakers: int,
+        sampling_rate: int = 44100,
         type_: Literal["hifi-gan", "istft", "ms-istft", "mb-istft"] = "hifi-gan",
         gen_istft_n_fft: int = 16,
         gen_istft_hop_size: int = 4,
-        subbands: bool = False,
-        **kwargs,
+        subbands: int = 8,
+        **kwargs: Any,
     ):
         super().__init__()
         self.spec_channels = spec_channels
@@ -434,23 +434,36 @@ def __init__(
             kernel_size=kernel_size,
             p_dropout=p_dropout,
         )
-        hps = {
-            "sampling_rate": sampling_rate,
-            "inter_channels": inter_channels,
-            "resblock": resblock,
-            "resblock_kernel_sizes": resblock_kernel_sizes,
-            "resblock_dilation_sizes": resblock_dilation_sizes,
-            "upsample_rates": upsample_rates,
-            "upsample_initial_channel": upsample_initial_channel,
-            "upsample_kernel_sizes": upsample_kernel_sizes,
-            "gin_channels": gin_channels,
-        }
 
         LOG.info(f"Decoder type: {type_}")
         if type_ == "hifi-gan":
+            hps = {
+                "sampling_rate": sampling_rate,
+                "inter_channels": inter_channels,
+                "resblock": resblock,
+                "resblock_kernel_sizes": resblock_kernel_sizes,
+                "resblock_dilation_sizes": resblock_dilation_sizes,
+                "upsample_rates": upsample_rates,
+                "upsample_initial_channel": upsample_initial_channel,
+                "upsample_kernel_sizes": upsample_kernel_sizes,
+                "gin_channels": gin_channels,
+            }
             self.dec = Generator(h=hps)
             self.mb = False
         else:
+            hps = {
+                "initial_channel": inter_channels,
+                "resblock": resblock,
+                "resblock_kernel_sizes": resblock_kernel_sizes,
+                "resblock_dilation_sizes": resblock_dilation_sizes,
+                "upsample_rates": upsample_rates,
+                "upsample_initial_channel": upsample_initial_channel,
+                "upsample_kernel_sizes": upsample_kernel_sizes,
+                "gin_channels": gin_channels,
+                "gen_istft_n_fft": gen_istft_n_fft,
+                "gen_istft_hop_size": gen_istft_hop_size,
+                "subbands": subbands,
+            }
             from .vdecoder.mb_istft.generators import (
                 Multiband_iSTFT_Generator,
                 Multistream_iSTFT_Generator,
@@ -459,6 +472,7 @@ def __init__(
 
             # gen_istft_n_fft, gen_istft_hop_size, subbands
             if type_ == "istft":
+                del hps["subbands"]
                 self.dec = iSTFT_Generator(**hps)
             elif type_ == "ms-istft":
                 self.dec = Multistream_iSTFT_Generator(**hps)
diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/generators.py b/src/so_vits_svc_fork/vdecoder/mb_istft/generators.py
index c5e469a4..95ff98c1 100644
--- a/src/so_vits_svc_fork/vdecoder/mb_istft/generators.py
+++ b/src/so_vits_svc_fork/vdecoder/mb_istft/generators.py
@@ -168,7 +168,7 @@ def forward(self, x, g=None):
             hop_length=self.gen_istft_hop_size,
             win_length=self.gen_istft_n_fft,
         ).to(x.device)
-        pqmf = PQMF(x.device)
+        pqmf = PQMF(x.device, subbands=self.subbands).to(x.device, dtype=x.dtype)
 
         x = self.conv_pre(x)  # [B, ch, length]
 

From 8f7476634dcd88261078549d0aa472b4506b46ec Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Fri, 31 Mar 2023 23:36:53 +0900
Subject: [PATCH 10/47] fix(train): remove ensure_pretrained_model

---
 src/so_vits_svc_fork/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/so_vits_svc_fork/train.py b/src/so_vits_svc_fork/train.py
index e0959410..fb797b59 100644
--- a/src/so_vits_svc_fork/train.py
+++ b/src/so_vits_svc_fork/train.py
@@ -39,7 +39,7 @@ def train(config_path: Path | str, model_path: Path | str):
     model_path = Path(model_path)
     if not torch.cuda.is_available():
         raise RuntimeError("CUDA is not available.")
-    utils.ensure_pretrained_model(model_path)
+    # utils.ensure_pretrained_model(model_path)
     hps = utils.get_hparams(config_path, model_path)
 
     n_gpus = torch.cuda.device_count()

From c61dd17192cf452e7ab7a35f00f741d6c2b93ccf Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Fri, 31 Mar 2023 23:37:48 +0900
Subject: [PATCH 11/47] fix(mel_processing): fix wrong logging

---
 src/so_vits_svc_fork/modules/mel_processing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/so_vits_svc_fork/modules/mel_processing.py b/src/so_vits_svc_fork/modules/mel_processing.py
index ff3ad9cd..0de6e54b 100644
--- a/src/so_vits_svc_fork/modules/mel_processing.py
+++ b/src/so_vits_svc_fork/modules/mel_processing.py
@@ -99,9 +99,9 @@ def mel_spectrogram_torch(
     y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
 ):
     if torch.min(y) < -1.0:
-        LOG.info("min value is ", torch.min(y))
+        LOG.info(f"min value is {torch.min(y)}")
     if torch.max(y) > 1.0:
-        LOG.info("max value is ", torch.max(y))
+        LOG.info(f"max value is {torch.max(y)}")
 
     global mel_basis, hann_window
     dtype_device = str(y.dtype) + "_" + str(y.device)

From 21feb01a0400eeeb48fd1f91fcb37c4545df7240 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sat, 1 Apr 2023 12:55:34 +0900
Subject: [PATCH 12/47] refactor: refactor a lot

BREAKING CHANGE: this is a breaking change
---
 .idea/workspace.xml                           |  73 ++-
 src/so_vits_svc_fork/__main__.py              |  18 +-
 src/so_vits_svc_fork/data_utils.py            |   4 +-
 src/so_vits_svc_fork/f0.py                    | 237 ++++++++
 src/so_vits_svc_fork/gui.py                   |   6 +-
 src/so_vits_svc_fork/hparams.py               |  33 +
 src/so_vits_svc_fork/inference/infer_tool.py  |  13 +-
 .../{ => inference}/inference_main.py         |   2 +-
 src/so_vits_svc_fork/models.py                | 574 ------------------
 .../decoders}/__init__.py                     |   0
 src/so_vits_svc_fork/modules/decoders/f0.py   |  45 ++
 .../modules/decoders/hifigan/__init__.py      |   3 +
 .../decoders/hifigan/_models.py}              |   2 +-
 .../decoders/hifigan/_utils.py}               |   0
 .../modules/decoders/mb_istft/__init__.py     |  15 +
 .../decoders/mb_istft/_generators.py}         |   4 +-
 .../decoders/mb_istft/_loss.py}               |   2 +-
 .../decoders/mb_istft/_pqmf.py}               |   2 +-
 .../decoders/mb_istft/_stft.py}               |   0
 .../decoders/mb_istft/_stft_loss.py}          |   0
 .../modules/descriminators.py                 | 144 +++++
 src/so_vits_svc_fork/modules/encoders.py      | 136 +++++
 src/so_vits_svc_fork/modules/flows.py         |  48 ++
 src/so_vits_svc_fork/modules/generator.py     | 220 +++++++
 .../{vdecoder => modules/onnx}/__init__.py    |   0
 .../onnx}/model_onnx.py                       |  14 +-
 .../{ => modules/onnx}/onnx_export.py         |   4 +-
 .../hifigan => preprocessing}/__init__.py     |   0
 .../preprocess_flist_config.py                |   0
 .../preprocess_hubert_f0.py                   |   8 +-
 .../preprocess_resample.py                    |   0
 .../preprocess_speaker_diarization.py         |   0
 .../{ => preprocessing}/preprocess_split.py   |   0
 .../{ => preprocessing}/preprocess_utils.py   |   0
 src/so_vits_svc_fork/spec_gen.py              |  23 -
 src/so_vits_svc_fork/train.py                 |  12 +-
 src/so_vits_svc_fork/utils.py                 | 313 +---------
 .../{vdecoder/mb_istft => utils}/__init__.py  |   0
 src/so_vits_svc_fork/utils/f0py               |   0
 tests/test_main.py                            |  26 +-
 40 files changed, 1036 insertions(+), 945 deletions(-)
 create mode 100644 src/so_vits_svc_fork/f0.py
 create mode 100644 src/so_vits_svc_fork/hparams.py
 rename src/so_vits_svc_fork/{ => inference}/inference_main.py (98%)
 delete mode 100644 src/so_vits_svc_fork/models.py
 rename src/so_vits_svc_fork/{onnxexport => modules/decoders}/__init__.py (100%)
 create mode 100644 src/so_vits_svc_fork/modules/decoders/f0.py
 create mode 100644 src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py
 rename src/so_vits_svc_fork/{vdecoder/hifigan/models.py => modules/decoders/hifigan/_models.py} (99%)
 rename src/so_vits_svc_fork/{vdecoder/hifigan/utils.py => modules/decoders/hifigan/_utils.py} (100%)
 create mode 100644 src/so_vits_svc_fork/modules/decoders/mb_istft/__init__.py
 rename src/so_vits_svc_fork/{vdecoder/mb_istft/generators.py => modules/decoders/mb_istft/_generators.py} (99%)
 rename src/so_vits_svc_fork/{vdecoder/mb_istft/loss.py => modules/decoders/mb_istft/_loss.py} (88%)
 rename src/so_vits_svc_fork/{vdecoder/mb_istft/pqmf.py => modules/decoders/mb_istft/_pqmf.py} (98%)
 rename src/so_vits_svc_fork/{vdecoder/mb_istft/stft.py => modules/decoders/mb_istft/_stft.py} (100%)
 rename src/so_vits_svc_fork/{vdecoder/mb_istft/stft_loss.py => modules/decoders/mb_istft/_stft_loss.py} (100%)
 create mode 100644 src/so_vits_svc_fork/modules/descriminators.py
 create mode 100644 src/so_vits_svc_fork/modules/encoders.py
 create mode 100644 src/so_vits_svc_fork/modules/flows.py
 create mode 100644 src/so_vits_svc_fork/modules/generator.py
 rename src/so_vits_svc_fork/{vdecoder => modules/onnx}/__init__.py (100%)
 rename src/so_vits_svc_fork/{onnxexport => modules/onnx}/model_onnx.py (97%)
 rename src/so_vits_svc_fork/{ => modules/onnx}/onnx_export.py (94%)
 rename src/so_vits_svc_fork/{vdecoder/hifigan => preprocessing}/__init__.py (100%)
 rename src/so_vits_svc_fork/{ => preprocessing}/preprocess_flist_config.py (100%)
 rename src/so_vits_svc_fork/{ => preprocessing}/preprocess_hubert_f0.py (95%)
 rename src/so_vits_svc_fork/{ => preprocessing}/preprocess_resample.py (100%)
 rename src/so_vits_svc_fork/{ => preprocessing}/preprocess_speaker_diarization.py (100%)
 rename src/so_vits_svc_fork/{ => preprocessing}/preprocess_split.py (100%)
 rename src/so_vits_svc_fork/{ => preprocessing}/preprocess_utils.py (100%)
 delete mode 100644 src/so_vits_svc_fork/spec_gen.py
 rename src/so_vits_svc_fork/{vdecoder/mb_istft => utils}/__init__.py (100%)
 create mode 100644 src/so_vits_svc_fork/utils/f0py

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 4c57db80..60a45bf0 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,16 +2,68 @@
 <project version="4">
   <component name="ChangeListManager">
     <list default="true" id="7aacf003-229b-4eb0-80a4-ff105dc3c3d4" name="変更" comment="">
+      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/f0.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/__init__.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/descriminators.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/encoders.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/flows.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/generator.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/__init__.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/__init__.py" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/env.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/models.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/models.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/nvSTFT.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/__main__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/__main__.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/data_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/data_utils.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/gui.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/gui.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/inference/infer_tool.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/inference/infer_tool.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/models.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/onnx_export.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/onnx_export.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/onnxexport/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/f0.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/onnxexport/model_onnx.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/model_onnx.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_flist_config.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/preprocess_flist_config.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_hubert_f0.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_resample.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/preprocess_resample.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_speaker_diarization.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/preprocess_speaker_diarization.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_split.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/preprocess_split.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/preprocess_utils.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/spec_gen.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/train.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/train.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/utils./f0py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/utils./f0py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/utils.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/hparams.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/__init__.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/models.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/hifigan/_utils.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/mb_istft/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/mb_istft/generators.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/mb_istft/loss.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_loss.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/mb_istft/pqmf.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_pqmf.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/mb_istft/stft.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_stft.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/mb_istft/stft_loss.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_stft_loss.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/1.wav" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/10.wav" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/2.wav" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/3.wav" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/4.wav" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/5.wav" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/6.wav" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/7.wav" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/8.wav" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/9.wav" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/あ.wav" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/tests/test_main.py" beforeDir="false" afterPath="$PROJECT_DIR$/tests/test_main.py" afterDir="false" />
     </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
     <option name="LAST_RESOLUTION" value="IGNORE" />
   </component>
+  <component name="FileTemplateManagerImpl">
+    <option name="RECENT_TEMPLATES">
+      <list>
+        <option value="Python Script" />
+      </list>
+    </option>
+  </component>
   <component name="Git.Settings">
     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
   </component>
@@ -19,6 +71,9 @@
     <option name="stateVersion" value="1" />
   </component>
   <component name="ProjectId" id="2N3U7T2ZqSld9sk8NQi5nSwUOO9" />
+  <component name="ProjectLevelVcsManager">
+    <ConfirmationsSetting value="2" id="Add" />
+  </component>
   <component name="ProjectViewState">
     <option name="hideEmptyMiddlePackages" value="true" />
     <option name="showLibraryContents" value="true" />
@@ -33,6 +88,15 @@
     &quot;nodejs_package_manager_path&quot;: &quot;npm&quot;
   }
 }</component>
+  <component name="RecentsManager">
+    <key name="MoveFile.RECENT_KEYS">
+      <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\modules" />
+      <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\onnxexport" />
+      <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\preprocessing" />
+      <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\modules\decoders" />
+      <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\vdecoder\mb_istft\_pqmf.py" />
+    </key>
+  </component>
   <component name="RunManager" selected="Python.so-vits-svc-fork">
     <configuration name="so-vits-svc-fork" type="PythonConfigurationType" factoryName="Python">
       <module name="so-vits-svc-fork" />
@@ -95,7 +159,8 @@
       <workItem from="1678892093553" duration="810000" />
       <workItem from="1678932243084" duration="593000" />
       <workItem from="1680174456649" duration="1005000" />
-      <workItem from="1680251014707" duration="2225000" />
+      <workItem from="1680251014707" duration="2800000" />
+      <workItem from="1680319074742" duration="1776000" />
     </task>
     <servers />
   </component>
diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
index 0e639710..e9addd92 100644
--- a/src/so_vits_svc_fork/__main__.py
+++ b/src/so_vits_svc_fork/__main__.py
@@ -208,7 +208,7 @@ def infer(
     device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
 ):
     """Inference"""
-    from .inference_main import infer
+    from so_vits_svc_fork.inference.inference_main import infer
 
     if not auto_predict_f0:
         LOG.warning(
@@ -367,7 +367,7 @@ def vc(
     passthrough_original: bool = False,
 ) -> None:
     """Realtime inference from microphone"""
-    from .inference_main import realtime
+    from so_vits_svc_fork.inference.inference_main import realtime
 
     if auto_predict_f0:
         LOG.warning(
@@ -455,7 +455,7 @@ def pre_resample(
     hop_seconds: float,
 ) -> None:
     """Preprocessing part 1: resample"""
-    from .preprocess_resample import preprocess_resample
+    from so_vits_svc_fork.preprocessing.preprocess_resample import preprocess_resample
 
     input_dir = Path(input_dir)
     output_dir = Path(output_dir)
@@ -498,7 +498,7 @@ def pre_config(
     config_path: Path,
 ):
     """Preprocessing part 2: config"""
-    from .preprocess_flist_config import preprocess_config
+    from so_vits_svc_fork.preprocessing.preprocess_flist_config import preprocess_config
 
     input_dir = Path(input_dir)
     filelist_path = Path(filelist_path)
@@ -556,7 +556,7 @@ def pre_hubert(
 ) -> None:
     """Preprocessing part 3: hubert
     If the HuBERT model is not found, it will be downloaded automatically."""
-    from .preprocess_hubert_f0 import preprocess_hubert_f0
+    from so_vits_svc_fork.preprocessing.preprocess_hubert_f0 import preprocess_hubert_f0
 
     input_dir = Path(input_dir)
     config_path = Path(config_path)
@@ -622,7 +622,9 @@ def pre_sd(
 
     if max_speakers == 1:
         LOG.warning("Consider using pre-split if max_speakers == 1")
-    from .preprocess_speaker_diarization import preprocess_speaker_diarization
+    from so_vits_svc_fork.preprocessing.preprocess_speaker_diarization import (
+        preprocess_speaker_diarization,
+    )
 
     preprocess_speaker_diarization(
         input_dir=input_dir,
@@ -673,7 +675,7 @@ def pre_split(
     sr: int,
 ):
     """Split audio files into multiple files"""
-    from .preprocess_split import preprocess_split
+    from so_vits_svc_fork.preprocessing.preprocess_split import preprocess_split
 
     preprocess_split(
         input_dir=input_dir,
@@ -742,7 +744,7 @@ def onnx(input_path: Path, output_path: Path, config_path: Path, device: str) ->
         output_path = output_path / (input_path.stem + ".onnx")
     config_path = Path(config_path)
     device_ = torch.device(device)
-    from .onnx_export import onnx_export
+    from so_vits_svc_fork.modules.onnx.onnx_export import onnx_export
 
     onnx_export(
         input_path=input_path,
diff --git a/src/so_vits_svc_fork/data_utils.py b/src/so_vits_svc_fork/data_utils.py
index 6a8caf1a..e4f15d74 100644
--- a/src/so_vits_svc_fork/data_utils.py
+++ b/src/so_vits_svc_fork/data_utils.py
@@ -6,6 +6,8 @@
 import torch.utils.data
 import torchaudio
 
+import so_vits_svc_fork.f0
+
 from . import utils
 from .modules.mel_processing import spectrogram_torch
 from .utils import load_filepaths_and_text
@@ -65,7 +67,7 @@ def get_audio(self, filename):
         spk = torch.LongTensor([self.spk_map[spk]])
 
         f0 = np.load(filename + ".f0.npy")
-        f0, uv = utils.interpolate_f0(f0)
+        f0, uv = so_vits_svc_fork.f0.interpolate_f0(f0)
         f0 = torch.FloatTensor(f0)
         uv = torch.FloatTensor(uv)
 
diff --git a/src/so_vits_svc_fork/f0.py b/src/so_vits_svc_fork/f0.py
new file mode 100644
index 00000000..255ad89e
--- /dev/null
+++ b/src/so_vits_svc_fork/f0.py
@@ -0,0 +1,237 @@
+from __future__ import annotations
+
+from logging import getLogger
+from typing import Any, Literal
+
+import numpy as np
+import torch
+import torchcrepe
+from cm_time import timer
+from numpy import dtype, float32, ndarray
+from torch import FloatTensor, Tensor
+
+LOG = getLogger(__name__)
+
+
+def normalize_f0(
+    f0: FloatTensor, x_mask: FloatTensor, uv: FloatTensor, random_scale=True
+) -> FloatTensor:
+    # calculate means based on x_mask
+    uv_sum = torch.sum(uv, dim=1, keepdim=True)
+    uv_sum[uv_sum == 0] = 9999
+    means = torch.sum(f0[:, 0, :] * uv, dim=1, keepdim=True) / uv_sum
+
+    if random_scale:
+        factor = torch.Tensor(f0.shape[0], 1).uniform_(0.8, 1.2).to(f0.device)
+    else:
+        factor = torch.ones(f0.shape[0], 1).to(f0.device)
+    # normalize f0 based on means and factor
+    f0_norm = (f0 - means.unsqueeze(-1)) * factor.unsqueeze(-1)
+    if torch.isnan(f0_norm).any():
+        exit(0)
+    return f0_norm * x_mask
+
+
+def interpolate_f0(
+    f0: ndarray[Any, dtype[float32]]
+) -> tuple[ndarray[Any, dtype[float32]], ndarray[Any, dtype[float32]]]:
+    data = np.reshape(f0, (f0.size, 1))
+
+    vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
+    vuv_vector[data > 0.0] = 1.0
+    vuv_vector[data <= 0.0] = 0.0
+
+    ip_data = data
+
+    frame_number = data.size
+    last_value = 0.0
+    for i in range(frame_number):
+        if data[i] <= 0.0:
+            j = i + 1
+            for j in range(i + 1, frame_number):
+                if data[j] > 0.0:
+                    break
+            if j < frame_number - 1:
+                if last_value > 0.0:
+                    step = (data[j] - data[i - 1]) / float(j - i)
+                    for k in range(i, j):
+                        ip_data[k] = data[i - 1] + step * (k - i + 1)
+                else:
+                    for k in range(i, j):
+                        ip_data[k] = data[j]
+            else:
+                for k in range(i, frame_number):
+                    ip_data[k] = last_value
+        else:
+            ip_data[i] = data[i]
+            last_value = data[i]
+
+    return ip_data[:, 0], vuv_vector[:, 0]
+
+
+def compute_f0_parselmouth(
+    wav_numpy: ndarray[Any, dtype[float32]],
+    p_len: None | int = None,
+    sampling_rate: int = 44100,
+    hop_length: int = 512,
+):
+    import parselmouth
+
+    x = wav_numpy
+    if p_len is None:
+        p_len = x.shape[0] // hop_length
+    else:
+        assert abs(p_len - x.shape[0] // hop_length) < 4, "pad length error"
+    time_step = hop_length / sampling_rate * 1000
+    f0_min = 50
+    f0_max = 1100
+    f0 = (
+        parselmouth.Sound(x, sampling_rate)
+        .to_pitch_ac(
+            time_step=time_step / 1000,
+            voicing_threshold=0.6,
+            pitch_floor=f0_min,
+            pitch_ceiling=f0_max,
+        )
+        .selected_array["frequency"]
+    )
+
+    pad_size = (p_len - len(f0) + 1) // 2
+    if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+        f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
+    return f0
+
+
+def _resize_f0(
+    x: ndarray[Any, dtype[float32]], target_len: int
+) -> ndarray[Any, dtype[float32]]:
+    source = np.array(x)
+    source[source < 0.001] = np.nan
+    target = np.interp(
+        np.arange(0, len(source) * target_len, len(source)) / target_len,
+        np.arange(0, len(source)),
+        source,
+    )
+    res = np.nan_to_num(target)
+    return res
+
+
+def compute_f0_pyworld(
+    wav_numpy: ndarray[Any, dtype[float32]],
+    p_len: None | int = None,
+    sampling_rate: int = 44100,
+    hop_length: int = 512,
+    type_: Literal["dio", "harvest"] = "dio",
+):
+    import pyworld
+
+    if p_len is None:
+        p_len = wav_numpy.shape[0] // hop_length
+    if type_ == "dio":
+        f0, t = pyworld.dio(
+            wav_numpy.astype(np.double),
+            fs=sampling_rate,
+            f0_ceil=f0_max,
+            f0_floor=f0_min,
+            frame_period=1000 * hop_length / sampling_rate,
+        )
+    elif type_ == "harvest":
+        f0, t = pyworld.harvest(
+            wav_numpy.astype(np.double),
+            fs=sampling_rate,
+            f0_ceil=f0_max,
+            f0_floor=f0_min,
+            frame_period=1000 * hop_length / sampling_rate,
+        )
+    f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate)
+    for index, pitch in enumerate(f0):
+        f0[index] = round(pitch, 1)
+    return _resize_f0(f0, p_len)
+
+
+def compute_f0_crepe(
+    wav_numpy: ndarray[Any, dtype[float32]],
+    p_len: None | int = None,
+    sampling_rate: int = 44100,
+    hop_length: int = 512,
+    device: str = "cuda" if torch.cuda.is_available() else "cpu",
+    model: Literal["full", "tiny"] = "full",
+):
+    audio = torch.from_numpy(wav_numpy).to(device, copy=True)
+    audio = torch.unsqueeze(audio, dim=0)
+
+    if audio.ndim == 2 and audio.shape[0] > 1:
+        audio = torch.mean(audio, dim=0, keepdim=True).detach()
+    # (T) -> (1, T)
+    audio = audio.detach()
+
+    pitch: Tensor = torchcrepe.predict(
+        audio,
+        sampling_rate,
+        hop_length,
+        f0_min,
+        f0_max,
+        model,
+        batch_size=hop_length * 2,
+        device=device,
+        pad=True,
+    )
+
+    f0 = pitch.squeeze(0).cpu().numpy()
+    p_len = p_len or wav_numpy.shape[0] // hop_length
+    f0 = _resize_f0(f0, p_len)
+    return f0
+
+
+def compute_f0(
+    wav_numpy: ndarray[Any, dtype[float32]],
+    p_len: None | int = None,
+    sampling_rate: int = 44100,
+    hop_length: int = 512,
+    method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
+    **kwargs,
+):
+    with timer() as t:
+        wav_numpy = wav_numpy.astype(np.float32)
+        wav_numpy /= np.quantile(np.abs(wav_numpy), 0.999)
+        if method in ["dio", "harvest"]:
+            f0 = compute_f0_pyworld(wav_numpy, p_len, sampling_rate, hop_length, method)
+        elif method == "crepe":
+            f0 = compute_f0_crepe(wav_numpy, p_len, sampling_rate, hop_length, **kwargs)
+        elif method == "crepe-tiny":
+            f0 = compute_f0_crepe(
+                wav_numpy, p_len, sampling_rate, hop_length, model="tiny", **kwargs
+            )
+        elif method == "parselmouth":
+            f0 = compute_f0_parselmouth(wav_numpy, p_len, sampling_rate, hop_length)
+        else:
+            raise ValueError(
+                "type must be dio, crepe, crepe-tiny, harvest or parselmouth"
+            )
+    rtf = t.elapsed / (len(wav_numpy) / sampling_rate)
+    LOG.info(f"F0 inference time:       {t.elapsed:.3f}s, RTF: {rtf:.3f}")
+    return f0
+
+
+def f0_to_coarse(f0: torch.Tensor | float):
+    is_torch = isinstance(f0, torch.Tensor)
+    f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (
+        f0_mel_max - f0_mel_min
+    ) + 1
+
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
+    f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
+    assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
+        f0_coarse.max(),
+        f0_coarse.min(),
+    )
+    return f0_coarse
+
+
+f0_bin = 256
+f0_max = 1100.0
+f0_min = 50.0
+f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+f0_mel_max = 1127 * np.log(1 + f0_max / 700)
diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py
index c70d180b..ddebc02e 100644
--- a/src/so_vits_svc_fork/gui.py
+++ b/src/so_vits_svc_fork/gui.py
@@ -548,7 +548,7 @@ def apply_preset(name: str) -> None:
             elif event == "config_path":
                 update_speaker()
             elif event == "infer":
-                from .inference_main import infer
+                from so_vits_svc_fork.inference.inference_main import infer
 
                 input_path = Path(values["input_path"])
                 output_path = (
@@ -600,7 +600,7 @@ def apply_preset(name: str) -> None:
                 _, _, input_device_indices, output_device_indices = get_devices(
                     update=False
                 )
-                from .inference_main import realtime
+                from so_vits_svc_fork.inference.inference_main import realtime
 
                 if future:
                     LOG.info("Canceling previous task")
@@ -650,7 +650,7 @@ def apply_preset(name: str) -> None:
                     future.cancel()
                     future = None
             elif event == "onnx_export":
-                from .onnx_export import onnx_export
+                from so_vits_svc_fork.modules.onnx.onnx_export import onnx_export
 
                 try:
                     onnx_export(
diff --git a/src/so_vits_svc_fork/hparams.py b/src/so_vits_svc_fork/hparams.py
new file mode 100644
index 00000000..27e56c82
--- /dev/null
+++ b/src/so_vits_svc_fork/hparams.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+
+class HParams:
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if type(v) == dict:
+                v = HParams(**v)
+            self[k] = v
+
+    def keys(self):
+        return self.__dict__.keys()
+
+    def items(self):
+        return self.__dict__.items()
+
+    def values(self):
+        return self.__dict__.values()
+
+    def __len__(self):
+        return len(self.__dict__)
+
+    def __getitem__(self, key):
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+
+    def __contains__(self, key):
+        return key in self.__dict__
+
+    def __repr__(self):
+        return self.__dict__.__repr__()
diff --git a/src/so_vits_svc_fork/inference/infer_tool.py b/src/so_vits_svc_fork/inference/infer_tool.py
index 462ef3d6..b65ece60 100644
--- a/src/so_vits_svc_fork/inference/infer_tool.py
+++ b/src/so_vits_svc_fork/inference/infer_tool.py
@@ -12,9 +12,10 @@
 from cm_time import timer
 from numpy import dtype, float32, ndarray
 
+import so_vits_svc_fork.f0
 from so_vits_svc_fork import cluster, utils
-from so_vits_svc_fork.models import SynthesizerTrn
 
+from ..modules.generator import SynthesizerTrn
 from ..utils import HUBERT_SAMPLING_RATE
 
 LOG = getLogger(__name__)
@@ -134,13 +135,13 @@ def get_unit_f0(
             "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
         ] = "dio",
     ):
-        f0 = utils.compute_f0(
+        f0 = so_vits_svc_fork.f0.compute_f0(
             audio,
             sampling_rate=self.target_sample,
             hop_length=self.hop_size,
             method=f0_method,
         )
-        f0, uv = utils.interpolate_f0(f0)
+        f0, uv = so_vits_svc_fork.f0.interpolate_f0(f0)
         f0 = torch.FloatTensor(f0)
         uv = torch.FloatTensor(uv)
         f0 = f0 * 2 ** (tran / 12)
@@ -257,7 +258,7 @@ def infer_silence(
         chunk_length_min = chunk_length_min = (
             int(
                 min(
-                    sr / utils.f0_min * 20 + 1,
+                    sr / so_vits_svc_fork.f0.f0_min * 20 + 1,
                     chunk_seconds * sr,
                 )
             )
@@ -543,7 +544,9 @@ def infer(audio: ndarray[Any, dtype[float32]]) -> ndarray[Any, dtype[float32]]:
         self.input_audio_store = np.concatenate([self.input_audio_store, input_audio])
         LOG.info(f"input_audio_store: {self.input_audio_store.shape}")
         sr = self.svc_model.target_sample
-        chunk_length_min = int(min(sr / utils.f0_min * 20 + 1, chunk_seconds * sr)) // 2
+        chunk_length_min = (
+            int(min(sr / so_vits_svc_fork.f0.f0_min * 20 + 1, chunk_seconds * sr)) // 2
+        )
         LOG.info(f"Chunk length min: {chunk_length_min}")
         chunk_list = list(
             split_silence(
diff --git a/src/so_vits_svc_fork/inference_main.py b/src/so_vits_svc_fork/inference/inference_main.py
similarity index 98%
rename from src/so_vits_svc_fork/inference_main.py
rename to src/so_vits_svc_fork/inference/inference_main.py
index e01ed63d..90b98b22 100644
--- a/src/so_vits_svc_fork/inference_main.py
+++ b/src/so_vits_svc_fork/inference/inference_main.py
@@ -10,7 +10,7 @@
 import torch
 from cm_time import timer
 
-from .inference.infer_tool import RealtimeVC, RealtimeVC2, Svc
+from so_vits_svc_fork.inference.infer_tool import RealtimeVC, RealtimeVC2, Svc
 
 LOG = getLogger(__name__)
 
diff --git a/src/so_vits_svc_fork/models.py b/src/so_vits_svc_fork/models.py
deleted file mode 100644
index 1d369efd..00000000
--- a/src/so_vits_svc_fork/models.py
+++ /dev/null
@@ -1,574 +0,0 @@
-from logging import getLogger
-from typing import Any, Literal, Sequence
-
-import torch
-from torch import nn
-from torch.nn import Conv1d, Conv2d
-from torch.nn import functional as F
-from torch.nn.utils import spectral_norm, weight_norm
-
-import so_vits_svc_fork.modules.attentions as attentions
-import so_vits_svc_fork.modules.commons as commons
-import so_vits_svc_fork.modules.modules as modules
-
-from . import utils
-from .modules.commons import get_padding
-from .utils import f0_to_coarse
-from .vdecoder.hifigan.models import Generator
-
-LOG = getLogger(__name__)
-
-
-class ResidualCouplingBlock(nn.Module):
-    def __init__(
-        self,
-        channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        n_flows=4,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-
-        self.flows = nn.ModuleList()
-        for i in range(n_flows):
-            self.flows.append(
-                modules.ResidualCouplingLayer(
-                    channels,
-                    hidden_channels,
-                    kernel_size,
-                    dilation_rate,
-                    n_layers,
-                    gin_channels=gin_channels,
-                    mean_only=True,
-                )
-            )
-            self.flows.append(modules.Flip())
-
-    def forward(self, x, x_mask, g=None, reverse=False):
-        if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, x_mask, g=g, reverse=reverse)
-        else:
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x
-
-
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.gin_channels = gin_channels
-
-        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
-        self.enc = modules.WN(
-            hidden_channels,
-            kernel_size,
-            dilation_rate,
-            n_layers,
-            gin_channels=gin_channels,
-        )
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-
-    def forward(self, x, x_lengths, g=None):
-        # print(x.shape,x_lengths.shape)
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
-            x.dtype
-        )
-        x = self.pre(x) * x_mask
-        x = self.enc(x, x_mask, g=g)
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
-        return z, m, logs, x_mask
-
-
-class TextEncoder(nn.Module):
-    def __init__(
-        self,
-        out_channels,
-        hidden_channels,
-        kernel_size,
-        n_layers,
-        gin_channels=0,
-        filter_channels=None,
-        n_heads=None,
-        p_dropout=None,
-    ):
-        super().__init__()
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.n_layers = n_layers
-        self.gin_channels = gin_channels
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-        self.f0_emb = nn.Embedding(256, hidden_channels)
-
-        self.enc_ = attentions.Encoder(
-            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
-        )
-
-    def forward(self, x, x_mask, f0=None, noice_scale=1):
-        x = x + self.f0_emb(f0).transpose(1, 2)
-        x = self.enc_(x * x_mask, x_mask)
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
-
-        return z, m, logs, x_mask
-
-
-class DiscriminatorP(torch.nn.Module):
-    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
-        super().__init__()
-        self.period = period
-        self.use_spectral_norm = use_spectral_norm
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(
-                    Conv2d(
-                        1,
-                        32,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        32,
-                        128,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        128,
-                        512,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        512,
-                        1024,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        1024,
-                        1024,
-                        (kernel_size, 1),
-                        1,
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-            ]
-        )
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-
-    def forward(self, x):
-        fmap = []
-
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0:  # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), "reflect")
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class DiscriminatorS(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super().__init__()
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
-                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
-                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
-                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
-                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
-                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-            ]
-        )
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-
-    def forward(self, x):
-        fmap = []
-
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class MultiPeriodDiscriminator(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super().__init__()
-        periods = [2, 3, 5, 7, 11]
-
-        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
-        discs = discs + [
-            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
-        ]
-        self.discriminators = nn.ModuleList(discs)
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            y_d_gs.append(y_d_g)
-            fmap_rs.append(fmap_r)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-class SpeakerEncoder(torch.nn.Module):
-    def __init__(
-        self,
-        mel_n_channels=80,
-        model_num_layers=3,
-        model_hidden_size=256,
-        model_embedding_size=256,
-    ):
-        super().__init__()
-        self.lstm = nn.LSTM(
-            mel_n_channels, model_hidden_size, model_num_layers, batch_first=True
-        )
-        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
-        self.relu = nn.ReLU()
-
-    def forward(self, mels):
-        self.lstm.flatten_parameters()
-        _, (hidden, _) = self.lstm(mels)
-        embeds_raw = self.relu(self.linear(hidden[-1]))
-        return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
-
-    def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
-        mel_slices = []
-        for i in range(0, total_frames - partial_frames, partial_hop):
-            mel_range = torch.arange(i, i + partial_frames)
-            mel_slices.append(mel_range)
-
-        return mel_slices
-
-    def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
-        mel_len = mel.size(1)
-        last_mel = mel[:, -partial_frames:]
-
-        if mel_len > partial_frames:
-            mel_slices = self.compute_partial_slices(
-                mel_len, partial_frames, partial_hop
-            )
-            mels = list(mel[:, s] for s in mel_slices)
-            mels.append(last_mel)
-            mels = torch.stack(tuple(mels), 0).squeeze(1)
-
-            with torch.no_grad():
-                partial_embeds = self(mels)
-            embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
-            # embed = embed / torch.linalg.norm(embed, 2)
-        else:
-            with torch.no_grad():
-                embed = self(last_mel)
-
-        return embed
-
-
-class F0Decoder(nn.Module):
-    def __init__(
-        self,
-        out_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        spk_channels=0,
-    ):
-        super().__init__()
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.spk_channels = spk_channels
-
-        self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
-        self.decoder = attentions.FFT(
-            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
-        )
-        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
-        self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
-        self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
-
-    def forward(self, x, norm_f0, x_mask, spk_emb=None):
-        x = torch.detach(x)
-        if spk_emb is not None:
-            x = x + self.cond(spk_emb)
-        x += self.f0_prenet(norm_f0)
-        x = self.prenet(x) * x_mask
-        x = self.decoder(x * x_mask, x_mask)
-        x = self.proj(x) * x_mask
-        return x
-
-
-class SynthesizerTrn(nn.Module):
-    """
-    Synthesizer for Training
-    """
-
-    def __init__(
-        self,
-        spec_channels: int,
-        segment_size: int,
-        inter_channels: int,
-        hidden_channels: int,
-        filter_channels: int,
-        n_heads: int,
-        n_layers: int,
-        kernel_size: int,
-        p_dropout: int,
-        resblock: str,
-        resblock_kernel_sizes: Sequence[int],
-        resblock_dilation_sizes: Sequence[Sequence[int]],
-        upsample_rates: Sequence[int],
-        upsample_initial_channel: int,
-        upsample_kernel_sizes: Sequence[int],
-        gin_channels: int,
-        ssl_dim: int,
-        n_speakers: int,
-        sampling_rate: int = 44100,
-        type_: Literal["hifi-gan", "istft", "ms-istft", "mb-istft"] = "hifi-gan",
-        gen_istft_n_fft: int = 16,
-        gen_istft_hop_size: int = 4,
-        subbands: int = 8,
-        **kwargs: Any,
-    ):
-        super().__init__()
-        self.spec_channels = spec_channels
-        self.inter_channels = inter_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.resblock = resblock
-        self.resblock_kernel_sizes = resblock_kernel_sizes
-        self.resblock_dilation_sizes = resblock_dilation_sizes
-        self.upsample_rates = upsample_rates
-        self.upsample_initial_channel = upsample_initial_channel
-        self.upsample_kernel_sizes = upsample_kernel_sizes
-        self.segment_size = segment_size
-        self.gin_channels = gin_channels
-        self.ssl_dim = ssl_dim
-        self.emb_g = nn.Embedding(n_speakers, gin_channels)
-
-        self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
-
-        self.enc_p = TextEncoder(
-            inter_channels,
-            hidden_channels,
-            filter_channels=filter_channels,
-            n_heads=n_heads,
-            n_layers=n_layers,
-            kernel_size=kernel_size,
-            p_dropout=p_dropout,
-        )
-
-        LOG.info(f"Decoder type: {type_}")
-        if type_ == "hifi-gan":
-            hps = {
-                "sampling_rate": sampling_rate,
-                "inter_channels": inter_channels,
-                "resblock": resblock,
-                "resblock_kernel_sizes": resblock_kernel_sizes,
-                "resblock_dilation_sizes": resblock_dilation_sizes,
-                "upsample_rates": upsample_rates,
-                "upsample_initial_channel": upsample_initial_channel,
-                "upsample_kernel_sizes": upsample_kernel_sizes,
-                "gin_channels": gin_channels,
-            }
-            self.dec = Generator(h=hps)
-            self.mb = False
-        else:
-            hps = {
-                "initial_channel": inter_channels,
-                "resblock": resblock,
-                "resblock_kernel_sizes": resblock_kernel_sizes,
-                "resblock_dilation_sizes": resblock_dilation_sizes,
-                "upsample_rates": upsample_rates,
-                "upsample_initial_channel": upsample_initial_channel,
-                "upsample_kernel_sizes": upsample_kernel_sizes,
-                "gin_channels": gin_channels,
-                "gen_istft_n_fft": gen_istft_n_fft,
-                "gen_istft_hop_size": gen_istft_hop_size,
-                "subbands": subbands,
-            }
-            from .vdecoder.mb_istft.generators import (
-                Multiband_iSTFT_Generator,
-                Multistream_iSTFT_Generator,
-                iSTFT_Generator,
-            )
-
-            # gen_istft_n_fft, gen_istft_hop_size, subbands
-            if type_ == "istft":
-                del hps["subbands"]
-                self.dec = iSTFT_Generator(**hps)
-            elif type_ == "ms-istft":
-                self.dec = Multistream_iSTFT_Generator(**hps)
-            elif type_ == "mb-istft":
-                self.dec = Multiband_iSTFT_Generator(**hps)
-            else:
-                raise ValueError(f"Unknown type: {type_}")
-            self.mb = True
-
-        self.enc_q = Encoder(
-            spec_channels,
-            inter_channels,
-            hidden_channels,
-            5,
-            1,
-            16,
-            gin_channels=gin_channels,
-        )
-        self.flow = ResidualCouplingBlock(
-            inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
-        )
-        self.f0_decoder = F0Decoder(
-            1,
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout,
-            spk_channels=gin_channels,
-        )
-        self.emb_uv = nn.Embedding(2, hidden_channels)
-
-    def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None):
-        g = self.emb_g(g).transpose(1, 2)
-        # ssl prenet
-        x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(
-            c.dtype
-        )
-        x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
-
-        # f0 predict
-        lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500
-        norm_lf0 = utils.normalize_f0(lf0, x_mask, uv)
-        pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
-
-        # encoder
-        z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
-        z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
-
-        # flow
-        z_p = self.flow(z, spec_mask, g=g)
-        z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(
-            z, f0, spec_lengths, self.segment_size
-        )
-
-        # nsf decoder
-        # MB-iSTFT-VITS
-        if self.dec:
-            o, o_mb = self.dec(z_slice, g=g)
-        else:
-            o = self.dec(z_slice, g=g, f0=pitch_slice)
-            o_mb = None
-        return (
-            o,
-            o_mb,
-            ids_slice,
-            spec_mask,
-            (z, z_p, m_p, logs_p, m_q, logs_q),
-            pred_lf0,
-            norm_lf0,
-            lf0,
-        )
-
-    def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False):
-        c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
-        g = self.emb_g(g).transpose(1, 2)
-        x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(
-            c.dtype
-        )
-        x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
-
-        if predict_f0:
-            lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500
-            norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
-            pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
-            f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
-
-        z_p, m_p, logs_p, c_mask = self.enc_p(
-            x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale
-        )
-        z = self.flow(z_p, c_mask, g=g, reverse=True)
-
-        # MB-iSTFT-VITS
-        if self.mb:
-            o, o_mb = self.dec(z * c_mask, g=g)
-        else:
-            o = self.dec(z * c_mask, g=g, f0=f0)
-        return o
diff --git a/src/so_vits_svc_fork/onnxexport/__init__.py b/src/so_vits_svc_fork/modules/decoders/__init__.py
similarity index 100%
rename from src/so_vits_svc_fork/onnxexport/__init__.py
rename to src/so_vits_svc_fork/modules/decoders/__init__.py
diff --git a/src/so_vits_svc_fork/modules/decoders/f0.py b/src/so_vits_svc_fork/modules/decoders/f0.py
new file mode 100644
index 00000000..38d8c77d
--- /dev/null
+++ b/src/so_vits_svc_fork/modules/decoders/f0.py
@@ -0,0 +1,45 @@
+import torch
+from torch import nn
+
+from so_vits_svc_fork.modules import attentions as attentions
+
+
+class F0Decoder(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        spk_channels=0,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.spk_channels = spk_channels
+
+        self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
+        self.decoder = attentions.FFT(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
+        self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
+
+    def forward(self, x, norm_f0, x_mask, spk_emb=None):
+        x = torch.detach(x)
+        if spk_emb is not None:
+            x = x + self.cond(spk_emb)
+        x += self.f0_prenet(norm_f0)
+        x = self.prenet(x) * x_mask
+        x = self.decoder(x * x_mask, x_mask)
+        x = self.proj(x) * x_mask
+        return x
diff --git a/src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py b/src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py
new file mode 100644
index 00000000..486c66fe
--- /dev/null
+++ b/src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py
@@ -0,0 +1,3 @@
+from ._models import Generator
+
+__all__ = ["Generator"]
diff --git a/src/so_vits_svc_fork/vdecoder/hifigan/models.py b/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
similarity index 99%
rename from src/so_vits_svc_fork/vdecoder/hifigan/models.py
rename to src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
index 779f4ec6..07ae35b5 100644
--- a/src/so_vits_svc_fork/vdecoder/hifigan/models.py
+++ b/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
@@ -7,7 +7,7 @@
 from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
 from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
 
-from .utils import get_padding, init_weights
+from ._utils import get_padding, init_weights
 
 LOG = getLogger(__name__)
 
diff --git a/src/so_vits_svc_fork/vdecoder/hifigan/utils.py b/src/so_vits_svc_fork/modules/decoders/hifigan/_utils.py
similarity index 100%
rename from src/so_vits_svc_fork/vdecoder/hifigan/utils.py
rename to src/so_vits_svc_fork/modules/decoders/hifigan/_utils.py
diff --git a/src/so_vits_svc_fork/modules/decoders/mb_istft/__init__.py b/src/so_vits_svc_fork/modules/decoders/mb_istft/__init__.py
new file mode 100644
index 00000000..1ba61d27
--- /dev/null
+++ b/src/so_vits_svc_fork/modules/decoders/mb_istft/__init__.py
@@ -0,0 +1,15 @@
+from ._generators import (
+    Multiband_iSTFT_Generator,
+    Multistream_iSTFT_Generator,
+    iSTFT_Generator,
+)
+from ._loss import subband_stft_loss
+from ._pqmf import PQMF
+
+__all__ = [
+    "subband_stft_loss",
+    "PQMF",
+    "iSTFT_Generator",
+    "Multiband_iSTFT_Generator",
+    "Multistream_iSTFT_Generator",
+]
diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/generators.py b/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py
similarity index 99%
rename from src/so_vits_svc_fork/vdecoder/mb_istft/generators.py
rename to src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py
index 95ff98c1..f9b53880 100644
--- a/src/so_vits_svc_fork/vdecoder/mb_istft/generators.py
+++ b/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py
@@ -8,8 +8,8 @@
 
 from ...modules import modules
 from ...modules.commons import get_padding, init_weights
-from .pqmf import PQMF
-from .stft import TorchSTFT
+from ._pqmf import PQMF
+from ._stft import TorchSTFT
 
 
 class iSTFT_Generator(torch.nn.Module):
diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/loss.py b/src/so_vits_svc_fork/modules/decoders/mb_istft/_loss.py
similarity index 88%
rename from src/so_vits_svc_fork/vdecoder/mb_istft/loss.py
rename to src/so_vits_svc_fork/modules/decoders/mb_istft/_loss.py
index 9895befd..7e8b7634 100644
--- a/src/so_vits_svc_fork/vdecoder/mb_istft/loss.py
+++ b/src/so_vits_svc_fork/modules/decoders/mb_istft/_loss.py
@@ -1,4 +1,4 @@
-from .stft_loss import MultiResolutionSTFTLoss
+from ._stft_loss import MultiResolutionSTFTLoss
 
 
 def subband_stft_loss(h, y_mb, y_hat_mb):
diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/pqmf.py b/src/so_vits_svc_fork/modules/decoders/mb_istft/_pqmf.py
similarity index 98%
rename from src/so_vits_svc_fork/vdecoder/mb_istft/pqmf.py
rename to src/so_vits_svc_fork/modules/decoders/mb_istft/_pqmf.py
index 987dde8e..981c1739 100644
--- a/src/so_vits_svc_fork/vdecoder/mb_istft/pqmf.py
+++ b/src/so_vits_svc_fork/modules/decoders/mb_istft/_pqmf.py
@@ -48,7 +48,7 @@ class PQMF(torch.nn.Module):
         https://ieeexplore.ieee.org/document/258122
     """
 
-    def __init__(self, device, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0):
+    def __init__(self, device, subbands=8, taps=62, cutoff_ratio=0.15, beta=9.0):
         """Initialize PQMF module.
         Args:
             subbands (int): The number of subbands.
diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/stft.py b/src/so_vits_svc_fork/modules/decoders/mb_istft/_stft.py
similarity index 100%
rename from src/so_vits_svc_fork/vdecoder/mb_istft/stft.py
rename to src/so_vits_svc_fork/modules/decoders/mb_istft/_stft.py
diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/stft_loss.py b/src/so_vits_svc_fork/modules/decoders/mb_istft/_stft_loss.py
similarity index 100%
rename from src/so_vits_svc_fork/vdecoder/mb_istft/stft_loss.py
rename to src/so_vits_svc_fork/modules/decoders/mb_istft/_stft_loss.py
diff --git a/src/so_vits_svc_fork/modules/descriminators.py b/src/so_vits_svc_fork/modules/descriminators.py
new file mode 100644
index 00000000..dbffd86b
--- /dev/null
+++ b/src/so_vits_svc_fork/modules/descriminators.py
@@ -0,0 +1,144 @@
+import torch
+from torch import nn
+from torch.nn import Conv1d, Conv2d
+from torch.nn import functional as F
+from torch.nn.utils import spectral_norm, weight_norm
+
+from so_vits_svc_fork.modules import modules as modules
+from so_vits_svc_fork.modules.commons import get_padding
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super().__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super().__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super().__init__()
+        periods = [2, 3, 5, 7, 11]
+
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
diff --git a/src/so_vits_svc_fork/modules/encoders.py b/src/so_vits_svc_fork/modules/encoders.py
new file mode 100644
index 00000000..4894aa5c
--- /dev/null
+++ b/src/so_vits_svc_fork/modules/encoders.py
@@ -0,0 +1,136 @@
+import torch
+from torch import nn
+
+from so_vits_svc_fork.modules import attentions as attentions
+from so_vits_svc_fork.modules import commons as commons
+from so_vits_svc_fork.modules import modules as modules
+
+
+class SpeakerEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        mel_n_channels=80,
+        model_num_layers=3,
+        model_hidden_size=256,
+        model_embedding_size=256,
+    ):
+        super().__init__()
+        self.lstm = nn.LSTM(
+            mel_n_channels, model_hidden_size, model_num_layers, batch_first=True
+        )
+        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+        self.relu = nn.ReLU()
+
+    def forward(self, mels):
+        self.lstm.flatten_parameters()
+        _, (hidden, _) = self.lstm(mels)
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+
+    def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
+        mel_slices = []
+        for i in range(0, total_frames - partial_frames, partial_hop):
+            mel_range = torch.arange(i, i + partial_frames)
+            mel_slices.append(mel_range)
+
+        return mel_slices
+
+    def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
+        mel_len = mel.size(1)
+        last_mel = mel[:, -partial_frames:]
+
+        if mel_len > partial_frames:
+            mel_slices = self.compute_partial_slices(
+                mel_len, partial_frames, partial_hop
+            )
+            mels = list(mel[:, s] for s in mel_slices)
+            mels.append(last_mel)
+            mels = torch.stack(tuple(mels), 0).squeeze(1)
+
+            with torch.no_grad():
+                partial_embeds = self(mels)
+            embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
+            # embed = embed / torch.linalg.norm(embed, 2)
+        else:
+            with torch.no_grad():
+                embed = self(last_mel)
+
+        return embed
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, x, x_lengths, g=None):
+        # print(x.shape,x_lengths.shape)
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+
+
+class TextEncoder(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        n_layers,
+        gin_channels=0,
+        filter_channels=None,
+        n_heads=None,
+        p_dropout=None,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+        self.f0_emb = nn.Embedding(256, hidden_channels)
+
+        self.enc_ = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+
+    def forward(self, x, x_mask, f0=None, noice_scale=1):
+        x = x + self.f0_emb(f0).transpose(1, 2)
+        x = self.enc_(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
+
+        return z, m, logs, x_mask
diff --git a/src/so_vits_svc_fork/modules/flows.py b/src/so_vits_svc_fork/modules/flows.py
new file mode 100644
index 00000000..9abcba21
--- /dev/null
+++ b/src/so_vits_svc_fork/modules/flows.py
@@ -0,0 +1,48 @@
+from torch import nn
+
+from so_vits_svc_fork.modules import modules as modules
+
+
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
diff --git a/src/so_vits_svc_fork/modules/generator.py b/src/so_vits_svc_fork/modules/generator.py
new file mode 100644
index 00000000..c90d7471
--- /dev/null
+++ b/src/so_vits_svc_fork/modules/generator.py
@@ -0,0 +1,220 @@
+from logging import getLogger
+from typing import Any, Literal, Sequence
+
+import torch
+from torch import nn
+
+import so_vits_svc_fork.f0
+
+LOG = getLogger(__name__)
+from so_vits_svc_fork.f0 import f0_to_coarse
+from so_vits_svc_fork.modules import commons as commons
+from so_vits_svc_fork.modules.decoders.f0 import F0Decoder
+from so_vits_svc_fork.modules.decoders.hifigan import Generator
+from so_vits_svc_fork.modules.encoders import Encoder, TextEncoder
+from so_vits_svc_fork.modules.flows import ResidualCouplingBlock
+
+
+class SynthesizerTrn(nn.Module):
+    """
+    Synthesizer for Training
+    """
+
+    def __init__(
+        self,
+        spec_channels: int,
+        segment_size: int,
+        inter_channels: int,
+        hidden_channels: int,
+        filter_channels: int,
+        n_heads: int,
+        n_layers: int,
+        kernel_size: int,
+        p_dropout: int,
+        resblock: str,
+        resblock_kernel_sizes: Sequence[int],
+        resblock_dilation_sizes: Sequence[Sequence[int]],
+        upsample_rates: Sequence[int],
+        upsample_initial_channel: int,
+        upsample_kernel_sizes: Sequence[int],
+        gin_channels: int,
+        ssl_dim: int,
+        n_speakers: int,
+        sampling_rate: int = 44100,
+        type_: Literal["hifi-gan", "istft", "ms-istft", "mb-istft"] = "hifi-gan",
+        gen_istft_n_fft: int = 16,
+        gen_istft_hop_size: int = 4,
+        subbands: int = 8,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        self.ssl_dim = ssl_dim
+        self.emb_g = nn.Embedding(n_speakers, gin_channels)
+
+        self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
+
+        self.enc_p = TextEncoder(
+            inter_channels,
+            hidden_channels,
+            filter_channels=filter_channels,
+            n_heads=n_heads,
+            n_layers=n_layers,
+            kernel_size=kernel_size,
+            p_dropout=p_dropout,
+        )
+
+        LOG.info(f"Decoder type: {type_}")
+        if type_ == "hifi-gan":
+            hps = {
+                "sampling_rate": sampling_rate,
+                "inter_channels": inter_channels,
+                "resblock": resblock,
+                "resblock_kernel_sizes": resblock_kernel_sizes,
+                "resblock_dilation_sizes": resblock_dilation_sizes,
+                "upsample_rates": upsample_rates,
+                "upsample_initial_channel": upsample_initial_channel,
+                "upsample_kernel_sizes": upsample_kernel_sizes,
+                "gin_channels": gin_channels,
+            }
+            self.dec = Generator(h=hps)
+            self.mb = False
+        else:
+            hps = {
+                "initial_channel": inter_channels,
+                "resblock": resblock,
+                "resblock_kernel_sizes": resblock_kernel_sizes,
+                "resblock_dilation_sizes": resblock_dilation_sizes,
+                "upsample_rates": upsample_rates,
+                "upsample_initial_channel": upsample_initial_channel,
+                "upsample_kernel_sizes": upsample_kernel_sizes,
+                "gin_channels": gin_channels,
+                "gen_istft_n_fft": gen_istft_n_fft,
+                "gen_istft_hop_size": gen_istft_hop_size,
+                "subbands": subbands,
+            }
+            from .vdecoder.mb_istft.generators import (
+                Multiband_iSTFT_Generator,
+                Multistream_iSTFT_Generator,
+                iSTFT_Generator,
+            )
+
+            # gen_istft_n_fft, gen_istft_hop_size, subbands
+            if type_ == "istft":
+                del hps["subbands"]
+                self.dec = iSTFT_Generator(**hps)
+            elif type_ == "ms-istft":
+                self.dec = Multistream_iSTFT_Generator(**hps)
+            elif type_ == "mb-istft":
+                self.dec = Multiband_iSTFT_Generator(**hps)
+            else:
+                raise ValueError(f"Unknown type: {type_}")
+            self.mb = True
+
+        self.enc_q = Encoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
+        )
+        self.f0_decoder = F0Decoder(
+            1,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            spk_channels=gin_channels,
+        )
+        self.emb_uv = nn.Embedding(2, hidden_channels)
+
+    def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None):
+        g = self.emb_g(g).transpose(1, 2)
+        # ssl prenet
+        x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(
+            c.dtype
+        )
+        x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
+
+        # f0 predict
+        lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500
+        norm_lf0 = so_vits_svc_fork.f0.normalize_f0(lf0, x_mask, uv)
+        pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
+
+        # encoder
+        z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
+        z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
+
+        # flow
+        z_p = self.flow(z, spec_mask, g=g)
+        z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(
+            z, f0, spec_lengths, self.segment_size
+        )
+
+        # MB-iSTFT-VITS
+        if self.dec:
+            o, o_mb = self.dec(z_slice, g=g)
+        # nsf decoder
+        else:
+            o = self.dec(z_slice, g=g, f0=pitch_slice)
+            o_mb = None
+        return (
+            o,
+            o_mb,
+            ids_slice,
+            spec_mask,
+            (z, z_p, m_p, logs_p, m_q, logs_q),
+            pred_lf0,
+            norm_lf0,
+            lf0,
+        )
+
+    def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False):
+        c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+        g = self.emb_g(g).transpose(1, 2)
+        x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(
+            c.dtype
+        )
+        x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
+
+        if predict_f0:
+            lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500
+            norm_lf0 = so_vits_svc_fork.f0.normalize_f0(
+                lf0, x_mask, uv, random_scale=False
+            )
+            pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
+            f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
+
+        z_p, m_p, logs_p, c_mask = self.enc_p(
+            x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale
+        )
+        z = self.flow(z_p, c_mask, g=g, reverse=True)
+
+        # MB-iSTFT-VITS
+        if self.mb:
+            o, o_mb = self.dec(z * c_mask, g=g)
+        else:
+            o = self.dec(z * c_mask, g=g, f0=f0)
+        return o
diff --git a/src/so_vits_svc_fork/vdecoder/__init__.py b/src/so_vits_svc_fork/modules/onnx/__init__.py
similarity index 100%
rename from src/so_vits_svc_fork/vdecoder/__init__.py
rename to src/so_vits_svc_fork/modules/onnx/__init__.py
diff --git a/src/so_vits_svc_fork/onnxexport/model_onnx.py b/src/so_vits_svc_fork/modules/onnx/model_onnx.py
similarity index 97%
rename from src/so_vits_svc_fork/onnxexport/model_onnx.py
rename to src/so_vits_svc_fork/modules/onnx/model_onnx.py
index d450b9c6..c232df89 100644
--- a/src/so_vits_svc_fork/onnxexport/model_onnx.py
+++ b/src/so_vits_svc_fork/modules/onnx/model_onnx.py
@@ -4,11 +4,11 @@
 from torch.nn import functional as F
 from torch.nn.utils import spectral_norm, weight_norm
 
-from .. import utils
-from ..modules import attentions, commons, modules
-from ..modules.commons import get_padding
-from ..utils import f0_to_coarse
-from ..vdecoder.hifigan.models import Generator
+import so_vits_svc_fork.f0
+from so_vits_svc_fork.f0 import f0_to_coarse
+from so_vits_svc_fork.modules import attentions, commons, modules
+from so_vits_svc_fork.modules.commons import get_padding
+from so_vits_svc_fork.modules.decoders.hifigan import Generator
 
 
 class ResidualCouplingBlock(nn.Module):
@@ -394,7 +394,9 @@ def forward(self, c, f0, mel2ph, uv, noise=None, g=None):
 
         if self.predict_f0:
             lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500
-            norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
+            norm_lf0 = so_vits_svc_fork.f0.normalize_f0(
+                lf0, x_mask, uv, random_scale=False
+            )
             pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
             f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
 
diff --git a/src/so_vits_svc_fork/onnx_export.py b/src/so_vits_svc_fork/modules/onnx/onnx_export.py
similarity index 94%
rename from src/so_vits_svc_fork/onnx_export.py
rename to src/so_vits_svc_fork/modules/onnx/onnx_export.py
index 401c266d..c6c9bb78 100644
--- a/src/so_vits_svc_fork/onnx_export.py
+++ b/src/so_vits_svc_fork/modules/onnx/onnx_export.py
@@ -4,8 +4,8 @@
 
 import torch
 
-from . import utils
-from .onnxexport.model_onnx import SynthesizerTrn
+from so_vits_svc_fork import utils
+from so_vits_svc_fork.modules.onnx.model_onnx import SynthesizerTrn
 
 
 def onnx_export(
diff --git a/src/so_vits_svc_fork/vdecoder/hifigan/__init__.py b/src/so_vits_svc_fork/preprocessing/__init__.py
similarity index 100%
rename from src/so_vits_svc_fork/vdecoder/hifigan/__init__.py
rename to src/so_vits_svc_fork/preprocessing/__init__.py
diff --git a/src/so_vits_svc_fork/preprocess_flist_config.py b/src/so_vits_svc_fork/preprocessing/preprocess_flist_config.py
similarity index 100%
rename from src/so_vits_svc_fork/preprocess_flist_config.py
rename to src/so_vits_svc_fork/preprocessing/preprocess_flist_config.py
diff --git a/src/so_vits_svc_fork/preprocess_hubert_f0.py b/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
similarity index 95%
rename from src/so_vits_svc_fork/preprocess_hubert_f0.py
rename to src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
index d4f383d0..026f9236 100644
--- a/src/so_vits_svc_fork/preprocess_hubert_f0.py
+++ b/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
@@ -11,9 +11,11 @@
 from joblib import Parallel, cpu_count, delayed
 from tqdm import tqdm
 
-from . import utils
+import so_vits_svc_fork.f0
+from so_vits_svc_fork import utils
+from so_vits_svc_fork.utils import HUBERT_SAMPLING_RATE
+
 from .preprocess_utils import check_hubert_min_duration
-from .utils import HUBERT_SAMPLING_RATE
 
 LOG = getLogger(__name__)
 
@@ -48,7 +50,7 @@ def _process_one(
     # Compute f0
     f0_path = filepath.parent / (filepath.name + ".f0.npy")
     if (not f0_path.exists()) or force_rebuild:
-        f0 = utils.compute_f0(
+        f0 = so_vits_svc_fork.f0.compute_f0(
             audio, sampling_rate=sampling_rate, hop_length=hop_length, method=f0_method
         )
         np.save(f0_path, f0)
diff --git a/src/so_vits_svc_fork/preprocess_resample.py b/src/so_vits_svc_fork/preprocessing/preprocess_resample.py
similarity index 100%
rename from src/so_vits_svc_fork/preprocess_resample.py
rename to src/so_vits_svc_fork/preprocessing/preprocess_resample.py
diff --git a/src/so_vits_svc_fork/preprocess_speaker_diarization.py b/src/so_vits_svc_fork/preprocessing/preprocess_speaker_diarization.py
similarity index 100%
rename from src/so_vits_svc_fork/preprocess_speaker_diarization.py
rename to src/so_vits_svc_fork/preprocessing/preprocess_speaker_diarization.py
diff --git a/src/so_vits_svc_fork/preprocess_split.py b/src/so_vits_svc_fork/preprocessing/preprocess_split.py
similarity index 100%
rename from src/so_vits_svc_fork/preprocess_split.py
rename to src/so_vits_svc_fork/preprocessing/preprocess_split.py
diff --git a/src/so_vits_svc_fork/preprocess_utils.py b/src/so_vits_svc_fork/preprocessing/preprocess_utils.py
similarity index 100%
rename from src/so_vits_svc_fork/preprocess_utils.py
rename to src/so_vits_svc_fork/preprocessing/preprocess_utils.py
diff --git a/src/so_vits_svc_fork/spec_gen.py b/src/so_vits_svc_fork/spec_gen.py
deleted file mode 100644
index b0ba544b..00000000
--- a/src/so_vits_svc_fork/spec_gen.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import json
-
-from tqdm import tqdm
-
-from .data_utils import TextAudioSpeakerLoader
-from .utils import HParams
-
-config_path = "configs/config.json"
-with open(config_path) as f:
-    data = f.read()
-config = json.loads(data)
-hps = HParams(**config)
-
-train_dataset = TextAudioSpeakerLoader("filelists/train.txt", hps)
-test_dataset = TextAudioSpeakerLoader("filelists/test.txt", hps)
-eval_dataset = TextAudioSpeakerLoader("filelists/val.txt", hps)
-
-for _ in tqdm(train_dataset):
-    pass
-for _ in tqdm(eval_dataset):
-    pass
-for _ in tqdm(test_dataset):
-    pass
diff --git a/src/so_vits_svc_fork/train.py b/src/so_vits_svc_fork/train.py
index fb797b59..93ea7c93 100644
--- a/src/so_vits_svc_fork/train.py
+++ b/src/so_vits_svc_fork/train.py
@@ -16,11 +16,14 @@
 from torch.utils.tensorboard import SummaryWriter
 from tqdm import trange
 
+import so_vits_svc_fork.f0
 import so_vits_svc_fork.modules.commons as commons
+import so_vits_svc_fork.utils
 
 from . import utils
 from .data_utils import TextAudioCollate, TextAudioSpeakerLoader
-from .models import MultiPeriodDiscriminator, SynthesizerTrn
+from .modules.descriminators import MultiPeriodDiscriminator
+from .modules.generator import SynthesizerTrn
 from .modules.losses import discriminator_loss, feature_loss, generator_loss, kl_loss
 from .modules.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
 
@@ -272,8 +275,7 @@ def train_and_evaluate(
                 # MB-iSTFT-VITS
                 loss_subband = torch.tensor(0.0)
                 if hps.model.type_ == "mb-istft":
-                    from .vdecoder.mb_istft.loss import subband_stft_loss
-                    from .vdecoder.mb_istft.pqmf import PQMF
+                    from .modules.decoders.mb_istft import PQMF, subband_stft_loss
 
                     y_mb = PQMF(y.device, hps.model.subbands).analysis(y)
                     loss_subband = subband_stft_loss(hps, y_mb, y_hat_mb)
@@ -336,11 +338,11 @@ def train_and_evaluate(
                     "all/mel": utils.plot_spectrogram_to_numpy(
                         mel[0].data.cpu().numpy()
                     ),
-                    "all/lf0": utils.plot_data_to_numpy(
+                    "all/lf0": so_vits_svc_fork.utils.plot_data_to_numpy(
                         lf0[0, 0, :].cpu().numpy(),
                         pred_lf0[0, 0, :].detach().cpu().numpy(),
                     ),
-                    "all/norm_lf0": utils.plot_data_to_numpy(
+                    "all/norm_lf0": so_vits_svc_fork.utils.plot_data_to_numpy(
                         lf0[0, 0, :].cpu().numpy(),
                         norm_lf0[0, 0, :].detach().cpu().numpy(),
                     ),
diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py
index 30e30fb6..5e9b860f 100644
--- a/src/so_vits_svc_fork/utils.py
+++ b/src/so_vits_svc_fork/utils.py
@@ -5,286 +5,22 @@
 from itertools import groupby
 from logging import getLogger
 from pathlib import Path
-from typing import Any, Literal
+from typing import Any
 
 import numpy as np
 import requests
 import torch
-import torchcrepe
 from cm_time import timer
-from numpy import dtype, float32, ndarray
+from numpy import ndarray
 from scipy.io.wavfile import read
-from torch import FloatTensor, Tensor
 from tqdm import tqdm
 
+from so_vits_svc_fork.hparams import HParams
+
 LOG = getLogger(__name__)
-MATPLOTLIB_FLAG = False
-f0_bin = 256
-f0_max = 1100.0
-f0_min = 50.0
-f0_mel_min = 1127 * np.log(1 + f0_min / 700)
-f0_mel_max = 1127 * np.log(1 + f0_max / 700)
 HUBERT_SAMPLING_RATE = 16000
 
 
-# def normalize_f0(f0, random_scale=True):
-#     f0_norm = f0.clone()  # create a copy of the input Tensor
-#     batch_size, _, frame_length = f0_norm.shape
-#     for i in range(batch_size):
-#         means = torch.mean(f0_norm[i, 0, :])
-#         if random_scale:
-#             factor = random.uniform(0.8, 1.2)
-#         else:
-#             factor = 1
-#         f0_norm[i, 0, :] = (f0_norm[i, 0, :] - means) * factor
-#     return f0_norm
-# def normalize_f0(f0, random_scale=True):
-#     means = torch.mean(f0[:, 0, :], dim=1, keepdim=True)
-#     if random_scale:
-#         factor = torch.Tensor(f0.shape[0],1).uniform_(0.8, 1.2).to(f0.device)
-#     else:
-#         factor = torch.ones(f0.shape[0], 1, 1).to(f0.device)
-#     f0_norm = (f0 - means.unsqueeze(-1)) * factor.unsqueeze(-1)
-#     return f0_norm
-def normalize_f0(
-    f0: FloatTensor, x_mask: FloatTensor, uv: FloatTensor, random_scale=True
-) -> FloatTensor:
-    # calculate means based on x_mask
-    uv_sum = torch.sum(uv, dim=1, keepdim=True)
-    uv_sum[uv_sum == 0] = 9999
-    means = torch.sum(f0[:, 0, :] * uv, dim=1, keepdim=True) / uv_sum
-
-    if random_scale:
-        factor = torch.Tensor(f0.shape[0], 1).uniform_(0.8, 1.2).to(f0.device)
-    else:
-        factor = torch.ones(f0.shape[0], 1).to(f0.device)
-    # normalize f0 based on means and factor
-    f0_norm = (f0 - means.unsqueeze(-1)) * factor.unsqueeze(-1)
-    if torch.isnan(f0_norm).any():
-        exit(0)
-    return f0_norm * x_mask
-
-
-def plot_data_to_numpy(x: ndarray, y: ndarray) -> ndarray:
-    global MATPLOTLIB_FLAG
-    if not MATPLOTLIB_FLAG:
-        import matplotlib
-
-        matplotlib.use("Agg")
-        MATPLOTLIB_FLAG = True
-    import matplotlib.pylab as plt
-    import numpy as np
-
-    fig, ax = plt.subplots(figsize=(10, 2))
-    plt.plot(x)
-    plt.plot(y)
-    plt.tight_layout()
-
-    fig.canvas.draw()
-    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
-    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
-    plt.close()
-    return data
-
-
-def interpolate_f0(
-    f0: ndarray[Any, dtype[float32]]
-) -> tuple[ndarray[Any, dtype[float32]], ndarray[Any, dtype[float32]]]:
-    data = np.reshape(f0, (f0.size, 1))
-
-    vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
-    vuv_vector[data > 0.0] = 1.0
-    vuv_vector[data <= 0.0] = 0.0
-
-    ip_data = data
-
-    frame_number = data.size
-    last_value = 0.0
-    for i in range(frame_number):
-        if data[i] <= 0.0:
-            j = i + 1
-            for j in range(i + 1, frame_number):
-                if data[j] > 0.0:
-                    break
-            if j < frame_number - 1:
-                if last_value > 0.0:
-                    step = (data[j] - data[i - 1]) / float(j - i)
-                    for k in range(i, j):
-                        ip_data[k] = data[i - 1] + step * (k - i + 1)
-                else:
-                    for k in range(i, j):
-                        ip_data[k] = data[j]
-            else:
-                for k in range(i, frame_number):
-                    ip_data[k] = last_value
-        else:
-            ip_data[i] = data[i]
-            last_value = data[i]
-
-    return ip_data[:, 0], vuv_vector[:, 0]
-
-
-def compute_f0_parselmouth(
-    wav_numpy: ndarray[Any, dtype[float32]],
-    p_len: None | int = None,
-    sampling_rate: int = 44100,
-    hop_length: int = 512,
-):
-    import parselmouth
-
-    x = wav_numpy
-    if p_len is None:
-        p_len = x.shape[0] // hop_length
-    else:
-        assert abs(p_len - x.shape[0] // hop_length) < 4, "pad length error"
-    time_step = hop_length / sampling_rate * 1000
-    f0_min = 50
-    f0_max = 1100
-    f0 = (
-        parselmouth.Sound(x, sampling_rate)
-        .to_pitch_ac(
-            time_step=time_step / 1000,
-            voicing_threshold=0.6,
-            pitch_floor=f0_min,
-            pitch_ceiling=f0_max,
-        )
-        .selected_array["frequency"]
-    )
-
-    pad_size = (p_len - len(f0) + 1) // 2
-    if pad_size > 0 or p_len - len(f0) - pad_size > 0:
-        f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
-    return f0
-
-
-def _resize_f0(
-    x: ndarray[Any, dtype[float32]], target_len: int
-) -> ndarray[Any, dtype[float32]]:
-    source = np.array(x)
-    source[source < 0.001] = np.nan
-    target = np.interp(
-        np.arange(0, len(source) * target_len, len(source)) / target_len,
-        np.arange(0, len(source)),
-        source,
-    )
-    res = np.nan_to_num(target)
-    return res
-
-
-def compute_f0_pyworld(
-    wav_numpy: ndarray[Any, dtype[float32]],
-    p_len: None | int = None,
-    sampling_rate: int = 44100,
-    hop_length: int = 512,
-    type_: Literal["dio", "harvest"] = "dio",
-):
-    import pyworld
-
-    if p_len is None:
-        p_len = wav_numpy.shape[0] // hop_length
-    if type_ == "dio":
-        f0, t = pyworld.dio(
-            wav_numpy.astype(np.double),
-            fs=sampling_rate,
-            f0_ceil=f0_max,
-            f0_floor=f0_min,
-            frame_period=1000 * hop_length / sampling_rate,
-        )
-    elif type_ == "harvest":
-        f0, t = pyworld.harvest(
-            wav_numpy.astype(np.double),
-            fs=sampling_rate,
-            f0_ceil=f0_max,
-            f0_floor=f0_min,
-            frame_period=1000 * hop_length / sampling_rate,
-        )
-    f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate)
-    for index, pitch in enumerate(f0):
-        f0[index] = round(pitch, 1)
-    return _resize_f0(f0, p_len)
-
-
-def compute_f0_crepe(
-    wav_numpy: ndarray[Any, dtype[float32]],
-    p_len: None | int = None,
-    sampling_rate: int = 44100,
-    hop_length: int = 512,
-    device: str = "cuda" if torch.cuda.is_available() else "cpu",
-    model: Literal["full", "tiny"] = "full",
-):
-    audio = torch.from_numpy(wav_numpy).to(device, copy=True)
-    audio = torch.unsqueeze(audio, dim=0)
-
-    if audio.ndim == 2 and audio.shape[0] > 1:
-        audio = torch.mean(audio, dim=0, keepdim=True).detach()
-    # (T) -> (1, T)
-    audio = audio.detach()
-
-    pitch: Tensor = torchcrepe.predict(
-        audio,
-        sampling_rate,
-        hop_length,
-        f0_min,
-        f0_max,
-        model,
-        batch_size=hop_length * 2,
-        device=device,
-        pad=True,
-    )
-
-    f0 = pitch.squeeze(0).cpu().numpy()
-    p_len = p_len or wav_numpy.shape[0] // hop_length
-    f0 = _resize_f0(f0, p_len)
-    return f0
-
-
-def compute_f0(
-    wav_numpy: ndarray[Any, dtype[float32]],
-    p_len: None | int = None,
-    sampling_rate: int = 44100,
-    hop_length: int = 512,
-    method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
-    **kwargs,
-):
-    with timer() as t:
-        wav_numpy = wav_numpy.astype(np.float32)
-        wav_numpy /= np.quantile(np.abs(wav_numpy), 0.999)
-        if method in ["dio", "harvest"]:
-            f0 = compute_f0_pyworld(wav_numpy, p_len, sampling_rate, hop_length, method)
-        elif method == "crepe":
-            f0 = compute_f0_crepe(wav_numpy, p_len, sampling_rate, hop_length, **kwargs)
-        elif method == "crepe-tiny":
-            f0 = compute_f0_crepe(
-                wav_numpy, p_len, sampling_rate, hop_length, model="tiny", **kwargs
-            )
-        elif method == "parselmouth":
-            f0 = compute_f0_parselmouth(wav_numpy, p_len, sampling_rate, hop_length)
-        else:
-            raise ValueError(
-                "type must be dio, crepe, crepe-tiny, harvest or parselmouth"
-            )
-    rtf = t.elapsed / (len(wav_numpy) / sampling_rate)
-    LOG.info(f"F0 inference time:       {t.elapsed:.3f}s, RTF: {rtf:.3f}")
-    return f0
-
-
-def f0_to_coarse(f0: torch.Tensor | float):
-    is_torch = isinstance(f0, torch.Tensor)
-    f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
-    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (
-        f0_mel_max - f0_mel_min
-    ) + 1
-
-    f0_mel[f0_mel <= 1] = 1
-    f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
-    f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
-    assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
-        f0_coarse.max(),
-        f0_coarse.min(),
-    )
-    return f0_coarse
-
-
 def download_file(
     url: str,
     filepath: Path | str,
@@ -595,33 +331,20 @@ def repeat_expand_2d(content: ndarray, target_len: int) -> ndarray:
     return target
 
 
-class HParams:
-    def __init__(self, **kwargs):
-        for k, v in kwargs.items():
-            if type(v) == dict:
-                v = HParams(**v)
-            self[k] = v
-
-    def keys(self):
-        return self.__dict__.keys()
-
-    def items(self):
-        return self.__dict__.items()
-
-    def values(self):
-        return self.__dict__.values()
-
-    def __len__(self):
-        return len(self.__dict__)
-
-    def __getitem__(self, key):
-        return getattr(self, key)
+def plot_data_to_numpy(x: ndarray, y: ndarray) -> ndarray:
+    import matplotlib
 
-    def __setitem__(self, key, value):
-        return setattr(self, key, value)
+    matplotlib.use("Agg")
+    import matplotlib.pylab as plt
+    import numpy as np
 
-    def __contains__(self, key):
-        return key in self.__dict__
+    fig, ax = plt.subplots(figsize=(10, 2))
+    plt.plot(x)
+    plt.plot(y)
+    plt.tight_layout()
 
-    def __repr__(self):
-        return self.__dict__.__repr__()
+    fig.canvas.draw()
+    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    plt.close()
+    return data
diff --git a/src/so_vits_svc_fork/vdecoder/mb_istft/__init__.py b/src/so_vits_svc_fork/utils/__init__.py
similarity index 100%
rename from src/so_vits_svc_fork/vdecoder/mb_istft/__init__.py
rename to src/so_vits_svc_fork/utils/__init__.py
diff --git a/src/so_vits_svc_fork/utils/f0py b/src/so_vits_svc_fork/utils/f0py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_main.py b/tests/test_main.py
index dadc4b3f..490a9716 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -10,29 +10,33 @@
 class TestMain(TestCase):
     def test_import(self):
         import so_vits_svc_fork.cluster.train_cluster  # noqa
-        import so_vits_svc_fork.inference_main  # noqa
-        import so_vits_svc_fork.onnx_export  # noqa
-        import so_vits_svc_fork.preprocess_flist_config  # noqa
-        import so_vits_svc_fork.preprocess_hubert_f0  # noqa
-        import so_vits_svc_fork.preprocess_resample  # noqa
-        import so_vits_svc_fork.preprocess_split  # noqa
+        import so_vits_svc_fork.inference.inference_main  # noqa
+        import so_vits_svc_fork.modules.onnx.onnx_export  # noqa
+        import so_vits_svc_fork.preprocessing.preprocess_flist_config  # noqa
+        import so_vits_svc_fork.preprocessing.preprocess_hubert_f0  # noqa
+        import so_vits_svc_fork.preprocessing.preprocess_resample  # noqa
+        import so_vits_svc_fork.preprocessing.preprocess_split  # noqa
         import so_vits_svc_fork.train  # noqa
 
     def test_infer(self):
         if IS_CI:
             raise SkipTest("Skip inference test on CI")
-        from so_vits_svc_fork.inference_main import infer  # noqa
+        from so_vits_svc_fork.inference.inference_main import infer  # noqa
 
         # infer("tests/dataset_raw/34j/1.wav", "tests/configs/config.json", "tests/logs/44k")
 
     def test_preprocess(self):
-        from so_vits_svc_fork.preprocess_resample import preprocess_resample
+        from so_vits_svc_fork.preprocessing.preprocess_resample import (
+            preprocess_resample,
+        )
 
         preprocess_resample(
             "tests/dataset_raw", "tests/dataset/44k", 44100, n_jobs=1 if IS_CI else -1
         )
 
-        from so_vits_svc_fork.preprocess_flist_config import preprocess_config
+        from so_vits_svc_fork.preprocessing.preprocess_flist_config import (
+            preprocess_config,
+        )
 
         preprocess_config(
             "tests/dataset/44k",
@@ -44,7 +48,9 @@ def test_preprocess(self):
 
         if IS_CI:
             raise SkipTest("Skip hubert and f0 test on CI")
-        from so_vits_svc_fork.preprocess_hubert_f0 import preprocess_hubert_f0
+        from so_vits_svc_fork.preprocessing.preprocess_hubert_f0 import (
+            preprocess_hubert_f0,
+        )
 
         preprocess_hubert_f0("tests/dataset/44k", "tests/configs/44k/config.json")
 

From 0116fa4af3acea55b31119b404cffdac3c2e0554 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sat, 1 Apr 2023 13:12:31 +0900
Subject: [PATCH 13/47] refactor: more refactoring

---
 .idea/workspace.xml                           | 49 ++++---------------
 .../hubert/put_hubert_ckpt_here               |  0
 .../modules/decoders/mb_istft/_generators.py  |  4 +-
 src/so_vits_svc_fork/modules/generator.py     | 10 ++--
 .../{ => modules}/hubert/__init__.py          |  0
 .../{ => modules}/hubert/hubert_model.py      |  0
 .../{ => modules}/hubert/hubert_model_onnx.py |  0
 src/so_vits_svc_fork/train.py                 | 18 +++----
 src/so_vits_svc_fork/utils.py                 | 18 ++-----
 src/so_vits_svc_fork/utils/__init__.py        |  0
 src/so_vits_svc_fork/utils/f0py               |  0
 11 files changed, 28 insertions(+), 71 deletions(-)
 delete mode 100644 src/so_vits_svc_fork/hubert/put_hubert_ckpt_here
 rename src/so_vits_svc_fork/{ => modules}/hubert/__init__.py (100%)
 rename src/so_vits_svc_fork/{ => modules}/hubert/hubert_model.py (100%)
 rename src/so_vits_svc_fork/{ => modules}/hubert/hubert_model_onnx.py (100%)
 delete mode 100644 src/so_vits_svc_fork/utils/__init__.py
 delete mode 100644 src/so_vits_svc_fork/utils/f0py

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 60a45bf0..2642b5e0 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,43 +2,15 @@
 <project version="4">
   <component name="ChangeListManager">
     <list default="true" id="7aacf003-229b-4eb0-80a4-ff105dc3c3d4" name="変更" comment="">
-      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/f0.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/__init__.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/descriminators.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/encoders.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/flows.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/generator.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/__init__.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/__init__.py" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/__main__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/__main__.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/data_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/data_utils.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/gui.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/gui.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/inference/infer_tool.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/inference/infer_tool.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/models.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/onnx_export.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/onnx_export.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/onnxexport/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/f0.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/onnxexport/model_onnx.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/model_onnx.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_flist_config.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/preprocess_flist_config.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_hubert_f0.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_resample.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/preprocess_resample.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_speaker_diarization.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/preprocess_speaker_diarization.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_split.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/preprocess_split.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocess_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/preprocessing/preprocess_utils.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/spec_gen.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/train.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/train.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/utils./f0py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/utils./f0py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/utils.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/hparams.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/__init__.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/models.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/hifigan/utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/hifigan/_utils.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/mb_istft/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/mb_istft/generators.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/mb_istft/loss.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_loss.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/mb_istft/pqmf.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_pqmf.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/mb_istft/stft.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_stft.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/vdecoder/mb_istft/stft_loss.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_stft_loss.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/hubert/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/hubert/__init__.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/hubert/hubert_model.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/hubert/hubert_model.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/hubert/hubert_model_onnx.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/hubert/hubert_model_onnx.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/hubert/put_hubert_ckpt_here" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/generator.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/generator.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/utils/__init__.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/utils/f0py" beforeDir="false" />
       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/1.wav" beforeDir="false" />
       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/10.wav" beforeDir="false" />
       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/2.wav" beforeDir="false" />
@@ -50,7 +22,6 @@
       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/8.wav" beforeDir="false" />
       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/9.wav" beforeDir="false" />
       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/あ.wav" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/tests/test_main.py" beforeDir="false" afterPath="$PROJECT_DIR$/tests/test_main.py" afterDir="false" />
     </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -91,10 +62,10 @@
   <component name="RecentsManager">
     <key name="MoveFile.RECENT_KEYS">
       <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\modules" />
+      <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\inference" />
       <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\onnxexport" />
       <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\preprocessing" />
       <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\modules\decoders" />
-      <recent name="C:\Users\m\MyMain\dev\so-vits-svc-fork\src\so_vits_svc_fork\vdecoder\mb_istft\_pqmf.py" />
     </key>
   </component>
   <component name="RunManager" selected="Python.so-vits-svc-fork">
@@ -160,7 +131,7 @@
       <workItem from="1678932243084" duration="593000" />
       <workItem from="1680174456649" duration="1005000" />
       <workItem from="1680251014707" duration="2800000" />
-      <workItem from="1680319074742" duration="1776000" />
+      <workItem from="1680319074742" duration="2743000" />
     </task>
     <servers />
   </component>
diff --git a/src/so_vits_svc_fork/hubert/put_hubert_ckpt_here b/src/so_vits_svc_fork/hubert/put_hubert_ckpt_here
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py b/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py
index f9b53880..2f28ede0 100644
--- a/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py
+++ b/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py
@@ -6,8 +6,8 @@
 from torch.nn import functional as F
 from torch.nn.utils import remove_weight_norm, weight_norm
 
-from ...modules import modules
-from ...modules.commons import get_padding, init_weights
+from ....modules import modules
+from ....modules.commons import get_padding, init_weights
 from ._pqmf import PQMF
 from ._stft import TorchSTFT
 
diff --git a/src/so_vits_svc_fork/modules/generator.py b/src/so_vits_svc_fork/modules/generator.py
index c90d7471..949744de 100644
--- a/src/so_vits_svc_fork/modules/generator.py
+++ b/src/so_vits_svc_fork/modules/generator.py
@@ -11,6 +11,11 @@
 from so_vits_svc_fork.modules import commons as commons
 from so_vits_svc_fork.modules.decoders.f0 import F0Decoder
 from so_vits_svc_fork.modules.decoders.hifigan import Generator
+from so_vits_svc_fork.modules.decoders.mb_istft import (
+    Multiband_iSTFT_Generator,
+    Multistream_iSTFT_Generator,
+    iSTFT_Generator,
+)
 from so_vits_svc_fork.modules.encoders import Encoder, TextEncoder
 from so_vits_svc_fork.modules.flows import ResidualCouplingBlock
 
@@ -108,11 +113,6 @@ def __init__(
                 "gen_istft_hop_size": gen_istft_hop_size,
                 "subbands": subbands,
             }
-            from .vdecoder.mb_istft.generators import (
-                Multiband_iSTFT_Generator,
-                Multistream_iSTFT_Generator,
-                iSTFT_Generator,
-            )
 
             # gen_istft_n_fft, gen_istft_hop_size, subbands
             if type_ == "istft":
diff --git a/src/so_vits_svc_fork/hubert/__init__.py b/src/so_vits_svc_fork/modules/hubert/__init__.py
similarity index 100%
rename from src/so_vits_svc_fork/hubert/__init__.py
rename to src/so_vits_svc_fork/modules/hubert/__init__.py
diff --git a/src/so_vits_svc_fork/hubert/hubert_model.py b/src/so_vits_svc_fork/modules/hubert/hubert_model.py
similarity index 100%
rename from src/so_vits_svc_fork/hubert/hubert_model.py
rename to src/so_vits_svc_fork/modules/hubert/hubert_model.py
diff --git a/src/so_vits_svc_fork/hubert/hubert_model_onnx.py b/src/so_vits_svc_fork/modules/hubert/hubert_model_onnx.py
similarity index 100%
rename from src/so_vits_svc_fork/hubert/hubert_model_onnx.py
rename to src/so_vits_svc_fork/modules/hubert/hubert_model_onnx.py
diff --git a/src/so_vits_svc_fork/train.py b/src/so_vits_svc_fork/train.py
index 93ea7c93..513e13ed 100644
--- a/src/so_vits_svc_fork/train.py
+++ b/src/so_vits_svc_fork/train.py
@@ -22,14 +22,12 @@
 
 from . import utils
 from .data_utils import TextAudioCollate, TextAudioSpeakerLoader
+from .hparams import HParams
 from .modules.descriminators import MultiPeriodDiscriminator
 from .modules.generator import SynthesizerTrn
 from .modules.losses import discriminator_loss, feature_loss, generator_loss, kl_loss
 from .modules.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
 
-# os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO'
-
-
 LOG = getLogger(__name__)
 torch.backends.cudnn.benchmark = True
 global_step = 0
@@ -50,7 +48,7 @@ def train(config_path: Path | str, model_path: Path | str):
     os.environ["MASTER_PORT"] = hps.train.port
 
     mp.spawn(
-        run,
+        _run,
         nprocs=n_gpus,
         args=(
             n_gpus,
@@ -59,7 +57,7 @@ def train(config_path: Path | str, model_path: Path | str):
     )
 
 
-def run(rank, n_gpus, hps):
+def _run(rank: int, n_gpus: int, hps: HParams):
     global global_step
     if rank == 0:
         LOG.info(hps)
@@ -158,7 +156,7 @@ def run(rank, n_gpus, hps):
 
     for epoch in trange(epoch_str, hps.train.epochs + 1):
         if rank == 0:
-            train_and_evaluate(
+            _train_and_evaluate(
                 rank,
                 epoch,
                 hps,
@@ -170,7 +168,7 @@ def run(rank, n_gpus, hps):
                 [writer, writer_eval],
             )
         else:
-            train_and_evaluate(
+            _train_and_evaluate(
                 rank,
                 epoch,
                 hps,
@@ -185,7 +183,7 @@ def run(rank, n_gpus, hps):
         scheduler_d.step()
 
 
-def train_and_evaluate(
+def _train_and_evaluate(
     rank, epoch, hps, nets, optims, schedulers, scaler, loaders, writers
 ):
     net_g, net_d = nets
@@ -357,7 +355,7 @@ def train_and_evaluate(
 
             if global_step % hps.train.eval_interval == 0:
                 LOG.info("Saving checkpoints...")
-                evaluate(hps, net_g, eval_loader, writer_eval)
+                _evaluate(hps, net_g, eval_loader, writer_eval)
                 utils.save_checkpoint(
                     net_g,
                     optim_g,
@@ -390,7 +388,7 @@ def train_and_evaluate(
         start_time = now
 
 
-def evaluate(hps, generator, eval_loader, writer_eval):
+def _evaluate(hps, generator, eval_loader, writer_eval):
     generator.eval()
     image_dict = {}
     audio_dict = {}
diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py
index 5e9b860f..8dff73d2 100644
--- a/src/so_vits_svc_fork/utils.py
+++ b/src/so_vits_svc_fork/utils.py
@@ -7,6 +7,8 @@
 from pathlib import Path
 from typing import Any
 
+import matplotlib
+import matplotlib.pylab as plt
 import numpy as np
 import requests
 import torch
@@ -254,15 +256,7 @@ def latest_checkpoint_path(dir_path: Path | str, regex: str = "G_*.pth"):
 
 
 def plot_spectrogram_to_numpy(spectrogram):
-    global MATPLOTLIB_FLAG
-    if not MATPLOTLIB_FLAG:
-        import matplotlib
-
-        matplotlib.use("Agg")
-        MATPLOTLIB_FLAG = True
-    import matplotlib.pylab as plt
-    import numpy as np
-
+    matplotlib.use("Agg")
     fig, ax = plt.subplots(figsize=(10, 2))
     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
     plt.colorbar(im, ax=ax)
@@ -314,7 +308,6 @@ def get_hparams_from_file(config_path: Path | str) -> HParams:
 
 def repeat_expand_2d(content: ndarray, target_len: int) -> ndarray:
     # content : [h, t]
-
     src_len = content.shape[-1]
     target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(
         content.device
@@ -332,12 +325,7 @@ def repeat_expand_2d(content: ndarray, target_len: int) -> ndarray:
 
 
 def plot_data_to_numpy(x: ndarray, y: ndarray) -> ndarray:
-    import matplotlib
-
     matplotlib.use("Agg")
-    import matplotlib.pylab as plt
-    import numpy as np
-
     fig, ax = plt.subplots(figsize=(10, 2))
     plt.plot(x)
     plt.plot(y)
diff --git a/src/so_vits_svc_fork/utils/__init__.py b/src/so_vits_svc_fork/utils/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/so_vits_svc_fork/utils/f0py b/src/so_vits_svc_fork/utils/f0py
deleted file mode 100644
index e69de29b..00000000

From 68fd98252a9ba2395662ef691fe8f37f54db529a Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sat, 1 Apr 2023 13:46:25 +0900
Subject: [PATCH 14/47] refactor: refactor a lot

---
 .idea/workspace.xml                            | 17 +++++++----------
 src/so_vits_svc_fork/__main__.py               |  2 +-
 src/so_vits_svc_fork/gui.py                    |  2 +-
 src/so_vits_svc_fork/modules/generator.py      | 18 ++++++++++++++----
 src/so_vits_svc_fork/modules/onnx/__init__.py  |  3 +++
 .../onnx/{onnx_export.py => _export.py}        |  2 +-
 .../modules/onnx/{model_onnx.py => _model.py}  |  0
 .../configs_template/config_template.json      |  0
 src/so_vits_svc_fork/train.py                  | 15 ++++++---------
 tests/test_main.py                             |  2 +-
 10 files changed, 34 insertions(+), 27 deletions(-)
 rename src/so_vits_svc_fork/modules/onnx/{onnx_export.py => _export.py} (96%)
 rename src/so_vits_svc_fork/modules/onnx/{model_onnx.py => _model.py} (100%)
 rename src/so_vits_svc_fork/{ => preprocessing}/configs_template/config_template.json (100%)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 2642b5e0..73a840f7 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,15 +2,11 @@
 <project version="4">
   <component name="ChangeListManager">
     <list default="true" id="7aacf003-229b-4eb0-80a4-ff105dc3c3d4" name="変更" comment="">
-      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/hubert/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/hubert/__init__.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/hubert/hubert_model.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/hubert/hubert_model.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/hubert/hubert_model_onnx.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/hubert/hubert_model_onnx.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/hubert/put_hubert_ckpt_here" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/generator.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/generator.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/utils/__init__.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/utils/f0py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/__main__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/__main__.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/gui.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/gui.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/__init__.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/model_onnx.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/_model.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/onnx_export.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/so_vits_svc_fork/modules/onnx/_export.py" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/1.wav" beforeDir="false" />
       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/10.wav" beforeDir="false" />
       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/2.wav" beforeDir="false" />
@@ -22,6 +18,7 @@
       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/8.wav" beforeDir="false" />
       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/9.wav" beforeDir="false" />
       <change beforePath="$PROJECT_DIR$/tests/dataset_raw/34j/nested/あ.wav" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/tests/test_main.py" beforeDir="false" afterPath="$PROJECT_DIR$/tests/test_main.py" afterDir="false" />
     </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -131,7 +128,7 @@
       <workItem from="1678932243084" duration="593000" />
       <workItem from="1680174456649" duration="1005000" />
       <workItem from="1680251014707" duration="2800000" />
-      <workItem from="1680319074742" duration="2743000" />
+      <workItem from="1680319074742" duration="3292000" />
     </task>
     <servers />
   </component>
diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
index e9addd92..1eb098b4 100644
--- a/src/so_vits_svc_fork/__main__.py
+++ b/src/so_vits_svc_fork/__main__.py
@@ -744,7 +744,7 @@ def onnx(input_path: Path, output_path: Path, config_path: Path, device: str) ->
         output_path = output_path / (input_path.stem + ".onnx")
     config_path = Path(config_path)
     device_ = torch.device(device)
-    from so_vits_svc_fork.modules.onnx.onnx_export import onnx_export
+    from so_vits_svc_fork.modules.onnx._export import onnx_export
 
     onnx_export(
         input_path=input_path,
diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py
index ddebc02e..9f419b02 100644
--- a/src/so_vits_svc_fork/gui.py
+++ b/src/so_vits_svc_fork/gui.py
@@ -650,7 +650,7 @@ def apply_preset(name: str) -> None:
                     future.cancel()
                     future = None
             elif event == "onnx_export":
-                from so_vits_svc_fork.modules.onnx.onnx_export import onnx_export
+                from so_vits_svc_fork.modules.onnx._export import onnx_export
 
                 try:
                     onnx_export(
diff --git a/src/so_vits_svc_fork/modules/generator.py b/src/so_vits_svc_fork/modules/generator.py
index 949744de..97f5396f 100644
--- a/src/so_vits_svc_fork/modules/generator.py
+++ b/src/so_vits_svc_fork/modules/generator.py
@@ -1,3 +1,4 @@
+import warnings
 from logging import getLogger
 from typing import Any, Literal, Sequence
 
@@ -5,8 +6,6 @@
 from torch import nn
 
 import so_vits_svc_fork.f0
-
-LOG = getLogger(__name__)
 from so_vits_svc_fork.f0 import f0_to_coarse
 from so_vits_svc_fork.modules import commons as commons
 from so_vits_svc_fork.modules.decoders.f0 import F0Decoder
@@ -19,6 +18,8 @@
 from so_vits_svc_fork.modules.encoders import Encoder, TextEncoder
 from so_vits_svc_fork.modules.flows import ResidualCouplingBlock
 
+LOG = getLogger(__name__)
+
 
 class SynthesizerTrn(nn.Module):
     """
@@ -70,6 +71,15 @@ def __init__(
         self.segment_size = segment_size
         self.gin_channels = gin_channels
         self.ssl_dim = ssl_dim
+        self.n_speakers = n_speakers
+        self.sampling_rate = sampling_rate
+        self.type_ = type_
+        self.gen_istft_n_fft = gen_istft_n_fft
+        self.gen_istft_hop_size = gen_istft_hop_size
+        self.subbands = subbands
+        if kwargs:
+            warnings.warn(f"Unused arguments: {kwargs}")
+
         self.emb_g = nn.Embedding(n_speakers, gin_channels)
 
         self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
@@ -174,9 +184,9 @@ def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None):
         )
 
         # MB-iSTFT-VITS
-        if self.dec:
+        if self.mb:
             o, o_mb = self.dec(z_slice, g=g)
-        # nsf decoder
+        # HiFi-GAN
         else:
             o = self.dec(z_slice, g=g, f0=pitch_slice)
             o_mb = None
diff --git a/src/so_vits_svc_fork/modules/onnx/__init__.py b/src/so_vits_svc_fork/modules/onnx/__init__.py
index e69de29b..22c85b04 100644
--- a/src/so_vits_svc_fork/modules/onnx/__init__.py
+++ b/src/so_vits_svc_fork/modules/onnx/__init__.py
@@ -0,0 +1,3 @@
+from ._export import onnx_export
+
+__all__ = ["onnx_export"]
diff --git a/src/so_vits_svc_fork/modules/onnx/onnx_export.py b/src/so_vits_svc_fork/modules/onnx/_export.py
similarity index 96%
rename from src/so_vits_svc_fork/modules/onnx/onnx_export.py
rename to src/so_vits_svc_fork/modules/onnx/_export.py
index c6c9bb78..184ed408 100644
--- a/src/so_vits_svc_fork/modules/onnx/onnx_export.py
+++ b/src/so_vits_svc_fork/modules/onnx/_export.py
@@ -5,7 +5,7 @@
 import torch
 
 from so_vits_svc_fork import utils
-from so_vits_svc_fork.modules.onnx.model_onnx import SynthesizerTrn
+from so_vits_svc_fork.modules.onnx._model import SynthesizerTrn
 
 
 def onnx_export(
diff --git a/src/so_vits_svc_fork/modules/onnx/model_onnx.py b/src/so_vits_svc_fork/modules/onnx/_model.py
similarity index 100%
rename from src/so_vits_svc_fork/modules/onnx/model_onnx.py
rename to src/so_vits_svc_fork/modules/onnx/_model.py
diff --git a/src/so_vits_svc_fork/configs_template/config_template.json b/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json
similarity index 100%
rename from src/so_vits_svc_fork/configs_template/config_template.json
rename to src/so_vits_svc_fork/preprocessing/configs_template/config_template.json
diff --git a/src/so_vits_svc_fork/train.py b/src/so_vits_svc_fork/train.py
index 513e13ed..902c11c6 100644
--- a/src/so_vits_svc_fork/train.py
+++ b/src/so_vits_svc_fork/train.py
@@ -50,14 +50,11 @@ def train(config_path: Path | str, model_path: Path | str):
     mp.spawn(
         _run,
         nprocs=n_gpus,
-        args=(
-            n_gpus,
-            hps,
-        ),
+        args=(n_gpus, hps, True),
     )
 
 
-def _run(rank: int, n_gpus: int, hps: HParams):
+def _run(rank: int, n_gpus: int, hps: HParams, reset_optimizer: bool = False):
     global global_step
     if rank == 0:
         LOG.info(hps)
@@ -118,7 +115,7 @@ def _run(rank: int, n_gpus: int, hps: HParams):
     net_g = DDP(net_g, device_ids=[rank])  # , find_unused_parameters=True)
     net_d = DDP(net_d, device_ids=[rank])
 
-    skip_optimizer = False
+    skip_optimizer = reset_optimizer
     try:
         _, _, _, epoch_str = utils.load_checkpoint(
             utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"),
@@ -272,7 +269,7 @@ def _train_and_evaluate(
 
                 # MB-iSTFT-VITS
                 loss_subband = torch.tensor(0.0)
-                if hps.model.type_ == "mb-istft":
+                if hps.model.__dict__.get("type_") == "mb-istft":
                     from .modules.decoders.mb_istft import PQMF, subband_stft_loss
 
                     y_mb = PQMF(y.device, hps.model.subbands).analysis(y)
@@ -296,7 +293,7 @@ def _train_and_evaluate(
                     "melspectrogram": loss_mel.item(),
                     "kl_divergence": loss_kl.item(),
                 }
-                if hps.model.type_ == "mb-istft":
+                if hps.model.__dict__.get("type_") == "mb-istft":
                     losses["subband_stft"] = loss_subband.item()
                 LOG.info(
                     "Train Epoch: {} [{:.0f}%]".format(
@@ -320,7 +317,7 @@ def _train_and_evaluate(
                         "loss/g/lf0": loss_lf0,
                     }
                 )
-                if hps.model.type_ == "mb-istft":
+                if hps.model.__dict__.get("type_") == "mb-istft":
                     scalar_dict["loss/g/subband"] = loss_subband
 
                 # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
diff --git a/tests/test_main.py b/tests/test_main.py
index 490a9716..3015589c 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -11,7 +11,7 @@ class TestMain(TestCase):
     def test_import(self):
         import so_vits_svc_fork.cluster.train_cluster  # noqa
         import so_vits_svc_fork.inference.inference_main  # noqa
-        import so_vits_svc_fork.modules.onnx.onnx_export  # noqa
+        import so_vits_svc_fork.modules.onnx._export  # noqa
         import so_vits_svc_fork.preprocessing.preprocess_flist_config  # noqa
         import so_vits_svc_fork.preprocessing.preprocess_hubert_f0  # noqa
         import so_vits_svc_fork.preprocessing.preprocess_resample  # noqa

From ef9378daecd61658f37f5e58b0f7d8653040b428 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sat, 1 Apr 2023 19:07:36 +0900
Subject: [PATCH 15/47] fix(train): fix reset_optimizer

---
 src/so_vits_svc_fork/__main__.py | 19 +++++++++++++++++--
 src/so_vits_svc_fork/train.py    |  6 ++++--
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
index 1eb098b4..af145551 100644
--- a/src/so_vits_svc_fork/__main__.py
+++ b/src/so_vits_svc_fork/__main__.py
@@ -93,7 +93,20 @@ def cli():
     type=bool,
     help="launch tensorboard",
 )
-def train(config_path: Path, model_path: Path, tensorboard: bool = False):
+@click.option(
+    "-r",
+    "--reset-optimizer",
+    default=False,
+    type=bool,
+    help="reset optimizer",
+    is_flag=True,
+)
+def train(
+    config_path: Path,
+    model_path: Path,
+    tensorboard: bool = False,
+    reset_optimizer: bool = False,
+):
     """Train model
     If D_0.pth or G_0.pth not found, automatically download from hub."""
     from .train import train
@@ -112,7 +125,9 @@ def train(config_path: Path, model_path: Path, tensorboard: bool = False):
         url = tb.launch()
         webbrowser.open(url)
 
-    train(config_path=config_path, model_path=model_path)
+    train(
+        config_path=config_path, model_path=model_path, reset_optimizer=reset_optimizer
+    )
 
 
 @cli.command()
diff --git a/src/so_vits_svc_fork/train.py b/src/so_vits_svc_fork/train.py
index 902c11c6..49be8e61 100644
--- a/src/so_vits_svc_fork/train.py
+++ b/src/so_vits_svc_fork/train.py
@@ -34,7 +34,9 @@
 start_time = time.time()
 
 
-def train(config_path: Path | str, model_path: Path | str):
+def train(
+    config_path: Path | str, model_path: Path | str, reset_optimizer: bool = False
+):
     """Assume Single Node Multi GPUs Training Only"""
     config_path = Path(config_path)
     model_path = Path(model_path)
@@ -50,7 +52,7 @@ def train(config_path: Path | str, model_path: Path | str):
     mp.spawn(
         _run,
         nprocs=n_gpus,
-        args=(n_gpus, hps, True),
+        args=(n_gpus, hps, reset_optimizer),
     )
 
 

From ef2afb64da9574a9d37b6f5d15c9dc662430eede Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sat, 1 Apr 2023 19:24:05 +0900
Subject: [PATCH 16/47] fix(train): fix pbar starting and ending, log more

---
 src/so_vits_svc_fork/train.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/so_vits_svc_fork/train.py b/src/so_vits_svc_fork/train.py
index 49be8e61..0466c6a0 100644
--- a/src/so_vits_svc_fork/train.py
+++ b/src/so_vits_svc_fork/train.py
@@ -151,9 +151,14 @@ def _run(rank: int, n_gpus: int, hps: HParams, reset_optimizer: bool = False):
 
     scaler = GradScaler(enabled=hps.train.fp16_run)
 
-    LOG.info("Start training")
+    LOG.info(
+        "Start training..."
+        "Note: You do not need to wait until the progress bar is full."
+    )
 
-    for epoch in trange(epoch_str, hps.train.epochs + 1):
+    for epoch in trange(
+        epoch_str, hps.train.epochs + 1, initial=epoch_str, total=hps.train.epochs
+    ):
         if rank == 0:
             _train_and_evaluate(
                 rank,

From 8940eeb6b394f04fb0d5cc52c0780ff8ce20fffa Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sat, 1 Apr 2023 23:32:57 +0900
Subject: [PATCH 17/47] fix(ms-istft): fix conv1d input

---
 src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py b/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py
index 2f28ede0..77f4b88a 100644
--- a/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py
+++ b/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py
@@ -297,7 +297,9 @@ def __init__(
             updown_filter[k, k, 0] = 1.0
         self.register_buffer("updown_filter", updown_filter)
         self.multistream_conv_post = weight_norm(
-            Conv1d(4, 1, kernel_size=63, bias=False, padding=get_padding(63, 1))
+            Conv1d(
+                self.subbands, 1, kernel_size=63, bias=False, padding=get_padding(63, 1)
+            )
         )
         self.multistream_conv_post.apply(init_weights)
 

From eb0b6926f49e2e8fbc731f7dff615f097961b49a Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sat, 1 Apr 2023 23:34:13 +0900
Subject: [PATCH 18/47] chore(config_template): fix default

---
 .../preprocessing/configs_template/config_template.json     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json b/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json
index 1116c0a8..c3e2712b 100644
--- a/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json
+++ b/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json
@@ -1,13 +1,13 @@
 {
   "train": {
-    "log_interval": 200,
-    "eval_interval": 800,
+    "log_interval": 50,
+    "eval_interval": 100,
     "seed": 1234,
     "epochs": 10000,
     "learning_rate": 0.0001,
     "betas": [0.8, 0.99],
     "eps": 1e-9,
-    "batch_size": 2,
+    "batch_size": 18,
     "fp16_run": false,
     "lr_decay": 0.999875,
     "segment_size": 10240,

From 9298ec237d492b04192d2938be3b9bb05f045a36 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 13:48:26 +0900
Subject: [PATCH 19/47] fix: remove hubert and onnx models

---
 .../modules/hubert/__init__.py                |   0
 .../modules/hubert/hubert_model.py            | 223 ----------
 .../modules/hubert/hubert_model_onnx.py       | 219 ----------
 src/so_vits_svc_fork/modules/onnx/__init__.py |   3 -
 src/so_vits_svc_fork/modules/onnx/_export.py  |  65 ---
 src/so_vits_svc_fork/modules/onnx/_model.py   | 406 ------------------
 6 files changed, 916 deletions(-)
 delete mode 100644 src/so_vits_svc_fork/modules/hubert/__init__.py
 delete mode 100644 src/so_vits_svc_fork/modules/hubert/hubert_model.py
 delete mode 100644 src/so_vits_svc_fork/modules/hubert/hubert_model_onnx.py
 delete mode 100644 src/so_vits_svc_fork/modules/onnx/__init__.py
 delete mode 100644 src/so_vits_svc_fork/modules/onnx/_export.py
 delete mode 100644 src/so_vits_svc_fork/modules/onnx/_model.py

diff --git a/src/so_vits_svc_fork/modules/hubert/__init__.py b/src/so_vits_svc_fork/modules/hubert/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/so_vits_svc_fork/modules/hubert/hubert_model.py b/src/so_vits_svc_fork/modules/hubert/hubert_model.py
deleted file mode 100644
index 98c40d56..00000000
--- a/src/so_vits_svc_fork/modules/hubert/hubert_model.py
+++ /dev/null
@@ -1,223 +0,0 @@
-import copy
-import random
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as t_func
-from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
-
-
-class Hubert(nn.Module):
-    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
-        super().__init__()
-        self._mask = mask
-        self.feature_extractor = FeatureExtractor()
-        self.feature_projection = FeatureProjection()
-        self.positional_embedding = PositionalConvEmbedding()
-        self.norm = nn.LayerNorm(768)
-        self.dropout = nn.Dropout(0.1)
-        self.encoder = TransformerEncoder(
-            nn.TransformerEncoderLayer(
-                768, 12, 3072, activation="gelu", batch_first=True
-            ),
-            12,
-        )
-        self.proj = nn.Linear(768, 256)
-
-        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
-        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
-
-    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        mask = None
-        if self.training and self._mask:
-            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
-            x[mask] = self.masked_spec_embed.to(x.dtype)
-        return x, mask
-
-    def encode(
-        self, x: torch.Tensor, layer: Optional[int] = None
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        x = self.feature_extractor(x)
-        x = self.feature_projection(x.transpose(1, 2))
-        x, mask = self.mask(x)
-        x = x + self.positional_embedding(x)
-        x = self.dropout(self.norm(x))
-        x = self.encoder(x, output_layer=layer)
-        return x, mask
-
-    def logits(self, x: torch.Tensor) -> torch.Tensor:
-        logits = torch.cosine_similarity(
-            x.unsqueeze(2),
-            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
-            dim=-1,
-        )
-        return logits / 0.1
-
-    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        x, mask = self.encode(x)
-        x = self.proj(x)
-        logits = self.logits(x)
-        return logits, mask
-
-
-class HubertSoft(Hubert):
-    def __init__(self):
-        super().__init__()
-
-    @torch.inference_mode()
-    def units(self, wav: torch.Tensor) -> torch.Tensor:
-        wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
-        x, _ = self.encode(wav)
-        return self.proj(x)
-
-
-class FeatureExtractor(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
-        self.norm0 = nn.GroupNorm(512, 512)
-        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
-        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
-        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
-        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
-        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
-        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = t_func.gelu(self.norm0(self.conv0(x)))
-        x = t_func.gelu(self.conv1(x))
-        x = t_func.gelu(self.conv2(x))
-        x = t_func.gelu(self.conv3(x))
-        x = t_func.gelu(self.conv4(x))
-        x = t_func.gelu(self.conv5(x))
-        x = t_func.gelu(self.conv6(x))
-        return x
-
-
-class FeatureProjection(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.norm = nn.LayerNorm(512)
-        self.projection = nn.Linear(512, 768)
-        self.dropout = nn.Dropout(0.1)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.norm(x)
-        x = self.projection(x)
-        x = self.dropout(x)
-        return x
-
-
-class PositionalConvEmbedding(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv1d(
-            768,
-            768,
-            kernel_size=128,
-            padding=128 // 2,
-            groups=16,
-        )
-        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.conv(x.transpose(1, 2))
-        x = t_func.gelu(x[:, :, :-1])
-        return x.transpose(1, 2)
-
-
-class TransformerEncoder(nn.Module):
-    def __init__(
-        self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
-    ) -> None:
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
-        )
-        self.num_layers = num_layers
-
-    def forward(
-        self,
-        src: torch.Tensor,
-        mask: torch.Tensor = None,
-        src_key_padding_mask: torch.Tensor = None,
-        output_layer: Optional[int] = None,
-    ) -> torch.Tensor:
-        output = src
-        for layer in self.layers[:output_layer]:
-            output = layer(
-                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
-            )
-        return output
-
-
-def _compute_mask(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    device: torch.device,
-    min_masks: int = 0,
-) -> torch.Tensor:
-    batch_size, sequence_length = shape
-
-    if mask_length < 1:
-        raise ValueError("`mask_length` has to be bigger than 0.")
-
-    if mask_length > sequence_length:
-        raise ValueError(
-            "`mask_length` has to be smaller than `sequence_length`, "
-            f"but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
-        )
-
-    # compute number of masked spans in batch
-    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
-    num_masked_spans = max(num_masked_spans, min_masks)
-
-    # make sure num masked indices <= sequence_length
-    if num_masked_spans * mask_length > sequence_length:
-        num_masked_spans = sequence_length // mask_length
-
-    # SpecAugment mask to fill
-    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
-
-    # uniform distribution to sample from, make sure that offset samples are < sequence_length
-    uniform_dist = torch.ones(
-        (batch_size, sequence_length - (mask_length - 1)), device=device
-    )
-
-    # get random indices to mask
-    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
-
-    # expand masked indices to masked spans
-    mask_indices = (
-        mask_indices.unsqueeze(dim=-1)
-        .expand((batch_size, num_masked_spans, mask_length))
-        .reshape(batch_size, num_masked_spans * mask_length)
-    )
-    offsets = (
-        torch.arange(mask_length, device=device)[None, None, :]
-        .expand((batch_size, num_masked_spans, mask_length))
-        .reshape(batch_size, num_masked_spans * mask_length)
-    )
-    mask_idxs = mask_indices + offsets
-
-    # scatter indices to mask
-    mask = mask.scatter(1, mask_idxs, True)
-
-    return mask
-
-
-def hubert_soft(
-    path: str,
-) -> HubertSoft:
-    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
-    Args:
-        path (str): path of a pretrained model
-    """
-    hubert = HubertSoft()
-    checkpoint = torch.load(path)
-    consume_prefix_in_state_dict_if_present(checkpoint, "module.")
-    hubert.load_state_dict(checkpoint)
-    hubert.eval()
-    return hubert
diff --git a/src/so_vits_svc_fork/modules/hubert/hubert_model_onnx.py b/src/so_vits_svc_fork/modules/hubert/hubert_model_onnx.py
deleted file mode 100644
index 548f599e..00000000
--- a/src/so_vits_svc_fork/modules/hubert/hubert_model_onnx.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import copy
-import random
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as t_func
-from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
-
-
-class Hubert(nn.Module):
-    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
-        super().__init__()
-        self._mask = mask
-        self.feature_extractor = FeatureExtractor()
-        self.feature_projection = FeatureProjection()
-        self.positional_embedding = PositionalConvEmbedding()
-        self.norm = nn.LayerNorm(768)
-        self.dropout = nn.Dropout(0.1)
-        self.encoder = TransformerEncoder(
-            nn.TransformerEncoderLayer(
-                768, 12, 3072, activation="gelu", batch_first=True
-            ),
-            12,
-        )
-        self.proj = nn.Linear(768, 256)
-
-        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
-        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
-
-    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        mask = None
-        if self.training and self._mask:
-            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
-            x[mask] = self.masked_spec_embed.to(x.dtype)
-        return x, mask
-
-    def encode(
-        self, x: torch.Tensor, layer: Optional[int] = None
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        x = self.feature_extractor(x)
-        x = self.feature_projection(x.transpose(1, 2))
-        x, mask = self.mask(x)
-        x = x + self.positional_embedding(x)
-        x = self.dropout(self.norm(x))
-        x = self.encoder(x, output_layer=layer)
-        return x, mask
-
-    def logits(self, x: torch.Tensor) -> torch.Tensor:
-        logits = torch.cosine_similarity(
-            x.unsqueeze(2),
-            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
-            dim=-1,
-        )
-        return logits / 0.1
-
-
-class HubertSoft(Hubert):
-    def __init__(self):
-        super().__init__()
-
-    def units(self, wav: torch.Tensor) -> torch.Tensor:
-        wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
-        x, _ = self.encode(wav)
-        return self.proj(x)
-
-    def forward(self, x):
-        return self.units(x)
-
-
-class FeatureExtractor(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
-        self.norm0 = nn.GroupNorm(512, 512)
-        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
-        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
-        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
-        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
-        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
-        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = t_func.gelu(self.norm0(self.conv0(x)))
-        x = t_func.gelu(self.conv1(x))
-        x = t_func.gelu(self.conv2(x))
-        x = t_func.gelu(self.conv3(x))
-        x = t_func.gelu(self.conv4(x))
-        x = t_func.gelu(self.conv5(x))
-        x = t_func.gelu(self.conv6(x))
-        return x
-
-
-class FeatureProjection(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.norm = nn.LayerNorm(512)
-        self.projection = nn.Linear(512, 768)
-        self.dropout = nn.Dropout(0.1)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.norm(x)
-        x = self.projection(x)
-        x = self.dropout(x)
-        return x
-
-
-class PositionalConvEmbedding(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv1d(
-            768,
-            768,
-            kernel_size=128,
-            padding=128 // 2,
-            groups=16,
-        )
-        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.conv(x.transpose(1, 2))
-        x = t_func.gelu(x[:, :, :-1])
-        return x.transpose(1, 2)
-
-
-class TransformerEncoder(nn.Module):
-    def __init__(
-        self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
-    ) -> None:
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
-        )
-        self.num_layers = num_layers
-
-    def forward(
-        self,
-        src: torch.Tensor,
-        mask: torch.Tensor = None,
-        src_key_padding_mask: torch.Tensor = None,
-        output_layer: Optional[int] = None,
-    ) -> torch.Tensor:
-        output = src
-        for layer in self.layers[:output_layer]:
-            output = layer(
-                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
-            )
-        return output
-
-
-def _compute_mask(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    device: torch.device,
-    min_masks: int = 0,
-) -> torch.Tensor:
-    batch_size, sequence_length = shape
-
-    if mask_length < 1:
-        raise ValueError("`mask_length` has to be bigger than 0.")
-
-    if mask_length > sequence_length:
-        raise ValueError(
-            "`mask_length` has to be smaller than `sequence_length`, "
-            f"but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
-        )
-
-    # compute number of masked spans in batch
-    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
-    num_masked_spans = max(num_masked_spans, min_masks)
-
-    # make sure num masked indices <= sequence_length
-    if num_masked_spans * mask_length > sequence_length:
-        num_masked_spans = sequence_length // mask_length
-
-    # SpecAugment mask to fill
-    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
-
-    # uniform distribution to sample from, make sure that offset samples are < sequence_length
-    uniform_dist = torch.ones(
-        (batch_size, sequence_length - (mask_length - 1)), device=device
-    )
-
-    # get random indices to mask
-    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
-
-    # expand masked indices to masked spans
-    mask_indices = (
-        mask_indices.unsqueeze(dim=-1)
-        .expand((batch_size, num_masked_spans, mask_length))
-        .reshape(batch_size, num_masked_spans * mask_length)
-    )
-    offsets = (
-        torch.arange(mask_length, device=device)[None, None, :]
-        .expand((batch_size, num_masked_spans, mask_length))
-        .reshape(batch_size, num_masked_spans * mask_length)
-    )
-    mask_idxs = mask_indices + offsets
-
-    # scatter indices to mask
-    mask = mask.scatter(1, mask_idxs, True)
-
-    return mask
-
-
-def hubert_soft(
-    path: str,
-) -> HubertSoft:
-    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
-    Args:
-        path (str): path of a pretrained model
-    """
-    hubert = HubertSoft()
-    checkpoint = torch.load(path)
-    consume_prefix_in_state_dict_if_present(checkpoint, "module.")
-    hubert.load_state_dict(checkpoint)
-    hubert.eval()
-    return hubert
diff --git a/src/so_vits_svc_fork/modules/onnx/__init__.py b/src/so_vits_svc_fork/modules/onnx/__init__.py
deleted file mode 100644
index 22c85b04..00000000
--- a/src/so_vits_svc_fork/modules/onnx/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from ._export import onnx_export
-
-__all__ = ["onnx_export"]
diff --git a/src/so_vits_svc_fork/modules/onnx/_export.py b/src/so_vits_svc_fork/modules/onnx/_export.py
deleted file mode 100644
index 184ed408..00000000
--- a/src/so_vits_svc_fork/modules/onnx/_export.py
+++ /dev/null
@@ -1,65 +0,0 @@
-from __future__ import annotations
-
-from pathlib import Path
-
-import torch
-
-from so_vits_svc_fork import utils
-from so_vits_svc_fork.modules.onnx._model import SynthesizerTrn
-
-
-def onnx_export(
-    input_path: Path | str,
-    output_path: Path | str,
-    config_path: Path | str,
-    device: str | torch.device = "cpu",
-):
-    input_path = Path(input_path)
-    output_path = Path(output_path)
-    config_path = Path(config_path)
-    hps = utils.get_hparams_from_file(config_path.as_posix())
-    SVCVITS = SynthesizerTrn(
-        hps.data.filter_length // 2 + 1,
-        hps.train.segment_size // hps.data.hop_length,
-        **hps.model,
-    )
-    _ = utils.load_checkpoint(input_path.as_posix(), SVCVITS, None)
-    _ = SVCVITS.eval().to(device)
-    for i in SVCVITS.parameters():
-        i.requires_grad = False
-
-    test_hidden_unit = torch.rand(1, 10, 256)
-    test_pitch = torch.rand(1, 10)
-    test_mel2ph = torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
-    test_uv = torch.ones(1, 10, dtype=torch.float32)
-    test_noise = torch.randn(1, 192, 10)
-    test_sid = torch.LongTensor([0])
-    input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
-    output_names = [
-        "audio",
-    ]
-
-    torch.onnx.export(
-        SVCVITS,
-        (
-            test_hidden_unit.to(device),
-            test_pitch.to(device),
-            test_mel2ph.to(device),
-            test_uv.to(device),
-            test_noise.to(device),
-            test_sid.to(device),
-        ),
-        output_path.as_posix(),
-        dynamic_axes={
-            "c": [0, 1],
-            "f0": [1],
-            "mel2ph": [1],
-            "uv": [1],
-            "noise": [2],
-        },
-        do_constant_folding=False,
-        opset_version=16,
-        verbose=False,
-        input_names=input_names,
-        output_names=output_names,
-    )
diff --git a/src/so_vits_svc_fork/modules/onnx/_model.py b/src/so_vits_svc_fork/modules/onnx/_model.py
deleted file mode 100644
index c232df89..00000000
--- a/src/so_vits_svc_fork/modules/onnx/_model.py
+++ /dev/null
@@ -1,406 +0,0 @@
-import torch
-from torch import nn
-from torch.nn import Conv1d, Conv2d
-from torch.nn import functional as F
-from torch.nn.utils import spectral_norm, weight_norm
-
-import so_vits_svc_fork.f0
-from so_vits_svc_fork.f0 import f0_to_coarse
-from so_vits_svc_fork.modules import attentions, commons, modules
-from so_vits_svc_fork.modules.commons import get_padding
-from so_vits_svc_fork.modules.decoders.hifigan import Generator
-
-
-class ResidualCouplingBlock(nn.Module):
-    def __init__(
-        self,
-        channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        n_flows=4,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-
-        self.flows = nn.ModuleList()
-        for i in range(n_flows):
-            self.flows.append(
-                modules.ResidualCouplingLayer(
-                    channels,
-                    hidden_channels,
-                    kernel_size,
-                    dilation_rate,
-                    n_layers,
-                    gin_channels=gin_channels,
-                    mean_only=True,
-                )
-            )
-            self.flows.append(modules.Flip())
-
-    def forward(self, x, x_mask, g=None, reverse=False):
-        if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, x_mask, g=g, reverse=reverse)
-        else:
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x
-
-
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.gin_channels = gin_channels
-
-        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
-        self.enc = modules.WN(
-            hidden_channels,
-            kernel_size,
-            dilation_rate,
-            n_layers,
-            gin_channels=gin_channels,
-        )
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-
-    def forward(self, x, x_lengths, g=None):
-        # print(x.shape,x_lengths.shape)
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
-            x.dtype
-        )
-        x = self.pre(x) * x_mask
-        x = self.enc(x, x_mask, g=g)
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
-        return z, m, logs, x_mask
-
-
-class TextEncoder(nn.Module):
-    def __init__(
-        self,
-        out_channels,
-        hidden_channels,
-        kernel_size,
-        n_layers,
-        gin_channels=0,
-        filter_channels=None,
-        n_heads=None,
-        p_dropout=None,
-    ):
-        super().__init__()
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.n_layers = n_layers
-        self.gin_channels = gin_channels
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-        self.f0_emb = nn.Embedding(256, hidden_channels)
-
-        self.enc_ = attentions.Encoder(
-            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
-        )
-
-    def forward(self, x, x_mask, f0=None, z=None):
-        x = x + self.f0_emb(f0).transpose(1, 2)
-        x = self.enc_(x * x_mask, x_mask)
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        z = (m + z * torch.exp(logs)) * x_mask
-        return z, m, logs, x_mask
-
-
-class DiscriminatorP(torch.nn.Module):
-    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
-        super().__init__()
-        self.period = period
-        self.use_spectral_norm = use_spectral_norm
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(
-                    Conv2d(
-                        1,
-                        32,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        32,
-                        128,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        128,
-                        512,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        512,
-                        1024,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        1024,
-                        1024,
-                        (kernel_size, 1),
-                        1,
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-            ]
-        )
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-
-    def forward(self, x):
-        fmap = []
-
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0:  # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), "reflect")
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class DiscriminatorS(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super().__init__()
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
-                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
-                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
-                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
-                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
-                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-            ]
-        )
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-
-    def forward(self, x):
-        fmap = []
-
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class F0Decoder(nn.Module):
-    def __init__(
-        self,
-        out_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        spk_channels=0,
-    ):
-        super().__init__()
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.spk_channels = spk_channels
-
-        self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
-        self.decoder = attentions.FFT(
-            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
-        )
-        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
-        self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
-        self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
-
-    def forward(self, x, norm_f0, x_mask, spk_emb=None):
-        x = torch.detach(x)
-        if spk_emb is not None:
-            x = x + self.cond(spk_emb)
-        x += self.f0_prenet(norm_f0)
-        x = self.prenet(x) * x_mask
-        x = self.decoder(x * x_mask, x_mask)
-        x = self.proj(x) * x_mask
-        return x
-
-
-class SynthesizerTrn(nn.Module):
-    """
-    Synthesizer for Training
-    """
-
-    def __init__(
-        self,
-        spec_channels,
-        segment_size,
-        inter_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        gin_channels,
-        ssl_dim,
-        n_speakers,
-        sampling_rate=44100,
-        **kwargs
-    ):
-        super().__init__()
-        self.spec_channels = spec_channels
-        self.inter_channels = inter_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.resblock = resblock
-        self.resblock_kernel_sizes = resblock_kernel_sizes
-        self.resblock_dilation_sizes = resblock_dilation_sizes
-        self.upsample_rates = upsample_rates
-        self.upsample_initial_channel = upsample_initial_channel
-        self.upsample_kernel_sizes = upsample_kernel_sizes
-        self.segment_size = segment_size
-        self.gin_channels = gin_channels
-        self.ssl_dim = ssl_dim
-        self.emb_g = nn.Embedding(n_speakers, gin_channels)
-
-        self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
-
-        self.enc_p = TextEncoder(
-            inter_channels,
-            hidden_channels,
-            filter_channels=filter_channels,
-            n_heads=n_heads,
-            n_layers=n_layers,
-            kernel_size=kernel_size,
-            p_dropout=p_dropout,
-        )
-        hps = {
-            "sampling_rate": sampling_rate,
-            "inter_channels": inter_channels,
-            "resblock": resblock,
-            "resblock_kernel_sizes": resblock_kernel_sizes,
-            "resblock_dilation_sizes": resblock_dilation_sizes,
-            "upsample_rates": upsample_rates,
-            "upsample_initial_channel": upsample_initial_channel,
-            "upsample_kernel_sizes": upsample_kernel_sizes,
-            "gin_channels": gin_channels,
-        }
-        self.dec = Generator(h=hps)
-        self.enc_q = Encoder(
-            spec_channels,
-            inter_channels,
-            hidden_channels,
-            5,
-            1,
-            16,
-            gin_channels=gin_channels,
-        )
-        self.flow = ResidualCouplingBlock(
-            inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
-        )
-        self.f0_decoder = F0Decoder(
-            1,
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout,
-            spk_channels=gin_channels,
-        )
-        self.emb_uv = nn.Embedding(2, hidden_channels)
-        self.predict_f0 = False
-
-    def forward(self, c, f0, mel2ph, uv, noise=None, g=None):
-        decoder_inp = F.pad(c, [0, 0, 1, 0])
-        mel2ph_ = mel2ph.unsqueeze(2).repeat([1, 1, c.shape[-1]])
-        c = torch.gather(decoder_inp, 1, mel2ph_).transpose(1, 2)  # [B, T, H]
-
-        c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
-        g = g.unsqueeze(0)
-        g = self.emb_g(g).transpose(1, 2)
-        x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(
-            c.dtype
-        )
-        x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
-
-        if self.predict_f0:
-            lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500
-            norm_lf0 = so_vits_svc_fork.f0.normalize_f0(
-                lf0, x_mask, uv, random_scale=False
-            )
-            pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
-            f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
-
-        z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), z=noise)
-        z = self.flow(z_p, c_mask, g=g, reverse=True)
-        o = self.dec(z * c_mask, g=g, f0=f0)
-        return o

From d9231f0870a498b75f4fe6382eadfb3e994ce79e Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 14:00:09 +0900
Subject: [PATCH 20/47] fix(utils): update typing

---
 src/so_vits_svc_fork/utils.py | 102 +++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 52 deletions(-)

diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py
index 8dff73d2..28259eb8 100644
--- a/src/so_vits_svc_fork/utils.py
+++ b/src/so_vits_svc_fork/utils.py
@@ -13,6 +13,8 @@
 import requests
 import torch
 from cm_time import timer
+from fairseq import checkpoint_utils
+from fairseq.models.hubert.hubert import HubertModel
 from numpy import ndarray
 from scipy.io.wavfile import read
 from tqdm import tqdm
@@ -28,7 +30,7 @@ def download_file(
     filepath: Path | str,
     chunk_size: int = 4 * 1024,
     tqdm_cls: type = tqdm,
-    **kwargs,
+    **tqdm_kwargs: Any,
 ):
     filepath = Path(filepath)
     filepath.parent.mkdir(parents=True, exist_ok=True)
@@ -43,7 +45,7 @@ def download_file(
         unit="iB",
         unit_scale=True,
         unit_divisor=1024,
-        **kwargs,
+        **tqdm_kwargs,
     ) as pbar:
         for data in resp.iter_content(chunk_size=chunk_size):
             size = f.write(data)
@@ -51,7 +53,7 @@ def download_file(
     temppath.rename(filepath)
 
 
-def ensure_pretrained_model(folder_path: Path, **kwargs) -> None:
+def ensure_pretrained_model(folder_path: Path, **tqdm_kwargs: Any) -> None:
     model_urls = [
         # "https://huggingface.co/innnky/sovits_pretrained/resolve/main/sovits4/G_0.pth",
         "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/D_0.pth",
@@ -62,24 +64,26 @@ def ensure_pretrained_model(folder_path: Path, **kwargs) -> None:
         model_path = folder_path / model_url.split("/")[-1]
         if not model_path.exists():
             download_file(
-                model_url, model_path, desc=f"Downloading {model_path.name}", **kwargs
+                model_url,
+                model_path,
+                desc=f"Downloading {model_path.name}",
+                **tqdm_kwargs,
             )
 
 
-def ensure_hubert_model(**kwargs) -> Path:
+def ensure_hubert_model(**tqdm_kwargs: Any) -> Path:
     vec_path = Path("checkpoint_best_legacy_500.pt")
     vec_path.parent.mkdir(parents=True, exist_ok=True)
     if not vec_path.exists():
         # url = "http://obs.cstcloud.cn/share/obs/sankagenkeshi/checkpoint_best_legacy_500.pt"
         # url = "https://huggingface.co/innnky/contentvec/resolve/main/checkpoint_best_legacy_500.pt"
         url = "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/checkpoint_best_legacy_500.pt"
-        download_file(url, vec_path, desc="Downloading Hubert model", **kwargs)
+        download_file(url, vec_path, desc="Downloading Hubert model", **tqdm_kwargs)
     return vec_path
 
 
-def get_hubert_model():
+def get_hubert_model() -> HubertModel:
     vec_path = ensure_hubert_model()
-    from fairseq import checkpoint_utils
 
     models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
         [vec_path.as_posix()],
@@ -90,43 +94,32 @@ def get_hubert_model():
     return model
 
 
-def get_hubert_content(hmodel, wav_16k_tensor):
-    with timer() as t:
-        feats = wav_16k_tensor
-        if feats.dim() == 2:  # double channels
-            feats = feats.mean(-1)
-        assert feats.dim() == 1, feats.dim()
-        feats = feats.view(1, -1)
-        padding_mask = torch.BoolTensor(feats.shape).fill_(False)
-        inputs = {
-            "source": feats.to(wav_16k_tensor.device),
-            "padding_mask": padding_mask.to(wav_16k_tensor.device),
-            "output_layer": 9,  # layer 9
-        }
-        with torch.no_grad():
-            logits = hmodel.extract_features(**inputs)
-            feats = hmodel.final_proj(logits[0])
-        res = feats.transpose(1, 2)
-    wav_len = wav_16k_tensor.shape[-1] / 16000
+def get_content(
+    cmodel: HubertModel, audio: torch.Tensor, wrong_legacy_proj: bool = False
+) -> ndarray:
+    with torch.no_grad(), timer() as t:
+        c = cmodel.extract_features(
+            audio.squeeze(1),
+            padding_mask=torch.BoolTensor(audio.shape).fill_(False),
+            output_layer=9,
+        )
+        if wrong_legacy_proj:
+            assert hasattr(cmodel, "final_proj")
+            c = cmodel.final_proj(c[0])
+    c = c.transpose(1, 2)
+    wav_len = audio.shape[-1] / 16000
     LOG.info(
         f"HuBERT inference time  : {t.elapsed:.3f}s, RTF: {t.elapsed / wav_len:.3f}"
     )
-    return res
-
-
-def get_content(cmodel: Any, y: ndarray) -> ndarray:
-    with torch.no_grad():
-        c = cmodel.extract_features(y.squeeze(1))[0]
-    c = c.transpose(1, 2)
     return c
 
 
 def load_checkpoint(
-    checkpoint_path: Any,
-    model: Any,
-    optimizer: Any = None,
+    checkpoint_path: Path | str,
+    model: torch.nn.Module,
+    optimizer: torch.optim.Optimizer | None = None,
     skip_optimizer: bool = False,
-):
+) -> tuple[torch.nn.Module, torch.optim.Optimizer | None, float, int]:
     if not Path(checkpoint_path).is_file():
         raise FileNotFoundError(f"File {checkpoint_path} not found")
     checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
@@ -146,8 +139,6 @@ def load_checkpoint(
     new_state_dict = {}
     for k, v in state_dict.items():
         try:
-            # assert "dec" in k or "disc" in k
-            # print("load", k)
             new_state_dict[k] = saved_state_dict[k]
             assert saved_state_dict[k].shape == v.shape, (
                 saved_state_dict[k].shape,
@@ -166,7 +157,11 @@ def load_checkpoint(
 
 
 def save_checkpoint(
-    model, optimizer, learning_rate, iteration, checkpoint_path
+    model: torch.nn.Module,
+    optimizer: torch.optim.Optimizer,
+    learning_rate: float,
+    iteration: int,
+    checkpoint_path: Path | str,
 ) -> None:
     LOG.info(
         "Saving model and optimizer state at iteration {} to {}".format(
@@ -190,7 +185,7 @@ def save_checkpoint(
 
 def clean_checkpoints(
     path_to_models: Path | str, n_ckpts_to_keep: int = 2, sort_by_time: bool = True
-):
+) -> None:
     """Freeing up space by deleting saved ckpts
 
     Arguments:
@@ -230,15 +225,18 @@ def clean_checkpoints(
             to_delete.unlink()
 
 
+from torch.utils.tensorboard.writer import SummaryWriter
+
+
 def summarize(
-    writer,
-    global_step,
-    scalars={},
-    histograms={},
-    images={},
-    audios={},
-    audio_sampling_rate=22050,
-):
+    writer: SummaryWriter,
+    global_step: int,
+    scalars: dict[str, float] = {},
+    histograms: dict[str, ndarray] = {},
+    images: dict[str, ndarray] = {},
+    audios: dict[str, ndarray] = {},
+    audio_sampling_rate: int = 22050,
+) -> None:
     for k, v in scalars.items():
         writer.add_scalar(k, v, global_step)
     for k, v in histograms.items():
@@ -249,13 +247,13 @@ def summarize(
         writer.add_audio(k, v, global_step, audio_sampling_rate)
 
 
-def latest_checkpoint_path(dir_path: Path | str, regex: str = "G_*.pth"):
+def latest_checkpoint_path(dir_path: Path | str, regex: str = "G_*.pth") -> Path:
     dir_path = Path(dir_path)
     name_key = lambda p: int(re.match(r"._(\d+)\.pth", p.name).group(1))
     return list(sorted(dir_path.glob(regex), key=name_key))[-1]
 
 
-def plot_spectrogram_to_numpy(spectrogram):
+def plot_spectrogram_to_numpy(spectrogram: ndarray) -> ndarray:
     matplotlib.use("Agg")
     fig, ax = plt.subplots(figsize=(10, 2))
     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
@@ -271,7 +269,7 @@ def plot_spectrogram_to_numpy(spectrogram):
     return data
 
 
-def load_wav_to_torch(full_path: Path | str):
+def load_wav_to_torch(full_path: Path | str) -> tuple[torch.Tensor, int]:
     sampling_rate, data = read(full_path)
     return torch.FloatTensor(data.astype(np.float32)), sampling_rate
 

From 8db979c38dad36ea7193a37a73798a798188b153 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 15:00:28 +0900
Subject: [PATCH 21/47] refactor(utils): refactor a lot

---
 src/so_vits_svc_fork/gui.py                   |  2 +-
 src/so_vits_svc_fork/inference/infer_tool.py  | 70 +++++++++----------
 .../configs_template/config_template.json     |  7 +-
 .../preprocessing/preprocess_hubert_f0.py     | 40 ++++++-----
 src/so_vits_svc_fork/train.py                 |  2 +-
 src/so_vits_svc_fork/utils.py                 | 61 ++++++++++------
 6 files changed, 102 insertions(+), 80 deletions(-)

diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py
index 9f419b02..64d3a4bf 100644
--- a/src/so_vits_svc_fork/gui.py
+++ b/src/so_vits_svc_fork/gui.py
@@ -453,7 +453,7 @@ def update_speaker() -> None:
 
         config_path = Path(values["config_path"])
         if config_path.exists() and config_path.is_file():
-            hp = utils.get_hparams_from_file(values["config_path"])
+            hp = utils.get_hparams(values["config_path"])
             LOG.debug(f"Loaded config from {values['config_path']}")
             window["speaker"].update(
                 values=list(hp.__dict__["spk"].keys()), set_to_index=0
diff --git a/src/so_vits_svc_fork/inference/infer_tool.py b/src/so_vits_svc_fork/inference/infer_tool.py
index b65ece60..3b4debb7 100644
--- a/src/so_vits_svc_fork/inference/infer_tool.py
+++ b/src/so_vits_svc_fork/inference/infer_tool.py
@@ -16,7 +16,6 @@
 from so_vits_svc_fork import cluster, utils
 
 from ..modules.generator import SynthesizerTrn
-from ..utils import HUBERT_SAMPLING_RATE
 
 LOG = getLogger(__name__)
 
@@ -91,39 +90,38 @@ class Svc:
     def __init__(
         self,
         *,
-        net_g_path: str,
-        config_path: str,
+        net_g_path: Path | str,
+        config_path: Path | str,
         device: torch.device | str | None = None,
         cluster_model_path: Path | str | None = None,
         half: bool = False,
     ):
         self.net_g_path = net_g_path
         if device is None:
-            self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         else:
-            self.dev = torch.device(device)
-        self.net_g_ms = None
-        self.hps_ms = utils.get_hparams_from_file(config_path)
-        self.target_sample = self.hps_ms.data.sampling_rate
-        self.hop_size = self.hps_ms.data.hop_length
-        self.spk2id = self.hps_ms.spk
-        self.hubert_model = utils.get_hubert_model().to(self.dev)
-        self.half = half
+            self.device = torch.device(device)
+        self.hps = utils.get_hparams(config_path)
+        self.target_sample = self.hps.data.sampling_rate
+        self.hop_size = self.hps.data.hop_length
+        self.spk2id = self.hps.spk
+        self.hubert_model = utils.get_hubert_model(self.device)
+        self.dtype = torch.float16 if half else torch.float32
+        self.contentvec_final_proj = self.hps.data.__dict__.get(
+            "contentvec_final_proj", True
+        )
         self.load_model()
         if cluster_model_path is not None and Path(cluster_model_path).exists():
             self.cluster_model = cluster.get_cluster_model(cluster_model_path)
 
     def load_model(self):
-        self.net_g_ms = SynthesizerTrn(
-            self.hps_ms.data.filter_length // 2 + 1,
-            self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
-            **self.hps_ms.model,
+        self.net_g = SynthesizerTrn(
+            self.hps.data.filter_length // 2 + 1,
+            self.hps.train.segment_size // self.hps.data.hop_length,
+            **self.hps.model,
         )
-        _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
-        if self.half:
-            _ = self.net_g_ms.half().eval().to(self.dev)
-        else:
-            _ = self.net_g_ms.eval().to(self.dev)
+        _ = utils.load_checkpoint(self.net_g_path, self.net_g, None)
+        _ = self.net_g.eval().to(self.device, dtype=self.dtype)
 
     def get_unit_f0(
         self,
@@ -142,24 +140,26 @@ def get_unit_f0(
             method=f0_method,
         )
         f0, uv = so_vits_svc_fork.f0.interpolate_f0(f0)
-        f0 = torch.FloatTensor(f0)
-        uv = torch.FloatTensor(uv)
+        f0 = torch.as_tensor(f0, dtype=self.dtype, device=self.device)
+        uv = torch.as_tensor(uv, dtype=self.dtype, device=self.device)
         f0 = f0 * 2 ** (tran / 12)
-        f0 = f0.unsqueeze(0).to(self.dev)
-        uv = uv.unsqueeze(0).to(self.dev)
+        f0 = f0.unsqueeze(0)
+        uv = uv.unsqueeze(0)
 
-        wav16k = librosa.resample(
-            audio, orig_sr=self.target_sample, target_sr=HUBERT_SAMPLING_RATE
-        )
-        wav16k = torch.from_numpy(wav16k).to(self.dev)
-        c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k)
+        c = utils.get_content(
+            self.hubert_model,
+            audio,
+            self.device,
+            self.target_sample,
+            self.contentvec_final_proj,
+        ).to(self.dtype)
         c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
 
         if cluster_infer_ratio != 0:
             cluster_c = cluster.get_cluster_center_result(
                 self.cluster_model, c.cpu().numpy().T, speaker
             ).T
-            cluster_c = torch.FloatTensor(cluster_c).to(self.dev)
+            cluster_c = torch.FloatTensor(cluster_c).to(self.device)
             c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c
 
         c = c.unsqueeze(0)
@@ -202,21 +202,17 @@ def infer(
         elif len(speaker_candidates) == 0:
             raise ValueError(f"Speaker_id {speaker_id} is not found.")
         speaker = speaker_candidates[0][0]
-        sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
+        sid = torch.LongTensor([int(speaker_id)]).to(self.device).unsqueeze(0)
 
         # get unit f0
         c, f0, uv = self.get_unit_f0(
             audio, transpose, cluster_infer_ratio, speaker, f0_method
         )
-        if self.half:
-            c = c.half()
-            f0 = f0.half()
-            uv = uv.half()
 
         # inference
         with torch.no_grad():
             with timer() as t:
-                audio = self.net_g_ms.infer(
+                audio = self.net_g.infer(
                     c,
                     f0=f0,
                     g=sid,
diff --git a/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json b/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json
index c3e2712b..4bad2141 100644
--- a/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json
+++ b/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json
@@ -34,7 +34,8 @@
     "win_length": 2048,
     "n_mel_channels": 80,
     "mel_fmin": 0.0,
-    "mel_fmax": 22050
+    "mel_fmax": 22050,
+    "contentvec_final_proj": false
   },
   "model": {
     "inter_channels": 192,
@@ -51,7 +52,7 @@
       [1, 3, 5],
       [1, 3, 5]
     ],
-    "upsample_rates": [4, 4],
+    "upsample_rates": [4, 8],
     "upsample_initial_channel": 512,
     "upsample_kernel_sizes": [16, 16],
     "n_layers_q": 3,
@@ -62,7 +63,7 @@
     "type_": "mb-istft",
     "gen_istft_n_fft": 16,
     "gen_istft_hop_size": 4,
-    "subbands": 8
+    "subbands": 4
   },
   "spk": {
     "34j": 0,
diff --git a/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py b/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
index 026f9236..37ad21be 100644
--- a/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
+++ b/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
@@ -13,7 +13,6 @@
 
 import so_vits_svc_fork.f0
 from so_vits_svc_fork import utils
-from so_vits_svc_fork.utils import HUBERT_SAMPLING_RATE
 
 from .preprocess_utils import check_hubert_min_duration
 
@@ -21,6 +20,7 @@
 
 
 def _process_one(
+    *,
     filepath: Path,
     hubert_model,
     sampling_rate: int,
@@ -28,6 +28,7 @@ def _process_one(
     device: Literal["cuda", "cpu"] = "cuda",
     f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
     force_rebuild: bool = False,
+    legacy_final_proj: bool = False,
 ):
     audio, sr = librosa.load(filepath, sr=sampling_rate)
 
@@ -38,11 +39,9 @@ def _process_one(
     # Compute HuBERT content
     soft_path = filepath.parent / (filepath.name + ".soft.pt")
     if (not soft_path.exists()) or force_rebuild:
-        wav16k = librosa.resample(
-            audio, orig_sr=sampling_rate, target_sr=HUBERT_SAMPLING_RATE
+        c = utils.get_content(
+            hubert_model, audio, device, sr=sr, legacy_final_proj=legacy_final_proj
         )
-        wav16k = torch.from_numpy(wav16k).to(device)
-        c = utils.get_hubert_content(hubert_model, wav_16k_tensor=wav16k)
         torch.save(c.cpu(), soft_path)
     else:
         LOG.info(f"Skip {filepath} because {soft_path} exists.")
@@ -60,25 +59,28 @@ def _process_one(
 
 
 def _process_batch(
+    *,
     filepaths: Iterable[Path],
     sampling_rate: int,
     hop_length: int,
     pbar_position: int,
     f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
     force_rebuild: bool = False,
+    legacy_final_proj: bool = False,
 ):
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    hubert_model = utils.get_hubert_model().to(device)
+    hubert_model = utils.get_hubert_model(device)
 
     for filepath in tqdm(filepaths, position=pbar_position):
         _process_one(
-            filepath,
-            hubert_model,
-            sampling_rate,
-            hop_length,
-            device,
-            f0_method,
-            force_rebuild,
+            filepath=filepath,
+            hubert_model=hubert_model,
+            sampling_rate=sampling_rate,
+            hop_length=hop_length,
+            device=device,
+            f0_method=f0_method,
+            force_rebuild=force_rebuild,
+            legacy_final_proj=legacy_final_proj,
         )
 
 
@@ -92,9 +94,7 @@ def preprocess_hubert_f0(
     input_dir = Path(input_dir)
     config_path = Path(config_path)
     utils.ensure_hubert_model()
-    hps = utils.get_hparams_from_file(config_path)
-    sampling_rate = hps.data.sampling_rate
-    hop_length = hps.data.hop_length
+    hps = utils.get_hparams(config_path)
 
     filepaths = list(input_dir.rglob("*.wav"))
     n_jobs = min(cpu_count(), len(filepaths) // 32 + 1, n_jobs)
@@ -102,7 +102,13 @@ def preprocess_hubert_f0(
     filepath_chunks = np.array_split(filepaths, n_jobs)
     Parallel(n_jobs=n_jobs)(
         delayed(_process_batch)(
-            chunk, sampling_rate, hop_length, pbar_position, f0_method, force_rebuild
+            filepaths=chunk,
+            sampling_rate=hps.data.sampling_rate,
+            hop_length=hps.data.hop_length,
+            pbar_position=pbar_position,
+            f0_method=f0_method,
+            force_rebuild=force_rebuild,
+            legacy_final_proj=hps.data.__dict__.get("contentvec_final_proj", True),
         )
         for (pbar_position, chunk) in enumerate(filepath_chunks)
     )
diff --git a/src/so_vits_svc_fork/train.py b/src/so_vits_svc_fork/train.py
index 0466c6a0..691f4115 100644
--- a/src/so_vits_svc_fork/train.py
+++ b/src/so_vits_svc_fork/train.py
@@ -43,7 +43,7 @@ def train(
     if not torch.cuda.is_available():
         raise RuntimeError("CUDA is not available.")
     # utils.ensure_pretrained_model(model_path)
-    hps = utils.get_hparams(config_path, model_path)
+    hps = utils.get_backup_hparams(config_path, model_path)
 
     n_gpus = torch.cuda.device_count()
     os.environ["MASTER_ADDR"] = "localhost"
diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py
index 28259eb8..2a7a64bb 100644
--- a/src/so_vits_svc_fork/utils.py
+++ b/src/so_vits_svc_fork/utils.py
@@ -82,7 +82,7 @@ def ensure_hubert_model(**tqdm_kwargs: Any) -> Path:
     return vec_path
 
 
-def get_hubert_model() -> HubertModel:
+def get_hubert_model(device: torch.device) -> HubertModel:
     vec_path = ensure_hubert_model()
 
     models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
@@ -90,24 +90,40 @@ def get_hubert_model() -> HubertModel:
         suffix="",
     )
     model = models[0]
-    model.eval()
-    return model
+    return model.eval().to(device)
+
+
+import warnings
+
+import torchaudio
 
 
 def get_content(
-    cmodel: HubertModel, audio: torch.Tensor, wrong_legacy_proj: bool = False
-) -> ndarray:
-    with torch.no_grad(), timer() as t:
-        c = cmodel.extract_features(
-            audio.squeeze(1),
-            padding_mask=torch.BoolTensor(audio.shape).fill_(False),
-            output_layer=9,
+    cmodel: HubertModel,
+    audio: torch.Tensor | ndarray[Any, Any],
+    device: torch.device | str,
+    sr: int,
+    legacy_final_proj: bool = False,
+) -> torch.Tensor:
+    print(cmodel.final_proj)
+    audio = torch.as_tensor(audio)
+    if sr != HUBERT_SAMPLING_RATE:
+        audio = torchaudio.transforms.Resample(sr, HUBERT_SAMPLING_RATE)(audio).to(
+            device
         )
-        if wrong_legacy_proj:
+    if audio.ndim == 1:
+        audio = audio.unsqueeze(0)
+    with torch.no_grad(), timer() as t:
+        params = {"output_layer": 9} if legacy_final_proj else {}
+        c: torch.Tensor = cmodel.extract_features(audio, **params)[0]
+        if legacy_final_proj:
+            warnings.warn("legacy_final_proj is deprecated")
             assert hasattr(cmodel, "final_proj")
-            c = cmodel.final_proj(c[0])
-    c = c.transpose(1, 2)
-    wav_len = audio.shape[-1] / 16000
+            assert isinstance(cmodel.final_proj, torch.nn.Module)
+            c = cmodel.final_proj(c)
+        c = c.transpose(1, 2)
+        # print(c.shape)
+    wav_len = audio.shape[-1] / HUBERT_SAMPLING_RATE
     LOG.info(
         f"HuBERT inference time  : {t.elapsed:.3f}s, RTF: {t.elapsed / wav_len:.3f}"
     )
@@ -235,7 +251,7 @@ def summarize(
     histograms: dict[str, ndarray] = {},
     images: dict[str, ndarray] = {},
     audios: dict[str, ndarray] = {},
-    audio_sampling_rate: int = 22050,
+    audio_sampling_rate: int | None = None,
 ) -> None:
     for k, v in scalars.items():
         writer.add_scalar(k, v, global_step)
@@ -244,6 +260,8 @@ def summarize(
     for k, v in images.items():
         writer.add_image(k, v, global_step, dataformats="HWC")
     for k, v in audios.items():
+        if audio_sampling_rate is None:
+            raise ValueError("audio_sampling_rate must be provided")
         writer.add_audio(k, v, global_step, audio_sampling_rate)
 
 
@@ -280,7 +298,9 @@ def load_filepaths_and_text(filename: Path | str, split="|"):
     return filepaths_and_text
 
 
-def get_hparams(config_path: Path, model_path: Path, init: bool = True) -> HParams:
+def get_backup_hparams(
+    config_path: Path, model_path: Path, init: bool = True
+) -> HParams:
     model_path.mkdir(parents=True, exist_ok=True)
     config_save_path = model_path / "config.json"
     if init:
@@ -298,17 +318,17 @@ def get_hparams(config_path: Path, model_path: Path, init: bool = True) -> HPara
     return hparams
 
 
-def get_hparams_from_file(config_path: Path | str) -> HParams:
+def get_hparams(config_path: Path | str) -> HParams:
     config = json.loads(Path(config_path).read_text())
     hparams = HParams(**config)
     return hparams
 
 
-def repeat_expand_2d(content: ndarray, target_len: int) -> ndarray:
+def repeat_expand_2d(content: torch.Tensor, target_len: int) -> torch.Tensor:
     # content : [h, t]
     src_len = content.shape[-1]
-    target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(
-        content.device
+    target = torch.zeros(
+        [content.shape[0], target_len], dtype=content.dtype, device=content.device
     )
     temp = torch.arange(src_len + 1) * target_len / src_len
     current_pos = 0
@@ -318,7 +338,6 @@ def repeat_expand_2d(content: ndarray, target_len: int) -> ndarray:
         else:
             current_pos += 1
             target[:, i] = content[:, current_pos]
-
     return target
 
 

From 5b9e9af26ed49388debc7e8863664591635e0474 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 15:19:22 +0900
Subject: [PATCH 22/47] reafactor: rename inference module files

---
 src/so_vits_svc_fork/__main__.py                              | 4 ++--
 src/so_vits_svc_fork/gui.py                                   | 4 ++--
 src/so_vits_svc_fork/inference/{infer_tool.py => core.py}     | 0
 src/so_vits_svc_fork/inference/{inference_main.py => main.py} | 2 +-
 tests/test_main.py                                            | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)
 rename src/so_vits_svc_fork/inference/{infer_tool.py => core.py} (100%)
 rename src/so_vits_svc_fork/inference/{inference_main.py => main.py} (98%)

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
index af145551..72423d27 100644
--- a/src/so_vits_svc_fork/__main__.py
+++ b/src/so_vits_svc_fork/__main__.py
@@ -223,7 +223,7 @@ def infer(
     device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
 ):
     """Inference"""
-    from so_vits_svc_fork.inference.inference_main import infer
+    from so_vits_svc_fork.inference.main import infer
 
     if not auto_predict_f0:
         LOG.warning(
@@ -382,7 +382,7 @@ def vc(
     passthrough_original: bool = False,
 ) -> None:
     """Realtime inference from microphone"""
-    from so_vits_svc_fork.inference.inference_main import realtime
+    from so_vits_svc_fork.inference.main import realtime
 
     if auto_predict_f0:
         LOG.warning(
diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py
index 64d3a4bf..ee1214f4 100644
--- a/src/so_vits_svc_fork/gui.py
+++ b/src/so_vits_svc_fork/gui.py
@@ -548,7 +548,7 @@ def apply_preset(name: str) -> None:
             elif event == "config_path":
                 update_speaker()
             elif event == "infer":
-                from so_vits_svc_fork.inference.inference_main import infer
+                from so_vits_svc_fork.inference.main import infer
 
                 input_path = Path(values["input_path"])
                 output_path = (
@@ -600,7 +600,7 @@ def apply_preset(name: str) -> None:
                 _, _, input_device_indices, output_device_indices = get_devices(
                     update=False
                 )
-                from so_vits_svc_fork.inference.inference_main import realtime
+                from so_vits_svc_fork.inference.main import realtime
 
                 if future:
                     LOG.info("Canceling previous task")
diff --git a/src/so_vits_svc_fork/inference/infer_tool.py b/src/so_vits_svc_fork/inference/core.py
similarity index 100%
rename from src/so_vits_svc_fork/inference/infer_tool.py
rename to src/so_vits_svc_fork/inference/core.py
diff --git a/src/so_vits_svc_fork/inference/inference_main.py b/src/so_vits_svc_fork/inference/main.py
similarity index 98%
rename from src/so_vits_svc_fork/inference/inference_main.py
rename to src/so_vits_svc_fork/inference/main.py
index 90b98b22..85852b26 100644
--- a/src/so_vits_svc_fork/inference/inference_main.py
+++ b/src/so_vits_svc_fork/inference/main.py
@@ -10,7 +10,7 @@
 import torch
 from cm_time import timer
 
-from so_vits_svc_fork.inference.infer_tool import RealtimeVC, RealtimeVC2, Svc
+from so_vits_svc_fork.inference.core import RealtimeVC, RealtimeVC2, Svc
 
 LOG = getLogger(__name__)
 
diff --git a/tests/test_main.py b/tests/test_main.py
index 3015589c..d5c3e0dc 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -10,7 +10,7 @@
 class TestMain(TestCase):
     def test_import(self):
         import so_vits_svc_fork.cluster.train_cluster  # noqa
-        import so_vits_svc_fork.inference.inference_main  # noqa
+        import so_vits_svc_fork.inference.main  # noqa
         import so_vits_svc_fork.modules.onnx._export  # noqa
         import so_vits_svc_fork.preprocessing.preprocess_flist_config  # noqa
         import so_vits_svc_fork.preprocessing.preprocess_hubert_f0  # noqa
@@ -21,7 +21,7 @@ def test_import(self):
     def test_infer(self):
         if IS_CI:
             raise SkipTest("Skip inference test on CI")
-        from so_vits_svc_fork.inference.inference_main import infer  # noqa
+        from so_vits_svc_fork.inference.main import infer  # noqa
 
         # infer("tests/dataset_raw/34j/1.wav", "tests/configs/config.json", "tests/logs/44k")
 

From aa501495eaedd0e034c3910e25ca8c13f657d1d9 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 15:32:15 +0900
Subject: [PATCH 23/47] refactor(train): refactor train

---
 src/so_vits_svc_fork/train.py | 38 +++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/src/so_vits_svc_fork/train.py b/src/so_vits_svc_fork/train.py
index 691f4115..fbd06a5c 100644
--- a/src/so_vits_svc_fork/train.py
+++ b/src/so_vits_svc_fork/train.py
@@ -9,6 +9,7 @@
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
+from torch import nn
 from torch.cuda.amp import GradScaler, autocast
 from torch.nn import functional as F
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -164,23 +165,23 @@ def _run(rank: int, n_gpus: int, hps: HParams, reset_optimizer: bool = False):
                 rank,
                 epoch,
                 hps,
-                [net_g, net_d],
-                [optim_g, optim_d],
-                [scheduler_g, scheduler_d],
+                (net_g, net_d),
+                (optim_g, optim_d),
+                (scheduler_g, scheduler_d),
                 scaler,
-                [train_loader, eval_loader],
-                [writer, writer_eval],
+                (train_loader, eval_loader),
+                (writer, writer_eval),
             )
         else:
             _train_and_evaluate(
                 rank,
                 epoch,
                 hps,
-                [net_g, net_d],
-                [optim_g, optim_d],
-                [scheduler_g, scheduler_d],
+                (net_g, net_d),
+                (optim_g, optim_d),
+                (scheduler_g, scheduler_d),
                 scaler,
-                [train_loader, None],
+                (train_loader, None),
                 None,
             )
         scheduler_g.step()
@@ -188,7 +189,17 @@ def _run(rank: int, n_gpus: int, hps: HParams, reset_optimizer: bool = False):
 
 
 def _train_and_evaluate(
-    rank, epoch, hps, nets, optims, schedulers, scaler, loaders, writers
+    rank: int,
+    epoch: int,
+    hps: HParams,
+    nets: tuple[nn.Module, nn.Module],
+    optims: tuple[torch.optim.Optimizer, torch.optim.Optimizer],
+    schedulers: tuple[
+        torch.optim.lr_scheduler.ExponentialLR, torch.optim.lr_scheduler.ExponentialLR
+    ],
+    scaler: GradScaler,
+    loaders: tuple[DataLoader, DataLoader | None],
+    writers: None | tuple[SummaryWriter, SummaryWriter],
 ):
     net_g, net_d = nets
     optim_g, optim_d = optims
@@ -392,7 +403,12 @@ def _train_and_evaluate(
         start_time = now
 
 
-def _evaluate(hps, generator, eval_loader, writer_eval):
+def _evaluate(
+    hps: HParams,
+    generator: torch.nn.Module,
+    eval_loader: torch.utils.data.DataLoader,
+    writer_eval: SummaryWriter,
+) -> None:
     generator.eval()
     image_dict = {}
     audio_dict = {}

From d8eec61b1649815dfc0b71899e2c34eb2017c987 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 15:32:45 +0900
Subject: [PATCH 24/47] fix(utils): remove unused print

---
 src/so_vits_svc_fork/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py
index 2a7a64bb..4a57d049 100644
--- a/src/so_vits_svc_fork/utils.py
+++ b/src/so_vits_svc_fork/utils.py
@@ -105,7 +105,6 @@ def get_content(
     sr: int,
     legacy_final_proj: bool = False,
 ) -> torch.Tensor:
-    print(cmodel.final_proj)
     audio = torch.as_tensor(audio)
     if sr != HUBERT_SAMPLING_RATE:
         audio = torchaudio.transforms.Resample(sr, HUBERT_SAMPLING_RATE)(audio).to(

From 2d89a9e8289287c2ffdf6211049e58903439bb7b Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 15:34:19 +0900
Subject: [PATCH 25/47] refactor: generator to synthesizer

---
 src/so_vits_svc_fork/inference/core.py                         | 2 +-
 src/so_vits_svc_fork/modules/{generator.py => synthesizers.py} | 0
 src/so_vits_svc_fork/train.py                                  | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename src/so_vits_svc_fork/modules/{generator.py => synthesizers.py} (100%)

diff --git a/src/so_vits_svc_fork/inference/core.py b/src/so_vits_svc_fork/inference/core.py
index 3b4debb7..9ba6c395 100644
--- a/src/so_vits_svc_fork/inference/core.py
+++ b/src/so_vits_svc_fork/inference/core.py
@@ -15,7 +15,7 @@
 import so_vits_svc_fork.f0
 from so_vits_svc_fork import cluster, utils
 
-from ..modules.generator import SynthesizerTrn
+from ..modules.synthesizers import SynthesizerTrn
 
 LOG = getLogger(__name__)
 
diff --git a/src/so_vits_svc_fork/modules/generator.py b/src/so_vits_svc_fork/modules/synthesizers.py
similarity index 100%
rename from src/so_vits_svc_fork/modules/generator.py
rename to src/so_vits_svc_fork/modules/synthesizers.py
diff --git a/src/so_vits_svc_fork/train.py b/src/so_vits_svc_fork/train.py
index fbd06a5c..5d03e921 100644
--- a/src/so_vits_svc_fork/train.py
+++ b/src/so_vits_svc_fork/train.py
@@ -25,9 +25,9 @@
 from .data_utils import TextAudioCollate, TextAudioSpeakerLoader
 from .hparams import HParams
 from .modules.descriminators import MultiPeriodDiscriminator
-from .modules.generator import SynthesizerTrn
 from .modules.losses import discriminator_loss, feature_loss, generator_loss, kl_loss
 from .modules.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
+from .modules.synthesizers import SynthesizerTrn
 
 LOG = getLogger(__name__)
 torch.backends.cudnn.benchmark = True

From c675a31869d0d53abd46d065a0cf8c0c5b8bd61b Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 15:57:12 +0900
Subject: [PATCH 26/47] fix(config_template): fix ssl_dim

---
 .../preprocessing/configs_template/config_template.json  | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json b/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json
index 4bad2141..72f47b5b 100644
--- a/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json
+++ b/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json
@@ -58,15 +58,12 @@
     "n_layers_q": 3,
     "use_spectral_norm": false,
     "gin_channels": 256,
-    "ssl_dim": 256,
+    "ssl_dim": 768,
     "n_speakers": 200,
-    "type_": "mb-istft",
+    "type_": "ms-istft",
     "gen_istft_n_fft": 16,
     "gen_istft_hop_size": 4,
     "subbands": 4
   },
-  "spk": {
-    "34j": 0,
-    "kiritan": 1
-  }
+  "spk": {}
 }

From 4103b390fcf610821fbf1c81ca9d2025d3b7af79 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 16:07:24 +0900
Subject: [PATCH 27/47] feat: make config selectable

---
 src/so_vits_svc_fork/__main__.py              | 12 ++++
 .../quickvc.json}                             |  0
 .../so-vits-svc-4.0v1-legacy.json             | 60 ++++++++++++++++++
 .../config_templates/so-vits-svc-4.0v1.json   | 62 +++++++++++++++++++
 .../preprocessing/preprocess_flist_config.py  |  6 +-
 tests/test_main.py                            |  3 +-
 6 files changed, 141 insertions(+), 2 deletions(-)
 rename src/so_vits_svc_fork/preprocessing/{configs_template/config_template.json => config_templates/quickvc.json} (100%)
 create mode 100644 src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1-legacy.json
 create mode 100644 src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1.json

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
index 72423d27..f63e3261 100644
--- a/src/so_vits_svc_fork/__main__.py
+++ b/src/so_vits_svc_fork/__main__.py
@@ -485,6 +485,9 @@ def pre_resample(
     )
 
 
+from so_vits_svc_fork.preprocessing.preprocess_flist_config import CONFIG_TEMPLATE_DIR
+
+
 @cli.command()
 @click.option(
     "-i",
@@ -507,10 +510,18 @@ def pre_resample(
     default=Path("./configs/44k/config.json"),
     help="path to config",
 )
+@click.option(
+    "-t",
+    "--config-type",
+    type=click.Choice([x.stem for x in CONFIG_TEMPLATE_DIR.rglob("*.json")]),
+    default="so-vits-svc-4.0v1",
+    help="config type",
+)
 def pre_config(
     input_dir: Path,
     filelist_path: Path,
     config_path: Path,
+    config_type: str,
 ):
     """Preprocessing part 2: config"""
     from so_vits_svc_fork.preprocessing.preprocess_flist_config import preprocess_config
@@ -524,6 +535,7 @@ def pre_config(
         val_list_path=filelist_path / "val.txt",
         test_list_path=filelist_path / "test.txt",
         config_path=config_path,
+        config_name=config_type,
     )
 
 
diff --git a/src/so_vits_svc_fork/preprocessing/configs_template/config_template.json b/src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json
similarity index 100%
rename from src/so_vits_svc_fork/preprocessing/configs_template/config_template.json
rename to src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json
diff --git a/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1-legacy.json b/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1-legacy.json
new file mode 100644
index 00000000..45852762
--- /dev/null
+++ b/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1-legacy.json
@@ -0,0 +1,60 @@
+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 800,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 0.0001,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 6,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 10240,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 512,
+    "port": "8001",
+    "keep_ckpts": 3
+  },
+  "data": {
+    "training_files": "filelists/44k/train.txt",
+    "validation_files": "filelists/44k/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 44100,
+    "filter_length": 2048,
+    "hop_length": 512,
+    "win_length": 2048,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": 22050
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3, 7, 11],
+    "resblock_dilation_sizes": [
+      [1, 3, 5],
+      [1, 3, 5],
+      [1, 3, 5]
+    ],
+    "upsample_rates": [8, 8, 2, 2, 2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16, 16, 4, 4, 4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 256,
+    "n_speakers": 200
+  },
+  "spk": {}
+}
diff --git a/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1.json b/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1.json
new file mode 100644
index 00000000..1bded47c
--- /dev/null
+++ b/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1.json
@@ -0,0 +1,62 @@
+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 800,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 0.0001,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 6,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 10240,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 512,
+    "port": "8001",
+    "keep_ckpts": 3
+  },
+  "data": {
+    "training_files": "filelists/44k/train.txt",
+    "validation_files": "filelists/44k/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 44100,
+    "filter_length": 2048,
+    "hop_length": 512,
+    "win_length": 2048,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": 22050,
+    "contentvec_final_proj": false
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3, 7, 11],
+    "resblock_dilation_sizes": [
+      [1, 3, 5],
+      [1, 3, 5],
+      [1, 3, 5]
+    ],
+    "upsample_rates": [8, 8, 2, 2, 2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16, 16, 4, 4, 4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 768,
+    "n_speakers": 200,
+    "type_": "hifi-gan"
+  },
+  "spk": {}
+}
diff --git a/src/so_vits_svc_fork/preprocessing/preprocess_flist_config.py b/src/so_vits_svc_fork/preprocessing/preprocess_flist_config.py
index 5db387b0..a6642bee 100644
--- a/src/so_vits_svc_fork/preprocessing/preprocess_flist_config.py
+++ b/src/so_vits_svc_fork/preprocessing/preprocess_flist_config.py
@@ -11,6 +11,7 @@
 from tqdm import tqdm
 
 LOG = getLogger(__name__)
+CONFIG_TEMPLATE_DIR = Path(__file__).parent / "config_templates"
 
 
 def preprocess_config(
@@ -19,6 +20,7 @@ def preprocess_config(
     val_list_path: Path | str,
     test_list_path: Path | str,
     config_path: Path | str,
+    config_name: str,
 ):
     input_dir = Path(input_dir)
     train_list_path = Path(train_list_path)
@@ -76,7 +78,9 @@ def preprocess_config(
     config = deepcopy(
         json.loads(
             (
-                Path(__file__).parent / "configs_template" / "config_template.json"
+                CONFIG_TEMPLATE_DIR / f"{config_name}.json"
+                if not config_name.endswith(".json")
+                else config_name
             ).read_text()
         )
     )
diff --git a/tests/test_main.py b/tests/test_main.py
index d5c3e0dc..a7acfcb4 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -43,7 +43,8 @@ def test_preprocess(self):
             "tests/filelists/train.txt",
             "tests/filelists/val.txt",
             "tests/filelists/test.txt",
-            "tests/configs/config.json",
+            "tests/configs/44k/config.json",
+            "so-vits-svc-4.0v1",
         )
 
         if IS_CI:

From d425f1af72bb947b391e7b54b13118508580018c Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 16:25:33 +0900
Subject: [PATCH 28/47] feat(preprocess_hubert): automatically decide n_jobs

---
 src/so_vits_svc_fork/__main__.py                 |  2 +-
 .../preprocessing/preprocess_hubert_f0.py        | 12 +++++++++---
 src/so_vits_svc_fork/utils.py                    | 16 +++++++++++++++-
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
index f63e3261..a3b12bb7 100644
--- a/src/so_vits_svc_fork/__main__.py
+++ b/src/so_vits_svc_fork/__main__.py
@@ -558,7 +558,7 @@ def pre_config(
     "-n",
     "--n-jobs",
     type=int,
-    default=4,
+    default=None,
     help="number of jobs (optimal value may depend on your VRAM capacity and audio duration per file)",
 )
 @click.option(
diff --git a/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py b/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
index 37ad21be..cf32a40c 100644
--- a/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
+++ b/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
@@ -8,15 +8,17 @@
 import librosa
 import numpy as np
 import torch
-from joblib import Parallel, cpu_count, delayed
+from joblib import Parallel, delayed
 from tqdm import tqdm
 
 import so_vits_svc_fork.f0
 from so_vits_svc_fork import utils
 
+from ..utils import get_total_gpu_memory
 from .preprocess_utils import check_hubert_min_duration
 
 LOG = getLogger(__name__)
+HUBERT_MEMORY = 1250
 
 
 def _process_one(
@@ -87,7 +89,7 @@ def _process_batch(
 def preprocess_hubert_f0(
     input_dir: Path | str,
     config_path: Path | str,
-    n_jobs: int = 4,
+    n_jobs: int | None = None,
     f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
     force_rebuild: bool = False,
 ):
@@ -95,9 +97,13 @@ def preprocess_hubert_f0(
     config_path = Path(config_path)
     utils.ensure_hubert_model()
     hps = utils.get_hparams(config_path)
+    if n_jobs is None:
+        memory = get_total_gpu_memory("free")
+        n_jobs = memory // HUBERT_MEMORY
+        LOG.info(f"n_jobs automatically set to {n_jobs}, memory: {memory} MiB")
 
     filepaths = list(input_dir.rglob("*.wav"))
-    n_jobs = min(cpu_count(), len(filepaths) // 32 + 1, n_jobs)
+    n_jobs = min(len(filepaths) // 16 + 1, n_jobs)
     shuffle(filepaths)
     filepath_chunks = np.array_split(filepaths, n_jobs)
     Parallel(n_jobs=n_jobs)(
diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py
index 4a57d049..1efcbaf7 100644
--- a/src/so_vits_svc_fork/utils.py
+++ b/src/so_vits_svc_fork/utils.py
@@ -2,10 +2,11 @@
 
 import json
 import re
+import subprocess
 from itertools import groupby
 from logging import getLogger
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal, Sequence
 
 import matplotlib
 import matplotlib.pylab as plt
@@ -352,3 +353,16 @@ def plot_data_to_numpy(x: ndarray, y: ndarray) -> ndarray:
     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
     plt.close()
     return data
+
+
+def get_gpu_memory(type_: Literal["total", "free", "used"]) -> Sequence[int]:
+    command = f"nvidia-smi --query-gpu=memory.{type_} --format=csv"
+    memory_free_info = (
+        subprocess.check_output(command.split()).decode("ascii").split("\n")[:-1][1:]
+    )
+    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
+    return memory_free_values
+
+
+def get_total_gpu_memory(type_: Literal["total", "free", "used"]) -> int:
+    return sum(get_gpu_memory(type_))

From 094fec5e7d8a520edf7f3961c646344d26d1201e Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 16:50:08 +0900
Subject: [PATCH 29/47] refactor: refactor checkpoint path

---
 src/so_vits_svc_fork/train.py | 45 +++++++++++++++++++----------------
 src/so_vits_svc_fork/utils.py |  7 ++++--
 2 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/src/so_vits_svc_fork/train.py b/src/so_vits_svc_fork/train.py
index 5d03e921..453c681c 100644
--- a/src/so_vits_svc_fork/train.py
+++ b/src/so_vits_svc_fork/train.py
@@ -115,31 +115,34 @@ def _run(rank: int, n_gpus: int, hps: HParams, reset_optimizer: bool = False):
         betas=hps.train.betas,
         eps=hps.train.eps,
     )
-    net_g = DDP(net_g, device_ids=[rank])  # , find_unused_parameters=True)
+    net_g = DDP(net_g, device_ids=[rank])
     net_d = DDP(net_d, device_ids=[rank])
 
-    skip_optimizer = reset_optimizer
-    try:
-        _, _, _, epoch_str = utils.load_checkpoint(
-            utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"),
-            net_g,
-            optim_g,
-            skip_optimizer,
-        )
-        _, _, _, epoch_str = utils.load_checkpoint(
-            utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"),
-            net_d,
-            optim_d,
-            skip_optimizer,
-        )
-        epoch_str = max(epoch_str, 1)
-        global_step = (epoch_str - 1) * len(train_loader)
-    except Exception as e:
-        LOG.exception(e)
-        LOG.info("No checkpoint found, start from scratch")
+    latest_g_path = utils.latest_checkpoint_path(hps.model_dir, "G_*.pth")
+    latest_d_path = utils.latest_checkpoint_path(hps.model_dir, "D_*.pth")
+    if latest_g_path is not None and latest_d_path is not None:
+        try:
+            _, _, _, epoch_str = utils.load_checkpoint(
+                latest_g_path,
+                net_g,
+                optim_g,
+                reset_optimizer,
+            )
+            _, _, _, epoch_str = utils.load_checkpoint(
+                latest_d_path,
+                net_d,
+                optim_d,
+                reset_optimizer,
+            )
+            epoch_str = max(epoch_str, 1)
+            global_step = (epoch_str - 1) * len(train_loader)
+        except Exception as e:
+            raise RuntimeError("Failed to load checkpoint") from e
+    else:
+        LOG.warning("No checkpoint found. Start from scratch.")
         epoch_str = 1
         global_step = 0
-    if skip_optimizer:
+    if reset_optimizer:
         epoch_str = 1
         global_step = 0
 
diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py
index 1efcbaf7..c6516fc2 100644
--- a/src/so_vits_svc_fork/utils.py
+++ b/src/so_vits_svc_fork/utils.py
@@ -265,10 +265,13 @@ def summarize(
         writer.add_audio(k, v, global_step, audio_sampling_rate)
 
 
-def latest_checkpoint_path(dir_path: Path | str, regex: str = "G_*.pth") -> Path:
+def latest_checkpoint_path(dir_path: Path | str, regex: str = "G_*.pth") -> Path | None:
     dir_path = Path(dir_path)
     name_key = lambda p: int(re.match(r"._(\d+)\.pth", p.name).group(1))
-    return list(sorted(dir_path.glob(regex), key=name_key))[-1]
+    paths = list(sorted(dir_path.glob(regex), key=name_key))
+    if len(paths) == 0:
+        return None
+    return paths[-1]
 
 
 def plot_spectrogram_to_numpy(spectrogram: ndarray) -> ndarray:

From 3f2afab4799fe4530b25e61b17b00a8d725ce548 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 16:50:44 +0900
Subject: [PATCH 30/47] chore(config_template): update intervals and
 batch_sizes

---
 .../preprocessing/config_templates/quickvc.json             | 4 ++--
 .../preprocessing/config_templates/so-vits-svc-4.0v1.json   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json b/src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json
index 72f47b5b..a583731e 100644
--- a/src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json
+++ b/src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json
@@ -1,7 +1,7 @@
 {
   "train": {
-    "log_interval": 50,
-    "eval_interval": 100,
+    "log_interval": 100,
+    "eval_interval": 200,
     "seed": 1234,
     "epochs": 10000,
     "learning_rate": 0.0001,
diff --git a/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1.json b/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1.json
index 1bded47c..d4c8d46f 100644
--- a/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1.json
+++ b/src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1.json
@@ -1,13 +1,13 @@
 {
   "train": {
-    "log_interval": 200,
-    "eval_interval": 800,
+    "log_interval": 100,
+    "eval_interval": 200,
     "seed": 1234,
     "epochs": 10000,
     "learning_rate": 0.0001,
     "betas": [0.8, 0.99],
     "eps": 1e-9,
-    "batch_size": 6,
+    "batch_size": 18,
     "fp16_run": false,
     "lr_decay": 0.999875,
     "segment_size": 10240,

From e989a05c52f7c1a2119228042375f4a24d7e3bda Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 17:23:04 +0900
Subject: [PATCH 31/47] refactor: refactor ensure_pretrained

---
 src/so_vits_svc_fork/gui.py                   |   8 +-
 src/so_vits_svc_fork/hparams.py               |   7 +-
 .../preprocessing/preprocess_hubert_f0.py     |   2 +-
 src/so_vits_svc_fork/train.py                 |  10 +-
 src/so_vits_svc_fork/utils.py                 | 109 ++++++++++++------
 5 files changed, 88 insertions(+), 48 deletions(-)

diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py
index ee1214f4..a436fcb1 100644
--- a/src/so_vits_svc_fork/gui.py
+++ b/src/so_vits_svc_fork/gui.py
@@ -11,7 +11,7 @@
 from pebble import ProcessFuture, ProcessPool
 from tqdm.tk import tqdm_tk
 
-from .utils import ensure_hubert_model
+from .utils import ensure_pretrained_model
 
 GUI_DEFAULT_PRESETS_PATH = Path(__file__).parent / "default_gui_presets.json"
 GUI_PRESETS_PATH = Path("./user_gui_presets.json").absolute()
@@ -85,16 +85,16 @@ def get_devices(
 
 def main():
     try:
-        ensure_hubert_model(tqdm_cls=tqdm_tk)
+        ensure_pretrained_model(".", "contentvec", tqdm_cls=tqdm_tk)
     except Exception as e:
         LOG.exception(e)
         LOG.info("Trying tqdm.std...")
         try:
-            ensure_hubert_model()
+            ensure_pretrained_model(".", "contentvec")
         except Exception as e:
             LOG.exception(e)
             try:
-                ensure_hubert_model(disable=True)
+                ensure_pretrained_model(".", "contentvec", disabled=True)
             except Exception as e:
                 LOG.exception(e)
                 LOG.error(
diff --git a/src/so_vits_svc_fork/hparams.py b/src/so_vits_svc_fork/hparams.py
index 27e56c82..6307042c 100644
--- a/src/so_vits_svc_fork/hparams.py
+++ b/src/so_vits_svc_fork/hparams.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
 
+from typing import Any
+
 
 class HParams:
-    def __init__(self, **kwargs):
+    def __init__(self, **kwargs: Any) -> None:
         for k, v in kwargs.items():
             if type(v) == dict:
                 v = HParams(**v)
@@ -17,6 +19,9 @@ def items(self):
     def values(self):
         return self.__dict__.values()
 
+    def get(self, key: str, default: Any = None):
+        return self.__dict__.get(key, default)
+
     def __len__(self):
         return len(self.__dict__)
 
diff --git a/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py b/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
index cf32a40c..b0034a93 100644
--- a/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
+++ b/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
@@ -95,7 +95,7 @@ def preprocess_hubert_f0(
 ):
     input_dir = Path(input_dir)
     config_path = Path(config_path)
-    utils.ensure_hubert_model()
+    utils.ensure_pretrained_model(".", "contentvec")
     hps = utils.get_hparams(config_path)
     if n_jobs is None:
         memory = get_total_gpu_memory("free")
diff --git a/src/so_vits_svc_fork/train.py b/src/so_vits_svc_fork/train.py
index 453c681c..c8272882 100644
--- a/src/so_vits_svc_fork/train.py
+++ b/src/so_vits_svc_fork/train.py
@@ -43,9 +43,9 @@ def train(
     model_path = Path(model_path)
     if not torch.cuda.is_available():
         raise RuntimeError("CUDA is not available.")
-    # utils.ensure_pretrained_model(model_path)
-    hps = utils.get_backup_hparams(config_path, model_path)
 
+    hps = utils.get_backup_hparams(config_path, model_path)
+    utils.ensure_pretrained_model(model_path, hps.model.get("type_", "hifi-gan"))
     n_gpus = torch.cuda.device_count()
     os.environ["MASTER_ADDR"] = "localhost"
     os.environ["MASTER_PORT"] = hps.train.port
@@ -290,7 +290,7 @@ def _train_and_evaluate(
 
                 # MB-iSTFT-VITS
                 loss_subband = torch.tensor(0.0)
-                if hps.model.__dict__.get("type_") == "mb-istft":
+                if hps.model.get("type_") == "mb-istft":
                     from .modules.decoders.mb_istft import PQMF, subband_stft_loss
 
                     y_mb = PQMF(y.device, hps.model.subbands).analysis(y)
@@ -314,7 +314,7 @@ def _train_and_evaluate(
                     "melspectrogram": loss_mel.item(),
                     "kl_divergence": loss_kl.item(),
                 }
-                if hps.model.__dict__.get("type_") == "mb-istft":
+                if hps.model.get("type_") == "mb-istft":
                     losses["subband_stft"] = loss_subband.item()
                 LOG.info(
                     "Train Epoch: {} [{:.0f}%]".format(
@@ -338,7 +338,7 @@ def _train_and_evaluate(
                         "loss/g/lf0": loss_lf0,
                     }
                 )
-                if hps.model.__dict__.get("type_") == "mb-istft":
+                if hps.model.get("type_") == "mb-istft":
                     scalar_dict["loss/g/subband"] = loss_subband
 
                 # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py
index c6516fc2..0e1eff4e 100644
--- a/src/so_vits_svc_fork/utils.py
+++ b/src/so_vits_svc_fork/utils.py
@@ -29,65 +29,100 @@
 def download_file(
     url: str,
     filepath: Path | str,
-    chunk_size: int = 4 * 1024,
+    chunk_size: int = 64 * 1024,
     tqdm_cls: type = tqdm,
+    skip_if_exists: bool = False,
+    overwrite: bool = False,
     **tqdm_kwargs: Any,
 ):
+    if skip_if_exists is True and overwrite is True:
+        raise ValueError("skip_if_exists and overwrite cannot be both True")
     filepath = Path(filepath)
     filepath.parent.mkdir(parents=True, exist_ok=True)
     temppath = filepath.parent / f"{filepath.name}.download"
     if filepath.exists():
-        raise FileExistsError(f"{filepath} already exists")
+        if skip_if_exists:
+            return
+        elif not overwrite:
+            filepath.unlink()
+        else:
+            raise FileExistsError(f"{filepath} already exists")
     temppath.unlink(missing_ok=True)
     resp = requests.get(url, stream=True)
     total = int(resp.headers.get("content-length", 0))
-    with temppath.open("wb") as f, tqdm_cls(
-        total=total,
-        unit="iB",
-        unit_scale=True,
-        unit_divisor=1024,
-        **tqdm_kwargs,
-    ) as pbar:
+    kwargs = (
+        dict(
+            total=total,
+            unit="iB",
+            unit_scale=True,
+            unit_divisor=1024,
+            desc=f"Downloading {filepath.name}",
+        )
+        | tqdm_kwargs
+    )
+    with temppath.open("wb") as f, tqdm_cls(**kwargs) as pbar:
         for data in resp.iter_content(chunk_size=chunk_size):
             size = f.write(data)
             pbar.update(size)
     temppath.rename(filepath)
 
 
-def ensure_pretrained_model(folder_path: Path, **tqdm_kwargs: Any) -> None:
-    model_urls = [
-        # "https://huggingface.co/innnky/sovits_pretrained/resolve/main/sovits4/G_0.pth",
-        "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/D_0.pth",
-        # "https://huggingface.co/innnky/sovits_pretrained/resolve/main/sovits4/D_0.pth",
-        "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/G_0.pth",
-    ]
-    for model_url in model_urls:
-        model_path = folder_path / model_url.split("/")[-1]
-        if not model_path.exists():
-            download_file(
-                model_url,
-                model_path,
-                desc=f"Downloading {model_path.name}",
-                **tqdm_kwargs,
+PRETRAINED_MODEL_URLS = {
+    "hifi-gan": [
+        [
+            "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/D_0.pth",
+            "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/G_0.pth",
+        ],
+        [
+            "https://huggingface.co/Himawari00/so-vits-svc4.0-pretrain-models/resolve/main/D_0.pth",
+            "https://huggingface.co/Himawari00/so-vits-svc4.0-pretrain-models/resolve/main/G_0.pth",
+        ],
+    ],
+    "contentvec": [
+        [
+            "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/checkpoint_best_legacy_500.pt"
+        ],
+        [
+            "https://huggingface.co/Himawari00/so-vits-svc4.0-pretrain-models/resolve/main/checkpoint_best_legacy_500.pt"
+        ],
+        [
+            "http://obs.cstcloud.cn/share/obs/sankagenkeshi/checkpoint_best_legacy_500.pt"
+        ],
+    ],
+}
+from joblib import Parallel, delayed
+
+
+def ensure_pretrained_model(
+    folder_path: Path | str, type_: str, **tqdm_kwargs: Any
+) -> tuple[Path, ...] | None:
+    folder_path = Path(folder_path)
+    models_candidates = PRETRAINED_MODEL_URLS.get(type_, None)
+    if models_candidates is None:
+        LOG.warning(f"Unknown pretrained model type: {type_}")
+        return
+    for model_urls in models_candidates:
+        paths = [folder_path / model_url.split("/")[-1] for model_url in model_urls]
+        try:
+            Parallel(n_jobs=len(paths))(
+                [
+                    delayed(download_file)(
+                        url, path, position=i, skip_if_exists=True, **tqdm_kwargs
+                    )
+                    for i, (url, path) in enumerate(zip(model_urls, paths))
+                ]
             )
-
-
-def ensure_hubert_model(**tqdm_kwargs: Any) -> Path:
-    vec_path = Path("checkpoint_best_legacy_500.pt")
-    vec_path.parent.mkdir(parents=True, exist_ok=True)
-    if not vec_path.exists():
-        # url = "http://obs.cstcloud.cn/share/obs/sankagenkeshi/checkpoint_best_legacy_500.pt"
-        # url = "https://huggingface.co/innnky/contentvec/resolve/main/checkpoint_best_legacy_500.pt"
-        url = "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/checkpoint_best_legacy_500.pt"
-        download_file(url, vec_path, desc="Downloading Hubert model", **tqdm_kwargs)
-    return vec_path
+            return tuple(paths)
+        except Exception as e:
+            LOG.exception(e)
+    return
 
 
 def get_hubert_model(device: torch.device) -> HubertModel:
-    vec_path = ensure_hubert_model()
+    (path,) = ensure_pretrained_model(Path("."), "contentvec")
 
     models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
-        [vec_path.as_posix()],
+        [path.as_posix()],
         suffix="",
     )
     model = models[0]

From 155267a5536d2762e1f87bf6848d001e31ae5c33 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 17:31:23 +0900
Subject: [PATCH 32/47] fix: remove onnx support

---
 src/so_vits_svc_fork/__main__.py | 1 +
 src/so_vits_svc_fork/gui.py      | 7 ++++---
 tests/test_main.py               | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
index a3b12bb7..6c22caea 100644
--- a/src/so_vits_svc_fork/__main__.py
+++ b/src/so_vits_svc_fork/__main__.py
@@ -760,6 +760,7 @@ def clean():
     help="device to use",
 )
 def onnx(input_path: Path, output_path: Path, config_path: Path, device: str) -> None:
+    raise NotImplementedError("ONNX export is not yet supported")
     """Export model to onnx"""
     input_path = Path(input_path)
     if input_path.is_dir():
diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py
index a436fcb1..406e8ff7 100644
--- a/src/so_vits_svc_fork/gui.py
+++ b/src/so_vits_svc_fork/gui.py
@@ -433,7 +433,7 @@ def main():
                 sg.Button("(Re)Start Voice Changer", key="start_vc"),
                 sg.Button("Stop Voice Changer", key="stop_vc"),
                 sg.Push(),
-                sg.Button("ONNX Export", key="onnx_export"),
+                # sg.Button("ONNX Export", key="onnx_export"),
             ],
         ]
     )
@@ -650,9 +650,10 @@ def apply_preset(name: str) -> None:
                     future.cancel()
                     future = None
             elif event == "onnx_export":
-                from so_vits_svc_fork.modules.onnx._export import onnx_export
-
                 try:
+                    raise NotImplementedError("ONNX export is not implemented yet.")
+                    from so_vits_svc_fork.modules.onnx._export import onnx_export
+
                     onnx_export(
                         input_path=Path(values["model_path"]),
                         output_path=Path(values["model_path"]).with_suffix(".onnx"),
diff --git a/tests/test_main.py b/tests/test_main.py
index a7acfcb4..36985dbf 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -11,7 +11,8 @@ class TestMain(TestCase):
     def test_import(self):
         import so_vits_svc_fork.cluster.train_cluster  # noqa
         import so_vits_svc_fork.inference.main  # noqa
-        import so_vits_svc_fork.modules.onnx._export  # noqa
+
+        # import so_vits_svc_fork.modules.onnx._export  # noqa
         import so_vits_svc_fork.preprocessing.preprocess_flist_config  # noqa
         import so_vits_svc_fork.preprocessing.preprocess_hubert_f0  # noqa
         import so_vits_svc_fork.preprocessing.preprocess_resample  # noqa

From 6f0ec1ebaaaa6b4f4f46eb4ed7829e157e9daa21 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 18:56:12 +0900
Subject: [PATCH 33/47] fix(utils): skip optimizer if failed

---
 src/so_vits_svc_fork/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py
index 0e1eff4e..c84ffbb8 100644
--- a/src/so_vits_svc_fork/utils.py
+++ b/src/so_vits_svc_fork/utils.py
@@ -181,7 +181,11 @@ def load_checkpoint(
         and not skip_optimizer
         and checkpoint_dict["optimizer"] is not None
     ):
-        optimizer.load_state_dict(checkpoint_dict["optimizer"])
+        try:
+            optimizer.load_state_dict(checkpoint_dict["optimizer"])
+        except Exception as e:
+            LOG.exception(e)
+            LOG.warning("Failed to load optimizer state")
     saved_state_dict = checkpoint_dict["model"]
     if hasattr(model, "module"):
         state_dict = model.module.state_dict()

From 1e085e2fa5173918b42830a468b88175b26e6acf Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 18:56:42 +0900
Subject: [PATCH 34/47] feat(synthesizers): allow ssl_dim = None

---
 src/so_vits_svc_fork/modules/synthesizers.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/so_vits_svc_fork/modules/synthesizers.py b/src/so_vits_svc_fork/modules/synthesizers.py
index 97f5396f..06dfda67 100644
--- a/src/so_vits_svc_fork/modules/synthesizers.py
+++ b/src/so_vits_svc_fork/modules/synthesizers.py
@@ -82,7 +82,10 @@ def __init__(
 
         self.emb_g = nn.Embedding(n_speakers, gin_channels)
 
-        self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
+        if ssl_dim is None:
+            self.pre = nn.LazyConv1d(hidden_channels, kernel_size=5, padding=2)
+        else:
+            self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
 
         self.enc_p = TextEncoder(
             inter_channels,

From 61e885e2727212f1a3c4a0d70d111f29587c959f Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 22:50:37 +0900
Subject: [PATCH 35/47] refactor: move hifigan descriminators

---
 .../modules/decoders/hifigan/_models.py       | 161 +-----------------
 .../modules/descriminators.py                 |  35 +++-
 2 files changed, 36 insertions(+), 160 deletions(-)

diff --git a/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py b/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
index 07ae35b5..92b5bd06 100644
--- a/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
+++ b/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
@@ -4,8 +4,8 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
-from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm, weight_norm
 
 from ._utils import get_padding, init_weights
 
@@ -442,160 +442,3 @@ def remove_weight_norm(self):
             l.remove_weight_norm()
         remove_weight_norm(self.conv_pre)
         remove_weight_norm(self.conv_post)
-
-
-class DiscriminatorP(torch.nn.Module):
-    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
-        super().__init__()
-        self.period = period
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(
-                    Conv2d(
-                        1,
-                        32,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(5, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        32,
-                        128,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(5, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        128,
-                        512,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(5, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        512,
-                        1024,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(5, 1), 0),
-                    )
-                ),
-                norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
-            ]
-        )
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-
-    def forward(self, x):
-        fmap = []
-
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0:  # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), "reflect")
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class MultiPeriodDiscriminator(torch.nn.Module):
-    def __init__(self, periods=None):
-        super().__init__()
-        self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
-        self.discriminators = nn.ModuleList()
-        for period in self.periods:
-            self.discriminators.append(DiscriminatorP(period))
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-class DiscriminatorS(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super().__init__()
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(Conv1d(1, 128, 15, 1, padding=7)),
-                norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
-                norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
-                norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
-                norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
-                norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
-                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-            ]
-        )
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-
-    def forward(self, x):
-        fmap = []
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class MultiScaleDiscriminator(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.discriminators = nn.ModuleList(
-            [
-                DiscriminatorS(use_spectral_norm=True),
-                DiscriminatorS(),
-                DiscriminatorS(),
-            ]
-        )
-        self.meanpools = nn.ModuleList(
-            [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)]
-        )
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            if i != 0:
-                y = self.meanpools[i - 1](y)
-                y_hat = self.meanpools[i - 1](y_hat)
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
diff --git a/src/so_vits_svc_fork/modules/descriminators.py b/src/so_vits_svc_fork/modules/descriminators.py
index dbffd86b..a59b1e5f 100644
--- a/src/so_vits_svc_fork/modules/descriminators.py
+++ b/src/so_vits_svc_fork/modules/descriminators.py
@@ -1,6 +1,6 @@
 import torch
 from torch import nn
-from torch.nn import Conv1d, Conv2d
+from torch.nn import AvgPool1d, Conv1d, Conv2d
 from torch.nn import functional as F
 from torch.nn.utils import spectral_norm, weight_norm
 
@@ -142,3 +142,36 @@ def forward(self, y, y_hat):
             fmap_gs.append(fmap_g)
 
         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorS(use_spectral_norm=True),
+                DiscriminatorS(),
+                DiscriminatorS(),
+            ]
+        )
+        self.meanpools = nn.ModuleList(
+            [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)]
+        )
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs

From 88aaa3ae953747f714228f66bd064a1231e7b195 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 2 Apr 2023 22:55:21 +0900
Subject: [PATCH 36/47] refactor(hifigan): remove duplicated or unused code

---
 .../modules/decoders/hifigan/_models.py       | 137 +-----------------
 .../modules/decoders/hifigan/_utils.py        |  56 -------
 2 files changed, 2 insertions(+), 191 deletions(-)

diff --git a/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py b/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
index 92b5bd06..4bc5d1ef 100644
--- a/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
+++ b/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
@@ -7,147 +7,14 @@
 from torch.nn import Conv1d, ConvTranspose1d
 from torch.nn.utils import remove_weight_norm, weight_norm
 
-from ._utils import get_padding, init_weights
+from ...modules import ResBlock1, ResBlock2
+from ._utils import init_weights
 
 LOG = getLogger(__name__)
 
 LRELU_SLOPE = 0.1
 
 
-class ResBlock1(torch.nn.Module):
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super().__init__()
-        self.h = h
-        self.convs1 = nn.ModuleList(
-            [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[2],
-                        padding=get_padding(kernel_size, dilation[2]),
-                    )
-                ),
-            ]
-        )
-        self.convs1.apply(init_weights)
-
-        self.convs2 = nn.ModuleList(
-            [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-            ]
-        )
-        self.convs2.apply(init_weights)
-
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, LRELU_SLOPE)
-            xt = c2(xt)
-            x = xt + x
-        return x
-
-    def remove_weight_norm(self):
-        for l in self.convs1:
-            remove_weight_norm(l)
-        for l in self.convs2:
-            remove_weight_norm(l)
-
-
-class ResBlock2(torch.nn.Module):
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
-        super().__init__()
-        self.h = h
-        self.convs = nn.ModuleList(
-            [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-            ]
-        )
-        self.convs.apply(init_weights)
-
-    def forward(self, x):
-        for c in self.convs:
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c(xt)
-            x = xt + x
-        return x
-
-    def remove_weight_norm(self):
-        for l in self.convs:
-            remove_weight_norm(l)
-
-
 def padDiff(x):
     return F.pad(
         F.pad(x, (0, 0, -1, 1), "constant", 0) - x, (0, 0, 0, -1), "constant", 0
diff --git a/src/so_vits_svc_fork/modules/decoders/hifigan/_utils.py b/src/so_vits_svc_fork/modules/decoders/hifigan/_utils.py
index 5839f9e4..8f862c11 100644
--- a/src/so_vits_svc_fork/modules/decoders/hifigan/_utils.py
+++ b/src/so_vits_svc_fork/modules/decoders/hifigan/_utils.py
@@ -1,71 +1,15 @@
-import glob
-import os
 from logging import getLogger
 
 # matplotlib.use("Agg")
-import matplotlib.pylab as plt
-import torch
-from torch.nn.utils import weight_norm
 
 LOG = getLogger(__name__)
 
 
-def plot_spectrogram(spectrogram):
-    fig, ax = plt.subplots(figsize=(10, 2))
-    im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
-    plt.colorbar(im, ax=ax)
-
-    fig.canvas.draw()
-    plt.close()
-
-    return fig
-
-
 def init_weights(m, mean=0.0, std=0.01):
     classname = m.__class__.__name__
     if classname.find("Conv") != -1:
         m.weight.data.normal_(mean, std)
 
 
-def apply_weight_norm(m):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        weight_norm(m)
-
-
 def get_padding(kernel_size, dilation=1):
     return int((kernel_size * dilation - dilation) / 2)
-
-
-def load_checkpoint(filepath, device):
-    assert os.path.isfile(filepath)
-    LOG.info(f"Loading '{filepath}'")
-    checkpoint_dict = torch.load(filepath, map_location=device)
-    LOG.info("Complete.")
-    return checkpoint_dict
-
-
-def save_checkpoint(filepath, obj):
-    LOG.info(f"Saving checkpoint to {filepath}")
-    torch.save(obj, filepath)
-    LOG.info("Complete.")
-
-
-def del_old_checkpoints(cp_dir, prefix, n_models=2):
-    pattern = os.path.join(cp_dir, prefix + "????????")
-    cp_list = glob.glob(pattern)  # get checkpoint paths
-    cp_list = sorted(cp_list)  # sort by iter
-    if len(cp_list) > n_models:  # if more than n_models models are found
-        for cp in cp_list[
-            :-n_models
-        ]:  # delete the oldest models other than last n_models
-            open(cp, "w").close()  # empty file contents
-            os.unlink(cp)  # delete file (move to trash when using Colab)
-
-
-def scan_checkpoint(cp_dir, prefix):
-    pattern = os.path.join(cp_dir, prefix + "????????")
-    cp_list = glob.glob(pattern)
-    if len(cp_list) == 0:
-        return None
-    return sorted(cp_list)[-1]

From f074993eb1006adb35e530589033d5e26a0ac915 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Mon, 3 Apr 2023 10:57:23 +0900
Subject: [PATCH 37/47] fix(synthesizers): fix default subbands

---
 src/so_vits_svc_fork/modules/synthesizers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/so_vits_svc_fork/modules/synthesizers.py b/src/so_vits_svc_fork/modules/synthesizers.py
index 06dfda67..0f796215 100644
--- a/src/so_vits_svc_fork/modules/synthesizers.py
+++ b/src/so_vits_svc_fork/modules/synthesizers.py
@@ -50,7 +50,7 @@ def __init__(
         type_: Literal["hifi-gan", "istft", "ms-istft", "mb-istft"] = "hifi-gan",
         gen_istft_n_fft: int = 16,
         gen_istft_hop_size: int = 4,
-        subbands: int = 8,
+        subbands: int = 4,
         **kwargs: Any,
     ):
         super().__init__()

From d747bea2bbca94964baf8e77c144575d7f2dbfb9 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Mon, 3 Apr 2023 10:57:59 +0900
Subject: [PATCH 38/47] perf(inference): add torch.cuda.empty_cache

---
 src/so_vits_svc_fork/inference/core.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/so_vits_svc_fork/inference/core.py b/src/so_vits_svc_fork/inference/core.py
index 9ba6c395..fbbcd794 100644
--- a/src/so_vits_svc_fork/inference/core.py
+++ b/src/so_vits_svc_fork/inference/core.py
@@ -224,10 +224,8 @@ def infer(
             LOG.info(
                 f"Inferece time: {t.elapsed:.2f}s, RTF: {t.elapsed / audio_duration:.2f}"
             )
-        return audio, audio.shape[-1]
-
-    def clear_empty(self):
         torch.cuda.empty_cache()
+        return audio, audio.shape[-1]
 
     def infer_silence(
         self,
@@ -300,6 +298,9 @@ def infer_silence(
                 # fade_len = int(self.target_sample * fade_seconds)
                 # _audio[:fade_len] = _audio[:fade_len] * np.linspace(0, 1, fade_len)
                 # _audio[-fade_len:] = _audio[-fade_len:] * np.linspace(1, 0, fade_len)
+
+                # empty cache
+                torch.cuda.empty_cache()
             result_audio = np.concatenate([result_audio, audio_chunk_infer])
         result_audio = result_audio[: audio.shape[0]]
         return result_audio

From 83e3e0390e511dfbef01e6e566ca66b6025a15ef Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Mon, 3 Apr 2023 11:05:02 +0900
Subject: [PATCH 39/47] chore(config): update quickvc config

---
 .../preprocessing/config_templates/quickvc.json        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json b/src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json
index a583731e..dbef0a69 100644
--- a/src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json
+++ b/src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json
@@ -19,9 +19,9 @@
     "max_speclen": 512,
     "port": "8001",
     "keep_ckpts": 3,
-    "fft_sizes": [384, 683, 171],
-    "hop_sizes": [30, 60, 10],
-    "win_lengths": [150, 300, 60],
+    "fft_sizes": [768, 1366, 342],
+    "hop_sizes": [60, 120, 20],
+    "win_lengths": [300, 600, 120],
     "window": "hann_window"
   },
   "data": {
@@ -52,9 +52,9 @@
       [1, 3, 5],
       [1, 3, 5]
     ],
-    "upsample_rates": [4, 8],
+    "upsample_rates": [8, 4],
     "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [16, 16],
+    "upsample_kernel_sizes": [32, 16],
     "n_layers_q": 3,
     "use_spectral_norm": false,
     "gin_channels": 256,

From 623aedda194a14618e53b952e64e381112d0e0ce Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Mon, 3 Apr 2023 11:08:38 +0900
Subject: [PATCH 40/47] fix(hifigan): fix ResBlock initialization

---
 src/so_vits_svc_fork/modules/decoders/hifigan/_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py b/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
index 4bc5d1ef..5161ed2e 100644
--- a/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
+++ b/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
@@ -265,7 +265,7 @@ def __init__(self, h):
             for j, (k, d) in enumerate(
                 zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])
             ):
-                self.resblocks.append(resblock(h, ch, k, d))
+                self.resblocks.append(resblock(ch, k, d))
 
         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
         self.ups.apply(init_weights)

From ba95f1c306f42e4446808bb31477cae36022b064 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Mon, 3 Apr 2023 11:21:23 +0900
Subject: [PATCH 41/47] docs(readme): update README.md

---
 README.md | 62 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 93f4777c..c8e9e337 100644
--- a/README.md
+++ b/README.md
@@ -37,13 +37,15 @@ A fork of [`so-vits-svc`](https://github.com/svc-develop-team/so-vits-svc) with
 ## Features not available in the original repo
 
 - **Realtime voice conversion** (enhanced in v1.1.0)
+- Integrates [`QuickVC`](https://github.com/quickvc/QuickVC-VoiceConversion)
+- Fixed misuse of `ContentVec` in the original repository.[^c]
 - More accurate pitch estimation using CREPE
-- GUI available
-- Unified command-line interface (no need to run Python scripts)
+- GUI and unified CLI available
 - Ready to use just by installing with `pip`.
-- Automatically download pretrained base model and HuBERT model
+- Automatically download pretrained models
 - Code completely formatted with black, isort, autoflake etc.
-- Other minor differences
+
+[^c]: [#206](https://github.com/34j/so-vits-svc-fork/issues/206)
 
 ## Installation
 
@@ -53,9 +55,32 @@ A fork of [`so-vits-svc`](https://github.com/svc-develop-team/so-vits-svc) with
   <img src="https://img.shields.io/badge/.bat-download-blue?style=flat-square&logo=windows" alt="Download .bat">
 </a>
 
-### [Creating a Virtual Environment](https://github.com/34j/so-vits-svc-fork/wiki#creating-a-virtual-environment)
+<details>
+  <summary>Creating a virtual environment</summary>
+  Windows:
+
+```shell
+py -3.10 -m venv venv
+venv\Scripts\activate
+```
+
+Linux/MacOS:
+
+```shell
+python3.10 -m venv venv
+source venv/bin/activate
+```
+
+Anaconda:
 
-### Install
+```shell
+conda create -n so-vits-svc-fork python=3.10 pip
+conda activate so-vits-svc-fork
+```
+
+Installing without creating a virtual environment may cause a `PermissionError` if Python is installed in Program Files, etc.
+
+</details>
 
 Install this via pip (or your favourite package manager that uses pip):
 
@@ -65,11 +90,14 @@ pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu1
 pip install -U so-vits-svc-fork
 ```
 
-- If no GPU is available, simply remove `pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu117`.
-- If you are using an AMD GPU on Linux, replace `--index-url https://download.pytorch.org/whl/cu117` with `--index-url https://download.pytorch.org/whl/rocm5.4.2`. AMD GPUs are not supported on Windows (#120).
-- If `fairseq` raises an error:
-  - If it prompts [`Microsoft C++ Build Tools`](https://visualstudio.microsoft.com/visual-cpp-build-tools/) is not installed. please install it.
-  - If it prompts that some dll is missing, reinstalling `Microsoft Visual C++ 2022` and `Windows SDK` may help.
+<details>
+  <summary>Notes</summary>
+  - If no GPU is available, simply remove `pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu117`.
+  - If you are using an AMD GPU on Linux, replace `--index-url https://download.pytorch.org/whl/cu117` with `--index-url https://download.pytorch.org/whl/rocm5.4.2`. AMD GPUs are not supported on Windows (#120).
+  - If `fairseq` raises an error:
+    - If it prompts [`Microsoft C++ Build Tools`](https://visualstudio.microsoft.com/visual-cpp-build-tools/) is not installed. please install it.
+    - If it prompts that some dll is missing, reinstalling `Microsoft Visual C++ 2022` and `Windows SDK` may help.
+</details>
 
 ### Update
 
@@ -95,18 +123,22 @@ svcg
 
 #### CLI
 
-- Realtime (from microphone)
+<details>
+  <summary>CLI</summary>
+  - Realtime (from microphone)
 
 ```shell
-svc vc --model-path <model-path>
+svc vc
 ```
 
 - File
 
 ```shell
-svc --model-path <model-path> source.wav
+svc infer source.wav
 ```
 
+</details>
+
 [Pretrained models](https://huggingface.co/models?search=so-vits-svc-4.0) are available on HuggingFace.
 
 #### Notes
@@ -130,7 +162,7 @@ svc --model-path <model-path> source.wav
 [![Open In Paperspace](https://img.shields.io/badge/Open%20in-Paperspace-blue?style=flat-square&logo=paperspace)](https://console.paperspace.com/github/34j/so-vits-svc-fork-paperspace/blob/main/so-vits-svc-fork-4.0-paperspace.ipynb)
 [![Paperspace Referral](<https://img.shields.io/badge/Referral%20($10)-9VJN74I-blue?style=flat-square&logo=paperspace>)](https://www.paperspace.com/?r=9VJN74I)[^p]
 
-If you do not have access to a GPU with more than 10 GB of VRAM, the free plan of Google Colab is recommended for light users and the Pro/Growth plan of Paperspace is recommended for heavy users. Conversely, if you have a high-end GPU, the use of cloud services is not recommended.
+If you do not have access to a GPU with more than 10 GB of VRAM, the free plan of Google Colab is recommended for light users and the Pro/Growth plan of Paperspace is recommended for heavy users. Conversely, if you have access to a high-end GPU, the use of cloud services is not recommended.
 
 [^p]: If you register a referral code and then add a payment method, you may save about $5 on your first month's monthly billing. Note that both referral rewards are Paperspace credits and not cash. It was a tough decision but inserted because debugging and training the initial model requires a large amount of computing power and the developer is a student.
 

From 76fe3d7efb002b8af324cdd599e4c16b7b0f4a41 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Mon, 3 Apr 2023 11:25:34 +0900
Subject: [PATCH 42/47] docs(readme): update README.md

---
 README.md | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index c8e9e337..a41961b4 100644
--- a/README.md
+++ b/README.md
@@ -57,7 +57,8 @@ A fork of [`so-vits-svc`](https://github.com/svc-develop-team/so-vits-svc) with
 
 <details>
   <summary>Creating a virtual environment</summary>
-  Windows:
+
+Windows:
 
 ```shell
 py -3.10 -m venv venv
@@ -92,12 +93,13 @@ pip install -U so-vits-svc-fork
 
 <details>
   <summary>Notes</summary>
-  - If no GPU is available, simply remove `pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu117`.
-  - If you are using an AMD GPU on Linux, replace `--index-url https://download.pytorch.org/whl/cu117` with `--index-url https://download.pytorch.org/whl/rocm5.4.2`. AMD GPUs are not supported on Windows (#120).
-  - If `fairseq` raises an error:
-    - If it prompts [`Microsoft C++ Build Tools`](https://visualstudio.microsoft.com/visual-cpp-build-tools/) is not installed. please install it.
-    - If it prompts that some dll is missing, reinstalling `Microsoft Visual C++ 2022` and `Windows SDK` may help.
-</details>
+
+- If no GPU is available, simply remove `pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu117`.
+- If you are using an AMD GPU on Linux, replace `--index-url https://download.pytorch.org/whl/cu117` with `--index-url https://download.pytorch.org/whl/rocm5.4.2`. AMD GPUs are not supported on Windows (#120).
+- If `fairseq` raises an error:
+  - If it prompts [`Microsoft C++ Build Tools`](https://visualstudio.microsoft.com/visual-cpp-build-tools/) is not installed. please install it.
+  - If it prompts that some dll is missing, reinstalling `Microsoft Visual C++ 2022` and `Windows SDK` may help.
+  </details>
 
 ### Update
 
@@ -125,7 +127,8 @@ svcg
 
 <details>
   <summary>CLI</summary>
-  - Realtime (from microphone)
+
+- Realtime (from microphone)
 
 ```shell
 svc vc
@@ -181,7 +184,7 @@ svc train -t
 
 - Dataset audio duration per file should be <~ 10s or VRAM will run out.
 - To change the f0 inference method to CREPE, replace `svc pre-hubert` with `svc pre-hubert -fm crepe`. You may need to reduce `--n-jobs` due to performance issues.
-- It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. The default value is optimized for Tesla T4 (16GB VRAM), but training is possible without that much VRAM.
+- It is recommended to increase the `batch_size` as much as possible in `config.json` before the `train` command to match the VRAM capacity.
 - Silence removal and volume normalization are automatically performed (as in the upstream repo) and are not required.
 
 ### Further help

From b97c70d7af5966f79f1367aac70696c1916f76f2 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Mon, 3 Apr 2023 11:28:34 +0900
Subject: [PATCH 43/47] docs(readme): update README.md

---
 README.md | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index a41961b4..b87697c1 100644
--- a/README.md
+++ b/README.md
@@ -95,7 +95,7 @@ pip install -U so-vits-svc-fork
   <summary>Notes</summary>
 
 - If no GPU is available, simply remove `pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu117`.
-- If you are using an AMD GPU on Linux, replace `--index-url https://download.pytorch.org/whl/cu117` with `--index-url https://download.pytorch.org/whl/rocm5.4.2`. AMD GPUs are not supported on Windows (#120).
+- If you are using an AMD GPU on Linux, replace `--index-url https://download.pytorch.org/whl/cu117` with `--index-url https://download.pytorch.org/whl/rocm5.4.2`. AMD GPUs are not supported on Windows ([#120](https://github.com/34j/so-vits-svc-fork/issues/120)).
 - If `fairseq` raises an error:
   - If it prompts [`Microsoft C++ Build Tools`](https://visualstudio.microsoft.com/visual-cpp-build-tools/) is not installed. please install it.
   - If it prompts that some dll is missing, reinstalling `Microsoft Visual C++ 2022` and `Windows SDK` may help.
@@ -125,9 +125,6 @@ svcg
 
 #### CLI
 
-<details>
-  <summary>CLI</summary>
-
 - Realtime (from microphone)
 
 ```shell
@@ -140,8 +137,6 @@ svc vc
 svc infer source.wav
 ```
 
-</details>
-
 [Pretrained models](https://huggingface.co/models?search=so-vits-svc-4.0) are available on HuggingFace.
 
 #### Notes
@@ -154,8 +149,8 @@ svc infer source.wav
 #### Before training
 
 - If your dataset has BGM, please remove the BGM using software such as [Ultimate Vocal Remover](https://ultimatevocalremover.com/). `3_HP-Vocal-UVR.pth` or `UVR-MDX-NET Main` is recommended. [^1]
-- If your dataset is a long audio file with multiple speakers, use `svc sd` to split the dataset into multiple files (using `pyannote.audio`). Further manual classification may be necessary due to accuracy issues. If speakers speak with a variety of speech styles, set --min-speakers larger than the actual number of speakers. Due to unresolved dependencies, please install `pyannote.audio` manually: `pip install pyannote-audio`.
 - If your dataset is a long audio file with a single speaker, use `svc split` to split the dataset into multiple files (using `librosa`).
+- If your dataset is a long audio file with multiple speakers, use `svc sd` to split the dataset into multiple files (using `pyannote.audio`). Further manual classification may be necessary due to accuracy issues. If speakers speak with a variety of speech styles, set --min-speakers larger than the actual number of speakers. Due to unresolved dependencies, please install `pyannote.audio` manually: `pip install pyannote-audio`.
 
 [^1]: https://ytpmv.info/how-to-use-uvr/
 

From 033ca785693962c6380685179a8bc2ae2cf20de7 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Mon, 3 Apr 2023 11:29:54 +0900
Subject: [PATCH 44/47] docs(readme): update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index b87697c1..c3f29d3d 100644
--- a/README.md
+++ b/README.md
@@ -55,6 +55,8 @@ A fork of [`so-vits-svc`](https://github.com/svc-develop-team/so-vits-svc) with
   <img src="https://img.shields.io/badge/.bat-download-blue?style=flat-square&logo=windows" alt="Download .bat">
 </a>
 
+### Manual installation
+
 <details>
   <summary>Creating a virtual environment</summary>
 

From 774cd33e67a71736001488f2d2d7c1cb9439321e Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Mon, 3 Apr 2023 11:32:25 +0900
Subject: [PATCH 45/47] docs(readme): update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c3f29d3d..9b0a4967 100644
--- a/README.md
+++ b/README.md
@@ -39,10 +39,10 @@ A fork of [`so-vits-svc`](https://github.com/svc-develop-team/so-vits-svc) with
 - **Realtime voice conversion** (enhanced in v1.1.0)
 - Integrates [`QuickVC`](https://github.com/quickvc/QuickVC-VoiceConversion)
 - Fixed misuse of `ContentVec` in the original repository.[^c]
-- More accurate pitch estimation using CREPE
+- More accurate pitch estimation using [`CREPE`](https://github.com/marl/crepe/).
 - GUI and unified CLI available
 - Ready to use just by installing with `pip`.
-- Automatically download pretrained models
+- Automatically download pretrained models.
 - Code completely formatted with black, isort, autoflake etc.
 
 [^c]: [#206](https://github.com/34j/so-vits-svc-fork/issues/206)

From 229f9ba59554e22cb0b7fb070f8060be9f161722 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Mon, 3 Apr 2023 11:38:52 +0900
Subject: [PATCH 46/47] docs(readme): update README.md

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 9b0a4967..96f172e5 100644
--- a/README.md
+++ b/README.md
@@ -89,15 +89,15 @@ Install this via pip (or your favourite package manager that uses pip):
 
 ```shell
 python -m pip install -U pip setuptools wheel
-pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu117
+pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu118
 pip install -U so-vits-svc-fork
 ```
 
 <details>
   <summary>Notes</summary>
 
-- If no GPU is available, simply remove `pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu117`.
-- If you are using an AMD GPU on Linux, replace `--index-url https://download.pytorch.org/whl/cu117` with `--index-url https://download.pytorch.org/whl/rocm5.4.2`. AMD GPUs are not supported on Windows ([#120](https://github.com/34j/so-vits-svc-fork/issues/120)).
+- If no GPU is available, simply remove `pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu118`.
+- If you are using an AMD GPU on Linux, replace `--index-url https://download.pytorch.org/whl/cu118` with `--index-url https://download.pytorch.org/whl/rocm5.4.2`. AMD GPUs are not supported on Windows ([#120](https://github.com/34j/so-vits-svc-fork/issues/120)).
 - If `fairseq` raises an error:
   - If it prompts [`Microsoft C++ Build Tools`](https://visualstudio.microsoft.com/visual-cpp-build-tools/) is not installed. please install it.
   - If it prompts that some dll is missing, reinstalling `Microsoft Visual C++ 2022` and `Windows SDK` may help.
@@ -180,8 +180,9 @@ svc train -t
 #### Notes
 
 - Dataset audio duration per file should be <~ 10s or VRAM will run out.
-- To change the f0 inference method to CREPE, replace `svc pre-hubert` with `svc pre-hubert -fm crepe`. You may need to reduce `--n-jobs` due to performance issues.
 - It is recommended to increase the `batch_size` as much as possible in `config.json` before the `train` command to match the VRAM capacity.
+- To use `CREPE`, replace `svc pre-hubert` with `svc pre-hubert -fm crepe`.
+- To use `QuickVC`, replace `svc pre-config` with `svc pre-config -t quickvc`.
 - Silence removal and volume normalization are automatically performed (as in the upstream repo) and are not required.
 
 ### Further help

From 17fa86c12738273988ef00523f923505ea98a9db Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Mon, 3 Apr 2023 11:40:56 +0900
Subject: [PATCH 47/47] fix(pre_hubert): fix vram estimiation

---
 src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py b/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
index b0034a93..3c7618a5 100644
--- a/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
+++ b/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
@@ -18,7 +18,8 @@
 from .preprocess_utils import check_hubert_min_duration
 
 LOG = getLogger(__name__)
-HUBERT_MEMORY = 1250
+HUBERT_MEMORY = 1600
+HUBERT_MEMORY_CREPE = 2600
 
 
 def _process_one(
@@ -99,7 +100,9 @@ def preprocess_hubert_f0(
     hps = utils.get_hparams(config_path)
     if n_jobs is None:
         memory = get_total_gpu_memory("free")
-        n_jobs = memory // HUBERT_MEMORY
+        n_jobs = memory // (
+            HUBERT_MEMORY_CREPE if f0_method == "crepe" else HUBERT_MEMORY
+        )
         LOG.info(f"n_jobs automatically set to {n_jobs}, memory: {memory} MiB")
 
     filepaths = list(input_dir.rglob("*.wav"))