voicepaw · 34j · Apr 8, 2023 · Apr 7, 2023 · Apr 7, 2023 · Apr 7, 2023
diff --git a/.gitignore b/.gitignore
@@ -151,6 +151,8 @@ tests/**/*.download
 tests/**/*.lab
 tests/**/*.pdf
 tests/**/*.csv
+tests/**/*.ckpt
+tests/**/*.yaml
 *.tfevents.*
 *.pt
 user_gui_presets.json
diff --git a/README.md b/README.md
@@ -41,6 +41,7 @@ A fork of [`so-vits-svc`](https://github.com/svc-develop-team/so-vits-svc) with
 - Fixed misuse of `ContentVec` in the original repository.[^c]
 - More accurate pitch estimation using [`CREPE`](https://github.com/marl/crepe/).
 - GUI and unified CLI available
+- ~2x faster training
 - Ready to use just by installing with `pip`.
 - Automatically download pretrained models.
 - Code completely formatted with black, isort, autoflake etc.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -63,6 +63,8 @@ pysimplegui = ">=4.6"
 pebble = ">=5.0"
 torchcrepe = ">=0.0.17"
 unidecode = "^1.3.6"
+lightning = "^2.0.1"
+fastapi = "<0.89"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = ">=3"

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
@@ -10,6 +10,7 @@
 import torch
 
 from so_vits_svc_fork import __version__
+from so_vits_svc_fork.utils import get_optimal_device
 
 LOG = getLogger(__name__)
 
@@ -190,7 +191,7 @@ def train(
     "-d",
     "--device",
     type=str,
-    default="cuda" if torch.cuda.is_available() else "cpu",
+    default=get_optimal_device(),
     help="device",
 )
 @click.option("-ch", "--chunk-seconds", type=float, default=0.5, help="chunk seconds")
@@ -220,7 +221,7 @@ def infer(
     pad_seconds: float = 0.5,
     chunk_seconds: float = 0.5,
     absolute_thresh: bool = False,
-    device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
+    device: str | torch.device = get_optimal_device(),
 ):
     """Inference"""
     from so_vits_svc_fork.inference.main import infer
@@ -339,7 +340,7 @@ def infer(
     "-d",
     "--device",
     type=str,
-    default="cuda" if torch.cuda.is_available() else "cpu",
+    default=get_optimal_device(),
     help="device",
 )
 @click.option("-s", "--speaker", type=str, default=None, help="speaker name")
@@ -378,7 +379,7 @@ def vc(
     version: int,
     input_device: int | str | None,
     output_device: int | str | None,
-    device: Literal["cpu", "cuda"],
+    device: torch.device,
     passthrough_original: bool = False,
 ) -> None:
     """Realtime inference from microphone"""
@@ -759,7 +760,9 @@ def clean():
     default="cpu",
     help="device to use",
 )
-def onnx(input_path: Path, output_path: Path, config_path: Path, device: str) -> None:
+def onnx(
+    input_path: Path, output_path: Path, config_path: Path, device: torch.device | str
+) -> None:
     raise NotImplementedError("ONNX export is not yet supported")
     """Export model to onnx"""
     input_path = Path(input_path)

diff --git a/src/so_vits_svc_fork/data_utils.py → src/so_vits_svc_fork/dataset.py b/src/so_vits_svc_fork/data_utils.py → src/so_vits_svc_fork/dataset.py
@@ -5,13 +5,14 @@
 from typing import Sequence
 
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
-import torch.utils.data
+from torch.utils.data import Dataset
 
 from .hparams import HParams
 
 
-class TextAudioSpeakerLoader(torch.utils.data.Dataset):
+class TextAudioDataset(Dataset):
     def __init__(self, hps: HParams, is_validation: bool = False):
         self.datapaths = [
             Path(x).parent / (Path(x).name + ".data.pt")
@@ -59,8 +60,8 @@ def _pad_stack(array: Sequence[torch.Tensor]) -> torch.Tensor:
     return torch.stack(x_padded)
 
 
-class TextAudioCollate:
-    def __call__(
+class TextAudioCollate(nn.Module):
+    def forward(
         self, batch: Sequence[dict[str, torch.Tensor]]
     ) -> tuple[torch.Tensor, ...]:
         batch = [b for b in batch if b is not None]

diff --git a/src/so_vits_svc_fork/f0.py b/src/so_vits_svc_fork/f0.py
@@ -10,6 +10,8 @@
 from numpy import dtype, float32, ndarray
 from torch import FloatTensor, Tensor
 
+from so_vits_svc_fork.utils import get_optimal_device
+
 LOG = getLogger(__name__)
 
 
@@ -154,7 +156,7 @@ def compute_f0_crepe(
     p_len: None | int = None,
     sampling_rate: int = 44100,
     hop_length: int = 512,
-    device: str = "cuda" if torch.cuda.is_available() else "cpu",
+    device: str | torch.device = get_optimal_device(),
     model: Literal["full", "tiny"] = "full",
 ):
     audio = torch.from_numpy(wav_numpy).to(device, copy=True)

diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py
@@ -11,7 +11,7 @@
 from pebble import ProcessFuture, ProcessPool
 from tqdm.tk import tqdm_tk
 
-from .utils import ensure_pretrained_model
+from .utils import ensure_pretrained_model, get_optimal_device
 
 GUI_DEFAULT_PRESETS_PATH = Path(__file__).parent / "default_gui_presets.json"
 GUI_PRESETS_PATH = Path("./user_gui_presets.json").absolute()
@@ -411,21 +411,14 @@ def main():
             [
                 sg.Checkbox(
                     key="use_gpu",
-                    default=(
-                        torch.cuda.is_available() or torch.backends.mps.is_available()
-                    ),
+                    default=get_optimal_device() != torch.device("cpu"),
                     text="Use GPU"
                     + (
                         " (not available; if your device has GPU, make sure you installed PyTorch with CUDA support)"
-                        if not (
-                            torch.cuda.is_available()
-                            or torch.backends.mps.is_available()
-                        )
+                        if get_optimal_device() == torch.device("cpu")
                         else ""
                     ),
-                    disabled=not (
-                        torch.cuda.is_available() or torch.backends.mps.is_available()
-                    ),
+                    disabled=get_optimal_device() == torch.device("cpu"),
                 )
             ],
             [
@@ -579,15 +572,7 @@ def apply_preset(name: str) -> None:
                         pad_seconds=values["pad_seconds"],
                         chunk_seconds=values["chunk_seconds"],
                         absolute_thresh=values["absolute_thresh"],
-                        device="cpu"
-                        if not values["use_gpu"]
-                        else (
-                            "cuda"
-                            if torch.cuda.is_available()
-                            else "mps"
-                            if torch.backends.mps.is_available()
-                            else "cpu"
-                        ),
+                        device="cpu" if not values["use_gpu"] else get_optimal_device(),
                     )
                     if values["auto_play"]:
                         pool.schedule(play_audio, args=[output_path])
@@ -641,7 +626,7 @@ def apply_preset(name: str) -> None:
                         output_device=output_device_indices[
                             window["output_device"].widget.current()
                         ],
-                        device="cuda" if values["use_gpu"] else "cpu",
+                        device=get_optimal_device() if values["use_gpu"] else "cpu",
                         passthrough_original=values["passthrough_original"],
                     ),
                 )

diff --git a/src/so_vits_svc_fork/inference/core.py b/src/so_vits_svc_fork/inference/core.py
@@ -16,6 +16,7 @@
 from so_vits_svc_fork import cluster, utils
 
 from ..modules.synthesizers import SynthesizerTrn
+from ..utils import get_optimal_device
 
 LOG = getLogger(__name__)
 
@@ -98,7 +99,7 @@ def __init__(
     ):
         self.net_g_path = net_g_path
         if device is None:
-            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            self.device = (get_optimal_device(),)
         else:
             self.device = torch.device(device)
         self.hps = utils.get_hparams(config_path)

diff --git a/src/so_vits_svc_fork/inference/main.py b/src/so_vits_svc_fork/inference/main.py
@@ -11,6 +11,7 @@
 from cm_time import timer
 
 from so_vits_svc_fork.inference.core import RealtimeVC, RealtimeVC2, Svc
+from so_vits_svc_fork.utils import get_optimal_device
 
 LOG = getLogger(__name__)
 
@@ -35,7 +36,7 @@ def infer(
     pad_seconds: float = 0.5,
     chunk_seconds: float = 0.5,
     absolute_thresh: bool = False,
-    device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
+    device: str | torch.device = get_optimal_device(),
 ):
     model_path = Path(model_path)
     output_path = Path(output_path)
@@ -94,7 +95,7 @@ def realtime(
     version: int = 2,
     input_device: int | str | None = None,
     output_device: int | str | None = None,
-    device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
+    device: str | torch.device = get_optimal_device(),
     passthrough_original: bool = False,
 ):
     import sounddevice as sd

diff --git a/src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py b/src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py
@@ -1,3 +1,3 @@
-from ._models import Generator
+from ._models import NSFHifiGANGenerator
 
-__all__ = ["Generator"]
+__all__ = ["NSFHifiGANGenerator"]
diff --git a/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py b/src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
@@ -214,7 +214,7 @@ def forward(self, x):
         return sine_merge, noise, uv
 
 
-class Generator(torch.nn.Module):
+class NSFHifiGANGenerator(torch.nn.Module):
     def __init__(self, h):
         super().__init__()
         self.h = h

diff --git a/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py b/src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py
@@ -360,7 +360,7 @@ def forward(self, x, g=None):
 
         y_mb_hat = F.conv_transpose1d(
             y_mb_hat,
-            self.updown_filter.cuda(x.device) * self.subbands,
+            self.updown_filter.to(x.device) * self.subbands,
             stride=self.subbands,
         )
 

diff --git a/src/so_vits_svc_fork/modules/decoders/mb_istft/_pqmf.py b/src/so_vits_svc_fork/modules/decoders/mb_istft/_pqmf.py
@@ -85,17 +85,15 @@ def __init__(self, device, subbands=8, taps=62, cutoff_ratio=0.15, beta=9.0):
             )
 
         # convert to tensor
-        analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).cuda(device)
-        synthesis_filter = (
-            torch.from_numpy(h_synthesis).float().unsqueeze(0).cuda(device)
-        )
+        analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).to(device)
+        synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0).to(device)
 
         # register coefficients as buffer
         self.register_buffer("analysis_filter", analysis_filter)
         self.register_buffer("synthesis_filter", synthesis_filter)
 
         # filter for downsampling & upsampling
-        updown_filter = torch.zeros((subbands, subbands, subbands)).float().cuda(device)
+        updown_filter = torch.zeros((subbands, subbands, subbands)).float().to(device)
         for k in range(subbands):
             updown_filter[k, k, 0] = 1.0
         self.register_buffer("updown_filter", updown_filter)

diff --git a/src/so_vits_svc_fork/modules/decoders/mb_istft/_stft.py b/src/so_vits_svc_fork/modules/decoders/mb_istft/_stft.py
@@ -182,11 +182,7 @@ def inverse(self, magnitude, phase):
             window_sum = torch.autograd.Variable(
                 torch.from_numpy(window_sum), requires_grad=False
             )
-            window_sum = (
-                window_sum.to(inverse_transform.device())
-                if magnitude.is_cuda
-                else window_sum
-            )
+            window_sum = window_sum.to(inverse_transform.device())
             inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
                 approx_nonzero_indices
             ]

diff --git a/src/so_vits_svc_fork/modules/synthesizers.py b/src/so_vits_svc_fork/modules/synthesizers.py
@@ -9,7 +9,7 @@
 from so_vits_svc_fork.f0 import f0_to_coarse
 from so_vits_svc_fork.modules import commons as commons
 from so_vits_svc_fork.modules.decoders.f0 import F0Decoder
-from so_vits_svc_fork.modules.decoders.hifigan import Generator
+from so_vits_svc_fork.modules.decoders.hifigan import NSFHifiGANGenerator
 from so_vits_svc_fork.modules.decoders.mb_istft import (
     Multiband_iSTFT_Generator,
     Multistream_iSTFT_Generator,
@@ -110,7 +110,7 @@ def __init__(
                 "upsample_kernel_sizes": upsample_kernel_sizes,
                 "gin_channels": gin_channels,
             }
-            self.dec = Generator(h=hps)
+            self.dec = NSFHifiGANGenerator(h=hps)
             self.mb = False
         else:
             hps = {

diff --git a/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py b/src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
@@ -18,7 +18,7 @@
 
 from ..hparams import HParams
 from ..modules.mel_processing import spec_to_mel_torch, spectrogram_torch
-from ..utils import get_total_gpu_memory
+from ..utils import get_optimal_device, get_total_gpu_memory
 from .preprocess_utils import check_hubert_min_duration
 
 LOG = getLogger(__name__)
@@ -30,7 +30,7 @@ def _process_one(
     *,
     filepath: Path,
     content_model: HubertModel,
-    device: Literal["cuda", "cpu"] = "cuda",
+    device: torch.device | str = get_optimal_device(),
     f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
     force_rebuild: bool = False,
     hps: HParams,
@@ -54,7 +54,7 @@ def _process_one(
     uv = torch.from_numpy(uv).float()
 
     # Compute HuBERT content
-    audio = torch.from_numpy(audio).float().cuda()
+    audio = torch.from_numpy(audio).float().to(device)
     c = utils.get_content(
         content_model,
         audio,
@@ -102,7 +102,7 @@ def _process_one(
 
 
 def _process_batch(filepaths: Iterable[Path], pbar_position: int, **kwargs):
-    content_model = utils.get_hubert_model("cuda")
+    content_model = utils.get_hubert_model(get_optimal_device())
 
     for filepath in tqdm(filepaths, position=pbar_position):
         _process_one(