Skip to content

Commit

Permalink
feat: migrate to lightning (#246)
Browse files Browse the repository at this point in the history
  • Loading branch information
34j authored Apr 8, 2023
1 parent e3f3c16 commit 824ecbd
Show file tree
Hide file tree
Showing 19 changed files with 1,815 additions and 1,136 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ tests/**/*.download
tests/**/*.lab
tests/**/*.pdf
tests/**/*.csv
tests/**/*.ckpt
tests/**/*.yaml
*.tfevents.*
*.pt
user_gui_presets.json
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ A fork of [`so-vits-svc`](https://github.com/svc-develop-team/so-vits-svc) with
- Fixed misuse of `ContentVec` in the original repository.[^c]
- More accurate pitch estimation using [`CREPE`](https://github.com/marl/crepe/).
- GUI and unified CLI available
- ~2x faster training
- Ready to use just by installing with `pip`.
- Automatically download pretrained models.
- Code completely formatted with black, isort, autoflake etc.
Expand Down
2,117 changes: 1,420 additions & 697 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ pysimplegui = ">=4.6"
pebble = ">=5.0"
torchcrepe = ">=0.0.17"
unidecode = "^1.3.6"
lightning = "^2.0.1"
fastapi = "<0.89"

[tool.poetry.group.dev.dependencies]
pre-commit = ">=3"
Expand Down
13 changes: 8 additions & 5 deletions src/so_vits_svc_fork/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import torch

from so_vits_svc_fork import __version__
from so_vits_svc_fork.utils import get_optimal_device

LOG = getLogger(__name__)

Expand Down Expand Up @@ -190,7 +191,7 @@ def train(
"-d",
"--device",
type=str,
default="cuda" if torch.cuda.is_available() else "cpu",
default=get_optimal_device(),
help="device",
)
@click.option("-ch", "--chunk-seconds", type=float, default=0.5, help="chunk seconds")
Expand Down Expand Up @@ -220,7 +221,7 @@ def infer(
pad_seconds: float = 0.5,
chunk_seconds: float = 0.5,
absolute_thresh: bool = False,
device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
device: str | torch.device = get_optimal_device(),
):
"""Inference"""
from so_vits_svc_fork.inference.main import infer
Expand Down Expand Up @@ -339,7 +340,7 @@ def infer(
"-d",
"--device",
type=str,
default="cuda" if torch.cuda.is_available() else "cpu",
default=get_optimal_device(),
help="device",
)
@click.option("-s", "--speaker", type=str, default=None, help="speaker name")
Expand Down Expand Up @@ -378,7 +379,7 @@ def vc(
version: int,
input_device: int | str | None,
output_device: int | str | None,
device: Literal["cpu", "cuda"],
device: torch.device,
passthrough_original: bool = False,
) -> None:
"""Realtime inference from microphone"""
Expand Down Expand Up @@ -759,7 +760,9 @@ def clean():
default="cpu",
help="device to use",
)
def onnx(input_path: Path, output_path: Path, config_path: Path, device: str) -> None:
def onnx(
input_path: Path, output_path: Path, config_path: Path, device: torch.device | str
) -> None:
raise NotImplementedError("ONNX export is not yet supported")
"""Export model to onnx"""
input_path = Path(input_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
from typing import Sequence

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.utils.data import Dataset

from .hparams import HParams


class TextAudioSpeakerLoader(torch.utils.data.Dataset):
class TextAudioDataset(Dataset):
def __init__(self, hps: HParams, is_validation: bool = False):
self.datapaths = [
Path(x).parent / (Path(x).name + ".data.pt")
Expand Down Expand Up @@ -59,8 +60,8 @@ def _pad_stack(array: Sequence[torch.Tensor]) -> torch.Tensor:
return torch.stack(x_padded)


class TextAudioCollate:
def __call__(
class TextAudioCollate(nn.Module):
def forward(
self, batch: Sequence[dict[str, torch.Tensor]]
) -> tuple[torch.Tensor, ...]:
batch = [b for b in batch if b is not None]
Expand Down
4 changes: 3 additions & 1 deletion src/so_vits_svc_fork/f0.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from numpy import dtype, float32, ndarray
from torch import FloatTensor, Tensor

from so_vits_svc_fork.utils import get_optimal_device

LOG = getLogger(__name__)


Expand Down Expand Up @@ -154,7 +156,7 @@ def compute_f0_crepe(
p_len: None | int = None,
sampling_rate: int = 44100,
hop_length: int = 512,
device: str = "cuda" if torch.cuda.is_available() else "cpu",
device: str | torch.device = get_optimal_device(),
model: Literal["full", "tiny"] = "full",
):
audio = torch.from_numpy(wav_numpy).to(device, copy=True)
Expand Down
27 changes: 6 additions & 21 deletions src/so_vits_svc_fork/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pebble import ProcessFuture, ProcessPool
from tqdm.tk import tqdm_tk

from .utils import ensure_pretrained_model
from .utils import ensure_pretrained_model, get_optimal_device

GUI_DEFAULT_PRESETS_PATH = Path(__file__).parent / "default_gui_presets.json"
GUI_PRESETS_PATH = Path("./user_gui_presets.json").absolute()
Expand Down Expand Up @@ -411,21 +411,14 @@ def main():
[
sg.Checkbox(
key="use_gpu",
default=(
torch.cuda.is_available() or torch.backends.mps.is_available()
),
default=get_optimal_device() != torch.device("cpu"),
text="Use GPU"
+ (
" (not available; if your device has GPU, make sure you installed PyTorch with CUDA support)"
if not (
torch.cuda.is_available()
or torch.backends.mps.is_available()
)
if get_optimal_device() == torch.device("cpu")
else ""
),
disabled=not (
torch.cuda.is_available() or torch.backends.mps.is_available()
),
disabled=get_optimal_device() == torch.device("cpu"),
)
],
[
Expand Down Expand Up @@ -579,15 +572,7 @@ def apply_preset(name: str) -> None:
pad_seconds=values["pad_seconds"],
chunk_seconds=values["chunk_seconds"],
absolute_thresh=values["absolute_thresh"],
device="cpu"
if not values["use_gpu"]
else (
"cuda"
if torch.cuda.is_available()
else "mps"
if torch.backends.mps.is_available()
else "cpu"
),
device="cpu" if not values["use_gpu"] else get_optimal_device(),
)
if values["auto_play"]:
pool.schedule(play_audio, args=[output_path])
Expand Down Expand Up @@ -641,7 +626,7 @@ def apply_preset(name: str) -> None:
output_device=output_device_indices[
window["output_device"].widget.current()
],
device="cuda" if values["use_gpu"] else "cpu",
device=get_optimal_device() if values["use_gpu"] else "cpu",
passthrough_original=values["passthrough_original"],
),
)
Expand Down
3 changes: 2 additions & 1 deletion src/so_vits_svc_fork/inference/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from so_vits_svc_fork import cluster, utils

from ..modules.synthesizers import SynthesizerTrn
from ..utils import get_optimal_device

LOG = getLogger(__name__)

Expand Down Expand Up @@ -98,7 +99,7 @@ def __init__(
):
self.net_g_path = net_g_path
if device is None:
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.device = (get_optimal_device(),)
else:
self.device = torch.device(device)
self.hps = utils.get_hparams(config_path)
Expand Down
5 changes: 3 additions & 2 deletions src/so_vits_svc_fork/inference/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from cm_time import timer

from so_vits_svc_fork.inference.core import RealtimeVC, RealtimeVC2, Svc
from so_vits_svc_fork.utils import get_optimal_device

LOG = getLogger(__name__)

Expand All @@ -35,7 +36,7 @@ def infer(
pad_seconds: float = 0.5,
chunk_seconds: float = 0.5,
absolute_thresh: bool = False,
device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
device: str | torch.device = get_optimal_device(),
):
model_path = Path(model_path)
output_path = Path(output_path)
Expand Down Expand Up @@ -94,7 +95,7 @@ def realtime(
version: int = 2,
input_device: int | str | None = None,
output_device: int | str | None = None,
device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
device: str | torch.device = get_optimal_device(),
passthrough_original: bool = False,
):
import sounddevice as sd
Expand Down
4 changes: 2 additions & 2 deletions src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from ._models import Generator
from ._models import NSFHifiGANGenerator

__all__ = ["Generator"]
__all__ = ["NSFHifiGANGenerator"]
2 changes: 1 addition & 1 deletion src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def forward(self, x):
return sine_merge, noise, uv


class Generator(torch.nn.Module):
class NSFHifiGANGenerator(torch.nn.Module):
def __init__(self, h):
super().__init__()
self.h = h
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ def forward(self, x, g=None):

y_mb_hat = F.conv_transpose1d(
y_mb_hat,
self.updown_filter.cuda(x.device) * self.subbands,
self.updown_filter.to(x.device) * self.subbands,
stride=self.subbands,
)

Expand Down
8 changes: 3 additions & 5 deletions src/so_vits_svc_fork/modules/decoders/mb_istft/_pqmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,17 +85,15 @@ def __init__(self, device, subbands=8, taps=62, cutoff_ratio=0.15, beta=9.0):
)

# convert to tensor
analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).cuda(device)
synthesis_filter = (
torch.from_numpy(h_synthesis).float().unsqueeze(0).cuda(device)
)
analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).to(device)
synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0).to(device)

# register coefficients as buffer
self.register_buffer("analysis_filter", analysis_filter)
self.register_buffer("synthesis_filter", synthesis_filter)

# filter for downsampling & upsampling
updown_filter = torch.zeros((subbands, subbands, subbands)).float().cuda(device)
updown_filter = torch.zeros((subbands, subbands, subbands)).float().to(device)
for k in range(subbands):
updown_filter[k, k, 0] = 1.0
self.register_buffer("updown_filter", updown_filter)
Expand Down
6 changes: 1 addition & 5 deletions src/so_vits_svc_fork/modules/decoders/mb_istft/_stft.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,7 @@ def inverse(self, magnitude, phase):
window_sum = torch.autograd.Variable(
torch.from_numpy(window_sum), requires_grad=False
)
window_sum = (
window_sum.to(inverse_transform.device())
if magnitude.is_cuda
else window_sum
)
window_sum = window_sum.to(inverse_transform.device())
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
approx_nonzero_indices
]
Expand Down
4 changes: 2 additions & 2 deletions src/so_vits_svc_fork/modules/synthesizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from so_vits_svc_fork.f0 import f0_to_coarse
from so_vits_svc_fork.modules import commons as commons
from so_vits_svc_fork.modules.decoders.f0 import F0Decoder
from so_vits_svc_fork.modules.decoders.hifigan import Generator
from so_vits_svc_fork.modules.decoders.hifigan import NSFHifiGANGenerator
from so_vits_svc_fork.modules.decoders.mb_istft import (
Multiband_iSTFT_Generator,
Multistream_iSTFT_Generator,
Expand Down Expand Up @@ -110,7 +110,7 @@ def __init__(
"upsample_kernel_sizes": upsample_kernel_sizes,
"gin_channels": gin_channels,
}
self.dec = Generator(h=hps)
self.dec = NSFHifiGANGenerator(h=hps)
self.mb = False
else:
hps = {
Expand Down
8 changes: 4 additions & 4 deletions src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from ..hparams import HParams
from ..modules.mel_processing import spec_to_mel_torch, spectrogram_torch
from ..utils import get_total_gpu_memory
from ..utils import get_optimal_device, get_total_gpu_memory
from .preprocess_utils import check_hubert_min_duration

LOG = getLogger(__name__)
Expand All @@ -30,7 +30,7 @@ def _process_one(
*,
filepath: Path,
content_model: HubertModel,
device: Literal["cuda", "cpu"] = "cuda",
device: torch.device | str = get_optimal_device(),
f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
force_rebuild: bool = False,
hps: HParams,
Expand All @@ -54,7 +54,7 @@ def _process_one(
uv = torch.from_numpy(uv).float()

# Compute HuBERT content
audio = torch.from_numpy(audio).float().cuda()
audio = torch.from_numpy(audio).float().to(device)
c = utils.get_content(
content_model,
audio,
Expand Down Expand Up @@ -102,7 +102,7 @@ def _process_one(


def _process_batch(filepaths: Iterable[Path], pbar_position: int, **kwargs):
content_model = utils.get_hubert_model("cuda")
content_model = utils.get_hubert_model(get_optimal_device())

for filepath in tqdm(filepaths, position=pbar_position):
_process_one(
Expand Down
Loading

0 comments on commit 824ecbd

Please sign in to comment.