Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: migrate to lightning #246

Merged
merged 19 commits into from
Apr 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ tests/**/*.download
tests/**/*.lab
tests/**/*.pdf
tests/**/*.csv
tests/**/*.ckpt
tests/**/*.yaml
*.tfevents.*
*.pt
user_gui_presets.json
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ A fork of [`so-vits-svc`](https://github.com/svc-develop-team/so-vits-svc) with
- Fixed misuse of `ContentVec` in the original repository.[^c]
- More accurate pitch estimation using [`CREPE`](https://github.com/marl/crepe/).
- GUI and unified CLI available
- ~2x faster training
- Ready to use just by installing with `pip`.
- Automatically download pretrained models.
- Code completely formatted with black, isort, autoflake etc.
Expand Down
2,117 changes: 1,420 additions & 697 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ pysimplegui = ">=4.6"
pebble = ">=5.0"
torchcrepe = ">=0.0.17"
unidecode = "^1.3.6"
lightning = "^2.0.1"
fastapi = "<0.89"

[tool.poetry.group.dev.dependencies]
pre-commit = ">=3"
Expand Down
13 changes: 8 additions & 5 deletions src/so_vits_svc_fork/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import torch

from so_vits_svc_fork import __version__
from so_vits_svc_fork.utils import get_optimal_device

LOG = getLogger(__name__)

Expand Down Expand Up @@ -190,7 +191,7 @@ def train(
"-d",
"--device",
type=str,
default="cuda" if torch.cuda.is_available() else "cpu",
default=get_optimal_device(),
help="device",
)
@click.option("-ch", "--chunk-seconds", type=float, default=0.5, help="chunk seconds")
Expand Down Expand Up @@ -220,7 +221,7 @@ def infer(
pad_seconds: float = 0.5,
chunk_seconds: float = 0.5,
absolute_thresh: bool = False,
device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
device: str | torch.device = get_optimal_device(),
):
"""Inference"""
from so_vits_svc_fork.inference.main import infer
Expand Down Expand Up @@ -339,7 +340,7 @@ def infer(
"-d",
"--device",
type=str,
default="cuda" if torch.cuda.is_available() else "cpu",
default=get_optimal_device(),
help="device",
)
@click.option("-s", "--speaker", type=str, default=None, help="speaker name")
Expand Down Expand Up @@ -378,7 +379,7 @@ def vc(
version: int,
input_device: int | str | None,
output_device: int | str | None,
device: Literal["cpu", "cuda"],
device: torch.device,
passthrough_original: bool = False,
) -> None:
"""Realtime inference from microphone"""
Expand Down Expand Up @@ -759,7 +760,9 @@ def clean():
default="cpu",
help="device to use",
)
def onnx(input_path: Path, output_path: Path, config_path: Path, device: str) -> None:
def onnx(
input_path: Path, output_path: Path, config_path: Path, device: torch.device | str
) -> None:
raise NotImplementedError("ONNX export is not yet supported")
"""Export model to onnx"""
input_path = Path(input_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
from typing import Sequence

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.utils.data import Dataset

from .hparams import HParams


class TextAudioSpeakerLoader(torch.utils.data.Dataset):
class TextAudioDataset(Dataset):
def __init__(self, hps: HParams, is_validation: bool = False):
self.datapaths = [
Path(x).parent / (Path(x).name + ".data.pt")
Expand Down Expand Up @@ -59,8 +60,8 @@ def _pad_stack(array: Sequence[torch.Tensor]) -> torch.Tensor:
return torch.stack(x_padded)


class TextAudioCollate:
def __call__(
class TextAudioCollate(nn.Module):
def forward(
self, batch: Sequence[dict[str, torch.Tensor]]
) -> tuple[torch.Tensor, ...]:
batch = [b for b in batch if b is not None]
Expand Down
4 changes: 3 additions & 1 deletion src/so_vits_svc_fork/f0.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from numpy import dtype, float32, ndarray
from torch import FloatTensor, Tensor

from so_vits_svc_fork.utils import get_optimal_device

LOG = getLogger(__name__)


Expand Down Expand Up @@ -154,7 +156,7 @@ def compute_f0_crepe(
p_len: None | int = None,
sampling_rate: int = 44100,
hop_length: int = 512,
device: str = "cuda" if torch.cuda.is_available() else "cpu",
device: str | torch.device = get_optimal_device(),
model: Literal["full", "tiny"] = "full",
):
audio = torch.from_numpy(wav_numpy).to(device, copy=True)
Expand Down
27 changes: 6 additions & 21 deletions src/so_vits_svc_fork/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pebble import ProcessFuture, ProcessPool
from tqdm.tk import tqdm_tk

from .utils import ensure_pretrained_model
from .utils import ensure_pretrained_model, get_optimal_device

GUI_DEFAULT_PRESETS_PATH = Path(__file__).parent / "default_gui_presets.json"
GUI_PRESETS_PATH = Path("./user_gui_presets.json").absolute()
Expand Down Expand Up @@ -411,21 +411,14 @@ def main():
[
sg.Checkbox(
key="use_gpu",
default=(
torch.cuda.is_available() or torch.backends.mps.is_available()
),
default=get_optimal_device() != torch.device("cpu"),
text="Use GPU"
+ (
" (not available; if your device has GPU, make sure you installed PyTorch with CUDA support)"
if not (
torch.cuda.is_available()
or torch.backends.mps.is_available()
)
if get_optimal_device() == torch.device("cpu")
else ""
),
disabled=not (
torch.cuda.is_available() or torch.backends.mps.is_available()
),
disabled=get_optimal_device() == torch.device("cpu"),
)
],
[
Expand Down Expand Up @@ -579,15 +572,7 @@ def apply_preset(name: str) -> None:
pad_seconds=values["pad_seconds"],
chunk_seconds=values["chunk_seconds"],
absolute_thresh=values["absolute_thresh"],
device="cpu"
if not values["use_gpu"]
else (
"cuda"
if torch.cuda.is_available()
else "mps"
if torch.backends.mps.is_available()
else "cpu"
),
device="cpu" if not values["use_gpu"] else get_optimal_device(),
)
if values["auto_play"]:
pool.schedule(play_audio, args=[output_path])
Expand Down Expand Up @@ -641,7 +626,7 @@ def apply_preset(name: str) -> None:
output_device=output_device_indices[
window["output_device"].widget.current()
],
device="cuda" if values["use_gpu"] else "cpu",
device=get_optimal_device() if values["use_gpu"] else "cpu",
passthrough_original=values["passthrough_original"],
),
)
Expand Down
3 changes: 2 additions & 1 deletion src/so_vits_svc_fork/inference/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from so_vits_svc_fork import cluster, utils

from ..modules.synthesizers import SynthesizerTrn
from ..utils import get_optimal_device

LOG = getLogger(__name__)

Expand Down Expand Up @@ -98,7 +99,7 @@ def __init__(
):
self.net_g_path = net_g_path
if device is None:
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.device = (get_optimal_device(),)
else:
self.device = torch.device(device)
self.hps = utils.get_hparams(config_path)
Expand Down
5 changes: 3 additions & 2 deletions src/so_vits_svc_fork/inference/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from cm_time import timer

from so_vits_svc_fork.inference.core import RealtimeVC, RealtimeVC2, Svc
from so_vits_svc_fork.utils import get_optimal_device

LOG = getLogger(__name__)

Expand All @@ -35,7 +36,7 @@ def infer(
pad_seconds: float = 0.5,
chunk_seconds: float = 0.5,
absolute_thresh: bool = False,
device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
device: str | torch.device = get_optimal_device(),
):
model_path = Path(model_path)
output_path = Path(output_path)
Expand Down Expand Up @@ -94,7 +95,7 @@ def realtime(
version: int = 2,
input_device: int | str | None = None,
output_device: int | str | None = None,
device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
device: str | torch.device = get_optimal_device(),
passthrough_original: bool = False,
):
import sounddevice as sd
Expand Down
4 changes: 2 additions & 2 deletions src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from ._models import Generator
from ._models import NSFHifiGANGenerator

__all__ = ["Generator"]
__all__ = ["NSFHifiGANGenerator"]
2 changes: 1 addition & 1 deletion src/so_vits_svc_fork/modules/decoders/hifigan/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def forward(self, x):
return sine_merge, noise, uv


class Generator(torch.nn.Module):
class NSFHifiGANGenerator(torch.nn.Module):
def __init__(self, h):
super().__init__()
self.h = h
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ def forward(self, x, g=None):

y_mb_hat = F.conv_transpose1d(
y_mb_hat,
self.updown_filter.cuda(x.device) * self.subbands,
self.updown_filter.to(x.device) * self.subbands,
stride=self.subbands,
)

Expand Down
8 changes: 3 additions & 5 deletions src/so_vits_svc_fork/modules/decoders/mb_istft/_pqmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,17 +85,15 @@ def __init__(self, device, subbands=8, taps=62, cutoff_ratio=0.15, beta=9.0):
)

# convert to tensor
analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).cuda(device)
synthesis_filter = (
torch.from_numpy(h_synthesis).float().unsqueeze(0).cuda(device)
)
analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).to(device)
synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0).to(device)

# register coefficients as buffer
self.register_buffer("analysis_filter", analysis_filter)
self.register_buffer("synthesis_filter", synthesis_filter)

# filter for downsampling & upsampling
updown_filter = torch.zeros((subbands, subbands, subbands)).float().cuda(device)
updown_filter = torch.zeros((subbands, subbands, subbands)).float().to(device)
for k in range(subbands):
updown_filter[k, k, 0] = 1.0
self.register_buffer("updown_filter", updown_filter)
Expand Down
6 changes: 1 addition & 5 deletions src/so_vits_svc_fork/modules/decoders/mb_istft/_stft.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,7 @@ def inverse(self, magnitude, phase):
window_sum = torch.autograd.Variable(
torch.from_numpy(window_sum), requires_grad=False
)
window_sum = (
window_sum.to(inverse_transform.device())
if magnitude.is_cuda
else window_sum
)
window_sum = window_sum.to(inverse_transform.device())
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
approx_nonzero_indices
]
Expand Down
4 changes: 2 additions & 2 deletions src/so_vits_svc_fork/modules/synthesizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from so_vits_svc_fork.f0 import f0_to_coarse
from so_vits_svc_fork.modules import commons as commons
from so_vits_svc_fork.modules.decoders.f0 import F0Decoder
from so_vits_svc_fork.modules.decoders.hifigan import Generator
from so_vits_svc_fork.modules.decoders.hifigan import NSFHifiGANGenerator
from so_vits_svc_fork.modules.decoders.mb_istft import (
Multiband_iSTFT_Generator,
Multistream_iSTFT_Generator,
Expand Down Expand Up @@ -110,7 +110,7 @@ def __init__(
"upsample_kernel_sizes": upsample_kernel_sizes,
"gin_channels": gin_channels,
}
self.dec = Generator(h=hps)
self.dec = NSFHifiGANGenerator(h=hps)
self.mb = False
else:
hps = {
Expand Down
8 changes: 4 additions & 4 deletions src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from ..hparams import HParams
from ..modules.mel_processing import spec_to_mel_torch, spectrogram_torch
from ..utils import get_total_gpu_memory
from ..utils import get_optimal_device, get_total_gpu_memory
from .preprocess_utils import check_hubert_min_duration

LOG = getLogger(__name__)
Expand All @@ -30,7 +30,7 @@ def _process_one(
*,
filepath: Path,
content_model: HubertModel,
device: Literal["cuda", "cpu"] = "cuda",
device: torch.device | str = get_optimal_device(),
f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
force_rebuild: bool = False,
hps: HParams,
Expand All @@ -54,7 +54,7 @@ def _process_one(
uv = torch.from_numpy(uv).float()

# Compute HuBERT content
audio = torch.from_numpy(audio).float().cuda()
audio = torch.from_numpy(audio).float().to(device)
c = utils.get_content(
content_model,
audio,
Expand Down Expand Up @@ -102,7 +102,7 @@ def _process_one(


def _process_batch(filepaths: Iterable[Path], pbar_position: int, **kwargs):
content_model = utils.get_hubert_model("cuda")
content_model = utils.get_hubert_model(get_optimal_device())

for filepath in tqdm(filepaths, position=pbar_position):
_process_one(
Expand Down
Loading