Skip to content

Commit

Permalink
feat: realtime inference (#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
34j authored Mar 17, 2023
1 parent acfe2c1 commit 4dea1ae
Show file tree
Hide file tree
Showing 7 changed files with 475 additions and 235 deletions.
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,20 @@ pip install so-vits-svc-fork

## Features not available in the original repo

- **Realtime voice conversion**
- Unified command-line interface (no need to run Python scripts)
- Ready to use just by installing with `pip`.
- Automatically download pretrained base model and HuBERT model
- Code completely formatted with black, isort, autoflake etc.

## Usage

### Realtime Voice conversion

```shell
svc vc --model-path <model-path>
```

### Training

Colab notebook: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/34j/so-vits-svc-fork/blob/main/notebooks/so-vits-svc-fork-4.0.ipynb)
Expand All @@ -69,6 +76,34 @@ svc train
svc --model-path <model-path> source.wav
```

For more details, run `svc -h` or `svc <subcommand> -h`.

```shell
svc -h
Usage: svc [OPTIONS] COMMAND [ARGS]...

so-vits-svc allows any folder structure for training data. However, it is
recommended to place the training data in the following structure:

dataset_raw/{speaker_name}/{wav_name}.wav

To train a model, run pre-resample, pre-config, pre-hubert, train. To infer
a model, run infer.

Options:
-h, --help Show this message and exit.

Commands:
clean Clean up files, only useful if you are using the default...
infer Inference
onnx Export model to onnx
pre-config Preprocessing part 2: config
pre-hubert Preprocessing part 3: hubert
pre-resample Preprocessing part 1: resample
train Train model
vc Realtime inference from microphone
```
## Contributors ✨
Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
Expand Down
20 changes: 17 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ rich = "*"
tqdm-joblib = "*"
tensorboardx = "*"
pyinputplus = "*"
cm-time = "^0.1.2"

[tool.poetry.group.dev.dependencies]
pre-commit = ">=3"
Expand Down
136 changes: 130 additions & 6 deletions src/so_vits_svc_fork/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,15 @@
@click.group()
def cli():
"""so-vits-svc allows any folder structure for training data.
However, it is recommended to place the training data in the following structure:
However, the following folder structure is recommended.
dataset_raw/{speaker_name}/{wav_name}.wav
When training: dataset_raw/{speaker_name}/{wav_name}.wav
When inference: configs/44k/config.json, logs/44k/G_XXXX.pth
If the folder structure is followed, you DO NOT NEED TO SPECIFY model path, config path, etc.
(The latest model will be automatically loaded.)
To train a model, run pre-resample, pre-config, pre-hubert, train.
To infer a model, run infer.
"""
Expand All @@ -62,7 +67,8 @@ def cli():
default=Path("./logs/44k"),
)
def train(config_path: Path, model_path: Path):
"""Train model"""
"""Train model
If D_0.pth or G_0.pth not found, automatically download from hub."""
from .train import main

config_path = Path(config_path)
Expand All @@ -87,7 +93,7 @@ def train(config_path: Path, model_path: Path):
"-m",
"--model_path",
type=click.Path(exists=True),
default=Path("./logs/44k/G_800.pth"),
default=Path("./logs/44k/"),
help="path to model",
)
@click.option(
Expand All @@ -107,7 +113,7 @@ def train(config_path: Path, model_path: Path):
@click.option("-t", "--transpose", type=int, default=0, help="transpose")
@click.option("-d", "--db_thresh", type=int, default=-40, help="db thresh")
@click.option(
"-a", "--auto_predict_f0", type=bool, default=False, help="auto predict f0"
"-a", "--auto_predict_f0", type=bool, default=True, help="auto predict f0"
)
@click.option(
"-r", "--cluster_infer_ratio", type=float, default=0, help="cluster infer ratio"
Expand Down Expand Up @@ -139,11 +145,20 @@ def infer(
"""Inference"""
from .inference_main import infer

if not auto_predict_f0:
LOG.warning(
f"auto_predict_f0 = False, transpose = {transpose}. If you want to change the pitch, please set transpose."
"Generally transpose = 0 does not work because your voice pitch and target voice pitch are different."
)

input_path = Path(input_path)
if output_path is None:
output_path = input_path.parent / f"{input_path.stem}.out{input_path.suffix}"
output_path = Path(output_path)
model_path = Path(model_path)
if model_path.is_dir():
model_path = list(sorted(model_path.glob("*.pth")))[-1]
LOG.info(f"Since model_path is a directory, use {model_path}")
config_path = Path(config_path)
if cluster_model_path is not None:
cluster_model_path = Path(cluster_model_path)
Expand All @@ -164,6 +179,114 @@ def infer(
)


@cli.command()
@click.option(
"-m",
"--model_path",
type=click.Path(exists=True),
default=Path("./logs/44k/"),
help="path to model",
)
@click.option(
"-c",
"--config_path",
type=click.Path(exists=True),
default=Path("./configs/44k/config.json"),
help="path to config",
)
@click.option(
"-k",
"--cluster_model_path",
type=click.Path(exists=True),
default=None,
help="path to cluster model",
)
@click.option("-t", "--transpose", type=int, default=12, help="transpose")
@click.option(
"-a",
"--auto_predict_f0",
type=bool,
default=False,
help="auto predict f0 (not recommended for realtime since voice pitch will not be stable)",
)
@click.option(
"-r", "--cluster_infer_ratio", type=float, default=0, help="cluster infer ratio"
)
@click.option("-n", "--noise_scale", type=float, default=0.4, help="noise scale")
@click.option("-d", "--db_thresh", type=int, default=-20, help="db thresh")
@click.option("-p", "--pad_seconds", type=float, default=0.02, help="pad seconds")
@click.option(
"-c",
"--crossfade_seconds",
type=float,
default=0.01,
help="crossfade seconds",
)
@click.option("-b", "--block_seconds", type=float, default=1, help="block seconds")
@click.option(
"-d",
"--device",
type=str,
default="cuda" if torch.cuda.is_available() else "cpu",
help="device",
)
@click.option("-s", "--speaker", type=str, default=None, help="speaker name")
def vc(
# paths
model_path: Path,
config_path: Path,
# svc config
speaker: str,
cluster_model_path: Path | None,
transpose: int,
auto_predict_f0: bool,
cluster_infer_ratio: float,
noise_scale: float,
# slice config
db_thresh: int,
pad_seconds: float,
# realtime config
crossfade_seconds: float,
block_seconds: float,
device: Literal["cpu", "cuda"],
) -> None:
"""Realtime inference from microphone"""
from .inference_main import realtime

if auto_predict_f0:
LOG.warning(
"auto_predict_f0 = True in realtime inference will cause unstable voice pitch, use with caution"
)
else:
LOG.warning(
f"auto_predict_f0 = False, transpose = {transpose}. If you want to change the pitch, please change the transpose value."
"Generally transpose = 0 does not work because your voice pitch and target voice pitch are different."
)
model_path = Path(model_path)
config_path = Path(config_path)
if cluster_model_path is not None:
cluster_model_path = Path(cluster_model_path)
if model_path.is_dir():
model_path = list(sorted(model_path.glob("*.pth")))[-1]
LOG.info(f"Since model_path is a directory, use {model_path}")

realtime(
model_path=model_path,
config_path=config_path,
speaker=speaker,
cluster_model_path=cluster_model_path,
transpose=transpose,
auto_predict_f0=auto_predict_f0,
cluster_infer_ratio=cluster_infer_ratio,
noise_scale=noise_scale,
crossfade_seconds=crossfade_seconds,
block_seconds=block_seconds,
db_thresh=db_thresh,
pad_seconds=pad_seconds,
device=device,
)


@click.help_option("--help", "-h")
@cli.command()
@click.option(
Expand Down Expand Up @@ -250,7 +373,8 @@ def pre_config(
default=Path("./configs/44k/config.json"),
)
def pre_hubert(input_dir: Path, config_path: Path) -> None:
"""Preprocessing part 3: hubert"""
"""Preprocessing part 3: hubert
If the HuBERT model is not found, it will be downloaded automatically."""
from .preprocess_hubert_f0 import preprocess_hubert_f0

input_dir = Path(input_dir)
Expand Down
Loading

0 comments on commit 4dea1ae

Please sign in to comment.