Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: realtime inference #2

Merged
merged 4 commits into from
Mar 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,20 @@ pip install so-vits-svc-fork

## Features not available in the original repo

- **Realtime voice conversion**
- Unified command-line interface (no need to run Python scripts)
- Ready to use just by installing with `pip`.
- Automatically download pretrained base model and HuBERT model
- Code completely formatted with black, isort, autoflake etc.

## Usage

### Realtime Voice conversion

```shell
svc vc --model-path <model-path>
```

### Training

Colab notebook: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/34j/so-vits-svc-fork/blob/main/notebooks/so-vits-svc-fork-4.0.ipynb)
Expand All @@ -69,6 +76,34 @@ svc train
svc --model-path <model-path> source.wav
```

For more details, run `svc -h` or `svc <subcommand> -h`.

```shell
svc -h
Usage: svc [OPTIONS] COMMAND [ARGS]...

so-vits-svc allows any folder structure for training data. However, it is
recommended to place the training data in the following structure:

dataset_raw/{speaker_name}/{wav_name}.wav

To train a model, run pre-resample, pre-config, pre-hubert, train. To infer
a model, run infer.

Options:
-h, --help Show this message and exit.

Commands:
clean Clean up files, only useful if you are using the default...
infer Inference
onnx Export model to onnx
pre-config Preprocessing part 2: config
pre-hubert Preprocessing part 3: hubert
pre-resample Preprocessing part 1: resample
train Train model
vc Realtime inference from microphone
```

## Contributors ✨

Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
Expand Down
20 changes: 17 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ rich = "*"
tqdm-joblib = "*"
tensorboardx = "*"
pyinputplus = "*"
cm-time = "^0.1.2"

[tool.poetry.group.dev.dependencies]
pre-commit = ">=3"
Expand Down
136 changes: 130 additions & 6 deletions src/so_vits_svc_fork/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,15 @@
@click.group()
def cli():
"""so-vits-svc allows any folder structure for training data.
However, it is recommended to place the training data in the following structure:
However, the following folder structure is recommended.

dataset_raw/{speaker_name}/{wav_name}.wav
When training: dataset_raw/{speaker_name}/{wav_name}.wav


When inference: configs/44k/config.json, logs/44k/G_XXXX.pth

If the folder structure is followed, you DO NOT NEED TO SPECIFY model path, config path, etc.
(The latest model will be automatically loaded.)
To train a model, run pre-resample, pre-config, pre-hubert, train.
To infer a model, run infer.
"""
Expand All @@ -62,7 +67,8 @@ def cli():
default=Path("./logs/44k"),
)
def train(config_path: Path, model_path: Path):
"""Train model"""
"""Train model
If D_0.pth or G_0.pth not found, automatically download from hub."""
from .train import main

config_path = Path(config_path)
Expand All @@ -87,7 +93,7 @@ def train(config_path: Path, model_path: Path):
"-m",
"--model_path",
type=click.Path(exists=True),
default=Path("./logs/44k/G_800.pth"),
default=Path("./logs/44k/"),
help="path to model",
)
@click.option(
Expand All @@ -107,7 +113,7 @@ def train(config_path: Path, model_path: Path):
@click.option("-t", "--transpose", type=int, default=0, help="transpose")
@click.option("-d", "--db_thresh", type=int, default=-40, help="db thresh")
@click.option(
"-a", "--auto_predict_f0", type=bool, default=False, help="auto predict f0"
"-a", "--auto_predict_f0", type=bool, default=True, help="auto predict f0"
)
@click.option(
"-r", "--cluster_infer_ratio", type=float, default=0, help="cluster infer ratio"
Expand Down Expand Up @@ -139,11 +145,20 @@ def infer(
"""Inference"""
from .inference_main import infer

if not auto_predict_f0:
LOG.warning(
f"auto_predict_f0 = False, transpose = {transpose}. If you want to change the pitch, please set transpose."
"Generally transpose = 0 does not work because your voice pitch and target voice pitch are different."
)

input_path = Path(input_path)
if output_path is None:
output_path = input_path.parent / f"{input_path.stem}.out{input_path.suffix}"
output_path = Path(output_path)
model_path = Path(model_path)
if model_path.is_dir():
model_path = list(sorted(model_path.glob("*.pth")))[-1]
LOG.info(f"Since model_path is a directory, use {model_path}")
config_path = Path(config_path)
if cluster_model_path is not None:
cluster_model_path = Path(cluster_model_path)
Expand All @@ -164,6 +179,114 @@ def infer(
)


@cli.command()
@click.option(
"-m",
"--model_path",
type=click.Path(exists=True),
default=Path("./logs/44k/"),
help="path to model",
)
@click.option(
"-c",
"--config_path",
type=click.Path(exists=True),
default=Path("./configs/44k/config.json"),
help="path to config",
)
@click.option(
"-k",
"--cluster_model_path",
type=click.Path(exists=True),
default=None,
help="path to cluster model",
)
@click.option("-t", "--transpose", type=int, default=12, help="transpose")
@click.option(
"-a",
"--auto_predict_f0",
type=bool,
default=False,
help="auto predict f0 (not recommended for realtime since voice pitch will not be stable)",
)
@click.option(
"-r", "--cluster_infer_ratio", type=float, default=0, help="cluster infer ratio"
)
@click.option("-n", "--noise_scale", type=float, default=0.4, help="noise scale")
@click.option("-d", "--db_thresh", type=int, default=-20, help="db thresh")
@click.option("-p", "--pad_seconds", type=float, default=0.02, help="pad seconds")
@click.option(
"-c",
"--crossfade_seconds",
type=float,
default=0.01,
help="crossfade seconds",
)
@click.option("-b", "--block_seconds", type=float, default=1, help="block seconds")
@click.option(
"-d",
"--device",
type=str,
default="cuda" if torch.cuda.is_available() else "cpu",
help="device",
)
@click.option("-s", "--speaker", type=str, default=None, help="speaker name")
def vc(
# paths
model_path: Path,
config_path: Path,
# svc config
speaker: str,
cluster_model_path: Path | None,
transpose: int,
auto_predict_f0: bool,
cluster_infer_ratio: float,
noise_scale: float,
# slice config
db_thresh: int,
pad_seconds: float,
# realtime config
crossfade_seconds: float,
block_seconds: float,
device: Literal["cpu", "cuda"],
) -> None:
"""Realtime inference from microphone"""
from .inference_main import realtime

if auto_predict_f0:
LOG.warning(
"auto_predict_f0 = True in realtime inference will cause unstable voice pitch, use with caution"
)
else:
LOG.warning(
f"auto_predict_f0 = False, transpose = {transpose}. If you want to change the pitch, please change the transpose value."
"Generally transpose = 0 does not work because your voice pitch and target voice pitch are different."
)
model_path = Path(model_path)
config_path = Path(config_path)
if cluster_model_path is not None:
cluster_model_path = Path(cluster_model_path)
if model_path.is_dir():
model_path = list(sorted(model_path.glob("*.pth")))[-1]
LOG.info(f"Since model_path is a directory, use {model_path}")

realtime(
model_path=model_path,
config_path=config_path,
speaker=speaker,
cluster_model_path=cluster_model_path,
transpose=transpose,
auto_predict_f0=auto_predict_f0,
cluster_infer_ratio=cluster_infer_ratio,
noise_scale=noise_scale,
crossfade_seconds=crossfade_seconds,
block_seconds=block_seconds,
db_thresh=db_thresh,
pad_seconds=pad_seconds,
device=device,
)


@click.help_option("--help", "-h")
@cli.command()
@click.option(
Expand Down Expand Up @@ -250,7 +373,8 @@ def pre_config(
default=Path("./configs/44k/config.json"),
)
def pre_hubert(input_dir: Path, config_path: Path) -> None:
"""Preprocessing part 3: hubert"""
"""Preprocessing part 3: hubert
If the HuBERT model is not found, it will be downloaded automatically."""
from .preprocess_hubert_f0 import preprocess_hubert_f0

input_dir = Path(input_dir)
Expand Down
Loading