feat: realtime inference (#2)

voicepaw · Mar 17, 2023 · 4dea1ae · 4dea1ae
1 parent acfe2c1
commit 4dea1ae
Show file tree

Hide file tree

Showing 7 changed files with 475 additions and 235 deletions.
diff --git a/README.md b/README.md
@@ -43,13 +43,20 @@ pip install so-vits-svc-fork
 
 ## Features not available in the original repo
 
+- **Realtime voice conversion**
 - Unified command-line interface (no need to run Python scripts)
 - Ready to use just by installing with `pip`.
 - Automatically download pretrained base model and HuBERT model
 - Code completely formatted with black, isort, autoflake etc.
 
 ## Usage
 
+### Realtime Voice conversion
+
+```shell
+svc vc --model-path <model-path>
+```
+
 ### Training
 
 Colab notebook: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/34j/so-vits-svc-fork/blob/main/notebooks/so-vits-svc-fork-4.0.ipynb)
@@ -69,6 +76,34 @@ svc train
 svc --model-path <model-path> source.wav
 ```
 
+For more details, run `svc -h` or `svc <subcommand> -h`.
+
+```shell
+svc -h
+Usage: svc [OPTIONS] COMMAND [ARGS]...
+
+  so-vits-svc allows any folder structure for training data. However, it is
+  recommended to place the training data in the following structure:
+
+      dataset_raw/{speaker_name}/{wav_name}.wav
+
+  To train a model, run pre-resample, pre-config, pre-hubert, train. To infer
+  a model, run infer.
+
+Options:
+  -h, --help  Show this message and exit.
+
+Commands:
+  clean         Clean up files, only useful if you are using the default...
+  infer         Inference
+  onnx          Export model to onnx
+  pre-config    Preprocessing part 2: config
+  pre-hubert    Preprocessing part 3: hubert
+  pre-resample  Preprocessing part 1: resample
+  train         Train model
+  vc            Realtime inference from microphone
+```
+
 ## Contributors ✨
 
 Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -54,6 +54,7 @@ rich = "*"
 tqdm-joblib = "*"
 tensorboardx = "*"
 pyinputplus = "*"
+cm-time = "^0.1.2"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = ">=3"

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
@@ -36,10 +36,15 @@
 @click.group()
 def cli():
     """so-vits-svc allows any folder structure for training data.
-    However, it is recommended to place the training data in the following structure:
+    However, the following folder structure is recommended.
 
-        dataset_raw/{speaker_name}/{wav_name}.wav
+        When training: dataset_raw/{speaker_name}/{wav_name}.wav
 
+
+        When inference: configs/44k/config.json, logs/44k/G_XXXX.pth
+
+    If the folder structure is followed, you DO NOT NEED TO SPECIFY model path, config path, etc.
+    (The latest model will be automatically loaded.)
     To train a model, run pre-resample, pre-config, pre-hubert, train.
     To infer a model, run infer.
     """
@@ -62,7 +67,8 @@ def cli():
     default=Path("./logs/44k"),
 )
 def train(config_path: Path, model_path: Path):
-    """Train model"""
+    """Train model
+    If D_0.pth or G_0.pth not found, automatically download from hub."""
     from .train import main
 
     config_path = Path(config_path)
@@ -87,7 +93,7 @@ def train(config_path: Path, model_path: Path):
     "-m",
     "--model_path",
     type=click.Path(exists=True),
-    default=Path("./logs/44k/G_800.pth"),
+    default=Path("./logs/44k/"),
     help="path to model",
 )
 @click.option(
@@ -107,7 +113,7 @@ def train(config_path: Path, model_path: Path):
 @click.option("-t", "--transpose", type=int, default=0, help="transpose")
 @click.option("-d", "--db_thresh", type=int, default=-40, help="db thresh")
 @click.option(
-    "-a", "--auto_predict_f0", type=bool, default=False, help="auto predict f0"
+    "-a", "--auto_predict_f0", type=bool, default=True, help="auto predict f0"
 )
 @click.option(
     "-r", "--cluster_infer_ratio", type=float, default=0, help="cluster infer ratio"
@@ -139,11 +145,20 @@ def infer(
     """Inference"""
     from .inference_main import infer
 
+    if not auto_predict_f0:
+        LOG.warning(
+            f"auto_predict_f0 = False, transpose = {transpose}. If you want to change the pitch, please set transpose."
+            "Generally transpose = 0 does not work because your voice pitch and target voice pitch are different."
+        )
+
     input_path = Path(input_path)
     if output_path is None:
         output_path = input_path.parent / f"{input_path.stem}.out{input_path.suffix}"
     output_path = Path(output_path)
     model_path = Path(model_path)
+    if model_path.is_dir():
+        model_path = list(sorted(model_path.glob("*.pth")))[-1]
+        LOG.info(f"Since model_path is a directory, use {model_path}")
     config_path = Path(config_path)
     if cluster_model_path is not None:
         cluster_model_path = Path(cluster_model_path)
@@ -164,6 +179,114 @@ def infer(
     )
 
 
+@cli.command()
+@click.option(
+    "-m",
+    "--model_path",
+    type=click.Path(exists=True),
+    default=Path("./logs/44k/"),
+    help="path to model",
+)
+@click.option(
+    "-c",
+    "--config_path",
+    type=click.Path(exists=True),
+    default=Path("./configs/44k/config.json"),
+    help="path to config",
+)
+@click.option(
+    "-k",
+    "--cluster_model_path",
+    type=click.Path(exists=True),
+    default=None,
+    help="path to cluster model",
+)
+@click.option("-t", "--transpose", type=int, default=12, help="transpose")
+@click.option(
+    "-a",
+    "--auto_predict_f0",
+    type=bool,
+    default=False,
+    help="auto predict f0 (not recommended for realtime since voice pitch will not be stable)",
+)
+@click.option(
+    "-r", "--cluster_infer_ratio", type=float, default=0, help="cluster infer ratio"
+)
+@click.option("-n", "--noise_scale", type=float, default=0.4, help="noise scale")
+@click.option("-d", "--db_thresh", type=int, default=-20, help="db thresh")
+@click.option("-p", "--pad_seconds", type=float, default=0.02, help="pad seconds")
+@click.option(
+    "-c",
+    "--crossfade_seconds",
+    type=float,
+    default=0.01,
+    help="crossfade seconds",
+)
+@click.option("-b", "--block_seconds", type=float, default=1, help="block seconds")
+@click.option(
+    "-d",
+    "--device",
+    type=str,
+    default="cuda" if torch.cuda.is_available() else "cpu",
+    help="device",
+)
+@click.option("-s", "--speaker", type=str, default=None, help="speaker name")
+def vc(
+    # paths
+    model_path: Path,
+    config_path: Path,
+    # svc config
+    speaker: str,
+    cluster_model_path: Path | None,
+    transpose: int,
+    auto_predict_f0: bool,
+    cluster_infer_ratio: float,
+    noise_scale: float,
+    # slice config
+    db_thresh: int,
+    pad_seconds: float,
+    # realtime config
+    crossfade_seconds: float,
+    block_seconds: float,
+    device: Literal["cpu", "cuda"],
+) -> None:
+    """Realtime inference from microphone"""
+    from .inference_main import realtime
+
+    if auto_predict_f0:
+        LOG.warning(
+            "auto_predict_f0 = True in realtime inference will cause unstable voice pitch, use with caution"
+        )
+    else:
+        LOG.warning(
+            f"auto_predict_f0 = False, transpose = {transpose}. If you want to change the pitch, please change the transpose value."
+            "Generally transpose = 0 does not work because your voice pitch and target voice pitch are different."
+        )
+    model_path = Path(model_path)
+    config_path = Path(config_path)
+    if cluster_model_path is not None:
+        cluster_model_path = Path(cluster_model_path)
+    if model_path.is_dir():
+        model_path = list(sorted(model_path.glob("*.pth")))[-1]
+        LOG.info(f"Since model_path is a directory, use {model_path}")
+
+    realtime(
+        model_path=model_path,
+        config_path=config_path,
+        speaker=speaker,
+        cluster_model_path=cluster_model_path,
+        transpose=transpose,
+        auto_predict_f0=auto_predict_f0,
+        cluster_infer_ratio=cluster_infer_ratio,
+        noise_scale=noise_scale,
+        crossfade_seconds=crossfade_seconds,
+        block_seconds=block_seconds,
+        db_thresh=db_thresh,
+        pad_seconds=pad_seconds,
+        device=device,
+    )
+
+
 @click.help_option("--help", "-h")
 @cli.command()
 @click.option(
@@ -250,7 +373,8 @@ def pre_config(
     default=Path("./configs/44k/config.json"),
 )
 def pre_hubert(input_dir: Path, config_path: Path) -> None:
-    """Preprocessing part 3: hubert"""
+    """Preprocessing part 3: hubert
+    If the HuBERT model is not found, it will be downloaded automatically."""
     from .preprocess_hubert_f0 import preprocess_hubert_f0
 
     input_dir = Path(input_dir)