voicepaw · 34j · Mar 25, 2023 · Mar 24, 2023 · Mar 24, 2023 · Mar 24, 2023
diff --git a/README.md b/README.md
@@ -119,13 +119,14 @@ Place your dataset like `dataset_raw/{speaker_id}/**/{wav_file}.{any_format}` (s
 ```shell
 svc pre-resample
 svc pre-config
-svc pre-hubert
+svc pre-hubert -fm dio
 svc train
 ```
 
 #### Notes
 
 - Dataset audio duration per file should be <~ 10s or VRAM will run out.
+- To change the f0 inference method to CREPE, replace `svc pre-hubert -fm dio` with `svc pre-hubert -fm crepe`. You may need to reduce `--n-jobs` due to performance issues.
 - It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. As tested, the default requires about 14 GB.
 
 ### Further help

diff --git a/notebooks/so-vits-svc-fork-4.0.ipynb b/notebooks/so-vits-svc-fork-4.0.ipynb
@@ -155,7 +155,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!svc pre-hubert"
+    "F0_METHOD = \"dio\" #@param [\"crepe\", \"crepe-tiny\", \"parselmouth\", \"dio\", \"harvest\"]\n",
+    "!svc pre-hubert -fm {F0_METHOD}"
    ]
   },
   {

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
@@ -281,7 +281,7 @@ def infer(
     "-fm",
     "--f0-method",
     type=click.Choice(["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"]),
-    default="crepe",
+    default="dio",
     help="f0 prediction method",
 )
 @click.option("-p", "--pad-seconds", type=float, default=0.02, help="pad seconds")
@@ -490,6 +490,7 @@ def pre_config(
 @click.option(
     "-n",
     "--n_jobs",
+    "--n-jobs",
     type=int,
     default=4,
     help="number of jobs (optimal value may depend on your VRAM capacity and audio duration per file)",
@@ -505,7 +506,7 @@ def pre_config(
     "-fm",
     "--f0-method",
     type=click.Choice(["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"]),
-    default="crepe",
+    default="dio",
 )
 def pre_hubert(
     input_dir: Path,

diff --git a/src/so_vits_svc_fork/default_gui_presets.json b/src/so_vits_svc_fork/default_gui_presets.json
@@ -75,7 +75,7 @@
     "silence_threshold": -35.0,
     "transpose": 0.0,
     "auto_predict_f0": true,
-    "f0_method": "crepe",
+    "f0_method": "dio",
     "cluster_infer_ratio": 0.0,
     "noise_scale": 0.4,
     "pad_seconds": 0.1,

diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py
@@ -34,7 +34,8 @@ def load_presets() -> dict:
         json.loads(GUI_PRESETS_PATH.read_text()) if GUI_PRESETS_PATH.exists() else {}
     )
     # prioriy: defaults > users
-    return {**defaults, **users}
+    # order: defaults -> users
+    return {**defaults, **users, **defaults}
 
 
 def add_preset(name: str, preset: dict) -> dict:

diff --git a/src/so_vits_svc_fork/inference/infer_tool.py b/src/so_vits_svc_fork/inference/infer_tool.py
@@ -130,7 +130,7 @@ def get_unit_f0(
         speaker: int | str,
         f0_method: Literal[
             "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
-        ] = "crepe",
+        ] = "dio",
     ):
         f0 = utils.compute_f0(
             audio,
@@ -172,7 +172,7 @@ def infer(
         noise_scale: float = 0.4,
         f0_method: Literal[
             "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
-        ] = "crepe",
+        ] = "dio",
     ) -> tuple[torch.Tensor, int]:
         audio = audio.astype(np.float32)
         # get speaker id
@@ -240,7 +240,7 @@ def infer_silence(
         noise_scale: float = 0.4,
         f0_method: Literal[
             "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
-        ] = "crepe",
+        ] = "dio",
         # slice config
         db_thresh: int = -40,
         pad_seconds: float = 0.5,
@@ -459,7 +459,7 @@ def infer(
         noise_scale: float = 0.4,
         f0_method: Literal[
             "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
-        ] = "crepe",
+        ] = "dio",
         # slice config
         db_thresh: int = -40,
         pad_seconds: float = 0.5,
@@ -519,7 +519,7 @@ def process(
         noise_scale: float = 0.4,
         f0_method: Literal[
             "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
-        ] = "crepe",
+        ] = "dio",
         # slice config
         db_thresh: int = -40,
         chunk_seconds: float = 0.5,

diff --git a/src/so_vits_svc_fork/inference_main.py b/src/so_vits_svc_fork/inference_main.py
@@ -29,9 +29,7 @@ def infer(
     auto_predict_f0: bool = False,
     cluster_infer_ratio: float = 0,
     noise_scale: float = 0.4,
-    f0_method: Literal[
-        "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
-    ] = "crepe",
+    f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
     # slice config
     db_thresh: int = -40,
     pad_seconds: float = 0.5,
@@ -83,9 +81,7 @@ def realtime(
     auto_predict_f0: bool = False,
     cluster_infer_ratio: float = 0,
     noise_scale: float = 0.4,
-    f0_method: Literal[
-        "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
-    ] = "crepe",
+    f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
     # slice config
     db_thresh: int = -40,
     pad_seconds: float = 0.5,

diff --git a/src/so_vits_svc_fork/preprocess_hubert_f0.py b/src/so_vits_svc_fork/preprocess_hubert_f0.py
@@ -23,9 +23,7 @@ def _process_one(
     sampling_rate: int,
     hop_length: int,
     device: Literal["cuda", "cpu"] = "cuda",
-    f0_method: Literal[
-        "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
-    ] = "crepe",
+    f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
     force_rebuild: bool = False,
 ):
     wav, sr = librosa.load(filepath, sr=sampling_rate)
@@ -59,9 +57,7 @@ def _process_batch(
     sampling_rate: int,
     hop_length: int,
     pbar_position: int,
-    f0_method: Literal[
-        "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
-    ] = "crepe",
+    f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
     force_rebuild: bool = False,
 ):
     device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -83,9 +79,7 @@ def preprocess_hubert_f0(
     input_dir: Path | str,
     config_path: Path | str,
     n_jobs: int = 4,
-    f0_method: Literal[
-        "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
-    ] = "crepe",
+    f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
     force_rebuild: bool = False,
 ):
     input_dir = Path(input_dir)

diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py
@@ -243,7 +243,7 @@ def compute_f0(
     p_len: None | int = None,
     sampling_rate: int = 44100,
     hop_length: int = 512,
-    method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "crepe",
+    method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
     **kwargs,
 ):
     with timer() as t:
@@ -260,7 +260,9 @@ def compute_f0(
         elif method == "parselmouth":
             f0 = compute_f0_parselmouth(wav_numpy, p_len, sampling_rate, hop_length)
         else:
-            raise ValueError("type must be dio, crepe, harvest or parselmouth")
+            raise ValueError(
+                "type must be dio, crepe, crepe-tiny, harvest or parselmouth"
+            )
     rtf = t.elapsed / (len(wav_numpy) / sampling_rate)
     LOG.info(f"F0 inference time:       {t.elapsed:.3f}s, RTF: {rtf:.3f}")
     return f0