From 668c8e1f18cefb0ebd2fb2f1d6572ce4d37d1102 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 19 Mar 2023 13:17:27 +0900
Subject: [PATCH] perf(preprocessing): better performance (#12)

Co-authored-by: gconway <gconway@g.hmc.edu>
---
 .gitignore                                    |  5 +-
 src/so_vits_svc_fork/__main__.py              | 25 +++++-
 .../preprocess_flist_config.py                |  2 +-
 src/so_vits_svc_fork/preprocess_hubert_f0.py  | 89 ++++++++++++-------
 src/so_vits_svc_fork/preprocess_resample.py   | 19 +++-
 src/so_vits_svc_fork/utils.py                 |  5 +-
 6 files changed, 106 insertions(+), 39 deletions(-)

diff --git a/.gitignore b/.gitignore
index f19c12fb..1d6c2d24 100644
--- a/.gitignore
+++ b/.gitignore
@@ -141,12 +141,15 @@ cython_debug/
 
 # additional files
 tests/**/*.wav
-!tests/dataset_raw/**/*.wav
+!tests/dataset_raw/34j/*.wav
 tests/**/*.npy
 tests/**/*.pt
 tests/**/*.txt
 tests/**/*.json
 tests/**/*.pth
 tests/**/*.download
+tests/**/*.lab
+tests/**/*.pdf
+tests/**/*.csv
 *.tfevents.*
 *.pt
diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
index d3bda208..57b14e2e 100644
--- a/src/so_vits_svc_fork/__main__.py
+++ b/src/so_vits_svc_fork/__main__.py
@@ -437,14 +437,35 @@ def pre_config(
     help="path to config",
     default=Path("./configs/44k/config.json"),
 )
-def pre_hubert(input_dir: Path, config_path: Path) -> None:
+@click.option(
+    "-n",
+    "--n_jobs",
+    type=int,
+    default=4,
+    help="number of jobs (optimal value may depend on your VRAM capacity and audio duration per file)",
+)
+@click.option(
+    "-f",
+    "--force_rebuild",
+    type=bool,
+    default=True,
+    help="force rebuild existing preprocessed files",
+)
+def pre_hubert(
+    input_dir: Path, config_path: Path, n_jobs: bool, force_rebuild: bool
+) -> None:
     """Preprocessing part 3: hubert
     If the HuBERT model is not found, it will be downloaded automatically."""
     from .preprocess_hubert_f0 import preprocess_hubert_f0
 
     input_dir = Path(input_dir)
     config_path = Path(config_path)
-    preprocess_hubert_f0(input_dir=input_dir, config_path=config_path)
+    preprocess_hubert_f0(
+        input_dir=input_dir,
+        config_path=config_path,
+        n_jobs=n_jobs,
+        force_rebuild=force_rebuild,
+    )
 
 
 @cli.command
diff --git a/src/so_vits_svc_fork/preprocess_flist_config.py b/src/so_vits_svc_fork/preprocess_flist_config.py
index edec1a0f..b7d79596 100644
--- a/src/so_vits_svc_fork/preprocess_flist_config.py
+++ b/src/so_vits_svc_fork/preprocess_flist_config.py
@@ -55,7 +55,7 @@ def preprocess_config(
         shuffle(paths)
         if len(paths) <= 4:
             raise ValueError(
-                f"too few files in {input_dir / speaker} (expected at least 4)."
+                f"too few files in {input_dir / speaker} (expected at least 5)."
             )
         train += paths[2:-2]
         val += paths[:2]
diff --git a/src/so_vits_svc_fork/preprocess_hubert_f0.py b/src/so_vits_svc_fork/preprocess_hubert_f0.py
index 304cb3ff..870d96fb 100644
--- a/src/so_vits_svc_fork/preprocess_hubert_f0.py
+++ b/src/so_vits_svc_fork/preprocess_hubert_f0.py
@@ -17,41 +17,70 @@
 LOG = getLogger(__name__)
 
 
-def preprocess_hubert_f0(input_dir: Path | str, config_path: Path | str):
+def _process_one(
+    filepath: Path,
+    hubert_model,
+    sampling_rate: int,
+    hop_length: int,
+    device: Literal["cuda", "cpu"] = "cuda",
+    force_rebuild: bool = False,
+):
+    wav, sr = librosa.load(filepath, sr=sampling_rate)
+    soft_path = filepath.parent / (filepath.name + ".soft.pt")
+    if not soft_path.exists() or force_rebuild:
+        wav16k = librosa.resample(
+            wav, orig_sr=sampling_rate, target_sr=HUBERT_SAMPLING_RATE
+        )
+        wav16k = torch.from_numpy(wav16k).to(device)
+        c = utils.get_hubert_content(hubert_model, wav_16k_tensor=wav16k)
+        torch.save(c.cpu(), soft_path)
+    else:
+        LOG.info(f"Skip {filepath} because {soft_path} exists.")
+    f0_path = filepath.parent / (filepath.name + ".f0.npy")
+    if not f0_path.exists() or force_rebuild:
+        f0 = utils.compute_f0_dio(
+            wav, sampling_rate=sampling_rate, hop_length=hop_length
+        )
+        np.save(f0_path, f0)
+    else:
+        LOG.info(f"Skip {filepath} because {f0_path} exists.")
+    torch.cuda.empty_cache()
+
+
+def _process_batch(
+    filepaths: Iterable[Path],
+    sampling_rate: int,
+    hop_length: int,
+    pbar_position: int,
+    force_rebuild: bool = False,
+):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    hubert_model = utils.get_hubert_model().to(device)
+
+    for filepath in tqdm(filepaths, position=pbar_position):
+        _process_one(filepath, hubert_model, sampling_rate, hop_length, device)
+
+
+def preprocess_hubert_f0(
+    input_dir: Path | str,
+    config_path: Path | str,
+    n_jobs: int = 4,
+    force_rebuild: bool = False,
+):
     input_dir = Path(input_dir)
     config_path = Path(config_path)
-    utils.get_hubert_model()
+    utils.ensure_hubert_model()
     hps = utils.get_hparams_from_file(config_path)
     sampling_rate = hps.data.sampling_rate
     hop_length = hps.data.hop_length
 
-    def _process_one(filepath: Path, hmodel, device: Literal["cuda", "cpu"] = "cuda"):
-        wav, sr = librosa.load(filepath, sr=sampling_rate)
-        soft_path = filepath.parent / (filepath.name + ".soft.pt")
-        if not soft_path.exists():
-            wav16k = librosa.resample(
-                wav, orig_sr=sampling_rate, target_sr=HUBERT_SAMPLING_RATE
-            )
-            wav16k = torch.from_numpy(wav16k).to(device)
-            c = utils.get_hubert_content(hmodel, wav_16k_tensor=wav16k)
-            torch.save(c.cpu(), soft_path)
-        f0_path = filepath.parent / (filepath.name + ".f0.npy")
-        if not f0_path.exists():
-            f0 = utils.compute_f0_dio(
-                wav, sampling_rate=sampling_rate, hop_length=hop_length
-            )
-            np.save(f0_path, f0)
-
-    def _process_batch(filepaths: Iterable[Path]):
-        LOG.info("Loading hubert model...")
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        hmodel = utils.get_hubert_model().to(device)
-        LOG.info("Hubert model loaded.")
-        for filepath in tqdm(filepaths):
-            _process_one(filepath, hmodel, device)
-
-    filepaths = list(input_dir.glob("**/*.wav"))
-    n_jobs = min(cpu_count(), len(filepaths) // 32 + 1, 8)
+    filepaths = list(input_dir.rglob("*.wav"))
+    n_jobs = min(cpu_count(), len(filepaths) // 32 + 1, n_jobs)
     shuffle(filepaths)
     filepath_chunks = np.array_split(filepaths, n_jobs)
-    Parallel(n_jobs=n_jobs)(delayed(_process_batch)(chunk) for chunk in filepath_chunks)
+    Parallel(n_jobs=n_jobs)(
+        delayed(_process_batch)(
+            chunk, sampling_rate, hop_length, pbar_position, force_rebuild
+        )
+        for (pbar_position, chunk) in enumerate(filepath_chunks)
+    )
diff --git a/src/so_vits_svc_fork/preprocess_resample.py b/src/so_vits_svc_fork/preprocess_resample.py
index c3ad6163..24a1082a 100644
--- a/src/so_vits_svc_fork/preprocess_resample.py
+++ b/src/so_vits_svc_fork/preprocess_resample.py
@@ -1,13 +1,17 @@
 from __future__ import annotations
 
+from logging import getLogger
 from pathlib import Path
 
+import audioread.exceptions
 import librosa
 import numpy as np
 import soundfile
 from joblib import Parallel, delayed
 from tqdm_joblib import tqdm_joblib
 
+LOG = getLogger(__name__)
+
 # input_dir and output_dir exists.
 # write code to convert input dir audio files to output dir audio files,
 # without changing folder structure. Use joblib to parallelize.
@@ -27,7 +31,15 @@ def preprocess_resample(
 
     def preprocess_one(input_path: Path, output_path: Path) -> None:
         """Preprocess one audio file."""
-        audio, sr = librosa.load(input_path)
+
+        try:
+            audio, sr = librosa.load(input_path)
+
+        # Audioread is the last backend it will attempt, so this is the exception thrown on failure
+        except audioread.exceptions.NoBackendError as e:
+            # Failure due to attempting to load a file that is not audio, so return early
+            LOG.warning(f"Failed to load {input_path} due to {e}")
+            return
 
         # Trim silence
         audio, _ = librosa.effects.trim(audio, top_db=20)
@@ -43,9 +55,10 @@ def preprocess_one(input_path: Path, output_path: Path) -> None:
         soundfile.write(output_path, audio, samplerate=sampling_rate, subtype="PCM_16")
 
     in_and_out_paths = []
-    for in_path in input_dir.rglob("*.wav"):
-        out_path = output_dir / in_path.relative_to(input_dir)
+    for in_path in input_dir.rglob("*.*"):
+        out_path = output_dir / in_path.relative_to(input_dir).with_suffix(".wav")
         out_path.parent.mkdir(parents=True, exist_ok=True)
         in_and_out_paths.append((in_path, out_path))
+
     with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)):
         Parallel(n_jobs=-1)(delayed(preprocess_one)(*args) for args in in_and_out_paths)
diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py
index 361c4c49..d91864eb 100644
--- a/src/so_vits_svc_fork/utils.py
+++ b/src/so_vits_svc_fork/utils.py
@@ -238,8 +238,9 @@ def ensure_pretrained_model(folder_path: Path) -> None:
             download_file(model_url, model_path, desc=f"Downloading {model_path.name}")
 
 
-def ensure_hurbert_model() -> Path:
+def ensure_hubert_model() -> Path:
     vec_path = Path("checkpoint_best_legacy_500.pt")
+    vec_path.parent.mkdir(parents=True, exist_ok=True)
     if not vec_path.exists():
         # url = "http://obs.cstcloud.cn/share/obs/sankagenkeshi/checkpoint_best_legacy_500.pt"
         # url = "https://huggingface.co/innnky/contentvec/resolve/main/checkpoint_best_legacy_500.pt"
@@ -249,7 +250,7 @@ def ensure_hurbert_model() -> Path:
 
 
 def get_hubert_model():
-    vec_path = ensure_hurbert_model()
+    vec_path = ensure_hubert_model()
     from fairseq import checkpoint_utils
 
     models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(