From 6217eda0ec3bac27e408fcd0466a6b658cf718c5 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Sun, 26 Mar 2023 22:10:03 +0900
Subject: [PATCH] perf(preprocess): specify samplerate to reduce memory usage
 (#137)

---
 src/so_vits_svc_fork/__main__.py              |  6 ++++++
 src/so_vits_svc_fork/preprocess_resample.py   | 21 +++++++------------
 .../preprocess_speaker_diarization.py         |  6 +++++-
 src/so_vits_svc_fork/preprocess_split.py      |  5 ++++-
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
index 395da5c0..50d75594 100644
--- a/src/so_vits_svc_fork/__main__.py
+++ b/src/so_vits_svc_fork/__main__.py
@@ -552,6 +552,7 @@ def pre_hubert(
 @click.option(
     "-t", "--huggingface-token", type=str, default=None, help="huggingface token"
 )
+@click.option("-s", "--sr", type=int, default=44100, help="sampling rate")
 def pre_sd(
     input_dir: Path | str,
     output_dir: Path | str,
@@ -559,6 +560,7 @@ def pre_sd(
     max_speakers: int,
     huggingface_token: str | None,
     n_jobs: int,
+    sr: int,
 ):
     """Speech diarization using pyannote.audio"""
     if huggingface_token is None:
@@ -585,6 +587,7 @@ def pre_sd(
         max_speakers=max_speakers,
         huggingface_token=huggingface_token,
         n_jobs=n_jobs,
+        sr=sr,
     )
 
 
@@ -613,6 +616,7 @@ def pre_sd(
 @click.option("-d", "--top-db", type=float, default=30, help="top db")
 @click.option("-f", "--frame-seconds", type=float, default=1, help="frame seconds")
 @click.option("-h", "--hop-seconds", type=float, default=0.3, help="hop seconds")
+@click.option("-s", "--sr", type=int, default=44100, help="sample rate")
 def pre_split(
     input_dir: Path | str,
     output_dir: Path | str,
@@ -620,6 +624,7 @@ def pre_split(
     frame_seconds: float,
     hop_seconds: float,
     n_jobs: int,
+    sr: int,
 ):
     """Split audio files into multiple files"""
     from .preprocess_split import preprocess_split
@@ -631,6 +636,7 @@ def pre_split(
         frame_seconds=frame_seconds,
         hop_seconds=hop_seconds,
         n_jobs=n_jobs,
+        sr=sr,
     )
 
 
diff --git a/src/so_vits_svc_fork/preprocess_resample.py b/src/so_vits_svc_fork/preprocess_resample.py
index b1ca37a1..f8d16f78 100644
--- a/src/so_vits_svc_fork/preprocess_resample.py
+++ b/src/so_vits_svc_fork/preprocess_resample.py
@@ -6,9 +6,7 @@
 from typing import Iterable
 
 import librosa
-import numpy as np
 import soundfile
-import soundfile as sf
 from joblib import Parallel, delayed
 from tqdm_joblib import tqdm_joblib
 
@@ -47,11 +45,11 @@ def is_relative_to(path: Path, *other):
         return False
 
 
-def _preprocess_one(input_path: Path, output_path: Path, sampling_rate: int) -> None:
+def _preprocess_one(input_path: Path, output_path: Path, sr: int) -> None:
     """Preprocess one audio file."""
 
     try:
-        audio, sr = sf.read(input_path, dtype="float32")
+        audio, sr = librosa.load(input_path, sr=sr, mono=True)
 
     # Audioread is the last backend it will attempt, so this is the exception thrown on failure
     except Exception as e:
@@ -59,18 +57,13 @@ def _preprocess_one(input_path: Path, output_path: Path, sampling_rate: int) ->
         LOG.warning(f"Failed to load {input_path} due to {e}")
         return
 
+    # Adjust volume
+    audio /= max(audio.max(), -audio.min())
+
     # Trim silence
     audio, _ = librosa.effects.trim(audio, top_db=20)
 
-    # Adjust volume
-    peak = np.abs(audio).max()
-    if peak > 1.0:
-        audio = 0.98 * audio / peak
-
-    # Resample
-    audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
-    audio /= max(audio.max(), -audio.min())
-    soundfile.write(output_path, audio, samplerate=sampling_rate, subtype="PCM_16")
+    soundfile.write(output_path, audio, samplerate=sr, subtype="PCM_16")
 
 
 def preprocess_resample(
@@ -109,6 +102,6 @@ def preprocess_resample(
 
     with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)):
         Parallel(n_jobs=n_jobs)(
-            delayed(_preprocess_one)(*args, sampling_rate=sampling_rate)
+            delayed(_preprocess_one)(*args, sr=sampling_rate)
             for args in in_and_out_paths
         )
diff --git a/src/so_vits_svc_fork/preprocess_speaker_diarization.py b/src/so_vits_svc_fork/preprocess_speaker_diarization.py
index 43989777..bd0be494 100644
--- a/src/so_vits_svc_fork/preprocess_speaker_diarization.py
+++ b/src/so_vits_svc_fork/preprocess_speaker_diarization.py
@@ -2,6 +2,7 @@
 from logging import getLogger
 from pathlib import Path
 
+import librosa
 import soundfile as sf
 import torch
 from joblib import Parallel, delayed
@@ -15,13 +16,14 @@
 def _process_one(
     input_path: Path,
     output_dir: Path,
+    sr: int,
     *,
     min_speakers: int = 1,
     max_speakers: int = 1,
     huggingface_token: str | None = None,
 ) -> None:
     try:
-        audio, sr = sf.read(input_path, dtype="float32")
+        audio, sr = librosa.load(input_path, sr=sr, mono=True)
     except Exception as e:
         LOG.warning(f"Failed to read {input_path}: {e}")
         return
@@ -59,6 +61,7 @@ def _process_one(
 def preprocess_speaker_diarization(
     input_dir: Path | str,
     output_dir: Path | str,
+    sr: int,
     *,
     min_speakers: int = 1,
     max_speakers: int = 1,
@@ -79,6 +82,7 @@ def preprocess_speaker_diarization(
             delayed(_process_one)(
                 input_path,
                 output_dir / input_path.relative_to(input_dir).parent / input_path.stem,
+                sr,
                 max_speakers=max_speakers,
                 min_speakers=min_speakers,
                 huggingface_token=huggingface_token,
diff --git a/src/so_vits_svc_fork/preprocess_split.py b/src/so_vits_svc_fork/preprocess_split.py
index 3d8546c7..7e76ed6a 100644
--- a/src/so_vits_svc_fork/preprocess_split.py
+++ b/src/so_vits_svc_fork/preprocess_split.py
@@ -14,13 +14,14 @@
 def _process_one(
     input_path: Path,
     output_dir: Path,
+    sr: int,
     *,
     top_db: int = 30,
     frame_seconds: float = 0.5,
     hop_seconds: float = 0.1,
 ):
     try:
-        audio, sr = sf.read(input_path, dtype="float32")
+        audio, sr = librosa.load(input_path, sr=sr, mono=True)
     except Exception as e:
         LOG.warning(f"Failed to read {input_path}: {e}")
         return
@@ -43,6 +44,7 @@ def _process_one(
 def preprocess_split(
     input_dir: Path | str,
     output_dir: Path | str,
+    sr: int,
     *,
     top_db: int = 30,
     frame_seconds: float = 0.5,
@@ -58,6 +60,7 @@ def preprocess_split(
             delayed(_process_one)(
                 input_path,
                 output_dir / input_path.relative_to(input_dir).parent,
+                sr,
                 top_db=top_db,
                 frame_seconds=frame_seconds,
                 hop_seconds=hop_seconds,