From 0af1e13a468ad282266a595b8d3c77d62aa938dc Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sun, 26 Mar 2023 19:00:44 +0900 Subject: [PATCH] fix(preprocess): fix dtype in sf.read() to save memory and fix preprocess_resample (#132) --- src/so_vits_svc_fork/__main__.py | 16 +++++- src/so_vits_svc_fork/preprocess_resample.py | 57 ++++++++++--------- .../preprocess_speaker_diarization.py | 2 +- src/so_vits_svc_fork/preprocess_split.py | 2 +- 4 files changed, 46 insertions(+), 31 deletions(-) diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py index 7f603349..876c3e04 100644 --- a/src/so_vits_svc_fork/__main__.py +++ b/src/so_vits_svc_fork/__main__.py @@ -434,14 +434,26 @@ def vc( help="path to output dir", ) @click.option("-s", "--sampling-rate", type=int, default=44100, help="sampling rate") -def pre_resample(input_dir: Path, output_dir: Path, sampling_rate: int) -> None: +@click.option( + "-n", + "--n-jobs", + type=int, + default=-1, + help="number of jobs (optimal value may depend on your RAM capacity and audio duration per file)", +) +def pre_resample( + input_dir: Path, output_dir: Path, sampling_rate: int, n_jobs: int +) -> None: """Preprocessing part 1: resample""" from .preprocess_resample import preprocess_resample input_dir = Path(input_dir) output_dir = Path(output_dir) preprocess_resample( - input_dir=input_dir, output_dir=output_dir, sampling_rate=sampling_rate + input_dir=input_dir, + output_dir=output_dir, + sampling_rate=sampling_rate, + n_jobs=n_jobs, ) diff --git a/src/so_vits_svc_fork/preprocess_resample.py b/src/so_vits_svc_fork/preprocess_resample.py index 480042b1..b1ca37a1 100644 --- a/src/so_vits_svc_fork/preprocess_resample.py +++ b/src/so_vits_svc_fork/preprocess_resample.py @@ -5,7 +5,6 @@ from pathlib import Path from typing import Iterable -import audioread.exceptions import librosa import numpy as np import soundfile @@ -48,37 +47,38 @@ def is_relative_to(path: Path, *other): return False -def preprocess_resample( - input_dir: Path | str, output_dir: Path | str, sampling_rate: int -) -> None: - input_dir = Path(input_dir) - output_dir = Path(output_dir) - """Preprocess audio files in input_dir and save them to output_dir.""" +def _preprocess_one(input_path: Path, output_path: Path, sampling_rate: int) -> None: + """Preprocess one audio file.""" + + try: + audio, sr = sf.read(input_path, dtype="float32") - def preprocess_one(input_path: Path, output_path: Path) -> None: - """Preprocess one audio file.""" + # Audioread is the last backend it will attempt, so this is the exception thrown on failure + except Exception as e: + # Failure due to attempting to load a file that is not audio, so return early + LOG.warning(f"Failed to load {input_path} due to {e}") + return - try: - audio, sr = sf.read(input_path) + # Trim silence + audio, _ = librosa.effects.trim(audio, top_db=20) - # Audioread is the last backend it will attempt, so this is the exception thrown on failure - except audioread.exceptions.NoBackendError as e: - # Failure due to attempting to load a file that is not audio, so return early - LOG.warning(f"Failed to load {input_path} due to {e}") - return + # Adjust volume + peak = np.abs(audio).max() + if peak > 1.0: + audio = 0.98 * audio / peak - # Trim silence - audio, _ = librosa.effects.trim(audio, top_db=20) + # Resample + audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate) + audio /= max(audio.max(), -audio.min()) + soundfile.write(output_path, audio, samplerate=sampling_rate, subtype="PCM_16") - # Adjust volume - peak = np.abs(audio).max() - if peak > 1.0: - audio = 0.98 * audio / peak - # Resample - audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate) - audio /= max(audio.max(), -audio.min()) - soundfile.write(output_path, audio, samplerate=sampling_rate, subtype="PCM_16") +def preprocess_resample( + input_dir: Path | str, output_dir: Path | str, sampling_rate: int, n_jobs: int = -1 +) -> None: + input_dir = Path(input_dir) + output_dir = Path(output_dir) + """Preprocess audio files in input_dir and save them to output_dir.""" in_paths = [] out_paths = [] @@ -108,4 +108,7 @@ def preprocess_one(input_path: Path, output_path: Path) -> None: in_and_out_paths = list(zip(in_paths, out_paths)) with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)): - Parallel(n_jobs=-1)(delayed(preprocess_one)(*args) for args in in_and_out_paths) + Parallel(n_jobs=n_jobs)( + delayed(_preprocess_one)(*args, sampling_rate=sampling_rate) + for args in in_and_out_paths + ) diff --git a/src/so_vits_svc_fork/preprocess_speaker_diarization.py b/src/so_vits_svc_fork/preprocess_speaker_diarization.py index 360cf5f8..43989777 100644 --- a/src/so_vits_svc_fork/preprocess_speaker_diarization.py +++ b/src/so_vits_svc_fork/preprocess_speaker_diarization.py @@ -21,7 +21,7 @@ def _process_one( huggingface_token: str | None = None, ) -> None: try: - audio, sr = sf.read(input_path) + audio, sr = sf.read(input_path, dtype="float32") except Exception as e: LOG.warning(f"Failed to read {input_path}: {e}") return diff --git a/src/so_vits_svc_fork/preprocess_split.py b/src/so_vits_svc_fork/preprocess_split.py index 540c1943..3d8546c7 100644 --- a/src/so_vits_svc_fork/preprocess_split.py +++ b/src/so_vits_svc_fork/preprocess_split.py @@ -20,7 +20,7 @@ def _process_one( hop_seconds: float = 0.1, ): try: - audio, sr = sf.read(input_path) + audio, sr = sf.read(input_path, dtype="float32") except Exception as e: LOG.warning(f"Failed to read {input_path}: {e}") return