From 6217eda0ec3bac27e408fcd0466a6b658cf718c5 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sun, 26 Mar 2023 22:10:03 +0900 Subject: [PATCH] perf(preprocess): specify samplerate to reduce memory usage (#137) --- src/so_vits_svc_fork/__main__.py | 6 ++++++ src/so_vits_svc_fork/preprocess_resample.py | 21 +++++++------------ .../preprocess_speaker_diarization.py | 6 +++++- src/so_vits_svc_fork/preprocess_split.py | 5 ++++- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py index 395da5c0..50d75594 100644 --- a/src/so_vits_svc_fork/__main__.py +++ b/src/so_vits_svc_fork/__main__.py @@ -552,6 +552,7 @@ def pre_hubert( @click.option( "-t", "--huggingface-token", type=str, default=None, help="huggingface token" ) +@click.option("-s", "--sr", type=int, default=44100, help="sampling rate") def pre_sd( input_dir: Path | str, output_dir: Path | str, @@ -559,6 +560,7 @@ def pre_sd( max_speakers: int, huggingface_token: str | None, n_jobs: int, + sr: int, ): """Speech diarization using pyannote.audio""" if huggingface_token is None: @@ -585,6 +587,7 @@ def pre_sd( max_speakers=max_speakers, huggingface_token=huggingface_token, n_jobs=n_jobs, + sr=sr, ) @@ -613,6 +616,7 @@ def pre_sd( @click.option("-d", "--top-db", type=float, default=30, help="top db") @click.option("-f", "--frame-seconds", type=float, default=1, help="frame seconds") @click.option("-h", "--hop-seconds", type=float, default=0.3, help="hop seconds") +@click.option("-s", "--sr", type=int, default=44100, help="sample rate") def pre_split( input_dir: Path | str, output_dir: Path | str, @@ -620,6 +624,7 @@ def pre_split( frame_seconds: float, hop_seconds: float, n_jobs: int, + sr: int, ): """Split audio files into multiple files""" from .preprocess_split import preprocess_split @@ -631,6 +636,7 @@ def pre_split( frame_seconds=frame_seconds, hop_seconds=hop_seconds, n_jobs=n_jobs, + sr=sr, ) diff --git a/src/so_vits_svc_fork/preprocess_resample.py b/src/so_vits_svc_fork/preprocess_resample.py index b1ca37a1..f8d16f78 100644 --- a/src/so_vits_svc_fork/preprocess_resample.py +++ b/src/so_vits_svc_fork/preprocess_resample.py @@ -6,9 +6,7 @@ from typing import Iterable import librosa -import numpy as np import soundfile -import soundfile as sf from joblib import Parallel, delayed from tqdm_joblib import tqdm_joblib @@ -47,11 +45,11 @@ def is_relative_to(path: Path, *other): return False -def _preprocess_one(input_path: Path, output_path: Path, sampling_rate: int) -> None: +def _preprocess_one(input_path: Path, output_path: Path, sr: int) -> None: """Preprocess one audio file.""" try: - audio, sr = sf.read(input_path, dtype="float32") + audio, sr = librosa.load(input_path, sr=sr, mono=True) # Audioread is the last backend it will attempt, so this is the exception thrown on failure except Exception as e: @@ -59,18 +57,13 @@ def _preprocess_one(input_path: Path, output_path: Path, sampling_rate: int) -> LOG.warning(f"Failed to load {input_path} due to {e}") return + # Adjust volume + audio /= max(audio.max(), -audio.min()) + # Trim silence audio, _ = librosa.effects.trim(audio, top_db=20) - # Adjust volume - peak = np.abs(audio).max() - if peak > 1.0: - audio = 0.98 * audio / peak - - # Resample - audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate) - audio /= max(audio.max(), -audio.min()) - soundfile.write(output_path, audio, samplerate=sampling_rate, subtype="PCM_16") + soundfile.write(output_path, audio, samplerate=sr, subtype="PCM_16") def preprocess_resample( @@ -109,6 +102,6 @@ def preprocess_resample( with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)): Parallel(n_jobs=n_jobs)( - delayed(_preprocess_one)(*args, sampling_rate=sampling_rate) + delayed(_preprocess_one)(*args, sr=sampling_rate) for args in in_and_out_paths ) diff --git a/src/so_vits_svc_fork/preprocess_speaker_diarization.py b/src/so_vits_svc_fork/preprocess_speaker_diarization.py index 43989777..bd0be494 100644 --- a/src/so_vits_svc_fork/preprocess_speaker_diarization.py +++ b/src/so_vits_svc_fork/preprocess_speaker_diarization.py @@ -2,6 +2,7 @@ from logging import getLogger from pathlib import Path +import librosa import soundfile as sf import torch from joblib import Parallel, delayed @@ -15,13 +16,14 @@ def _process_one( input_path: Path, output_dir: Path, + sr: int, *, min_speakers: int = 1, max_speakers: int = 1, huggingface_token: str | None = None, ) -> None: try: - audio, sr = sf.read(input_path, dtype="float32") + audio, sr = librosa.load(input_path, sr=sr, mono=True) except Exception as e: LOG.warning(f"Failed to read {input_path}: {e}") return @@ -59,6 +61,7 @@ def _process_one( def preprocess_speaker_diarization( input_dir: Path | str, output_dir: Path | str, + sr: int, *, min_speakers: int = 1, max_speakers: int = 1, @@ -79,6 +82,7 @@ def preprocess_speaker_diarization( delayed(_process_one)( input_path, output_dir / input_path.relative_to(input_dir).parent / input_path.stem, + sr, max_speakers=max_speakers, min_speakers=min_speakers, huggingface_token=huggingface_token, diff --git a/src/so_vits_svc_fork/preprocess_split.py b/src/so_vits_svc_fork/preprocess_split.py index 3d8546c7..7e76ed6a 100644 --- a/src/so_vits_svc_fork/preprocess_split.py +++ b/src/so_vits_svc_fork/preprocess_split.py @@ -14,13 +14,14 @@ def _process_one( input_path: Path, output_dir: Path, + sr: int, *, top_db: int = 30, frame_seconds: float = 0.5, hop_seconds: float = 0.1, ): try: - audio, sr = sf.read(input_path, dtype="float32") + audio, sr = librosa.load(input_path, sr=sr, mono=True) except Exception as e: LOG.warning(f"Failed to read {input_path}: {e}") return @@ -43,6 +44,7 @@ def _process_one( def preprocess_split( input_dir: Path | str, output_dir: Path | str, + sr: int, *, top_db: int = 30, frame_seconds: float = 0.5, @@ -58,6 +60,7 @@ def preprocess_split( delayed(_process_one)( input_path, output_dir / input_path.relative_to(input_dir).parent, + sr, top_db=top_db, frame_seconds=frame_seconds, hop_seconds=hop_seconds,