Skip to content

Commit

Permalink
perf(preprocess): specify samplerate to reduce memory usage (#137)
Browse files Browse the repository at this point in the history
  • Loading branch information
34j authored Mar 26, 2023
1 parent f4d0acf commit 6217eda
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 16 deletions.
6 changes: 6 additions & 0 deletions src/so_vits_svc_fork/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,13 +552,15 @@ def pre_hubert(
@click.option(
"-t", "--huggingface-token", type=str, default=None, help="huggingface token"
)
@click.option("-s", "--sr", type=int, default=44100, help="sampling rate")
def pre_sd(
input_dir: Path | str,
output_dir: Path | str,
min_speakers: int,
max_speakers: int,
huggingface_token: str | None,
n_jobs: int,
sr: int,
):
"""Speech diarization using pyannote.audio"""
if huggingface_token is None:
Expand All @@ -585,6 +587,7 @@ def pre_sd(
max_speakers=max_speakers,
huggingface_token=huggingface_token,
n_jobs=n_jobs,
sr=sr,
)


Expand Down Expand Up @@ -613,13 +616,15 @@ def pre_sd(
@click.option("-d", "--top-db", type=float, default=30, help="top db")
@click.option("-f", "--frame-seconds", type=float, default=1, help="frame seconds")
@click.option("-h", "--hop-seconds", type=float, default=0.3, help="hop seconds")
@click.option("-s", "--sr", type=int, default=44100, help="sample rate")
def pre_split(
input_dir: Path | str,
output_dir: Path | str,
top_db: int,
frame_seconds: float,
hop_seconds: float,
n_jobs: int,
sr: int,
):
"""Split audio files into multiple files"""
from .preprocess_split import preprocess_split
Expand All @@ -631,6 +636,7 @@ def pre_split(
frame_seconds=frame_seconds,
hop_seconds=hop_seconds,
n_jobs=n_jobs,
sr=sr,
)


Expand Down
21 changes: 7 additions & 14 deletions src/so_vits_svc_fork/preprocess_resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
from typing import Iterable

import librosa
import numpy as np
import soundfile
import soundfile as sf
from joblib import Parallel, delayed
from tqdm_joblib import tqdm_joblib

Expand Down Expand Up @@ -47,30 +45,25 @@ def is_relative_to(path: Path, *other):
return False


def _preprocess_one(input_path: Path, output_path: Path, sampling_rate: int) -> None:
def _preprocess_one(input_path: Path, output_path: Path, sr: int) -> None:
"""Preprocess one audio file."""

try:
audio, sr = sf.read(input_path, dtype="float32")
audio, sr = librosa.load(input_path, sr=sr, mono=True)

# Audioread is the last backend it will attempt, so this is the exception thrown on failure
except Exception as e:
# Failure due to attempting to load a file that is not audio, so return early
LOG.warning(f"Failed to load {input_path} due to {e}")
return

# Adjust volume
audio /= max(audio.max(), -audio.min())

# Trim silence
audio, _ = librosa.effects.trim(audio, top_db=20)

# Adjust volume
peak = np.abs(audio).max()
if peak > 1.0:
audio = 0.98 * audio / peak

# Resample
audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
audio /= max(audio.max(), -audio.min())
soundfile.write(output_path, audio, samplerate=sampling_rate, subtype="PCM_16")
soundfile.write(output_path, audio, samplerate=sr, subtype="PCM_16")


def preprocess_resample(
Expand Down Expand Up @@ -109,6 +102,6 @@ def preprocess_resample(

with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)):
Parallel(n_jobs=n_jobs)(
delayed(_preprocess_one)(*args, sampling_rate=sampling_rate)
delayed(_preprocess_one)(*args, sr=sampling_rate)
for args in in_and_out_paths
)
6 changes: 5 additions & 1 deletion src/so_vits_svc_fork/preprocess_speaker_diarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from logging import getLogger
from pathlib import Path

import librosa
import soundfile as sf
import torch
from joblib import Parallel, delayed
Expand All @@ -15,13 +16,14 @@
def _process_one(
input_path: Path,
output_dir: Path,
sr: int,
*,
min_speakers: int = 1,
max_speakers: int = 1,
huggingface_token: str | None = None,
) -> None:
try:
audio, sr = sf.read(input_path, dtype="float32")
audio, sr = librosa.load(input_path, sr=sr, mono=True)
except Exception as e:
LOG.warning(f"Failed to read {input_path}: {e}")
return
Expand Down Expand Up @@ -59,6 +61,7 @@ def _process_one(
def preprocess_speaker_diarization(
input_dir: Path | str,
output_dir: Path | str,
sr: int,
*,
min_speakers: int = 1,
max_speakers: int = 1,
Expand All @@ -79,6 +82,7 @@ def preprocess_speaker_diarization(
delayed(_process_one)(
input_path,
output_dir / input_path.relative_to(input_dir).parent / input_path.stem,
sr,
max_speakers=max_speakers,
min_speakers=min_speakers,
huggingface_token=huggingface_token,
Expand Down
5 changes: 4 additions & 1 deletion src/so_vits_svc_fork/preprocess_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
def _process_one(
input_path: Path,
output_dir: Path,
sr: int,
*,
top_db: int = 30,
frame_seconds: float = 0.5,
hop_seconds: float = 0.1,
):
try:
audio, sr = sf.read(input_path, dtype="float32")
audio, sr = librosa.load(input_path, sr=sr, mono=True)
except Exception as e:
LOG.warning(f"Failed to read {input_path}: {e}")
return
Expand All @@ -43,6 +44,7 @@ def _process_one(
def preprocess_split(
input_dir: Path | str,
output_dir: Path | str,
sr: int,
*,
top_db: int = 30,
frame_seconds: float = 0.5,
Expand All @@ -58,6 +60,7 @@ def preprocess_split(
delayed(_process_one)(
input_path,
output_dir / input_path.relative_to(input_dir).parent,
sr,
top_db=top_db,
frame_seconds=frame_seconds,
hop_seconds=hop_seconds,
Expand Down

0 comments on commit 6217eda

Please sign in to comment.