🐛 fix apply_prosody

lenML · Jun 24, 2024 · 7fa55d9 · 7fa55d9
1 parent bd5e6eb
commit 7fa55d9
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 49 deletions.
diff --git a/modules/utils/audio.py b/modules/utils/audio.py
@@ -4,11 +4,12 @@
 import numpy as np
 import soundfile as sf
 from pydub import AudioSegment, effects
+import librosa
 
 INT16_MAX = np.iinfo(np.int16).max
 
 
-def audio_to_int16(audio_data):
+def audio_to_int16(audio_data: np.ndarray) -> np.ndarray:
     if (
         audio_data.dtype == np.float32
         or audio_data.dtype == np.float64
@@ -19,6 +20,23 @@ def audio_to_int16(audio_data):
     return audio_data
 
 
+def pydub_to_np(audio: AudioSegment) -> tuple[int, np.ndarray]:
+    """
+    Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
+    where each value is in range [-1.0, 1.0].
+    Returns tuple (audio_np_array, sample_rate).
+    """
+    nd_array = np.array(audio.get_array_of_samples(), dtype=np.float32)
+    if audio.channels != 1:
+        nd_array = nd_array.reshape((-1, audio.channels))
+    nd_array = nd_array / (1 << (8 * audio.sample_width - 1))
+
+    return (
+        audio.frame_rate,
+        nd_array,
+    )
+
+
 def audiosegment_to_librosawav(audiosegment: AudioSegment) -> np.ndarray:
     """
     Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
@@ -34,29 +52,24 @@ def audiosegment_to_librosawav(audiosegment: AudioSegment) -> np.ndarray:
     return fp_arr
 
 
-def pydub_to_np(audio: AudioSegment) -> tuple[int, np.ndarray]:
-    """
-    Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
-    where each value is in range [-1.0, 1.0].
-    Returns tuple (audio_np_array, sample_rate).
-    """
-    return (
-        audio.frame_rate,
-        np.array(audio.get_array_of_samples(), dtype=np.float32).reshape(
-            (-1, audio.channels)
-        )
-        / (1 << (8 * audio.sample_width - 1)),
-    )
-
-
-def ndarray_to_segment(ndarray: np.ndarray, frame_rate: int) -> AudioSegment:
+def ndarray_to_segment(
+    ndarray: np.ndarray, frame_rate: int, sample_width: int = None, channels: int = None
+) -> AudioSegment:
     buffer = BytesIO()
-    sf.write(buffer, ndarray, frame_rate, format="wav")
+    sf.write(buffer, ndarray, frame_rate, format="wav", subtype="PCM_16")
     buffer.seek(0)
-    sound = AudioSegment.from_wav(
-        buffer,
+    sound: AudioSegment = AudioSegment.from_wav(buffer)
+
+    if sample_width is None:
+        sample_width = sound.sample_width
+    if channels is None:
+        channels = sound.channels
+
+    return (
+        sound.set_frame_rate(frame_rate)
+        .set_sample_width(sample_width)
+        .set_channels(channels)
     )
-    return sound
 
 
 def apply_prosody_to_audio_segment(
@@ -66,42 +79,41 @@ def apply_prosody_to_audio_segment(
     pitch: int = 0,
     sr: int = 24000,
 ) -> AudioSegment:
-    # Adjust rate (speed)
-    if rate != 1:
-        audio_segment = effects.speedup(audio_segment, playback_speed=rate)
+    audio_data = audiosegment_to_librosawav(audio_segment)
 
-    # Adjust volume
-    if volume != 0:
-        audio_segment = audio_segment + volume
+    audio_data = apply_prosody_to_audio_data(audio_data, rate, volume, pitch, sr)
 
-    # Adjust pitch
-    if pitch != 0:
-        audio_segment = audio_segment._spawn(
-            audio_segment.raw_data,
-            overrides={
-                "frame_rate": int(audio_segment.frame_rate * (2.0 ** (pitch / 12.0)))
-            },
-        ).set_frame_rate(sr)
+    audio_segment = ndarray_to_segment(
+        audio_data, sr, audio_segment.sample_width, audio_segment.channels
+    )
 
     return audio_segment
 
 
+# FIXME: 使用 librosa.effects 会有音质损失
 def apply_prosody_to_audio_data(
     audio_data: np.ndarray,
     rate: float = 1,
     volume: float = 0,
     pitch: int = 0,
     sr: int = 24000,
 ) -> np.ndarray:
-    audio_segment = ndarray_to_segment(audio_data, sr)
+    if audio_data.ndim != 1:
+        audio_data = audio_data.mean(axis=1)
 
-    audio_segment = apply_prosody_to_audio_segment(
-        audio_segment, rate=rate, volume=volume, pitch=pitch, sr=sr
-    )
+    # Adjust volume
+    if volume != 0:
+        audio_data = audio_data + volume
+
+    # Adjust rate (speed)
+    if rate != 1:
+        audio_data = librosa.effects.time_stretch(audio_data, rate=rate)
 
-    processed_audio_data = np.array(audio_segment.get_array_of_samples())
+    # Adjust pitch
+    if pitch != 0:
+        audio_data = librosa.effects.pitch_shift(audio_data, sr=sr, n_steps=pitch)
 
-    return processed_audio_data
+    return audio_data
 
 
 def apply_normalize(
@@ -124,11 +136,11 @@ def apply_normalize(
     input_sound = AudioSegment.from_mp3(input_file)
 
     for time_factor in time_stretch_factors:
-        output_wav = f"time_stretched_{int(time_factor * 100)}.wav"
-        sound = time_stretch(input_sound, time_factor)
-        sound.export(output_wav, format="wav")
+        output_wav = f"{input_file}_time_{time_factor}.wav"
+        output_sound = apply_prosody_to_audio_segment(input_sound, rate=time_factor)
+        output_sound.export(output_wav, format="wav")
 
     for pitch_factor in pitch_shift_factors:
-        output_wav = f"pitch_shifted_{int(pitch_factor * 100)}.wav"
-        sound = pitch_shift(input_sound, pitch_factor)
-        sound.export(output_wav, format="wav")
+        output_wav = f"{input_file}_pitch_{pitch_factor}.wav"
+        output_sound = apply_prosody_to_audio_segment(input_sound, pitch=pitch_factor)
+        output_sound.export(output_wav, format="wav")
diff --git a/requirements.docker.txt b/requirements.docker.txt
@@ -25,4 +25,5 @@ mistune==3.0.2
 cn2an
 # audio_denoiser
 python-box
-ftfy
+ftfy
+librosa
diff --git a/requirements.txt b/requirements.txt
@@ -24,4 +24,5 @@ mistune==3.0.2
 cn2an
 # audio_denoiser
 python-box
-ftfy
+ftfy
+librosa
-Original file line number
+Diff line change
@@ Expand Up / @@ -25,4 +25,5 @@ mistune==3.0.2 @@
     cn2an
     # audio_denoiser
     python-box
-    ftfy
+    ftfy
+    librosa