Skip to content

Commit

Permalink
🐛 fix apply_prosody
Browse files Browse the repository at this point in the history
  • Loading branch information
zhzLuke96 committed Jun 24, 2024
1 parent bd5e6eb commit 7fa55d9
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 49 deletions.
106 changes: 59 additions & 47 deletions modules/utils/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import numpy as np
import soundfile as sf
from pydub import AudioSegment, effects
import librosa

INT16_MAX = np.iinfo(np.int16).max


def audio_to_int16(audio_data):
def audio_to_int16(audio_data: np.ndarray) -> np.ndarray:
if (
audio_data.dtype == np.float32
or audio_data.dtype == np.float64
Expand All @@ -19,6 +20,23 @@ def audio_to_int16(audio_data):
return audio_data


def pydub_to_np(audio: AudioSegment) -> tuple[int, np.ndarray]:
"""
Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
where each value is in range [-1.0, 1.0].
Returns tuple (audio_np_array, sample_rate).
"""
nd_array = np.array(audio.get_array_of_samples(), dtype=np.float32)
if audio.channels != 1:
nd_array = nd_array.reshape((-1, audio.channels))
nd_array = nd_array / (1 << (8 * audio.sample_width - 1))

return (
audio.frame_rate,
nd_array,
)


def audiosegment_to_librosawav(audiosegment: AudioSegment) -> np.ndarray:
"""
Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
Expand All @@ -34,29 +52,24 @@ def audiosegment_to_librosawav(audiosegment: AudioSegment) -> np.ndarray:
return fp_arr


def pydub_to_np(audio: AudioSegment) -> tuple[int, np.ndarray]:
"""
Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
where each value is in range [-1.0, 1.0].
Returns tuple (audio_np_array, sample_rate).
"""
return (
audio.frame_rate,
np.array(audio.get_array_of_samples(), dtype=np.float32).reshape(
(-1, audio.channels)
)
/ (1 << (8 * audio.sample_width - 1)),
)


def ndarray_to_segment(ndarray: np.ndarray, frame_rate: int) -> AudioSegment:
def ndarray_to_segment(
ndarray: np.ndarray, frame_rate: int, sample_width: int = None, channels: int = None
) -> AudioSegment:
buffer = BytesIO()
sf.write(buffer, ndarray, frame_rate, format="wav")
sf.write(buffer, ndarray, frame_rate, format="wav", subtype="PCM_16")
buffer.seek(0)
sound = AudioSegment.from_wav(
buffer,
sound: AudioSegment = AudioSegment.from_wav(buffer)

if sample_width is None:
sample_width = sound.sample_width
if channels is None:
channels = sound.channels

return (
sound.set_frame_rate(frame_rate)
.set_sample_width(sample_width)
.set_channels(channels)
)
return sound


def apply_prosody_to_audio_segment(
Expand All @@ -66,42 +79,41 @@ def apply_prosody_to_audio_segment(
pitch: int = 0,
sr: int = 24000,
) -> AudioSegment:
# Adjust rate (speed)
if rate != 1:
audio_segment = effects.speedup(audio_segment, playback_speed=rate)
audio_data = audiosegment_to_librosawav(audio_segment)

# Adjust volume
if volume != 0:
audio_segment = audio_segment + volume
audio_data = apply_prosody_to_audio_data(audio_data, rate, volume, pitch, sr)

# Adjust pitch
if pitch != 0:
audio_segment = audio_segment._spawn(
audio_segment.raw_data,
overrides={
"frame_rate": int(audio_segment.frame_rate * (2.0 ** (pitch / 12.0)))
},
).set_frame_rate(sr)
audio_segment = ndarray_to_segment(
audio_data, sr, audio_segment.sample_width, audio_segment.channels
)

return audio_segment


# FIXME: 使用 librosa.effects 会有音质损失
def apply_prosody_to_audio_data(
audio_data: np.ndarray,
rate: float = 1,
volume: float = 0,
pitch: int = 0,
sr: int = 24000,
) -> np.ndarray:
audio_segment = ndarray_to_segment(audio_data, sr)
if audio_data.ndim != 1:
audio_data = audio_data.mean(axis=1)

audio_segment = apply_prosody_to_audio_segment(
audio_segment, rate=rate, volume=volume, pitch=pitch, sr=sr
)
# Adjust volume
if volume != 0:
audio_data = audio_data + volume

# Adjust rate (speed)
if rate != 1:
audio_data = librosa.effects.time_stretch(audio_data, rate=rate)

processed_audio_data = np.array(audio_segment.get_array_of_samples())
# Adjust pitch
if pitch != 0:
audio_data = librosa.effects.pitch_shift(audio_data, sr=sr, n_steps=pitch)

return processed_audio_data
return audio_data


def apply_normalize(
Expand All @@ -124,11 +136,11 @@ def apply_normalize(
input_sound = AudioSegment.from_mp3(input_file)

for time_factor in time_stretch_factors:
output_wav = f"time_stretched_{int(time_factor * 100)}.wav"
sound = time_stretch(input_sound, time_factor)
sound.export(output_wav, format="wav")
output_wav = f"{input_file}_time_{time_factor}.wav"
output_sound = apply_prosody_to_audio_segment(input_sound, rate=time_factor)
output_sound.export(output_wav, format="wav")

for pitch_factor in pitch_shift_factors:
output_wav = f"pitch_shifted_{int(pitch_factor * 100)}.wav"
sound = pitch_shift(input_sound, pitch_factor)
sound.export(output_wav, format="wav")
output_wav = f"{input_file}_pitch_{pitch_factor}.wav"
output_sound = apply_prosody_to_audio_segment(input_sound, pitch=pitch_factor)
output_sound.export(output_wav, format="wav")
3 changes: 2 additions & 1 deletion requirements.docker.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ mistune==3.0.2
cn2an
# audio_denoiser
python-box
ftfy
ftfy
librosa
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ mistune==3.0.2
cn2an
# audio_denoiser
python-box
ftfy
ftfy
librosa

0 comments on commit 7fa55d9

Please sign in to comment.