🐛 remove rubberband-cli dependencies #68

- 不在依赖 rubberband-cli ，兼容 window 系统
lenML · Jun 24, 2024 · 1cd34c3 · 1cd34c3
1 parent ff9c7c0
commit 1cd34c3
Show file tree

Hide file tree

Showing 7 changed files with 41 additions and 69 deletions.
diff --git a/docs/dependencies.md b/docs/dependencies.md
@@ -82,7 +82,6 @@ git clone https://github.com/lenML/ChatTTS-Forge.git --depth=1
 音频后处理操作（如加速、减速、提高音量等）依赖以下库：
 
 - **ffmpeg** 或 **libav**（推荐使用 ffmpeg）
-- **rubberband-cli**（仅 Linux 环境需要）
 
 ### 安装 ffmpeg
 
@@ -96,7 +95,6 @@ brew install ffmpeg
 
 ```bash
 apt-get install ffmpeg libavcodec-extra
-apt-get install rubberband-cli
 ```
 
 **Windows**:

diff --git a/modules/SynthesizeSegments.py b/modules/SynthesizeSegments.py
@@ -17,7 +17,7 @@
 from modules.speaker import Speaker
 from modules.ssml_parser.SSMLParser import SSMLBreak, SSMLContext, SSMLSegment
 from modules.utils import rng
-from modules.utils.audio import pitch_shift, time_stretch
+from modules.utils.audio import apply_prosody_to_audio_segment
 
 logger = logging.getLogger(__name__)
 
@@ -67,21 +67,6 @@ def combine_audio_segments(audio_segments: list[AudioSegment]) -> AudioSegment:
     return combined_audio
 
 
-def apply_prosody(
-    audio_segment: AudioSegment, rate: float, volume: float, pitch: float
-) -> AudioSegment:
-    if rate != 1:
-        audio_segment = time_stretch(audio_segment, rate)
-
-    if volume != 0:
-        audio_segment += volume
-
-    if pitch != 0:
-        audio_segment = pitch_shift(audio_segment, pitch)
-
-    return audio_segment
-
-
 def to_number(value, t, default=0):
     try:
         number = t(value)
@@ -228,7 +213,9 @@ def append_eos(text: str):
                 pitch = float(segment.get("pitch", "0"))
 
                 audio_segment = audio_data_to_segment(audio_data, sr)
-                audio_segment = apply_prosody(audio_segment, rate, volume, pitch)
+                audio_segment = apply_prosody_to_audio_segment(
+                    audio_segment, rate=rate, volume=volume, pitch=pitch
+                )
                 # compare by Box object
                 original_index = src_segments.index(segment)
                 audio_segments[original_index] = audio_segment

diff --git a/modules/devices/devices.py b/modules/devices/devices.py
@@ -131,6 +131,9 @@ def reset_device():
     global dtype_gpt
     global dtype_decoder
 
+    if config.runtime_env_vars.use_cpu is None:
+        config.runtime_env_vars.use_cpu = []
+
     if "all" in config.runtime_env_vars.use_cpu and not config.runtime_env_vars.no_half:
         logger.warning(
             "Cannot use half precision with CPU, using full precision instead"

diff --git a/modules/utils/audio.py b/modules/utils/audio.py
@@ -2,7 +2,6 @@
 from io import BytesIO
 
 import numpy as np
-import pyrubberband as pyrb
 import soundfile as sf
 from pydub import AudioSegment, effects
 
@@ -50,7 +49,7 @@ def pydub_to_np(audio: AudioSegment) -> tuple[int, np.ndarray]:
     )
 
 
-def ndarray_to_segment(ndarray, frame_rate):
+def ndarray_to_segment(ndarray: np.ndarray, frame_rate: int) -> AudioSegment:
     buffer = BytesIO()
     sf.write(buffer, ndarray, frame_rate, format="wav")
     buffer.seek(0)
@@ -60,58 +59,49 @@ def ndarray_to_segment(ndarray, frame_rate):
     return sound
 
 
-def time_stretch(input_segment: AudioSegment, time_factor: float) -> AudioSegment:
-    """
-    factor range -> [0.2,10]
-    """
-    time_factor = np.clip(time_factor, 0.2, 10)
-    sr = input_segment.frame_rate
-    y = audiosegment_to_librosawav(input_segment)
-    y_stretch = pyrb.time_stretch(y, sr, time_factor)
-
-    sound = ndarray_to_segment(
-        y_stretch,
-        frame_rate=sr,
-    )
-    return sound
+def apply_prosody_to_audio_segment(
+    audio_segment: AudioSegment,
+    rate: float = 1,
+    volume: float = 0,
+    pitch: int = 0,
+    sr: int = 24000,
+) -> AudioSegment:
+    # Adjust rate (speed)
+    if rate != 1:
+        audio_segment = effects.speedup(audio_segment, playback_speed=rate)
 
+    # Adjust volume
+    if volume != 0:
+        audio_segment = audio_segment + volume
 
-def pitch_shift(
-    input_segment: AudioSegment,
-    pitch_shift_factor: float,
-) -> AudioSegment:
-    """
-    factor range -> [-12,12]
-    """
-    pitch_shift_factor = np.clip(pitch_shift_factor, -12, 12)
-    sr = input_segment.frame_rate
-    y = audiosegment_to_librosawav(input_segment)
-    y_shift = pyrb.pitch_shift(y, sr, pitch_shift_factor)
-
-    sound = ndarray_to_segment(
-        y_shift,
-        frame_rate=sr,
-    )
-    return sound
+    # Adjust pitch
+    if pitch != 0:
+        audio_segment = audio_segment._spawn(
+            audio_segment.raw_data,
+            overrides={
+                "frame_rate": int(audio_segment.frame_rate * (2.0 ** (pitch / 12.0)))
+            },
+        ).set_frame_rate(sr)
+
+    return audio_segment
 
 
 def apply_prosody_to_audio_data(
     audio_data: np.ndarray,
     rate: float = 1,
     volume: float = 0,
-    pitch: float = 0,
+    pitch: int = 0,
     sr: int = 24000,
 ) -> np.ndarray:
-    if rate != 1:
-        audio_data = pyrb.time_stretch(audio_data, sr=sr, rate=rate)
+    audio_segment = ndarray_to_segment(audio_data, sr)
 
-    if volume != 0:
-        audio_data = audio_data * volume
+    audio_segment = apply_prosody_to_audio_segment(
+        audio_segment, rate=rate, volume=volume, pitch=pitch, sr=sr
+    )
 
-    if pitch != 0:
-        audio_data = pyrb.pitch_shift(audio_data, sr=sr, n_steps=pitch)
+    processed_audio_data = np.array(audio_segment.get_array_of_samples())
 
-    return audio_data
+    return processed_audio_data
 
 
 def apply_normalize(

diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -102,7 +102,6 @@ Pygments==2.18.0
 pynvml==11.5.0
 pyparsing==3.1.2
 pypinyin==0.51.0
-pyrubberband==0.3.0
 PySoundFile==0.9.0.post1
 pytest==8.2.2
 pytest-cov==5.0.0
@@ -158,5 +157,4 @@ watchfiles==0.22.0
 websockets==11.0.3
 Werkzeug==3.0.3
 zhon==2.0.2
-ftfy==6.2.0
-langdetect==1.0.9
+ftfy==6.2.0
diff --git a/requirements.docker.txt b/requirements.docker.txt
@@ -4,7 +4,6 @@ lxml
 pydub
 fastapi
 soundfile
-pyrubberband
 omegaconf
 pypinyin
 pandas
@@ -26,5 +25,4 @@ mistune==3.0.2
 cn2an
 # audio_denoiser
 python-box
-ftfy
-langdetect
+ftfy
diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,6 @@ lxml
 pydub
 fastapi
 soundfile
-pyrubberband
 omegaconf
 pypinyin
 vocos
@@ -25,5 +24,4 @@ mistune==3.0.2
 cn2an
 # audio_denoiser
 python-box
-ftfy
-langdetect
+ftfy