From 5c2788e04f3debfa8bafd8a2e2371dde30f38d4d Mon Sep 17 00:00:00 2001
From: zhzluke96 <zhzluke96@outlook.com>
Date: Sun, 9 Jun 2024 19:07:33 +0800
Subject: [PATCH] =?UTF-8?q?:sparkles:=20SSML=20=E6=94=AF=E6=8C=81=20enhanc?=
 =?UTF-8?q?er?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 modules/utils/audio.py            |  6 +++++-
 modules/webui/ssml/podcast_tab.py | 23 +++++++++++++++++------
 modules/webui/ssml/spliter_tab.py | 11 +++++++++--
 modules/webui/ssml/ssml_tab.py    |  8 +++++++-
 modules/webui/webui_utils.py      | 16 ++++++++++++++--
 5 files changed, 52 insertions(+), 12 deletions(-)
diff --git a/modules/utils/audio.py b/modules/utils/audio.py
index c16a14b..48f38c5 100644
--- a/modules/utils/audio.py
+++ b/modules/utils/audio.py
@@ -19,7 +19,11 @@ def audio_to_int16(audio_data):
     return audio_data
 
 
-def audiosegment_to_librosawav(audiosegment):
+def audiosegment_to_librosawav(audiosegment: AudioSegment) -> np.ndarray:
+    """
+    Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
+    where each value is in range [-1.0, 1.0].
+    """
     channel_sounds = audiosegment.split_to_mono()
     samples = [s.get_array_of_samples() for s in channel_sounds]
 
diff --git a/modules/webui/ssml/podcast_tab.py b/modules/webui/ssml/podcast_tab.py
index fc076f1..fc550c1 100644
--- a/modules/webui/ssml/podcast_tab.py
+++ b/modules/webui/ssml/podcast_tab.py
@@ -53,7 +53,7 @@
 # NOTE: 因为 text_normalize 需要使用 tokenizer
 @torch.inference_mode()
 @spaces.GPU
-def merge_dataframe_to_ssml(df: pd.DataFrame):
+def merge_dataframe_to_ssml(msg, spk, style, df: pd.DataFrame):
     ssml = ""
     indent = " " * 2
 
@@ -70,7 +70,8 @@ def merge_dataframe_to_ssml(df: pd.DataFrame):
         ssml += ">\n"
         ssml += f"{indent}{indent}{text_normalize(text)}\n"
         ssml += f"{indent}</voice>\n"
-    return f"<speak version='0.1'>\n{ssml}</speak>"
+    # 原封不动输出回去是为了触发 loadding 效果
+    return msg, spk, style, f"<speak version='0.1'>\n{ssml}</speak>"
 
 
 def create_ssml_podcast_tab(ssml_input: gr.Textbox, tabs1: gr.Tabs, tabs2: gr.Tabs):
@@ -163,11 +164,14 @@ def clear_message():
             columns=["index", "speaker", "text", "style"],
         )
 
-    def send_to_ssml(sheet: pd.DataFrame):
+    def send_to_ssml(msg, spk, style, sheet: pd.DataFrame):
         if sheet.empty:
             return gr.Error("Please add some text to the script table.")
-        ssml = merge_dataframe_to_ssml(sheet)
+        msg, spk, style, ssml = merge_dataframe_to_ssml(msg, spk, style, sheet)
         return [
+            msg,
+            spk,
+            style,
             gr.Textbox(value=ssml),
             gr.Tabs(selected="ssml"),
             gr.Tabs(selected="ssml.editor"),
@@ -194,6 +198,13 @@ def send_to_ssml(sheet: pd.DataFrame):
     )
     send_to_ssml_btn.click(
         send_to_ssml,
-        inputs=[script_table],
-        outputs=[ssml_input, tabs1, tabs2],
+        inputs=[msg, spk_input_dropdown, style_input_dropdown, script_table],
+        outputs=[
+            msg,
+            spk_input_dropdown,
+            style_input_dropdown,
+            ssml_input,
+            tabs1,
+            tabs2,
+        ],
     )
diff --git a/modules/webui/ssml/spliter_tab.py b/modules/webui/ssml/spliter_tab.py
index 8b61cd1..14f3a40 100644
--- a/modules/webui/ssml/spliter_tab.py
+++ b/modules/webui/ssml/spliter_tab.py
@@ -35,7 +35,8 @@ def merge_dataframe_to_ssml(dataframe, spk, style, seed):
         ssml += ">\n"
         ssml += f"{indent}{indent}{text_normalize(row.iloc[1])}\n"
         ssml += f"{indent}</voice>\n"
-    return f"<speak version='0.1'>\n{ssml}</speak>"
+    # 原封不动输出回去是为了触发 loadding 效果
+    return dataframe, spk, style, seed, f"<speak version='0.1'>\n{ssml}</speak>"
 
 
 # 长文本处理
@@ -153,7 +154,13 @@ def create_spliter_tab(ssml_input, tabs1, tabs2):
             style_input_dropdown,
             infer_seed_input,
         ],
-        outputs=[ssml_input],
+        outputs=[
+            long_text_output,
+            spk_input_text,
+            style_input_dropdown,
+            infer_seed_input,
+            ssml_input,
+        ],
     )
 
     def change_tab():
diff --git a/modules/webui/ssml/ssml_tab.py b/modules/webui/ssml/ssml_tab.py
index 31307c1..736380b 100644
--- a/modules/webui/ssml/ssml_tab.py
+++ b/modules/webui/ssml/ssml_tab.py
@@ -37,6 +37,12 @@ def create_ssml_interface():
                     maximum=webui_config.max_batch_size,
                     step=1,
                 )
+
+            with gr.Group():
+                gr.Markdown("💪🏼Enhance")
+                enable_enhance = gr.Checkbox(value=True, label="Enable Enhance")
+                enable_de_noise = gr.Checkbox(value=False, label="Enable De-noise")
+
             with gr.Group():
                 gr.Markdown("🎄Examples")
                 gr.Examples(
@@ -48,7 +54,7 @@ def create_ssml_interface():
 
     ssml_button.click(
         synthesize_ssml,
-        inputs=[ssml_input, batch_size_input],
+        inputs=[ssml_input, batch_size_input, enable_enhance, enable_de_noise],
         outputs=ssml_output,
     )
 
diff --git a/modules/webui/webui_utils.py b/modules/webui/webui_utils.py
index 4387816..3ddd4b9 100644
--- a/modules/webui/webui_utils.py
+++ b/modules/webui/webui_utils.py
@@ -107,7 +107,12 @@ def apply_audio_enhance(audio_data, sr, enable_denoise, enable_enhance):
 
 @torch.inference_mode()
 @spaces.GPU
-def synthesize_ssml(ssml: str, batch_size=4):
+def synthesize_ssml(
+    ssml: str,
+    batch_size=4,
+    enable_enhance=False,
+    enable_denoise=False,
+):
     try:
         batch_size = int(batch_size)
     except Exception:
@@ -130,7 +135,14 @@ def synthesize_ssml(ssml: str, batch_size=4):
     audio_segments = synthesize.synthesize_segments(segments)
     combined_audio = combine_audio_segments(audio_segments)
 
-    sr, audio_data = audio.pydub_to_np(combined_audio)
+    sr = combined_audio.frame_rate
+    audio_data, sr = apply_audio_enhance(
+        audio.audiosegment_to_librosawav(combined_audio),
+        sr,
+        enable_denoise,
+        enable_enhance,
+    )
+
     # NOTE: 这里必须要加，不然 gradio 没法解析成 mp3 格式
     audio_data = audio.audio_to_int16(audio_data)