From 5c2788e04f3debfa8bafd8a2e2371dde30f38d4d Mon Sep 17 00:00:00 2001 From: zhzluke96 Date: Sun, 9 Jun 2024 19:07:33 +0800 Subject: [PATCH] =?UTF-8?q?:sparkles:=20SSML=20=E6=94=AF=E6=8C=81=20enhanc?= =?UTF-8?q?er?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modules/utils/audio.py | 6 +++++- modules/webui/ssml/podcast_tab.py | 23 +++++++++++++++++------ modules/webui/ssml/spliter_tab.py | 11 +++++++++-- modules/webui/ssml/ssml_tab.py | 8 +++++++- modules/webui/webui_utils.py | 16 ++++++++++++++-- 5 files changed, 52 insertions(+), 12 deletions(-) diff --git a/modules/utils/audio.py b/modules/utils/audio.py index c16a14b..48f38c5 100644 --- a/modules/utils/audio.py +++ b/modules/utils/audio.py @@ -19,7 +19,11 @@ def audio_to_int16(audio_data): return audio_data -def audiosegment_to_librosawav(audiosegment): +def audiosegment_to_librosawav(audiosegment: AudioSegment) -> np.ndarray: + """ + Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels], + where each value is in range [-1.0, 1.0]. + """ channel_sounds = audiosegment.split_to_mono() samples = [s.get_array_of_samples() for s in channel_sounds] diff --git a/modules/webui/ssml/podcast_tab.py b/modules/webui/ssml/podcast_tab.py index fc076f1..fc550c1 100644 --- a/modules/webui/ssml/podcast_tab.py +++ b/modules/webui/ssml/podcast_tab.py @@ -53,7 +53,7 @@ # NOTE: 因为 text_normalize 需要使用 tokenizer @torch.inference_mode() @spaces.GPU -def merge_dataframe_to_ssml(df: pd.DataFrame): +def merge_dataframe_to_ssml(msg, spk, style, df: pd.DataFrame): ssml = "" indent = " " * 2 @@ -70,7 +70,8 @@ def merge_dataframe_to_ssml(df: pd.DataFrame): ssml += ">\n" ssml += f"{indent}{indent}{text_normalize(text)}\n" ssml += f"{indent}\n" - return f"\n{ssml}" + # 原封不动输出回去是为了触发 loadding 效果 + return msg, spk, style, f"\n{ssml}" def create_ssml_podcast_tab(ssml_input: gr.Textbox, tabs1: gr.Tabs, tabs2: gr.Tabs): @@ -163,11 +164,14 @@ def clear_message(): columns=["index", "speaker", "text", "style"], ) - def send_to_ssml(sheet: pd.DataFrame): + def send_to_ssml(msg, spk, style, sheet: pd.DataFrame): if sheet.empty: return gr.Error("Please add some text to the script table.") - ssml = merge_dataframe_to_ssml(sheet) + msg, spk, style, ssml = merge_dataframe_to_ssml(msg, spk, style, sheet) return [ + msg, + spk, + style, gr.Textbox(value=ssml), gr.Tabs(selected="ssml"), gr.Tabs(selected="ssml.editor"), @@ -194,6 +198,13 @@ def send_to_ssml(sheet: pd.DataFrame): ) send_to_ssml_btn.click( send_to_ssml, - inputs=[script_table], - outputs=[ssml_input, tabs1, tabs2], + inputs=[msg, spk_input_dropdown, style_input_dropdown, script_table], + outputs=[ + msg, + spk_input_dropdown, + style_input_dropdown, + ssml_input, + tabs1, + tabs2, + ], ) diff --git a/modules/webui/ssml/spliter_tab.py b/modules/webui/ssml/spliter_tab.py index 8b61cd1..14f3a40 100644 --- a/modules/webui/ssml/spliter_tab.py +++ b/modules/webui/ssml/spliter_tab.py @@ -35,7 +35,8 @@ def merge_dataframe_to_ssml(dataframe, spk, style, seed): ssml += ">\n" ssml += f"{indent}{indent}{text_normalize(row.iloc[1])}\n" ssml += f"{indent}\n" - return f"\n{ssml}" + # 原封不动输出回去是为了触发 loadding 效果 + return dataframe, spk, style, seed, f"\n{ssml}" # 长文本处理 @@ -153,7 +154,13 @@ def create_spliter_tab(ssml_input, tabs1, tabs2): style_input_dropdown, infer_seed_input, ], - outputs=[ssml_input], + outputs=[ + long_text_output, + spk_input_text, + style_input_dropdown, + infer_seed_input, + ssml_input, + ], ) def change_tab(): diff --git a/modules/webui/ssml/ssml_tab.py b/modules/webui/ssml/ssml_tab.py index 31307c1..736380b 100644 --- a/modules/webui/ssml/ssml_tab.py +++ b/modules/webui/ssml/ssml_tab.py @@ -37,6 +37,12 @@ def create_ssml_interface(): maximum=webui_config.max_batch_size, step=1, ) + + with gr.Group(): + gr.Markdown("💪🏼Enhance") + enable_enhance = gr.Checkbox(value=True, label="Enable Enhance") + enable_de_noise = gr.Checkbox(value=False, label="Enable De-noise") + with gr.Group(): gr.Markdown("🎄Examples") gr.Examples( @@ -48,7 +54,7 @@ def create_ssml_interface(): ssml_button.click( synthesize_ssml, - inputs=[ssml_input, batch_size_input], + inputs=[ssml_input, batch_size_input, enable_enhance, enable_de_noise], outputs=ssml_output, ) diff --git a/modules/webui/webui_utils.py b/modules/webui/webui_utils.py index 4387816..3ddd4b9 100644 --- a/modules/webui/webui_utils.py +++ b/modules/webui/webui_utils.py @@ -107,7 +107,12 @@ def apply_audio_enhance(audio_data, sr, enable_denoise, enable_enhance): @torch.inference_mode() @spaces.GPU -def synthesize_ssml(ssml: str, batch_size=4): +def synthesize_ssml( + ssml: str, + batch_size=4, + enable_enhance=False, + enable_denoise=False, +): try: batch_size = int(batch_size) except Exception: @@ -130,7 +135,14 @@ def synthesize_ssml(ssml: str, batch_size=4): audio_segments = synthesize.synthesize_segments(segments) combined_audio = combine_audio_segments(audio_segments) - sr, audio_data = audio.pydub_to_np(combined_audio) + sr = combined_audio.frame_rate + audio_data, sr = apply_audio_enhance( + audio.audiosegment_to_librosawav(combined_audio), + sr, + enable_denoise, + enable_enhance, + ) + # NOTE: 这里必须要加,不然 gradio 没法解析成 mp3 格式 audio_data = audio.audio_to_int16(audio_data)