Skip to content

Commit

Permalink
✨ SSML 支持 enhancer
Browse files Browse the repository at this point in the history
  • Loading branch information
zhzLuke96 committed Jun 9, 2024
1 parent d1a8dae commit 5c2788e
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 12 deletions.
6 changes: 5 additions & 1 deletion modules/utils/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ def audio_to_int16(audio_data):
return audio_data


def audiosegment_to_librosawav(audiosegment):
def audiosegment_to_librosawav(audiosegment: AudioSegment) -> np.ndarray:
"""
Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
where each value is in range [-1.0, 1.0].
"""
channel_sounds = audiosegment.split_to_mono()
samples = [s.get_array_of_samples() for s in channel_sounds]

Expand Down
23 changes: 17 additions & 6 deletions modules/webui/ssml/podcast_tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
# NOTE: 因为 text_normalize 需要使用 tokenizer
@torch.inference_mode()
@spaces.GPU
def merge_dataframe_to_ssml(df: pd.DataFrame):
def merge_dataframe_to_ssml(msg, spk, style, df: pd.DataFrame):
ssml = ""
indent = " " * 2

Expand All @@ -70,7 +70,8 @@ def merge_dataframe_to_ssml(df: pd.DataFrame):
ssml += ">\n"
ssml += f"{indent}{indent}{text_normalize(text)}\n"
ssml += f"{indent}</voice>\n"
return f"<speak version='0.1'>\n{ssml}</speak>"
# 原封不动输出回去是为了触发 loadding 效果
return msg, spk, style, f"<speak version='0.1'>\n{ssml}</speak>"


def create_ssml_podcast_tab(ssml_input: gr.Textbox, tabs1: gr.Tabs, tabs2: gr.Tabs):
Expand Down Expand Up @@ -163,11 +164,14 @@ def clear_message():
columns=["index", "speaker", "text", "style"],
)

def send_to_ssml(sheet: pd.DataFrame):
def send_to_ssml(msg, spk, style, sheet: pd.DataFrame):
if sheet.empty:
return gr.Error("Please add some text to the script table.")
ssml = merge_dataframe_to_ssml(sheet)
msg, spk, style, ssml = merge_dataframe_to_ssml(msg, spk, style, sheet)
return [
msg,
spk,
style,
gr.Textbox(value=ssml),
gr.Tabs(selected="ssml"),
gr.Tabs(selected="ssml.editor"),
Expand All @@ -194,6 +198,13 @@ def send_to_ssml(sheet: pd.DataFrame):
)
send_to_ssml_btn.click(
send_to_ssml,
inputs=[script_table],
outputs=[ssml_input, tabs1, tabs2],
inputs=[msg, spk_input_dropdown, style_input_dropdown, script_table],
outputs=[
msg,
spk_input_dropdown,
style_input_dropdown,
ssml_input,
tabs1,
tabs2,
],
)
11 changes: 9 additions & 2 deletions modules/webui/ssml/spliter_tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ def merge_dataframe_to_ssml(dataframe, spk, style, seed):
ssml += ">\n"
ssml += f"{indent}{indent}{text_normalize(row.iloc[1])}\n"
ssml += f"{indent}</voice>\n"
return f"<speak version='0.1'>\n{ssml}</speak>"
# 原封不动输出回去是为了触发 loadding 效果
return dataframe, spk, style, seed, f"<speak version='0.1'>\n{ssml}</speak>"


# 长文本处理
Expand Down Expand Up @@ -153,7 +154,13 @@ def create_spliter_tab(ssml_input, tabs1, tabs2):
style_input_dropdown,
infer_seed_input,
],
outputs=[ssml_input],
outputs=[
long_text_output,
spk_input_text,
style_input_dropdown,
infer_seed_input,
ssml_input,
],
)

def change_tab():
Expand Down
8 changes: 7 additions & 1 deletion modules/webui/ssml/ssml_tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ def create_ssml_interface():
maximum=webui_config.max_batch_size,
step=1,
)

with gr.Group():
gr.Markdown("💪🏼Enhance")
enable_enhance = gr.Checkbox(value=True, label="Enable Enhance")
enable_de_noise = gr.Checkbox(value=False, label="Enable De-noise")

with gr.Group():
gr.Markdown("🎄Examples")
gr.Examples(
Expand All @@ -48,7 +54,7 @@ def create_ssml_interface():

ssml_button.click(
synthesize_ssml,
inputs=[ssml_input, batch_size_input],
inputs=[ssml_input, batch_size_input, enable_enhance, enable_de_noise],
outputs=ssml_output,
)

Expand Down
16 changes: 14 additions & 2 deletions modules/webui/webui_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,12 @@ def apply_audio_enhance(audio_data, sr, enable_denoise, enable_enhance):

@torch.inference_mode()
@spaces.GPU
def synthesize_ssml(ssml: str, batch_size=4):
def synthesize_ssml(
ssml: str,
batch_size=4,
enable_enhance=False,
enable_denoise=False,
):
try:
batch_size = int(batch_size)
except Exception:
Expand All @@ -130,7 +135,14 @@ def synthesize_ssml(ssml: str, batch_size=4):
audio_segments = synthesize.synthesize_segments(segments)
combined_audio = combine_audio_segments(audio_segments)

sr, audio_data = audio.pydub_to_np(combined_audio)
sr = combined_audio.frame_rate
audio_data, sr = apply_audio_enhance(
audio.audiosegment_to_librosawav(combined_audio),
sr,
enable_denoise,
enable_enhance,
)

# NOTE: 这里必须要加,不然 gradio 没法解析成 mp3 格式
audio_data = audio.audio_to_int16(audio_data)

Expand Down

0 comments on commit 5c2788e

Please sign in to comment.