Skip to content

Commit

Permalink
🐛 fix webui
Browse files Browse the repository at this point in the history
  • Loading branch information
zhzLuke96 committed Jun 3, 2024
1 parent 881b694 commit b44156f
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 58 deletions.
2 changes: 1 addition & 1 deletion modules/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def calc_spk_style(spk: str | int, style: str | int):

if type(style) == int or type(style) == float:
raise ParamsTypeError("The style parameter cannot be a number.")
elif type(style) == str:
elif type(style) == str and style != "":
if style.isdigit():
raise ParamsTypeError("The style parameter cannot be a number.")
else:
Expand Down
4 changes: 2 additions & 2 deletions modules/generate_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ def generate_audio(
if isinstance(spk, int):
with SeedContext(spk):
params_infer_code["spk_emb"] = chat_tts.sample_random_speaker()
logger.debug("spk", spk)
logger.debug(("spk", spk))
elif isinstance(spk, Speaker):
params_infer_code["spk_emb"] = spk.emb
logger.debug("spk", spk.name)
logger.debug(("spk", spk.name))

logger.debug(
{
Expand Down
7 changes: 4 additions & 3 deletions modules/utils/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,10 @@ def replace(match):

result = pattern.sub(replace, text)

if is_end:
# 加这个是为了防止吞字
result = ensure_suffix(result, "[v_break]", "。。。[v_break]。。。")
# NOTE: 加了会有杂音...
# if is_end:
# 加这个是为了防止吞字
# result = ensure_suffix(result, "[uv_break]", "。。。[uv_break]。。。")

return result

Expand Down
140 changes: 88 additions & 52 deletions webui.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
import logging

logging.basicConfig(level=logging.DEBUG)


import gradio as gr
import io

Expand All @@ -14,6 +19,7 @@
from modules.utils.normalization import text_normalize
from modules import refiner


torch._dynamo.config.cache_size_limit = 64
torch._dynamo.config.suppress_errors = True
torch.set_float32_matmul_precision("high")
Expand Down Expand Up @@ -56,6 +62,9 @@ def tts_generate(
prefix,
style,
):
if style == "*auto":
style = None

params = calc_spk_style(spk=spk, style=style)

spk = params.get("spk", spk)
Expand All @@ -82,8 +91,8 @@ def tts_generate(


@torch.inference_mode()
def refine_text(text: str):
return refiner.refine_text(text)
def refine_text(text: str, prompt: str):
return refiner.refine_text(text, prompt=prompt)


def read_local_readme():
Expand All @@ -96,138 +105,138 @@ def read_local_readme():
# 演示示例文本
sample_texts = [
{
"text": "天气预报显示,今天会有小雨,请大家出门时记得带伞。降温的天气也提醒我们要适时添衣保暖",
"text": "天气预报显示,今天会有小雨,请大家出门时记得带伞。降温的天气也提醒我们要适时添衣保暖 [lbreak]",
},
{
"text": "公司的年度总结会议将在下周三举行,请各部门提前准备好相关材料,确保会议顺利进行",
"text": "公司的年度总结会议将在下周三举行,请各部门提前准备好相关材料,确保会议顺利进行 [lbreak]",
},
{
"text": "今天的午餐菜单包括烤鸡、沙拉和蔬菜汤,大家可以根据自己的口味选择适合的菜品",
"text": "今天的午餐菜单包括烤鸡、沙拉和蔬菜汤,大家可以根据自己的口味选择适合的菜品 [lbreak]",
},
{
"text": "请注意,电梯将在下午两点进行例行维护,预计需要一个小时的时间,请大家在此期间使用楼梯",
"text": "请注意,电梯将在下午两点进行例行维护,预计需要一个小时的时间,请大家在此期间使用楼梯 [lbreak]",
},
{
"text": "图书馆新到了一批书籍,涵盖了文学、科学和历史等多个领域,欢迎大家前来借阅",
"text": "图书馆新到了一批书籍,涵盖了文学、科学和历史等多个领域,欢迎大家前来借阅 [lbreak]",
},
{
"text": "电影中梁朝伟扮演的陈永仁的编号27149",
"text": "电影中梁朝伟扮演的陈永仁的编号27149 [lbreak]",
},
{
"text": "这块黄金重达324.75克",
"text": "这块黄金重达324.75克 [lbreak]",
},
{
"text": "我们班的最高总分为583分",
"text": "我们班的最高总分为583分 [lbreak]",
},
{
"text": "12~23",
"text": "12~23 [lbreak]",
},
{
"text": "-1.5~2",
"text": "-1.5~2 [lbreak]",
},
{
"text": "她出生于86年8月18日,她弟弟出生于1995年3月1日",
"text": "她出生于86年8月18日,她弟弟出生于1995年3月1日 [lbreak]",
},
{
"text": "等会请在12:05请通知我",
"text": "等会请在12:05请通知我 [lbreak]",
},
{
"text": "今天的最低气温达到-10°C",
"text": "今天的最低气温达到-10°C [lbreak]",
},
{
"text": "现场有7/12的观众投出了赞成票",
"text": "现场有7/12的观众投出了赞成票 [lbreak]",
},
{
"text": "明天有62%的概率降雨",
"text": "明天有62%的概率降雨 [lbreak]",
},
{
"text": "随便来几个价格12块5,34.5元,20.1万",
"text": "随便来几个价格12块5,34.5元,20.1万 [lbreak]",
},
{
"text": "这是固话0421-33441122",
"text": "这是固话0421-33441122 [lbreak]",
},
{
"text": "这是手机+86 18544139121",
"text": "这是手机+86 18544139121 [lbreak]",
},
]

ssml_example1 = """
<speak version="0.1">
<voice spk="Bob" style="narration-relaxed">
下面是一个 ChatTTS 用于合成多角色多情感的有声书示例
下面是一个 ChatTTS 用于合成多角色多情感的有声书示例[lbreak]
</voice>
<voice spk="Bob" style="narration-relaxed">
黛玉冷笑道:
黛玉冷笑道:[lbreak]
</voice>
<voice spk="female2" style="angry">
我说呢 [uv_break] ,亏了绊住,不然,早就飞起来了
我说呢 [uv_break] ,亏了绊住,不然,早就飞起来了[lbreak]
</voice>
<voice spk="Bob" style="narration-relaxed">
宝玉道:
宝玉道:[lbreak]
</voice>
<voice spk="Alice" style="unfriendly">
“只许和你玩 [uv_break] ,替你解闷。不过偶然到他那里,就说这些闲话。”
“只许和你玩 [uv_break] ,替你解闷。不过偶然到他那里,就说这些闲话。”[lbreak]
</voice>
<voice spk="female2" style="angry">
“好没意思的话![uv_break] 去不去,关我什么事儿? 又没叫你替我解闷儿 [uv_break],还许你不理我呢”
“好没意思的话![uv_break] 去不去,关我什么事儿? 又没叫你替我解闷儿 [uv_break],还许你不理我呢” [lbreak]
</voice>
<voice spk="Bob" style="narration-relaxed">
说着,便赌气回房去了
说着,便赌气回房去了 [lbreak]
</voice>
</speak>
"""
ssml_example2 = """
<speak version="0.1">
<voice spk="Bob" style="narration-relaxed">
使用 prosody 控制生成文本的语速语调和音量,示例如下
使用 prosody 控制生成文本的语速语调和音量,示例如下 [lbreak]
<prosody>
无任何限制将会继承父级voice配置进行生成
无任何限制将会继承父级voice配置进行生成 [lbreak]
</prosody>
<prosody rate="1.5">
设置 rate 大于1表示加速,小于1为减速
设置 rate 大于1表示加速,小于1为减速 [lbreak]
</prosody>
<prosody pitch="6">
设置 pitch 调整音调,设置为6表示提高6个半音
设置 pitch 调整音调,设置为6表示提高6个半音 [lbreak]
</prosody>
<prosody volume="2">
设置 volume 调整音量,设置为2表示提高2个分贝
设置 volume 调整音量,设置为2表示提高2个分贝 [lbreak]
</prosody>
在 voice 中无prosody包裹的文本即为默认生成状态下的语音
在 voice 中无prosody包裹的文本即为默认生成状态下的语音 [lbreak]
</voice>
</speak>
"""
ssml_example3 = """
<speak version="0.1">
<voice spk="Bob" style="narration-relaxed">
使用 break 标签将会简单的
使用 break 标签将会简单的 [lbreak]
<break time="500" />
插入一段空白到生成结果中
插入一段空白到生成结果中 [lbreak]
</voice>
</speak>
"""

ssml_example4 = """
<speak version="0.1">
<voice spk="Bob" style="excited">
temperature for sampling (may be overridden by style or speaker)
temperature for sampling (may be overridden by style or speaker) [lbreak]
<break time="500" />
温度值用于采样,这个值有可能被 style 或者 speaker 覆盖
温度值用于采样,这个值有可能被 style 或者 speaker 覆盖 [lbreak]
<break time="500" />
temperature for sampling ,这个值有可能被 style 或者 speaker 覆盖
temperature for sampling ,这个值有可能被 style 或者 speaker 覆盖 [lbreak]
<break time="500" />
温度值用于采样,(may be overridden by style or speaker)
温度值用于采样,(may be overridden by style or speaker) [lbreak]
</voice>
</speak>
"""

default_ssml = """
<speak version="0.1">
<voice spk="Bob" seed="-1" style="narration-relaxed">
这里是一个简单的 SSML 示例
这里是一个简单的 SSML 示例 [lbreak]
</voice>
</speak>
"""
Expand All @@ -251,6 +260,15 @@ def create_interface():
"""

with gr.Blocks(js=js_func) as demo:
css = """
<style>
.big-button {
height: 80px;
}
</style>
"""

gr.HTML(css)
with gr.Tabs():
with gr.TabItem("TTS"):
with gr.Row():
Expand Down Expand Up @@ -278,20 +296,12 @@ def create_interface():
)

with gr.Row():
style_input_text = gr.Textbox(
label="Style (Text or Seed)", value="-1"
)
style_input_dropdown = gr.Dropdown(
choices=styles,
label="Choose Style",
interactive=True,
value="*auto",
)
style_input_dropdown.change(
fn=lambda x: x.startswith("*") and "-1" or x,
inputs=[style_input_dropdown],
outputs=[style_input_text],
)
infer_seed_input = gr.Number(value=-1, label="Inference Seed")
use_decoder_input = gr.Checkbox(value=True, label="Use Decoder")
prompt1_input = gr.Textbox(label="Prompt 1")
Expand All @@ -305,9 +315,35 @@ def create_interface():
lines=10,
placeholder="输入文本或选择示例",
)
with gr.Row():
contorl_tokens = [
"[laugh]",
"[uv_break]",
"[v_break]",
"[lbreak]",
]

for tk in contorl_tokens:
t_btn = gr.Button(tk)
t_btn.click(
lambda text, tk=tk: text + " " + tk,
inputs=[text_input],
outputs=[text_input],
)
with gr.Column(scale=1):
refine_prompt_input = gr.Textbox(
label="Refine Prompt",
value="[oral_2][laugh_0][break_6]",
)
refine_button = gr.Button("✍️Refine Text")
tts_button = gr.Button("🔊Generate Audio")
# TODO 分割句子,使用当前配置拼接为SSML,然后发送到SSML tab
# send_button = gr.Button("📩Split and send to SSML")

tts_button = gr.Button(
"🔊Generate Audio",
variant="primary",
elem_classes="big-button",
)

sample_dropdown = gr.Dropdown(
choices=[sample["text"] for sample in sample_texts],
Expand All @@ -325,7 +361,7 @@ def create_interface():

refine_button.click(
refine_text,
inputs=[text_input],
inputs=[text_input, refine_prompt_input],
outputs=[text_input],
)

Expand All @@ -342,7 +378,7 @@ def create_interface():
prompt1_input,
prompt2_input,
prefix_input,
style_input_text,
style_input_dropdown,
],
outputs=tts_output,
)
Expand All @@ -353,7 +389,7 @@ def create_interface():
lines=10,
value=default_ssml,
)
ssml_button = gr.Button("🔊Synthesize SSML")
ssml_button = gr.Button("🔊Synthesize SSML", variant="primary")
ssml_output = gr.Audio(label="Generated Audio")

ssml_button.click(
Expand Down

0 comments on commit b44156f

Please sign in to comment.