🐛 fix webui

lenML · Jun 3, 2024 · b44156f · b44156f
1 parent 881b694
commit b44156f
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 58 deletions.
diff --git a/modules/api/utils.py b/modules/api/utils.py
@@ -74,7 +74,7 @@ def calc_spk_style(spk: str | int, style: str | int):
 
     if type(style) == int or type(style) == float:
         raise ParamsTypeError("The style parameter cannot be a number.")
-    elif type(style) == str:
+    elif type(style) == str and style != "":
         if style.isdigit():
             raise ParamsTypeError("The style parameter cannot be a number.")
         else:

diff --git a/modules/generate_audio.py b/modules/generate_audio.py
@@ -38,10 +38,10 @@ def generate_audio(
     if isinstance(spk, int):
         with SeedContext(spk):
             params_infer_code["spk_emb"] = chat_tts.sample_random_speaker()
-        logger.debug("spk", spk)
+        logger.debug(("spk", spk))
     elif isinstance(spk, Speaker):
         params_infer_code["spk_emb"] = spk.emb
-        logger.debug("spk", spk.name)
+        logger.debug(("spk", spk.name))
 
     logger.debug(
         {

diff --git a/modules/utils/normalization.py b/modules/utils/normalization.py
@@ -81,9 +81,10 @@ def replace(match):
 
     result = pattern.sub(replace, text)
 
-    if is_end:
-        # 加这个是为了防止吞字
-        result = ensure_suffix(result, "[v_break]", "。。。[v_break]。。。")
+    # NOTE: 加了会有杂音...
+    # if is_end:
+    # 加这个是为了防止吞字
+    # result = ensure_suffix(result, "[uv_break]", "。。。[uv_break]。。。")
 
     return result
 

diff --git a/webui.py b/webui.py
@@ -1,3 +1,8 @@
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+
+
 import gradio as gr
 import io
 
@@ -14,6 +19,7 @@
 from modules.utils.normalization import text_normalize
 from modules import refiner
 
+
 torch._dynamo.config.cache_size_limit = 64
 torch._dynamo.config.suppress_errors = True
 torch.set_float32_matmul_precision("high")
@@ -56,6 +62,9 @@ def tts_generate(
     prefix,
     style,
 ):
+    if style == "*auto":
+        style = None
+
     params = calc_spk_style(spk=spk, style=style)
 
     spk = params.get("spk", spk)
@@ -82,8 +91,8 @@ def tts_generate(
 
 
 @torch.inference_mode()
-def refine_text(text: str):
-    return refiner.refine_text(text)
+def refine_text(text: str, prompt: str):
+    return refiner.refine_text(text, prompt=prompt)
 
 
 def read_local_readme():
@@ -96,138 +105,138 @@ def read_local_readme():
 # 演示示例文本
 sample_texts = [
     {
-        "text": "天气预报显示，今天会有小雨，请大家出门时记得带伞。降温的天气也提醒我们要适时添衣保暖。",
+        "text": "天气预报显示，今天会有小雨，请大家出门时记得带伞。降温的天气也提醒我们要适时添衣保暖 [lbreak]",
     },
     {
-        "text": "公司的年度总结会议将在下周三举行，请各部门提前准备好相关材料，确保会议顺利进行。",
+        "text": "公司的年度总结会议将在下周三举行，请各部门提前准备好相关材料，确保会议顺利进行 [lbreak]",
     },
     {
-        "text": "今天的午餐菜单包括烤鸡、沙拉和蔬菜汤，大家可以根据自己的口味选择适合的菜品。",
+        "text": "今天的午餐菜单包括烤鸡、沙拉和蔬菜汤，大家可以根据自己的口味选择适合的菜品 [lbreak]",
     },
     {
-        "text": "请注意，电梯将在下午两点进行例行维护，预计需要一个小时的时间，请大家在此期间使用楼梯。",
+        "text": "请注意，电梯将在下午两点进行例行维护，预计需要一个小时的时间，请大家在此期间使用楼梯 [lbreak]",
     },
     {
-        "text": "图书馆新到了一批书籍，涵盖了文学、科学和历史等多个领域，欢迎大家前来借阅。",
+        "text": "图书馆新到了一批书籍，涵盖了文学、科学和历史等多个领域，欢迎大家前来借阅 [lbreak]",
     },
     {
-        "text": "电影中梁朝伟扮演的陈永仁的编号27149",
+        "text": "电影中梁朝伟扮演的陈永仁的编号27149 [lbreak]",
     },
     {
-        "text": "这块黄金重达324.75克",
+        "text": "这块黄金重达324.75克 [lbreak]",
     },
     {
-        "text": "我们班的最高总分为583分",
+        "text": "我们班的最高总分为583分 [lbreak]",
     },
     {
-        "text": "12~23",
+        "text": "12~23 [lbreak]",
     },
     {
-        "text": "-1.5~2",
+        "text": "-1.5~2 [lbreak]",
     },
     {
-        "text": "她出生于86年8月18日，她弟弟出生于1995年3月1日",
+        "text": "她出生于86年8月18日，她弟弟出生于1995年3月1日 [lbreak]",
     },
     {
-        "text": "等会请在12:05请通知我",
+        "text": "等会请在12:05请通知我 [lbreak]",
     },
     {
-        "text": "今天的最低气温达到-10°C",
+        "text": "今天的最低气温达到-10°C [lbreak]",
     },
     {
-        "text": "现场有7/12的观众投出了赞成票",
+        "text": "现场有7/12的观众投出了赞成票 [lbreak]",
     },
     {
-        "text": "明天有62％的概率降雨",
+        "text": "明天有62％的概率降雨 [lbreak]",
     },
     {
-        "text": "随便来几个价格12块5，34.5元，20.1万",
+        "text": "随便来几个价格12块5，34.5元，20.1万 [lbreak]",
     },
     {
-        "text": "这是固话0421-33441122",
+        "text": "这是固话0421-33441122 [lbreak]",
     },
     {
-        "text": "这是手机+86 18544139121",
+        "text": "这是手机+86 18544139121 [lbreak]",
     },
 ]
 
 ssml_example1 = """
 <speak version="0.1">
     <voice spk="Bob" style="narration-relaxed">
-        下面是一个 ChatTTS 用于合成多角色多情感的有声书示例
+        下面是一个 ChatTTS 用于合成多角色多情感的有声书示例[lbreak]
     </voice>
     <voice spk="Bob" style="narration-relaxed">
-        黛玉冷笑道：
+        黛玉冷笑道：[lbreak]
     </voice>
     <voice spk="female2" style="angry">
-        我说呢 [uv_break] ，亏了绊住，不然，早就飞起来了。
+        我说呢 [uv_break] ，亏了绊住，不然，早就飞起来了[lbreak]
     </voice>
     <voice spk="Bob" style="narration-relaxed">
-        宝玉道：
+        宝玉道：[lbreak]
     </voice>
     <voice spk="Alice" style="unfriendly">
-        “只许和你玩 [uv_break] ，替你解闷。不过偶然到他那里，就说这些闲话。”
+        “只许和你玩 [uv_break] ，替你解闷。不过偶然到他那里，就说这些闲话。”[lbreak]
     </voice>
     <voice spk="female2" style="angry">
-        “好没意思的话！[uv_break] 去不去，关我什么事儿？ 又没叫你替我解闷儿 [uv_break]，还许你不理我呢”
+        “好没意思的话！[uv_break] 去不去，关我什么事儿？ 又没叫你替我解闷儿 [uv_break]，还许你不理我呢” [lbreak]
     </voice>
     <voice spk="Bob" style="narration-relaxed">
-        说着，便赌气回房去了。
+        说着，便赌气回房去了 [lbreak]
     </voice>
 </speak>
 """
 ssml_example2 = """
 <speak version="0.1">
     <voice spk="Bob" style="narration-relaxed">
-        使用 prosody 控制生成文本的语速语调和音量，示例如下
+        使用 prosody 控制生成文本的语速语调和音量，示例如下 [lbreak]
 
         <prosody>
-            无任何限制将会继承父级voice配置进行生成
+            无任何限制将会继承父级voice配置进行生成 [lbreak]
         </prosody>
         <prosody rate="1.5">
-            设置 rate 大于1表示加速，小于1为减速
+            设置 rate 大于1表示加速，小于1为减速 [lbreak]
         </prosody>
         <prosody pitch="6">
-            设置 pitch 调整音调，设置为6表示提高6个半音
+            设置 pitch 调整音调，设置为6表示提高6个半音 [lbreak]
         </prosody>
         <prosody volume="2">
-            设置 volume 调整音量，设置为2表示提高2个分贝
+            设置 volume 调整音量，设置为2表示提高2个分贝 [lbreak]
         </prosody>
 
-        在 voice 中无prosody包裹的文本即为默认生成状态下的语音
+        在 voice 中无prosody包裹的文本即为默认生成状态下的语音 [lbreak]
     </voice>
 </speak>
 """
 ssml_example3 = """
 <speak version="0.1">
     <voice spk="Bob" style="narration-relaxed">
-        使用 break 标签将会简单的
+        使用 break 标签将会简单的 [lbreak]
         
         <break time="500" />
 
-        插入一段空白到生成结果中 
+        插入一段空白到生成结果中 [lbreak]
     </voice>
 </speak>
 """
 
 ssml_example4 = """
 <speak version="0.1">
     <voice spk="Bob" style="excited">
-        temperature for sampling (may be overridden by style or speaker)
+        temperature for sampling (may be overridden by style or speaker) [lbreak]
         <break time="500" />
-        温度值用于采样，这个值有可能被 style 或者 speaker 覆盖 
+        温度值用于采样，这个值有可能被 style 或者 speaker 覆盖  [lbreak]
         <break time="500" />
-        temperature for sampling ，这个值有可能被 style 或者 speaker 覆盖 
+        temperature for sampling ，这个值有可能被 style 或者 speaker 覆盖  [lbreak]
         <break time="500" />
-        温度值用于采样，(may be overridden by style or speaker)
+        温度值用于采样，(may be overridden by style or speaker) [lbreak]
     </voice>
 </speak>
 """
 
 default_ssml = """
 <speak version="0.1">
   <voice spk="Bob" seed="-1" style="narration-relaxed">
-    这里是一个简单的 SSML 示例。 
+    这里是一个简单的 SSML 示例 [lbreak] 
   </voice>
 </speak>
 """
@@ -251,6 +260,15 @@ def create_interface():
     """
 
     with gr.Blocks(js=js_func) as demo:
+        css = """
+        <style>
+        .big-button {
+            height: 80px;
+        }
+        </style>
+        """
+
+        gr.HTML(css)
         with gr.Tabs():
             with gr.TabItem("TTS"):
                 with gr.Row():
@@ -278,20 +296,12 @@ def create_interface():
                             )
 
                         with gr.Row():
-                            style_input_text = gr.Textbox(
-                                label="Style (Text or Seed)", value="-1"
-                            )
                             style_input_dropdown = gr.Dropdown(
                                 choices=styles,
                                 label="Choose Style",
                                 interactive=True,
                                 value="*auto",
                             )
-                            style_input_dropdown.change(
-                                fn=lambda x: x.startswith("*") and "-1" or x,
-                                inputs=[style_input_dropdown],
-                                outputs=[style_input_text],
-                            )
                         infer_seed_input = gr.Number(value=-1, label="Inference Seed")
                         use_decoder_input = gr.Checkbox(value=True, label="Use Decoder")
                         prompt1_input = gr.Textbox(label="Prompt 1")
@@ -305,9 +315,35 @@ def create_interface():
                                     lines=10,
                                     placeholder="输入文本或选择示例",
                                 )
+                                with gr.Row():
+                                    contorl_tokens = [
+                                        "[laugh]",
+                                        "[uv_break]",
+                                        "[v_break]",
+                                        "[lbreak]",
+                                    ]
+
+                                    for tk in contorl_tokens:
+                                        t_btn = gr.Button(tk)
+                                        t_btn.click(
+                                            lambda text, tk=tk: text + " " + tk,
+                                            inputs=[text_input],
+                                            outputs=[text_input],
+                                        )
                             with gr.Column(scale=1):
+                                refine_prompt_input = gr.Textbox(
+                                    label="Refine Prompt",
+                                    value="[oral_2][laugh_0][break_6]",
+                                )
                                 refine_button = gr.Button("✍️Refine Text")
-                                tts_button = gr.Button("🔊Generate Audio")
+                                # TODO 分割句子，使用当前配置拼接为SSML，然后发送到SSML tab
+                                # send_button = gr.Button("📩Split and send to SSML")
+
+                                tts_button = gr.Button(
+                                    "🔊Generate Audio",
+                                    variant="primary",
+                                    elem_classes="big-button",
+                                )
 
                         sample_dropdown = gr.Dropdown(
                             choices=[sample["text"] for sample in sample_texts],
@@ -325,7 +361,7 @@ def create_interface():
 
                 refine_button.click(
                     refine_text,
-                    inputs=[text_input],
+                    inputs=[text_input, refine_prompt_input],
                     outputs=[text_input],
                 )
 
@@ -342,7 +378,7 @@ def create_interface():
                         prompt1_input,
                         prompt2_input,
                         prefix_input,
-                        style_input_text,
+                        style_input_dropdown,
                     ],
                     outputs=tts_output,
                 )
@@ -353,7 +389,7 @@ def create_interface():
                     lines=10,
                     value=default_ssml,
                 )
-                ssml_button = gr.Button("🔊Synthesize SSML")
+                ssml_button = gr.Button("🔊Synthesize SSML", variant="primary")
                 ssml_output = gr.Audio(label="Generated Audio")
 
                 ssml_button.click(