diff --git a/modules/denoise.py b/modules/denoise.py
new file mode 100644
index 0000000..1025daf
--- /dev/null
+++ b/modules/denoise.py
@@ -0,0 +1,7 @@
+from audio_denoiser.AudioDenoiser import AudioDenoiser
+import torch
+import torchaudio
+
+
+class TTSAudioDenoiser:
+ pass
diff --git a/modules/hf.py b/modules/hf.py
new file mode 100644
index 0000000..f1ea400
--- /dev/null
+++ b/modules/hf.py
@@ -0,0 +1,14 @@
+# 给huggingface space写的兼容代码
+
+try:
+ import spaces
+except:
+
+ class NoneSpaces:
+ def __init__(self):
+ pass
+
+ def GPU(self, fn):
+ return fn
+
+ spaces = NoneSpaces()
diff --git a/modules/webui/__init__.py b/modules/webui/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/modules/webui/app.py b/modules/webui/app.py
new file mode 100644
index 0000000..3d0439b
--- /dev/null
+++ b/modules/webui/app.py
@@ -0,0 +1,110 @@
+import logging
+import os
+
+import torch
+import gradio as gr
+
+from modules import config
+
+from modules.webui.tts_tab import create_tts_interface
+from modules.webui.ssml_tab import create_ssml_interface
+from modules.webui.spliter_tab import create_spliter_tab
+from modules.webui.speaker_tab import create_speaker_panel
+from modules.webui.readme_tab import create_readme_tab
+
+logger = logging.getLogger(__name__)
+
+logging.basicConfig(
+ level=os.getenv("LOG_LEVEL", "INFO"),
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+
+
+def webui_init():
+ # fix: If the system proxy is enabled in the Windows system, you need to skip these
+ os.environ["NO_PROXY"] = "localhost,127.0.0.1,0.0.0.0"
+
+ torch._dynamo.config.cache_size_limit = 64
+ torch._dynamo.config.suppress_errors = True
+ torch.set_float32_matmul_precision("high")
+
+ logger.info("WebUI module initialized")
+
+
+def create_app_footer():
+ gradio_version = gr.__version__
+ git_tag = config.versions.git_tag
+ git_commit = config.versions.git_commit
+ git_branch = config.versions.git_branch
+ python_version = config.versions.python_version
+ torch_version = config.versions.torch_version
+
+ config.versions.gradio_version = gradio_version
+
+ gr.Markdown(
+ f"""
+🍦 [ChatTTS-Forge](https://github.com/lenML/ChatTTS-Forge)
+version: [{git_tag}](https://github.com/lenML/ChatTTS-Forge/commit/{git_commit}) | branch: `{git_branch}` | python: `{python_version}` | torch: `{torch_version}`
+ """
+ )
+
+
+def create_interface():
+
+ js_func = """
+ function refresh() {
+ const url = new URL(window.location);
+
+ if (url.searchParams.get('__theme') !== 'dark') {
+ url.searchParams.set('__theme', 'dark');
+ window.location.href = url.href;
+ }
+ }
+ """
+
+ head_js = """
+
+ """
+
+ with gr.Blocks(js=js_func, head=head_js, title="ChatTTS Forge WebUI") as demo:
+ css = """
+
+ """
+
+ gr.HTML(css)
+ with gr.Tabs() as tabs:
+ with gr.TabItem("TTS"):
+ create_tts_interface()
+
+ with gr.TabItem("SSML", id="ssml"):
+ ssml_input = create_ssml_interface()
+
+ with gr.TabItem("Spilter"):
+ create_spliter_tab(ssml_input, tabs=tabs)
+
+ if config.runtime_env_vars.webui_experimental:
+ with gr.TabItem("Speaker"):
+ create_speaker_panel()
+ with gr.TabItem("Denoise"):
+ gr.Markdown("🚧 Under construction")
+ with gr.TabItem("Inpainting"):
+ gr.Markdown("🚧 Under construction")
+ with gr.TabItem("ASR"):
+ gr.Markdown("🚧 Under construction")
+
+ with gr.TabItem("README"):
+ create_readme_tab()
+
+ create_app_footer()
+ return demo
diff --git a/modules/webui/asr_tab.py b/modules/webui/asr_tab.py
new file mode 100644
index 0000000..e69de29
diff --git a/modules/webui/denoise_tab.py b/modules/webui/denoise_tab.py
new file mode 100644
index 0000000..bf01965
--- /dev/null
+++ b/modules/webui/denoise_tab.py
@@ -0,0 +1,5 @@
+import gradio as gr
+
+
+def create_denoise_tab():
+ pass
diff --git a/modules/webui/examples.py b/modules/webui/examples.py
new file mode 100644
index 0000000..eefc860
--- /dev/null
+++ b/modules/webui/examples.py
@@ -0,0 +1,164 @@
+example_texts = [
+ {
+ "text": "大🍌,一条大🍌,嘿,你的感觉真的很奇妙 [lbreak]",
+ },
+ {"text": "Big 🍌, a big 🍌, hey, your feeling is really wonderful [lbreak]"},
+ {
+ "text": """
+# 这是 markdown 标题
+
+```
+代码块将跳过
+```
+
+- **文本标准化**:
+ - **Markdown**: 自动检测处理 markdown 格式文本。
+ - **数字转写**: 自动将数字转为模型可识别的文本。
+ - **Emoji 适配**: 自动翻译 emoji 为可读文本。
+ - **基于分词器**: 基于 tokenizer 预处理文本,覆盖模型所有不支持字符范围。
+ - **中英文识别**: 适配英文环境。
+ """
+ },
+ {
+ "text": "天气预报显示,今天会有小雨,请大家出门时记得带伞。降温的天气也提醒我们要适时添衣保暖 [lbreak]",
+ },
+ {
+ "text": "公司的年度总结会议将在下周三举行,请各部门提前准备好相关材料,确保会议顺利进行 [lbreak]",
+ },
+ {
+ "text": "今天的午餐菜单包括烤鸡、沙拉和蔬菜汤,大家可以根据自己的口味选择适合的菜品 [lbreak]",
+ },
+ {
+ "text": "请注意,电梯将在下午两点进行例行维护,预计需要一个小时的时间,请大家在此期间使用楼梯 [lbreak]",
+ },
+ {
+ "text": "图书馆新到了一批书籍,涵盖了文学、科学和历史等多个领域,欢迎大家前来借阅 [lbreak]",
+ },
+ {
+ "text": "电影中梁朝伟扮演的陈永仁的编号27149 [lbreak]",
+ },
+ {
+ "text": "这块黄金重达324.75克 [lbreak]",
+ },
+ {
+ "text": "我们班的最高总分为583分 [lbreak]",
+ },
+ {
+ "text": "12~23 [lbreak]",
+ },
+ {
+ "text": "-1.5~2 [lbreak]",
+ },
+ {
+ "text": "她出生于86年8月18日,她弟弟出生于1995年3月1日 [lbreak]",
+ },
+ {
+ "text": "等会请在12:05请通知我 [lbreak]",
+ },
+ {
+ "text": "今天的最低气温达到-10°C [lbreak]",
+ },
+ {
+ "text": "现场有7/12的观众投出了赞成票 [lbreak]",
+ },
+ {
+ "text": "明天有62%的概率降雨 [lbreak]",
+ },
+ {
+ "text": "随便来几个价格12块5,34.5元,20.1万 [lbreak]",
+ },
+ {
+ "text": "这是固话0421-33441122 [lbreak]",
+ },
+ {
+ "text": "这是手机+86 18544139121 [lbreak]",
+ },
+]
+
+ssml_example1 = """
+
+
+ 下面是一个 ChatTTS 用于合成多角色多情感的有声书示例[lbreak]
+
+
+ 黛玉冷笑道:[lbreak]
+
+
+ 我说呢 [uv_break] ,亏了绊住,不然,早就飞起来了[lbreak]
+
+
+ 宝玉道:[lbreak]
+
+
+ “只许和你玩 [uv_break] ,替你解闷。不过偶然到他那里,就说这些闲话。”[lbreak]
+
+
+ “好没意思的话![uv_break] 去不去,关我什么事儿? 又没叫你替我解闷儿 [uv_break],还许你不理我呢” [lbreak]
+
+
+ 说着,便赌气回房去了 [lbreak]
+
+
+"""
+ssml_example2 = """
+
+
+ 使用 prosody 控制生成文本的语速语调和音量,示例如下 [lbreak]
+
+
+ 无任何限制将会继承父级voice配置进行生成 [lbreak]
+
+
+ 设置 rate 大于1表示加速,小于1为减速 [lbreak]
+
+
+ 设置 pitch 调整音调,设置为6表示提高6个半音 [lbreak]
+
+
+ 设置 volume 调整音量,设置为2表示提高2个分贝 [lbreak]
+
+
+ 在 voice 中无prosody包裹的文本即为默认生成状态下的语音 [lbreak]
+
+
+"""
+ssml_example3 = """
+
+
+ 使用 break 标签将会简单的 [lbreak]
+
+
+
+ 插入一段空白到生成结果中 [lbreak]
+
+
+"""
+
+ssml_example4 = """
+
+
+ temperature for sampling (may be overridden by style or speaker) [lbreak]
+
+ 温度值用于采样,这个值有可能被 style 或者 speaker 覆盖 [lbreak]
+
+ temperature for sampling ,这个值有可能被 style 或者 speaker 覆盖 [lbreak]
+
+ 温度值用于采样,(may be overridden by style or speaker) [lbreak]
+
+
+"""
+
+ssml_examples = [
+ ssml_example1,
+ ssml_example2,
+ ssml_example3,
+ ssml_example4,
+]
+
+default_ssml = """
+
+
+ 这里是一个简单的 SSML 示例 [lbreak]
+
+
+"""
diff --git a/modules/webui/readme_tab.py b/modules/webui/readme_tab.py
new file mode 100644
index 0000000..cb933ba
--- /dev/null
+++ b/modules/webui/readme_tab.py
@@ -0,0 +1,13 @@
+import gradio as gr
+
+
+def read_local_readme():
+ with open("README.md", "r", encoding="utf-8") as file:
+ content = file.read()
+ content = content[content.index("# ") :]
+ return content
+
+
+def create_readme_tab():
+ readme_content = read_local_readme()
+ gr.Markdown(readme_content)
diff --git a/modules/webui/speaker_tab.py b/modules/webui/speaker_tab.py
new file mode 100644
index 0000000..df306f2
--- /dev/null
+++ b/modules/webui/speaker_tab.py
@@ -0,0 +1,13 @@
+import gradio as gr
+
+from modules.webui.webui_utils import get_speakers
+
+
+# 显示 a b c d 四个选择框,选择一个或多个,然后可以试音,并导出
+def create_speaker_panel():
+ speakers = get_speakers()
+
+ def get_speaker_show_name(spk):
+ pass
+
+ gr.Markdown("🚧 Under construction")
diff --git a/modules/webui/spliter_tab.py b/modules/webui/spliter_tab.py
new file mode 100644
index 0000000..0fedbcb
--- /dev/null
+++ b/modules/webui/spliter_tab.py
@@ -0,0 +1,168 @@
+import gradio as gr
+import torch
+from modules.normalization import text_normalize
+from modules.webui.webui_utils import (
+ get_speakers,
+ get_styles,
+ split_long_text,
+ synthesize_ssml,
+)
+from modules.webui import webui_config
+from modules.webui.examples import ssml_examples, default_ssml
+
+
+def merge_dataframe_to_ssml(dataframe, spk, style, seed):
+ if style == "*auto":
+ style = None
+ if spk == "-1" or spk == -1:
+ spk = None
+ if seed == -1 or seed == "-1":
+ seed = None
+
+ ssml = ""
+ indent = " " * 2
+
+ for i, row in dataframe.iterrows():
+ ssml += f"{indent}\n"
+ ssml += f"{indent}{indent}{text_normalize(row[1])}\n"
+ ssml += f"{indent}\n"
+ return f"\n{ssml}"
+
+
+# 长文本处理
+# 可以输入长文本,并选择切割方法,切割之后可以将拼接的SSML发送到SSML tab
+# 根据 。 句号切割,切割之后显示到 data table
+def create_spliter_tab(ssml_input, tabs):
+ speakers = get_speakers()
+
+ def get_speaker_show_name(spk):
+ if spk.gender == "*" or spk.gender == "":
+ return spk.name
+ return f"{spk.gender} : {spk.name}"
+
+ speaker_names = ["*random"] + [
+ get_speaker_show_name(speaker) for speaker in speakers
+ ]
+
+ styles = ["*auto"] + [s.get("name") for s in get_styles()]
+
+ with gr.Row():
+ with gr.Column(scale=1):
+ # 选择说话人 选择风格 选择seed
+ with gr.Group():
+ gr.Markdown("🗣️Speaker")
+ spk_input_text = gr.Textbox(
+ label="Speaker (Text or Seed)",
+ value="female2",
+ show_label=False,
+ )
+ spk_input_dropdown = gr.Dropdown(
+ choices=speaker_names,
+ interactive=True,
+ value="female : female2",
+ show_label=False,
+ )
+ spk_rand_button = gr.Button(
+ value="🎲",
+ variant="secondary",
+ )
+ with gr.Group():
+ gr.Markdown("🎭Style")
+ style_input_dropdown = gr.Dropdown(
+ choices=styles,
+ interactive=True,
+ show_label=False,
+ value="*auto",
+ )
+ with gr.Group():
+ gr.Markdown("🗣️Seed")
+ infer_seed_input = gr.Number(
+ value=42,
+ label="Inference Seed",
+ show_label=False,
+ minimum=-1,
+ maximum=2**32 - 1,
+ )
+ infer_seed_rand_button = gr.Button(
+ value="🎲",
+ variant="secondary",
+ )
+
+ send_btn = gr.Button("📩Send to SSML", variant="primary")
+
+ with gr.Column(scale=3):
+ with gr.Group():
+ gr.Markdown("📝Long Text Input")
+ gr.Markdown("- 此页面用于处理超长文本")
+ gr.Markdown("- 切割后,可以选择说话人、风格、seed,然后发送到SSML")
+ long_text_input = gr.Textbox(
+ label="Long Text Input",
+ lines=10,
+ placeholder="输入长文本",
+ elem_id="long-text-input",
+ show_label=False,
+ )
+ long_text_split_button = gr.Button("🔪Split Text")
+
+ with gr.Row():
+ with gr.Column(scale=3):
+ with gr.Group():
+ gr.Markdown("🎨Output")
+ long_text_output = gr.DataFrame(
+ headers=["index", "text", "length"],
+ datatype=["number", "str", "number"],
+ elem_id="long-text-output",
+ interactive=False,
+ wrap=True,
+ value=[],
+ )
+
+ spk_input_dropdown.change(
+ fn=lambda x: x.startswith("*") and "-1" or x.split(":")[-1].strip(),
+ inputs=[spk_input_dropdown],
+ outputs=[spk_input_text],
+ )
+ spk_rand_button.click(
+ lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
+ inputs=[spk_input_text],
+ outputs=[spk_input_text],
+ )
+ infer_seed_rand_button.click(
+ lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
+ inputs=[infer_seed_input],
+ outputs=[infer_seed_input],
+ )
+ long_text_split_button.click(
+ split_long_text,
+ inputs=[long_text_input],
+ outputs=[long_text_output],
+ )
+
+ infer_seed_rand_button.click(
+ lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
+ inputs=[infer_seed_input],
+ outputs=[infer_seed_input],
+ )
+
+ send_btn.click(
+ merge_dataframe_to_ssml,
+ inputs=[
+ long_text_output,
+ spk_input_text,
+ style_input_dropdown,
+ infer_seed_input,
+ ],
+ outputs=[ssml_input],
+ )
+
+ def change_tab():
+ return gr.Tabs(selected="ssml")
+
+ send_btn.click(change_tab, inputs=[], outputs=[tabs])
diff --git a/modules/webui/ssml_tab.py b/modules/webui/ssml_tab.py
new file mode 100644
index 0000000..9979e25
--- /dev/null
+++ b/modules/webui/ssml_tab.py
@@ -0,0 +1,55 @@
+import gradio as gr
+from modules.webui.webui_utils import (
+ synthesize_ssml,
+)
+from modules.webui import webui_config
+from modules.webui.examples import ssml_examples, default_ssml
+
+
+def create_ssml_interface():
+ with gr.Row():
+ with gr.Column(scale=3):
+ with gr.Group():
+ gr.Markdown("📝SSML Input")
+ gr.Markdown(f"- 最长{webui_config.ssml_max:,}字符,超过会被截断")
+ gr.Markdown("- 尽量保证使用相同的 seed")
+ gr.Markdown(
+ "- 关于SSML可以看这个 [文档](https://github.com/lenML/ChatTTS-Forge/blob/main/docs/SSML.md)"
+ )
+ ssml_input = gr.Textbox(
+ label="SSML Input",
+ lines=10,
+ value=default_ssml,
+ placeholder="输入 SSML 或选择示例",
+ elem_id="ssml_input",
+ show_label=False,
+ )
+ ssml_button = gr.Button("🔊Synthesize SSML", variant="primary")
+ with gr.Column(scale=1):
+ with gr.Group():
+ # 参数
+ gr.Markdown("🎛️Parameters")
+ # batch size
+ batch_size_input = gr.Slider(
+ label="Batch Size",
+ value=4,
+ minimum=1,
+ maximum=webui_config.max_batch_size,
+ step=1,
+ )
+ with gr.Group():
+ gr.Markdown("🎄Examples")
+ gr.Examples(
+ examples=ssml_examples,
+ inputs=[ssml_input],
+ )
+
+ ssml_output = gr.Audio(label="Generated Audio")
+
+ ssml_button.click(
+ synthesize_ssml,
+ inputs=[ssml_input, batch_size_input],
+ outputs=ssml_output,
+ )
+
+ return ssml_input
diff --git a/modules/webui/tts_tab.py b/modules/webui/tts_tab.py
new file mode 100644
index 0000000..b378ddf
--- /dev/null
+++ b/modules/webui/tts_tab.py
@@ -0,0 +1,248 @@
+import gradio as gr
+import torch
+from modules.webui.webui_utils import (
+ get_speakers,
+ get_styles,
+ refine_text,
+ tts_generate,
+)
+from modules.webui import webui_config
+from modules.webui.examples import example_texts
+from modules import config
+
+
+def create_tts_interface():
+ speakers = get_speakers()
+
+ def get_speaker_show_name(spk):
+ if spk.gender == "*" or spk.gender == "":
+ return spk.name
+ return f"{spk.gender} : {spk.name}"
+
+ speaker_names = ["*random"] + [
+ get_speaker_show_name(speaker) for speaker in speakers
+ ]
+
+ styles = ["*auto"] + [s.get("name") for s in get_styles()]
+
+ history = []
+
+ with gr.Row():
+ with gr.Column(scale=1):
+ with gr.Group():
+ gr.Markdown("🎛️Sampling")
+ temperature_input = gr.Slider(
+ 0.01, 2.0, value=0.3, step=0.01, label="Temperature"
+ )
+ top_p_input = gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Top P")
+ top_k_input = gr.Slider(1, 50, value=20, step=1, label="Top K")
+ batch_size_input = gr.Slider(
+ 1,
+ webui_config.max_batch_size,
+ value=4,
+ step=1,
+ label="Batch Size",
+ )
+
+ with gr.Row():
+ with gr.Group():
+ gr.Markdown("🎭Style")
+ gr.Markdown("- 后缀为 `_p` 表示带prompt,效果更强但是影响质量")
+ style_input_dropdown = gr.Dropdown(
+ choices=styles,
+ # label="Choose Style",
+ interactive=True,
+ show_label=False,
+ value="*auto",
+ )
+ with gr.Row():
+ with gr.Group():
+ gr.Markdown("🗣️Speaker")
+ with gr.Tabs():
+ with gr.Tab(label="Pick"):
+ spk_input_text = gr.Textbox(
+ label="Speaker (Text or Seed)",
+ value="female2",
+ show_label=False,
+ )
+ spk_input_dropdown = gr.Dropdown(
+ choices=speaker_names,
+ # label="Choose Speaker",
+ interactive=True,
+ value="female : female2",
+ show_label=False,
+ )
+ spk_rand_button = gr.Button(
+ value="🎲",
+ # tooltip="Random Seed",
+ variant="secondary",
+ )
+ spk_input_dropdown.change(
+ fn=lambda x: x.startswith("*")
+ and "-1"
+ or x.split(":")[-1].strip(),
+ inputs=[spk_input_dropdown],
+ outputs=[spk_input_text],
+ )
+ spk_rand_button.click(
+ lambda x: str(torch.randint(0, 2**32 - 1, (1,)).item()),
+ inputs=[spk_input_text],
+ outputs=[spk_input_text],
+ )
+
+ if config.runtime_env_vars.webui_experimental:
+ with gr.Tab(label="Upload"):
+ spk_input_upload = gr.File(label="Speaker (Upload)")
+ # TODO 读取 speaker
+ # spk_input_upload.change(
+ # fn=lambda x: x.read().decode("utf-8"),
+ # inputs=[spk_input_upload],
+ # outputs=[spk_input_text],
+ # )
+ with gr.Group():
+ gr.Markdown("💃Inference Seed")
+ infer_seed_input = gr.Number(
+ value=42,
+ label="Inference Seed",
+ show_label=False,
+ minimum=-1,
+ maximum=2**32 - 1,
+ )
+ infer_seed_rand_button = gr.Button(
+ value="🎲",
+ # tooltip="Random Seed",
+ variant="secondary",
+ )
+ use_decoder_input = gr.Checkbox(
+ value=True, label="Use Decoder", visible=False
+ )
+ with gr.Group():
+ gr.Markdown("🔧Prompt engineering")
+ prompt1_input = gr.Textbox(label="Prompt 1")
+ prompt2_input = gr.Textbox(label="Prompt 2")
+ prefix_input = gr.Textbox(label="Prefix")
+
+ if config.runtime_env_vars.webui_experimental:
+ prompt_audio = gr.File(label="prompt_audio")
+
+ infer_seed_rand_button.click(
+ lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
+ inputs=[infer_seed_input],
+ outputs=[infer_seed_input],
+ )
+ with gr.Column(scale=3):
+ with gr.Row():
+ with gr.Column(scale=4):
+ with gr.Group():
+ input_title = gr.Markdown(
+ "📝Text Input",
+ elem_id="input-title",
+ )
+ gr.Markdown(
+ f"- 字数限制{webui_config.tts_max:,}字,超过部分截断"
+ )
+ gr.Markdown("- 如果尾字吞字不读,可以试试结尾加上 `[lbreak]`")
+ gr.Markdown(
+ "- If the input text is all in English, it is recommended to check disable_normalize"
+ )
+ text_input = gr.Textbox(
+ show_label=False,
+ label="Text to Speech",
+ lines=10,
+ placeholder="输入文本或选择示例",
+ elem_id="text-input",
+ )
+ # TODO 字数统计,其实实现很好写,但是就是会触发loading...并且还要和后端交互...
+ # text_input.change(
+ # fn=lambda x: (
+ # f"📝Text Input ({len(x)} char)"
+ # if x
+ # else (
+ # "📝Text Input (0 char)"
+ # if not x
+ # else "📝Text Input (0 char)"
+ # )
+ # ),
+ # inputs=[text_input],
+ # outputs=[input_title],
+ # )
+ with gr.Row():
+ contorl_tokens = [
+ "[laugh]",
+ "[uv_break]",
+ "[v_break]",
+ "[lbreak]",
+ ]
+
+ for tk in contorl_tokens:
+ t_btn = gr.Button(tk)
+ t_btn.click(
+ lambda text, tk=tk: text + " " + tk,
+ inputs=[text_input],
+ outputs=[text_input],
+ )
+ with gr.Column(scale=1):
+ with gr.Group():
+ gr.Markdown("🎶Refiner")
+ refine_prompt_input = gr.Textbox(
+ label="Refine Prompt",
+ value="[oral_2][laugh_0][break_6]",
+ )
+ refine_button = gr.Button("✍️Refine Text")
+ # TODO 分割句子,使用当前配置拼接为SSML,然后发送到SSML tab
+ # send_button = gr.Button("📩Split and send to SSML")
+
+ with gr.Group():
+ gr.Markdown("🔊Generate")
+ disable_normalize_input = gr.Checkbox(
+ value=False, label="Disable Normalize"
+ )
+ tts_button = gr.Button(
+ "🔊Generate Audio",
+ variant="primary",
+ elem_classes="big-button",
+ )
+
+ with gr.Group():
+ gr.Markdown("🎄Examples")
+ sample_dropdown = gr.Dropdown(
+ choices=[sample["text"] for sample in example_texts],
+ show_label=False,
+ value=None,
+ interactive=True,
+ )
+ sample_dropdown.change(
+ fn=lambda x: x,
+ inputs=[sample_dropdown],
+ outputs=[text_input],
+ )
+
+ with gr.Group():
+ gr.Markdown("🎨Output")
+ tts_output = gr.Audio(label="Generated Audio")
+
+ refine_button.click(
+ refine_text,
+ inputs=[text_input, refine_prompt_input],
+ outputs=[text_input],
+ )
+
+ tts_button.click(
+ tts_generate,
+ inputs=[
+ text_input,
+ temperature_input,
+ top_p_input,
+ top_k_input,
+ spk_input_text,
+ infer_seed_input,
+ use_decoder_input,
+ prompt1_input,
+ prompt2_input,
+ prefix_input,
+ style_input_dropdown,
+ disable_normalize_input,
+ batch_size_input,
+ ],
+ outputs=tts_output,
+ )
diff --git a/modules/webui/webui_config.py b/modules/webui/webui_config.py
new file mode 100644
index 0000000..64b64d7
--- /dev/null
+++ b/modules/webui/webui_config.py
@@ -0,0 +1,4 @@
+tts_max = 1000
+ssml_max = 1000
+spliter_threshold = 100
+max_batch_size = 8
diff --git a/modules/webui/webui_utils.py b/modules/webui/webui_utils.py
new file mode 100644
index 0000000..241d3de
--- /dev/null
+++ b/modules/webui/webui_utils.py
@@ -0,0 +1,169 @@
+import os
+import logging
+import sys
+
+import numpy as np
+
+from modules.devices import devices
+from modules.synthesize_audio import synthesize_audio
+from modules.hf import spaces
+from modules.webui import webui_config
+
+logging.basicConfig(
+ level=os.getenv("LOG_LEVEL", "INFO"),
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+
+
+import gradio as gr
+
+import torch
+
+from modules.ssml import parse_ssml
+from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments
+
+from modules.speaker import speaker_mgr
+from modules.data import styles_mgr
+
+from modules.api.utils import calc_spk_style
+import modules.generate_audio as generate
+
+from modules.normalization import text_normalize
+from modules import refiner, config
+
+from modules.utils import env, audio
+from modules.SentenceSplitter import SentenceSplitter
+
+
+def get_speakers():
+ return speaker_mgr.list_speakers()
+
+
+def get_styles():
+ return styles_mgr.list_items()
+
+
+def segments_length_limit(segments, total_max: int):
+ ret_segments = []
+ total_len = 0
+ for seg in segments:
+ if "text" not in seg:
+ continue
+ total_len += len(seg["text"])
+ if total_len > total_max:
+ break
+ ret_segments.append(seg)
+ return ret_segments
+
+
+@torch.inference_mode()
+@spaces.GPU
+def synthesize_ssml(ssml: str, batch_size=4):
+ try:
+ batch_size = int(batch_size)
+ except Exception:
+ batch_size = 8
+
+ ssml = ssml.strip()
+
+ if ssml == "":
+ return None
+
+ segments = parse_ssml(ssml)
+ max_len = webui_config.ssml_max
+ segments = segments_length_limit(segments, max_len)
+
+ if len(segments) == 0:
+ return None
+
+ synthesize = SynthesizeSegments(batch_size=batch_size)
+ audio_segments = synthesize.synthesize_segments(segments)
+ combined_audio = combine_audio_segments(audio_segments)
+
+ return audio.pydub_to_np(combined_audio)
+
+
+@torch.inference_mode()
+@spaces.GPU
+def tts_generate(
+ text,
+ temperature,
+ top_p,
+ top_k,
+ spk,
+ infer_seed,
+ use_decoder,
+ prompt1,
+ prompt2,
+ prefix,
+ style,
+ disable_normalize=False,
+ batch_size=4,
+):
+ try:
+ batch_size = int(batch_size)
+ except Exception:
+ batch_size = 4
+
+ max_len = webui_config.tts_max
+ text = text.strip()[0:max_len]
+
+ if text == "":
+ return None
+
+ if style == "*auto":
+ style = None
+
+ if isinstance(top_k, float):
+ top_k = int(top_k)
+
+ params = calc_spk_style(spk=spk, style=style)
+ spk = params.get("spk", spk)
+
+ infer_seed = infer_seed or params.get("seed", infer_seed)
+ temperature = temperature or params.get("temperature", temperature)
+ prefix = prefix or params.get("prefix", prefix)
+ prompt1 = prompt1 or params.get("prompt1", "")
+ prompt2 = prompt2 or params.get("prompt2", "")
+
+ infer_seed = np.clip(infer_seed, -1, 2**32 - 1, out=None, dtype=np.int64)
+ infer_seed = int(infer_seed)
+
+ if not disable_normalize:
+ text = text_normalize(text)
+
+ sample_rate, audio_data = synthesize_audio(
+ text=text,
+ temperature=temperature,
+ top_P=top_p,
+ top_K=top_k,
+ spk=spk,
+ infer_seed=infer_seed,
+ use_decoder=use_decoder,
+ prompt1=prompt1,
+ prompt2=prompt2,
+ prefix=prefix,
+ batch_size=batch_size,
+ )
+
+ audio_data = audio.audio_to_int16(audio_data)
+ return sample_rate, audio_data
+
+
+@torch.inference_mode()
+@spaces.GPU
+def refine_text(text: str, prompt: str):
+ text = text_normalize(text)
+ return refiner.refine_text(text, prompt=prompt)
+
+
+@torch.inference_mode()
+@spaces.GPU
+def split_long_text(long_text_input):
+ spliter = SentenceSplitter(webui_config.spliter_threshold)
+ sentences = spliter.parse(long_text_input)
+ sentences = [text_normalize(s) for s in sentences]
+ data = []
+ for i, text in enumerate(sentences):
+ data.append([i, text, len(text)])
+ return data
diff --git a/webui.py b/webui.py
index 2735f98..d3fc23c 100644
--- a/webui.py
+++ b/webui.py
@@ -1,857 +1,10 @@
-try:
- import spaces
-except:
-
- class NoneSpaces:
- def __init__(self):
- pass
-
- def GPU(self, fn):
- return fn
-
- spaces = NoneSpaces()
-
import os
-import logging
-import sys
-
-import numpy as np
-
from modules.devices import devices
-from modules.synthesize_audio import synthesize_audio
-
-logging.basicConfig(
- level=os.getenv("LOG_LEVEL", "INFO"),
- format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-)
-
-
-import gradio as gr
-
-import torch
-
-from modules.ssml import parse_ssml
-from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments
-
-from modules.speaker import speaker_mgr
-from modules.data import styles_mgr
-
-from modules.api.utils import calc_spk_style
-import modules.generate_audio as generate
-
-from modules.normalization import text_normalize
-from modules import refiner, config
-
-from modules.utils import env, audio
-from modules.SentenceSplitter import SentenceSplitter
-
-# fix: If the system proxy is enabled in the Windows system, you need to skip these
-os.environ["NO_PROXY"] = "localhost,127.0.0.1,0.0.0.0"
-
-torch._dynamo.config.cache_size_limit = 64
-torch._dynamo.config.suppress_errors = True
-torch.set_float32_matmul_precision("high")
-
-webui_config = {
- "tts_max": 1000,
- "ssml_max": 5000,
- "spliter_threshold": 100,
- "max_batch_size": 8,
-}
-
-
-def get_speakers():
- return speaker_mgr.list_speakers()
-
-
-def get_styles():
- return styles_mgr.list_items()
-
-
-def segments_length_limit(segments, total_max: int):
- ret_segments = []
- total_len = 0
- for seg in segments:
- if "text" not in seg:
- continue
- total_len += len(seg["text"])
- if total_len > total_max:
- break
- ret_segments.append(seg)
- return ret_segments
-
-
-@torch.inference_mode()
-@spaces.GPU
-def synthesize_ssml(ssml: str, batch_size=4):
- try:
- batch_size = int(batch_size)
- except Exception:
- batch_size = 8
-
- ssml = ssml.strip()
-
- if ssml == "":
- return None
-
- segments = parse_ssml(ssml)
- max_len = webui_config["ssml_max"]
- segments = segments_length_limit(segments, max_len)
-
- if len(segments) == 0:
- return None
-
- synthesize = SynthesizeSegments(batch_size=batch_size)
- audio_segments = synthesize.synthesize_segments(segments)
- combined_audio = combine_audio_segments(audio_segments)
-
- return audio.pydub_to_np(combined_audio)
-
-
-@torch.inference_mode()
-@spaces.GPU
-def tts_generate(
- text,
- temperature,
- top_p,
- top_k,
- spk,
- infer_seed,
- use_decoder,
- prompt1,
- prompt2,
- prefix,
- style,
- disable_normalize=False,
- batch_size=4,
-):
- try:
- batch_size = int(batch_size)
- except Exception:
- batch_size = 4
-
- max_len = webui_config["tts_max"]
- text = text.strip()[0:max_len]
-
- if text == "":
- return None
-
- if style == "*auto":
- style = None
-
- if isinstance(top_k, float):
- top_k = int(top_k)
-
- params = calc_spk_style(spk=spk, style=style)
- spk = params.get("spk", spk)
-
- infer_seed = infer_seed or params.get("seed", infer_seed)
- temperature = temperature or params.get("temperature", temperature)
- prefix = prefix or params.get("prefix", prefix)
- prompt1 = prompt1 or params.get("prompt1", "")
- prompt2 = prompt2 or params.get("prompt2", "")
-
- infer_seed = np.clip(infer_seed, -1, 2**32 - 1, out=None, dtype=np.int64)
- infer_seed = int(infer_seed)
-
- if not disable_normalize:
- text = text_normalize(text)
-
- sample_rate, audio_data = synthesize_audio(
- text=text,
- temperature=temperature,
- top_P=top_p,
- top_K=top_k,
- spk=spk,
- infer_seed=infer_seed,
- use_decoder=use_decoder,
- prompt1=prompt1,
- prompt2=prompt2,
- prefix=prefix,
- batch_size=batch_size,
- )
-
- audio_data = audio.audio_to_int16(audio_data)
- return sample_rate, audio_data
-
-
-@torch.inference_mode()
-@spaces.GPU
-def refine_text(text: str, prompt: str):
- text = text_normalize(text)
- return refiner.refine_text(text, prompt=prompt)
-
-
-def read_local_readme():
- with open("README.md", "r", encoding="utf-8") as file:
- content = file.read()
- content = content[content.index("# ") :]
- return content
-
-
-# 演示示例文本
-sample_texts = [
- {
- "text": "大🍌,一条大🍌,嘿,你的感觉真的很奇妙 [lbreak]",
- },
- {
- "text": "天气预报显示,今天会有小雨,请大家出门时记得带伞。降温的天气也提醒我们要适时添衣保暖 [lbreak]",
- },
- {
- "text": "公司的年度总结会议将在下周三举行,请各部门提前准备好相关材料,确保会议顺利进行 [lbreak]",
- },
- {
- "text": "今天的午餐菜单包括烤鸡、沙拉和蔬菜汤,大家可以根据自己的口味选择适合的菜品 [lbreak]",
- },
- {
- "text": "请注意,电梯将在下午两点进行例行维护,预计需要一个小时的时间,请大家在此期间使用楼梯 [lbreak]",
- },
- {
- "text": "图书馆新到了一批书籍,涵盖了文学、科学和历史等多个领域,欢迎大家前来借阅 [lbreak]",
- },
- {
- "text": "电影中梁朝伟扮演的陈永仁的编号27149 [lbreak]",
- },
- {
- "text": "这块黄金重达324.75克 [lbreak]",
- },
- {
- "text": "我们班的最高总分为583分 [lbreak]",
- },
- {
- "text": "12~23 [lbreak]",
- },
- {
- "text": "-1.5~2 [lbreak]",
- },
- {
- "text": "她出生于86年8月18日,她弟弟出生于1995年3月1日 [lbreak]",
- },
- {
- "text": "等会请在12:05请通知我 [lbreak]",
- },
- {
- "text": "今天的最低气温达到-10°C [lbreak]",
- },
- {
- "text": "现场有7/12的观众投出了赞成票 [lbreak]",
- },
- {
- "text": "明天有62%的概率降雨 [lbreak]",
- },
- {
- "text": "随便来几个价格12块5,34.5元,20.1万 [lbreak]",
- },
- {
- "text": "这是固话0421-33441122 [lbreak]",
- },
- {
- "text": "这是手机+86 18544139121 [lbreak]",
- },
-]
-
-ssml_example1 = """
-
-
- 下面是一个 ChatTTS 用于合成多角色多情感的有声书示例[lbreak]
-
-
- 黛玉冷笑道:[lbreak]
-
-
- 我说呢 [uv_break] ,亏了绊住,不然,早就飞起来了[lbreak]
-
-
- 宝玉道:[lbreak]
-
-
- “只许和你玩 [uv_break] ,替你解闷。不过偶然到他那里,就说这些闲话。”[lbreak]
-
-
- “好没意思的话![uv_break] 去不去,关我什么事儿? 又没叫你替我解闷儿 [uv_break],还许你不理我呢” [lbreak]
-
-
- 说着,便赌气回房去了 [lbreak]
-
-
-"""
-ssml_example2 = """
-
-
- 使用 prosody 控制生成文本的语速语调和音量,示例如下 [lbreak]
-
-
- 无任何限制将会继承父级voice配置进行生成 [lbreak]
-
-
- 设置 rate 大于1表示加速,小于1为减速 [lbreak]
-
-
- 设置 pitch 调整音调,设置为6表示提高6个半音 [lbreak]
-
-
- 设置 volume 调整音量,设置为2表示提高2个分贝 [lbreak]
-
-
- 在 voice 中无prosody包裹的文本即为默认生成状态下的语音 [lbreak]
-
-
-"""
-ssml_example3 = """
-
-
- 使用 break 标签将会简单的 [lbreak]
-
-
-
- 插入一段空白到生成结果中 [lbreak]
-
-
-"""
-
-ssml_example4 = """
-
-
- temperature for sampling (may be overridden by style or speaker) [lbreak]
-
- 温度值用于采样,这个值有可能被 style 或者 speaker 覆盖 [lbreak]
-
- temperature for sampling ,这个值有可能被 style 或者 speaker 覆盖 [lbreak]
-
- 温度值用于采样,(may be overridden by style or speaker) [lbreak]
-
-
-"""
-
-default_ssml = """
-
-
- 这里是一个简单的 SSML 示例 [lbreak]
-
-
-"""
-
-
-def create_tts_interface():
- speakers = get_speakers()
-
- def get_speaker_show_name(spk):
- if spk.gender == "*" or spk.gender == "":
- return spk.name
- return f"{spk.gender} : {spk.name}"
-
- speaker_names = ["*random"] + [
- get_speaker_show_name(speaker) for speaker in speakers
- ]
-
- styles = ["*auto"] + [s.get("name") for s in get_styles()]
-
- history = []
-
- with gr.Row():
- with gr.Column(scale=1):
- with gr.Group():
- gr.Markdown("🎛️Sampling")
- temperature_input = gr.Slider(
- 0.01, 2.0, value=0.3, step=0.01, label="Temperature"
- )
- top_p_input = gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Top P")
- top_k_input = gr.Slider(1, 50, value=20, step=1, label="Top K")
- batch_size_input = gr.Slider(
- 1,
- webui_config["max_batch_size"],
- value=4,
- step=1,
- label="Batch Size",
- )
-
- with gr.Row():
- with gr.Group():
- gr.Markdown("🎭Style")
- gr.Markdown("- 后缀为 `_p` 表示带prompt,效果更强但是影响质量")
- style_input_dropdown = gr.Dropdown(
- choices=styles,
- # label="Choose Style",
- interactive=True,
- show_label=False,
- value="*auto",
- )
- with gr.Row():
- with gr.Group():
- gr.Markdown("🗣️Speaker (Name or Seed)")
- spk_input_text = gr.Textbox(
- label="Speaker (Text or Seed)",
- value="female2",
- show_label=False,
- )
- spk_input_dropdown = gr.Dropdown(
- choices=speaker_names,
- # label="Choose Speaker",
- interactive=True,
- value="female : female2",
- show_label=False,
- )
- spk_rand_button = gr.Button(
- value="🎲",
- # tooltip="Random Seed",
- variant="secondary",
- )
- spk_input_dropdown.change(
- fn=lambda x: x.startswith("*")
- and "-1"
- or x.split(":")[-1].strip(),
- inputs=[spk_input_dropdown],
- outputs=[spk_input_text],
- )
- spk_rand_button.click(
- lambda x: str(torch.randint(0, 2**32 - 1, (1,)).item()),
- inputs=[spk_input_text],
- outputs=[spk_input_text],
- )
- with gr.Group():
- gr.Markdown("💃Inference Seed")
- infer_seed_input = gr.Number(
- value=42,
- label="Inference Seed",
- show_label=False,
- minimum=-1,
- maximum=2**32 - 1,
- )
- infer_seed_rand_button = gr.Button(
- value="🎲",
- # tooltip="Random Seed",
- variant="secondary",
- )
- use_decoder_input = gr.Checkbox(
- value=True, label="Use Decoder", visible=False
- )
- with gr.Group():
- gr.Markdown("🔧Prompt engineering")
- prompt1_input = gr.Textbox(label="Prompt 1")
- prompt2_input = gr.Textbox(label="Prompt 2")
- prefix_input = gr.Textbox(label="Prefix")
-
- infer_seed_rand_button.click(
- lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
- inputs=[infer_seed_input],
- outputs=[infer_seed_input],
- )
- with gr.Column(scale=3):
- with gr.Row():
- with gr.Column(scale=4):
- with gr.Group():
- input_title = gr.Markdown(
- "📝Text Input",
- elem_id="input-title",
- )
- gr.Markdown(
- f"- 字数限制{webui_config['tts_max']:,}字,超过部分截断"
- )
- gr.Markdown("- 如果尾字吞字不读,可以试试结尾加上 `[lbreak]`")
- gr.Markdown(
- "- If the input text is all in English, it is recommended to check disable_normalize"
- )
- text_input = gr.Textbox(
- show_label=False,
- label="Text to Speech",
- lines=10,
- placeholder="输入文本或选择示例",
- elem_id="text-input",
- )
- # TODO 字数统计,其实实现很好写,但是就是会触发loading...并且还要和后端交互...
- # text_input.change(
- # fn=lambda x: (
- # f"📝Text Input ({len(x)} char)"
- # if x
- # else (
- # "📝Text Input (0 char)"
- # if not x
- # else "📝Text Input (0 char)"
- # )
- # ),
- # inputs=[text_input],
- # outputs=[input_title],
- # )
- with gr.Row():
- contorl_tokens = [
- "[laugh]",
- "[uv_break]",
- "[v_break]",
- "[lbreak]",
- ]
-
- for tk in contorl_tokens:
- t_btn = gr.Button(tk)
- t_btn.click(
- lambda text, tk=tk: text + " " + tk,
- inputs=[text_input],
- outputs=[text_input],
- )
- with gr.Column(scale=1):
- with gr.Group():
- gr.Markdown("🎶Refiner")
- refine_prompt_input = gr.Textbox(
- label="Refine Prompt",
- value="[oral_2][laugh_0][break_6]",
- )
- refine_button = gr.Button("✍️Refine Text")
- # TODO 分割句子,使用当前配置拼接为SSML,然后发送到SSML tab
- # send_button = gr.Button("📩Split and send to SSML")
-
- with gr.Group():
- gr.Markdown("🔊Generate")
- disable_normalize_input = gr.Checkbox(
- value=False, label="Disable Normalize"
- )
- tts_button = gr.Button(
- "🔊Generate Audio",
- variant="primary",
- elem_classes="big-button",
- )
-
- with gr.Group():
- gr.Markdown("🎄Examples")
- sample_dropdown = gr.Dropdown(
- choices=[sample["text"] for sample in sample_texts],
- show_label=False,
- value=None,
- interactive=True,
- )
- sample_dropdown.change(
- fn=lambda x: x,
- inputs=[sample_dropdown],
- outputs=[text_input],
- )
-
- with gr.Group():
- gr.Markdown("🎨Output")
- tts_output = gr.Audio(label="Generated Audio")
-
- refine_button.click(
- refine_text,
- inputs=[text_input, refine_prompt_input],
- outputs=[text_input],
- )
-
- tts_button.click(
- tts_generate,
- inputs=[
- text_input,
- temperature_input,
- top_p_input,
- top_k_input,
- spk_input_text,
- infer_seed_input,
- use_decoder_input,
- prompt1_input,
- prompt2_input,
- prefix_input,
- style_input_dropdown,
- disable_normalize_input,
- batch_size_input,
- ],
- outputs=tts_output,
- )
-
-
-def create_ssml_interface():
- examples = [
- ssml_example1,
- ssml_example2,
- ssml_example3,
- ssml_example4,
- ]
-
- with gr.Row():
- with gr.Column(scale=3):
- with gr.Group():
- gr.Markdown("📝SSML Input")
- gr.Markdown(f"- 最长{webui_config['ssml_max']:,}字符,超过会被截断")
- gr.Markdown("- 尽量保证使用相同的 seed")
- gr.Markdown(
- "- 关于SSML可以看这个 [文档](https://github.com/lenML/ChatTTS-Forge/blob/main/docs/SSML.md)"
- )
- ssml_input = gr.Textbox(
- label="SSML Input",
- lines=10,
- value=default_ssml,
- placeholder="输入 SSML 或选择示例",
- elem_id="ssml_input",
- show_label=False,
- )
- ssml_button = gr.Button("🔊Synthesize SSML", variant="primary")
- with gr.Column(scale=1):
- with gr.Group():
- # 参数
- gr.Markdown("🎛️Parameters")
- # batch size
- batch_size_input = gr.Slider(
- label="Batch Size",
- value=4,
- minimum=1,
- maximum=webui_config["max_batch_size"],
- step=1,
- )
- with gr.Group():
- gr.Markdown("🎄Examples")
- gr.Examples(
- examples=examples,
- inputs=[ssml_input],
- )
-
- ssml_output = gr.Audio(label="Generated Audio")
-
- ssml_button.click(
- synthesize_ssml,
- inputs=[ssml_input, batch_size_input],
- outputs=ssml_output,
- )
-
- return ssml_input
-
-
-# NOTE: 这个其实是需要GPU的...但是spaces会自动卸载,所以不太好使,具体处理在text_normalize中兼容
-# @spaces.GPU
-def split_long_text(long_text_input):
- spliter = SentenceSplitter(webui_config["spliter_threshold"])
- sentences = spliter.parse(long_text_input)
- sentences = [text_normalize(s) for s in sentences]
- data = []
- for i, text in enumerate(sentences):
- data.append([i, text, len(text)])
- return data
-
-
-def merge_dataframe_to_ssml(dataframe, spk, style, seed):
- if style == "*auto":
- style = None
- if spk == "-1" or spk == -1:
- spk = None
- if seed == -1 or seed == "-1":
- seed = None
-
- ssml = ""
- indent = " " * 2
-
- for i, row in dataframe.iterrows():
- ssml += f"{indent}\n"
- ssml += f"{indent}{indent}{text_normalize(row[1])}\n"
- ssml += f"{indent}\n"
- return f"\n{ssml}"
-
-
-# 长文本处理
-# 可以输入长文本,并选择切割方法,切割之后可以将拼接的SSML发送到SSML tab
-# 根据 。 句号切割,切割之后显示到 data table
-def create_long_content_tab(ssml_input, tabs):
- speakers = get_speakers()
-
- def get_speaker_show_name(spk):
- if spk.gender == "*" or spk.gender == "":
- return spk.name
- return f"{spk.gender} : {spk.name}"
-
- speaker_names = ["*random"] + [
- get_speaker_show_name(speaker) for speaker in speakers
- ]
-
- styles = ["*auto"] + [s.get("name") for s in get_styles()]
-
- with gr.Row():
- with gr.Column(scale=1):
- # 选择说话人 选择风格 选择seed
- with gr.Group():
- gr.Markdown("🗣️Speaker")
- spk_input_text = gr.Textbox(
- label="Speaker (Text or Seed)",
- value="female2",
- show_label=False,
- )
- spk_input_dropdown = gr.Dropdown(
- choices=speaker_names,
- interactive=True,
- value="female : female2",
- show_label=False,
- )
- spk_rand_button = gr.Button(
- value="🎲",
- variant="secondary",
- )
- with gr.Group():
- gr.Markdown("🎭Style")
- style_input_dropdown = gr.Dropdown(
- choices=styles,
- interactive=True,
- show_label=False,
- value="*auto",
- )
- with gr.Group():
- gr.Markdown("🗣️Seed")
- infer_seed_input = gr.Number(
- value=42,
- label="Inference Seed",
- show_label=False,
- minimum=-1,
- maximum=2**32 - 1,
- )
- infer_seed_rand_button = gr.Button(
- value="🎲",
- variant="secondary",
- )
-
- send_btn = gr.Button("📩Send to SSML", variant="primary")
-
- with gr.Column(scale=3):
- with gr.Group():
- gr.Markdown("📝Long Text Input")
- gr.Markdown("- 此页面用于处理超长文本")
- gr.Markdown("- 切割后,可以选择说话人、风格、seed,然后发送到SSML")
- long_text_input = gr.Textbox(
- label="Long Text Input",
- lines=10,
- placeholder="输入长文本",
- elem_id="long-text-input",
- show_label=False,
- )
- long_text_split_button = gr.Button("🔪Split Text")
-
- with gr.Row():
- with gr.Column(scale=3):
- with gr.Group():
- gr.Markdown("🎨Output")
- long_text_output = gr.DataFrame(
- headers=["index", "text", "length"],
- datatype=["number", "str", "number"],
- elem_id="long-text-output",
- interactive=False,
- wrap=True,
- value=[],
- )
-
- spk_input_dropdown.change(
- fn=lambda x: x.startswith("*") and "-1" or x.split(":")[-1].strip(),
- inputs=[spk_input_dropdown],
- outputs=[spk_input_text],
- )
- spk_rand_button.click(
- lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
- inputs=[spk_input_text],
- outputs=[spk_input_text],
- )
- infer_seed_rand_button.click(
- lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
- inputs=[infer_seed_input],
- outputs=[infer_seed_input],
- )
- long_text_split_button.click(
- split_long_text,
- inputs=[long_text_input],
- outputs=[long_text_output],
- )
-
- infer_seed_rand_button.click(
- lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
- inputs=[infer_seed_input],
- outputs=[infer_seed_input],
- )
-
- send_btn.click(
- merge_dataframe_to_ssml,
- inputs=[
- long_text_output,
- spk_input_text,
- style_input_dropdown,
- infer_seed_input,
- ],
- outputs=[ssml_input],
- )
-
- def change_tab():
- return gr.Tabs(selected="ssml")
-
- send_btn.click(change_tab, inputs=[], outputs=[tabs])
-
-
-def create_readme_tab():
- readme_content = read_local_readme()
- gr.Markdown(readme_content)
-
-
-def create_app_footer():
- gradio_version = gr.__version__
- git_tag = config.versions.git_tag
- git_commit = config.versions.git_commit
- git_branch = config.versions.git_branch
- python_version = config.versions.python_version
- torch_version = config.versions.torch_version
-
- config.versions.gradio_version = gradio_version
-
- gr.Markdown(
- f"""
-🍦 [ChatTTS-Forge](https://github.com/lenML/ChatTTS-Forge)
-version: [{git_tag}](https://github.com/lenML/ChatTTS-Forge/commit/{git_commit}) | branch: `{git_branch}` | python: `{python_version}` | torch: `{torch_version}`
- """
- )
-
-
-def create_interface():
-
- js_func = """
- function refresh() {
- const url = new URL(window.location);
-
- if (url.searchParams.get('__theme') !== 'dark') {
- url.searchParams.set('__theme', 'dark');
- window.location.href = url.href;
- }
- }
- """
-
- head_js = """
-
- """
-
- with gr.Blocks(js=js_func, head=head_js, title="ChatTTS Forge WebUI") as demo:
- css = """
-
- """
-
- gr.HTML(css)
- with gr.Tabs() as tabs:
- with gr.TabItem("TTS"):
- create_tts_interface()
-
- with gr.TabItem("SSML", id="ssml"):
- ssml_input = create_ssml_interface()
-
- with gr.TabItem("Long Text"):
- create_long_content_tab(ssml_input, tabs=tabs)
-
- with gr.TabItem("README"):
- create_readme_tab()
-
- create_app_footer()
- return demo
-
+from modules.utils import env
+from modules.webui import webui_config
+from modules.webui.app import webui_init, create_interface
+from modules import generate_audio
+from modules import config
if __name__ == "__main__":
import argparse
@@ -914,6 +67,12 @@ def create_interface():
type=str.lower,
)
parser.add_argument("--compile", action="store_true", help="Enable model compile")
+ # webui_Experimental
+ parser.add_argument(
+ "--webui_experimental",
+ action="store_true",
+ help="Enable webui_experimental features",
+ )
args = parser.parse_args()
@@ -934,20 +93,23 @@ def get_and_update_env(*args):
device_id = get_and_update_env(args, "device_id", None, str)
use_cpu = get_and_update_env(args, "use_cpu", [], list)
compile = get_and_update_env(args, "compile", False, bool)
+ webui_experimental = get_and_update_env(args, "webui_experimental", False, bool)
- webui_config["tts_max"] = get_and_update_env(args, "tts_max_len", 1000, int)
- webui_config["ssml_max"] = get_and_update_env(args, "ssml_max_len", 5000, int)
- webui_config["max_batch_size"] = get_and_update_env(args, "max_batch_size", 8, int)
+ webui_config.tts_max = get_and_update_env(args, "tts_max_len", 1000, int)
+ webui_config.ssml_max = get_and_update_env(args, "ssml_max_len", 5000, int)
+ webui_config.max_batch_size = get_and_update_env(args, "max_batch_size", 8, int)
demo = create_interface()
if auth:
auth = tuple(auth.split(":"))
- generate.setup_lru_cache()
+ generate_audio.setup_lru_cache()
devices.reset_device()
devices.first_time_calculation()
+ webui_init()
+
demo.queue().launch(
server_name=server_name,
server_port=server_port,