diff --git a/modules/denoise.py b/modules/denoise.py new file mode 100644 index 0000000..1025daf --- /dev/null +++ b/modules/denoise.py @@ -0,0 +1,7 @@ +from audio_denoiser.AudioDenoiser import AudioDenoiser +import torch +import torchaudio + + +class TTSAudioDenoiser: + pass diff --git a/modules/hf.py b/modules/hf.py new file mode 100644 index 0000000..f1ea400 --- /dev/null +++ b/modules/hf.py @@ -0,0 +1,14 @@ +# 给huggingface space写的兼容代码 + +try: + import spaces +except: + + class NoneSpaces: + def __init__(self): + pass + + def GPU(self, fn): + return fn + + spaces = NoneSpaces() diff --git a/modules/webui/__init__.py b/modules/webui/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/modules/webui/app.py b/modules/webui/app.py new file mode 100644 index 0000000..3d0439b --- /dev/null +++ b/modules/webui/app.py @@ -0,0 +1,110 @@ +import logging +import os + +import torch +import gradio as gr + +from modules import config + +from modules.webui.tts_tab import create_tts_interface +from modules.webui.ssml_tab import create_ssml_interface +from modules.webui.spliter_tab import create_spliter_tab +from modules.webui.speaker_tab import create_speaker_panel +from modules.webui.readme_tab import create_readme_tab + +logger = logging.getLogger(__name__) + +logging.basicConfig( + level=os.getenv("LOG_LEVEL", "INFO"), + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) + + +def webui_init(): + # fix: If the system proxy is enabled in the Windows system, you need to skip these + os.environ["NO_PROXY"] = "localhost,127.0.0.1,0.0.0.0" + + torch._dynamo.config.cache_size_limit = 64 + torch._dynamo.config.suppress_errors = True + torch.set_float32_matmul_precision("high") + + logger.info("WebUI module initialized") + + +def create_app_footer(): + gradio_version = gr.__version__ + git_tag = config.versions.git_tag + git_commit = config.versions.git_commit + git_branch = config.versions.git_branch + python_version = config.versions.python_version + torch_version = config.versions.torch_version + + config.versions.gradio_version = gradio_version + + gr.Markdown( + f""" +🍦 [ChatTTS-Forge](https://github.com/lenML/ChatTTS-Forge) +version: [{git_tag}](https://github.com/lenML/ChatTTS-Forge/commit/{git_commit}) | branch: `{git_branch}` | python: `{python_version}` | torch: `{torch_version}` + """ + ) + + +def create_interface(): + + js_func = """ + function refresh() { + const url = new URL(window.location); + + if (url.searchParams.get('__theme') !== 'dark') { + url.searchParams.set('__theme', 'dark'); + window.location.href = url.href; + } + } + """ + + head_js = """ + + """ + + with gr.Blocks(js=js_func, head=head_js, title="ChatTTS Forge WebUI") as demo: + css = """ + + """ + + gr.HTML(css) + with gr.Tabs() as tabs: + with gr.TabItem("TTS"): + create_tts_interface() + + with gr.TabItem("SSML", id="ssml"): + ssml_input = create_ssml_interface() + + with gr.TabItem("Spilter"): + create_spliter_tab(ssml_input, tabs=tabs) + + if config.runtime_env_vars.webui_experimental: + with gr.TabItem("Speaker"): + create_speaker_panel() + with gr.TabItem("Denoise"): + gr.Markdown("🚧 Under construction") + with gr.TabItem("Inpainting"): + gr.Markdown("🚧 Under construction") + with gr.TabItem("ASR"): + gr.Markdown("🚧 Under construction") + + with gr.TabItem("README"): + create_readme_tab() + + create_app_footer() + return demo diff --git a/modules/webui/asr_tab.py b/modules/webui/asr_tab.py new file mode 100644 index 0000000..e69de29 diff --git a/modules/webui/denoise_tab.py b/modules/webui/denoise_tab.py new file mode 100644 index 0000000..bf01965 --- /dev/null +++ b/modules/webui/denoise_tab.py @@ -0,0 +1,5 @@ +import gradio as gr + + +def create_denoise_tab(): + pass diff --git a/modules/webui/examples.py b/modules/webui/examples.py new file mode 100644 index 0000000..eefc860 --- /dev/null +++ b/modules/webui/examples.py @@ -0,0 +1,164 @@ +example_texts = [ + { + "text": "大🍌,一条大🍌,嘿,你的感觉真的很奇妙 [lbreak]", + }, + {"text": "Big 🍌, a big 🍌, hey, your feeling is really wonderful [lbreak]"}, + { + "text": """ +# 这是 markdown 标题 + +``` +代码块将跳过 +``` + +- **文本标准化**: + - **Markdown**: 自动检测处理 markdown 格式文本。 + - **数字转写**: 自动将数字转为模型可识别的文本。 + - **Emoji 适配**: 自动翻译 emoji 为可读文本。 + - **基于分词器**: 基于 tokenizer 预处理文本,覆盖模型所有不支持字符范围。 + - **中英文识别**: 适配英文环境。 + """ + }, + { + "text": "天气预报显示,今天会有小雨,请大家出门时记得带伞。降温的天气也提醒我们要适时添衣保暖 [lbreak]", + }, + { + "text": "公司的年度总结会议将在下周三举行,请各部门提前准备好相关材料,确保会议顺利进行 [lbreak]", + }, + { + "text": "今天的午餐菜单包括烤鸡、沙拉和蔬菜汤,大家可以根据自己的口味选择适合的菜品 [lbreak]", + }, + { + "text": "请注意,电梯将在下午两点进行例行维护,预计需要一个小时的时间,请大家在此期间使用楼梯 [lbreak]", + }, + { + "text": "图书馆新到了一批书籍,涵盖了文学、科学和历史等多个领域,欢迎大家前来借阅 [lbreak]", + }, + { + "text": "电影中梁朝伟扮演的陈永仁的编号27149 [lbreak]", + }, + { + "text": "这块黄金重达324.75克 [lbreak]", + }, + { + "text": "我们班的最高总分为583分 [lbreak]", + }, + { + "text": "12~23 [lbreak]", + }, + { + "text": "-1.5~2 [lbreak]", + }, + { + "text": "她出生于86年8月18日,她弟弟出生于1995年3月1日 [lbreak]", + }, + { + "text": "等会请在12:05请通知我 [lbreak]", + }, + { + "text": "今天的最低气温达到-10°C [lbreak]", + }, + { + "text": "现场有7/12的观众投出了赞成票 [lbreak]", + }, + { + "text": "明天有62%的概率降雨 [lbreak]", + }, + { + "text": "随便来几个价格12块5,34.5元,20.1万 [lbreak]", + }, + { + "text": "这是固话0421-33441122 [lbreak]", + }, + { + "text": "这是手机+86 18544139121 [lbreak]", + }, +] + +ssml_example1 = """ + + + 下面是一个 ChatTTS 用于合成多角色多情感的有声书示例[lbreak] + + + 黛玉冷笑道:[lbreak] + + + 我说呢 [uv_break] ,亏了绊住,不然,早就飞起来了[lbreak] + + + 宝玉道:[lbreak] + + + “只许和你玩 [uv_break] ,替你解闷。不过偶然到他那里,就说这些闲话。”[lbreak] + + + “好没意思的话![uv_break] 去不去,关我什么事儿? 又没叫你替我解闷儿 [uv_break],还许你不理我呢” [lbreak] + + + 说着,便赌气回房去了 [lbreak] + + +""" +ssml_example2 = """ + + + 使用 prosody 控制生成文本的语速语调和音量,示例如下 [lbreak] + + + 无任何限制将会继承父级voice配置进行生成 [lbreak] + + + 设置 rate 大于1表示加速,小于1为减速 [lbreak] + + + 设置 pitch 调整音调,设置为6表示提高6个半音 [lbreak] + + + 设置 volume 调整音量,设置为2表示提高2个分贝 [lbreak] + + + 在 voice 中无prosody包裹的文本即为默认生成状态下的语音 [lbreak] + + +""" +ssml_example3 = """ + + + 使用 break 标签将会简单的 [lbreak] + + + + 插入一段空白到生成结果中 [lbreak] + + +""" + +ssml_example4 = """ + + + temperature for sampling (may be overridden by style or speaker) [lbreak] + + 温度值用于采样,这个值有可能被 style 或者 speaker 覆盖 [lbreak] + + temperature for sampling ,这个值有可能被 style 或者 speaker 覆盖 [lbreak] + + 温度值用于采样,(may be overridden by style or speaker) [lbreak] + + +""" + +ssml_examples = [ + ssml_example1, + ssml_example2, + ssml_example3, + ssml_example4, +] + +default_ssml = """ + + + 这里是一个简单的 SSML 示例 [lbreak] + + +""" diff --git a/modules/webui/readme_tab.py b/modules/webui/readme_tab.py new file mode 100644 index 0000000..cb933ba --- /dev/null +++ b/modules/webui/readme_tab.py @@ -0,0 +1,13 @@ +import gradio as gr + + +def read_local_readme(): + with open("README.md", "r", encoding="utf-8") as file: + content = file.read() + content = content[content.index("# ") :] + return content + + +def create_readme_tab(): + readme_content = read_local_readme() + gr.Markdown(readme_content) diff --git a/modules/webui/speaker_tab.py b/modules/webui/speaker_tab.py new file mode 100644 index 0000000..df306f2 --- /dev/null +++ b/modules/webui/speaker_tab.py @@ -0,0 +1,13 @@ +import gradio as gr + +from modules.webui.webui_utils import get_speakers + + +# 显示 a b c d 四个选择框,选择一个或多个,然后可以试音,并导出 +def create_speaker_panel(): + speakers = get_speakers() + + def get_speaker_show_name(spk): + pass + + gr.Markdown("🚧 Under construction") diff --git a/modules/webui/spliter_tab.py b/modules/webui/spliter_tab.py new file mode 100644 index 0000000..0fedbcb --- /dev/null +++ b/modules/webui/spliter_tab.py @@ -0,0 +1,168 @@ +import gradio as gr +import torch +from modules.normalization import text_normalize +from modules.webui.webui_utils import ( + get_speakers, + get_styles, + split_long_text, + synthesize_ssml, +) +from modules.webui import webui_config +from modules.webui.examples import ssml_examples, default_ssml + + +def merge_dataframe_to_ssml(dataframe, spk, style, seed): + if style == "*auto": + style = None + if spk == "-1" or spk == -1: + spk = None + if seed == -1 or seed == "-1": + seed = None + + ssml = "" + indent = " " * 2 + + for i, row in dataframe.iterrows(): + ssml += f"{indent}\n" + return f"\n{ssml}" + + +# 长文本处理 +# 可以输入长文本,并选择切割方法,切割之后可以将拼接的SSML发送到SSML tab +# 根据 。 句号切割,切割之后显示到 data table +def create_spliter_tab(ssml_input, tabs): + speakers = get_speakers() + + def get_speaker_show_name(spk): + if spk.gender == "*" or spk.gender == "": + return spk.name + return f"{spk.gender} : {spk.name}" + + speaker_names = ["*random"] + [ + get_speaker_show_name(speaker) for speaker in speakers + ] + + styles = ["*auto"] + [s.get("name") for s in get_styles()] + + with gr.Row(): + with gr.Column(scale=1): + # 选择说话人 选择风格 选择seed + with gr.Group(): + gr.Markdown("🗣️Speaker") + spk_input_text = gr.Textbox( + label="Speaker (Text or Seed)", + value="female2", + show_label=False, + ) + spk_input_dropdown = gr.Dropdown( + choices=speaker_names, + interactive=True, + value="female : female2", + show_label=False, + ) + spk_rand_button = gr.Button( + value="🎲", + variant="secondary", + ) + with gr.Group(): + gr.Markdown("🎭Style") + style_input_dropdown = gr.Dropdown( + choices=styles, + interactive=True, + show_label=False, + value="*auto", + ) + with gr.Group(): + gr.Markdown("🗣️Seed") + infer_seed_input = gr.Number( + value=42, + label="Inference Seed", + show_label=False, + minimum=-1, + maximum=2**32 - 1, + ) + infer_seed_rand_button = gr.Button( + value="🎲", + variant="secondary", + ) + + send_btn = gr.Button("📩Send to SSML", variant="primary") + + with gr.Column(scale=3): + with gr.Group(): + gr.Markdown("📝Long Text Input") + gr.Markdown("- 此页面用于处理超长文本") + gr.Markdown("- 切割后,可以选择说话人、风格、seed,然后发送到SSML") + long_text_input = gr.Textbox( + label="Long Text Input", + lines=10, + placeholder="输入长文本", + elem_id="long-text-input", + show_label=False, + ) + long_text_split_button = gr.Button("🔪Split Text") + + with gr.Row(): + with gr.Column(scale=3): + with gr.Group(): + gr.Markdown("🎨Output") + long_text_output = gr.DataFrame( + headers=["index", "text", "length"], + datatype=["number", "str", "number"], + elem_id="long-text-output", + interactive=False, + wrap=True, + value=[], + ) + + spk_input_dropdown.change( + fn=lambda x: x.startswith("*") and "-1" or x.split(":")[-1].strip(), + inputs=[spk_input_dropdown], + outputs=[spk_input_text], + ) + spk_rand_button.click( + lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()), + inputs=[spk_input_text], + outputs=[spk_input_text], + ) + infer_seed_rand_button.click( + lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()), + inputs=[infer_seed_input], + outputs=[infer_seed_input], + ) + long_text_split_button.click( + split_long_text, + inputs=[long_text_input], + outputs=[long_text_output], + ) + + infer_seed_rand_button.click( + lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()), + inputs=[infer_seed_input], + outputs=[infer_seed_input], + ) + + send_btn.click( + merge_dataframe_to_ssml, + inputs=[ + long_text_output, + spk_input_text, + style_input_dropdown, + infer_seed_input, + ], + outputs=[ssml_input], + ) + + def change_tab(): + return gr.Tabs(selected="ssml") + + send_btn.click(change_tab, inputs=[], outputs=[tabs]) diff --git a/modules/webui/ssml_tab.py b/modules/webui/ssml_tab.py new file mode 100644 index 0000000..9979e25 --- /dev/null +++ b/modules/webui/ssml_tab.py @@ -0,0 +1,55 @@ +import gradio as gr +from modules.webui.webui_utils import ( + synthesize_ssml, +) +from modules.webui import webui_config +from modules.webui.examples import ssml_examples, default_ssml + + +def create_ssml_interface(): + with gr.Row(): + with gr.Column(scale=3): + with gr.Group(): + gr.Markdown("📝SSML Input") + gr.Markdown(f"- 最长{webui_config.ssml_max:,}字符,超过会被截断") + gr.Markdown("- 尽量保证使用相同的 seed") + gr.Markdown( + "- 关于SSML可以看这个 [文档](https://github.com/lenML/ChatTTS-Forge/blob/main/docs/SSML.md)" + ) + ssml_input = gr.Textbox( + label="SSML Input", + lines=10, + value=default_ssml, + placeholder="输入 SSML 或选择示例", + elem_id="ssml_input", + show_label=False, + ) + ssml_button = gr.Button("🔊Synthesize SSML", variant="primary") + with gr.Column(scale=1): + with gr.Group(): + # 参数 + gr.Markdown("🎛️Parameters") + # batch size + batch_size_input = gr.Slider( + label="Batch Size", + value=4, + minimum=1, + maximum=webui_config.max_batch_size, + step=1, + ) + with gr.Group(): + gr.Markdown("🎄Examples") + gr.Examples( + examples=ssml_examples, + inputs=[ssml_input], + ) + + ssml_output = gr.Audio(label="Generated Audio") + + ssml_button.click( + synthesize_ssml, + inputs=[ssml_input, batch_size_input], + outputs=ssml_output, + ) + + return ssml_input diff --git a/modules/webui/tts_tab.py b/modules/webui/tts_tab.py new file mode 100644 index 0000000..b378ddf --- /dev/null +++ b/modules/webui/tts_tab.py @@ -0,0 +1,248 @@ +import gradio as gr +import torch +from modules.webui.webui_utils import ( + get_speakers, + get_styles, + refine_text, + tts_generate, +) +from modules.webui import webui_config +from modules.webui.examples import example_texts +from modules import config + + +def create_tts_interface(): + speakers = get_speakers() + + def get_speaker_show_name(spk): + if spk.gender == "*" or spk.gender == "": + return spk.name + return f"{spk.gender} : {spk.name}" + + speaker_names = ["*random"] + [ + get_speaker_show_name(speaker) for speaker in speakers + ] + + styles = ["*auto"] + [s.get("name") for s in get_styles()] + + history = [] + + with gr.Row(): + with gr.Column(scale=1): + with gr.Group(): + gr.Markdown("🎛️Sampling") + temperature_input = gr.Slider( + 0.01, 2.0, value=0.3, step=0.01, label="Temperature" + ) + top_p_input = gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Top P") + top_k_input = gr.Slider(1, 50, value=20, step=1, label="Top K") + batch_size_input = gr.Slider( + 1, + webui_config.max_batch_size, + value=4, + step=1, + label="Batch Size", + ) + + with gr.Row(): + with gr.Group(): + gr.Markdown("🎭Style") + gr.Markdown("- 后缀为 `_p` 表示带prompt,效果更强但是影响质量") + style_input_dropdown = gr.Dropdown( + choices=styles, + # label="Choose Style", + interactive=True, + show_label=False, + value="*auto", + ) + with gr.Row(): + with gr.Group(): + gr.Markdown("🗣️Speaker") + with gr.Tabs(): + with gr.Tab(label="Pick"): + spk_input_text = gr.Textbox( + label="Speaker (Text or Seed)", + value="female2", + show_label=False, + ) + spk_input_dropdown = gr.Dropdown( + choices=speaker_names, + # label="Choose Speaker", + interactive=True, + value="female : female2", + show_label=False, + ) + spk_rand_button = gr.Button( + value="🎲", + # tooltip="Random Seed", + variant="secondary", + ) + spk_input_dropdown.change( + fn=lambda x: x.startswith("*") + and "-1" + or x.split(":")[-1].strip(), + inputs=[spk_input_dropdown], + outputs=[spk_input_text], + ) + spk_rand_button.click( + lambda x: str(torch.randint(0, 2**32 - 1, (1,)).item()), + inputs=[spk_input_text], + outputs=[spk_input_text], + ) + + if config.runtime_env_vars.webui_experimental: + with gr.Tab(label="Upload"): + spk_input_upload = gr.File(label="Speaker (Upload)") + # TODO 读取 speaker + # spk_input_upload.change( + # fn=lambda x: x.read().decode("utf-8"), + # inputs=[spk_input_upload], + # outputs=[spk_input_text], + # ) + with gr.Group(): + gr.Markdown("💃Inference Seed") + infer_seed_input = gr.Number( + value=42, + label="Inference Seed", + show_label=False, + minimum=-1, + maximum=2**32 - 1, + ) + infer_seed_rand_button = gr.Button( + value="🎲", + # tooltip="Random Seed", + variant="secondary", + ) + use_decoder_input = gr.Checkbox( + value=True, label="Use Decoder", visible=False + ) + with gr.Group(): + gr.Markdown("🔧Prompt engineering") + prompt1_input = gr.Textbox(label="Prompt 1") + prompt2_input = gr.Textbox(label="Prompt 2") + prefix_input = gr.Textbox(label="Prefix") + + if config.runtime_env_vars.webui_experimental: + prompt_audio = gr.File(label="prompt_audio") + + infer_seed_rand_button.click( + lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()), + inputs=[infer_seed_input], + outputs=[infer_seed_input], + ) + with gr.Column(scale=3): + with gr.Row(): + with gr.Column(scale=4): + with gr.Group(): + input_title = gr.Markdown( + "📝Text Input", + elem_id="input-title", + ) + gr.Markdown( + f"- 字数限制{webui_config.tts_max:,}字,超过部分截断" + ) + gr.Markdown("- 如果尾字吞字不读,可以试试结尾加上 `[lbreak]`") + gr.Markdown( + "- If the input text is all in English, it is recommended to check disable_normalize" + ) + text_input = gr.Textbox( + show_label=False, + label="Text to Speech", + lines=10, + placeholder="输入文本或选择示例", + elem_id="text-input", + ) + # TODO 字数统计,其实实现很好写,但是就是会触发loading...并且还要和后端交互... + # text_input.change( + # fn=lambda x: ( + # f"📝Text Input ({len(x)} char)" + # if x + # else ( + # "📝Text Input (0 char)" + # if not x + # else "📝Text Input (0 char)" + # ) + # ), + # inputs=[text_input], + # outputs=[input_title], + # ) + with gr.Row(): + contorl_tokens = [ + "[laugh]", + "[uv_break]", + "[v_break]", + "[lbreak]", + ] + + for tk in contorl_tokens: + t_btn = gr.Button(tk) + t_btn.click( + lambda text, tk=tk: text + " " + tk, + inputs=[text_input], + outputs=[text_input], + ) + with gr.Column(scale=1): + with gr.Group(): + gr.Markdown("🎶Refiner") + refine_prompt_input = gr.Textbox( + label="Refine Prompt", + value="[oral_2][laugh_0][break_6]", + ) + refine_button = gr.Button("✍️Refine Text") + # TODO 分割句子,使用当前配置拼接为SSML,然后发送到SSML tab + # send_button = gr.Button("📩Split and send to SSML") + + with gr.Group(): + gr.Markdown("🔊Generate") + disable_normalize_input = gr.Checkbox( + value=False, label="Disable Normalize" + ) + tts_button = gr.Button( + "🔊Generate Audio", + variant="primary", + elem_classes="big-button", + ) + + with gr.Group(): + gr.Markdown("🎄Examples") + sample_dropdown = gr.Dropdown( + choices=[sample["text"] for sample in example_texts], + show_label=False, + value=None, + interactive=True, + ) + sample_dropdown.change( + fn=lambda x: x, + inputs=[sample_dropdown], + outputs=[text_input], + ) + + with gr.Group(): + gr.Markdown("🎨Output") + tts_output = gr.Audio(label="Generated Audio") + + refine_button.click( + refine_text, + inputs=[text_input, refine_prompt_input], + outputs=[text_input], + ) + + tts_button.click( + tts_generate, + inputs=[ + text_input, + temperature_input, + top_p_input, + top_k_input, + spk_input_text, + infer_seed_input, + use_decoder_input, + prompt1_input, + prompt2_input, + prefix_input, + style_input_dropdown, + disable_normalize_input, + batch_size_input, + ], + outputs=tts_output, + ) diff --git a/modules/webui/webui_config.py b/modules/webui/webui_config.py new file mode 100644 index 0000000..64b64d7 --- /dev/null +++ b/modules/webui/webui_config.py @@ -0,0 +1,4 @@ +tts_max = 1000 +ssml_max = 1000 +spliter_threshold = 100 +max_batch_size = 8 diff --git a/modules/webui/webui_utils.py b/modules/webui/webui_utils.py new file mode 100644 index 0000000..241d3de --- /dev/null +++ b/modules/webui/webui_utils.py @@ -0,0 +1,169 @@ +import os +import logging +import sys + +import numpy as np + +from modules.devices import devices +from modules.synthesize_audio import synthesize_audio +from modules.hf import spaces +from modules.webui import webui_config + +logging.basicConfig( + level=os.getenv("LOG_LEVEL", "INFO"), + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) + + +import gradio as gr + +import torch + +from modules.ssml import parse_ssml +from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments + +from modules.speaker import speaker_mgr +from modules.data import styles_mgr + +from modules.api.utils import calc_spk_style +import modules.generate_audio as generate + +from modules.normalization import text_normalize +from modules import refiner, config + +from modules.utils import env, audio +from modules.SentenceSplitter import SentenceSplitter + + +def get_speakers(): + return speaker_mgr.list_speakers() + + +def get_styles(): + return styles_mgr.list_items() + + +def segments_length_limit(segments, total_max: int): + ret_segments = [] + total_len = 0 + for seg in segments: + if "text" not in seg: + continue + total_len += len(seg["text"]) + if total_len > total_max: + break + ret_segments.append(seg) + return ret_segments + + +@torch.inference_mode() +@spaces.GPU +def synthesize_ssml(ssml: str, batch_size=4): + try: + batch_size = int(batch_size) + except Exception: + batch_size = 8 + + ssml = ssml.strip() + + if ssml == "": + return None + + segments = parse_ssml(ssml) + max_len = webui_config.ssml_max + segments = segments_length_limit(segments, max_len) + + if len(segments) == 0: + return None + + synthesize = SynthesizeSegments(batch_size=batch_size) + audio_segments = synthesize.synthesize_segments(segments) + combined_audio = combine_audio_segments(audio_segments) + + return audio.pydub_to_np(combined_audio) + + +@torch.inference_mode() +@spaces.GPU +def tts_generate( + text, + temperature, + top_p, + top_k, + spk, + infer_seed, + use_decoder, + prompt1, + prompt2, + prefix, + style, + disable_normalize=False, + batch_size=4, +): + try: + batch_size = int(batch_size) + except Exception: + batch_size = 4 + + max_len = webui_config.tts_max + text = text.strip()[0:max_len] + + if text == "": + return None + + if style == "*auto": + style = None + + if isinstance(top_k, float): + top_k = int(top_k) + + params = calc_spk_style(spk=spk, style=style) + spk = params.get("spk", spk) + + infer_seed = infer_seed or params.get("seed", infer_seed) + temperature = temperature or params.get("temperature", temperature) + prefix = prefix or params.get("prefix", prefix) + prompt1 = prompt1 or params.get("prompt1", "") + prompt2 = prompt2 or params.get("prompt2", "") + + infer_seed = np.clip(infer_seed, -1, 2**32 - 1, out=None, dtype=np.int64) + infer_seed = int(infer_seed) + + if not disable_normalize: + text = text_normalize(text) + + sample_rate, audio_data = synthesize_audio( + text=text, + temperature=temperature, + top_P=top_p, + top_K=top_k, + spk=spk, + infer_seed=infer_seed, + use_decoder=use_decoder, + prompt1=prompt1, + prompt2=prompt2, + prefix=prefix, + batch_size=batch_size, + ) + + audio_data = audio.audio_to_int16(audio_data) + return sample_rate, audio_data + + +@torch.inference_mode() +@spaces.GPU +def refine_text(text: str, prompt: str): + text = text_normalize(text) + return refiner.refine_text(text, prompt=prompt) + + +@torch.inference_mode() +@spaces.GPU +def split_long_text(long_text_input): + spliter = SentenceSplitter(webui_config.spliter_threshold) + sentences = spliter.parse(long_text_input) + sentences = [text_normalize(s) for s in sentences] + data = [] + for i, text in enumerate(sentences): + data.append([i, text, len(text)]) + return data diff --git a/webui.py b/webui.py index 2735f98..d3fc23c 100644 --- a/webui.py +++ b/webui.py @@ -1,857 +1,10 @@ -try: - import spaces -except: - - class NoneSpaces: - def __init__(self): - pass - - def GPU(self, fn): - return fn - - spaces = NoneSpaces() - import os -import logging -import sys - -import numpy as np - from modules.devices import devices -from modules.synthesize_audio import synthesize_audio - -logging.basicConfig( - level=os.getenv("LOG_LEVEL", "INFO"), - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", -) - - -import gradio as gr - -import torch - -from modules.ssml import parse_ssml -from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments - -from modules.speaker import speaker_mgr -from modules.data import styles_mgr - -from modules.api.utils import calc_spk_style -import modules.generate_audio as generate - -from modules.normalization import text_normalize -from modules import refiner, config - -from modules.utils import env, audio -from modules.SentenceSplitter import SentenceSplitter - -# fix: If the system proxy is enabled in the Windows system, you need to skip these -os.environ["NO_PROXY"] = "localhost,127.0.0.1,0.0.0.0" - -torch._dynamo.config.cache_size_limit = 64 -torch._dynamo.config.suppress_errors = True -torch.set_float32_matmul_precision("high") - -webui_config = { - "tts_max": 1000, - "ssml_max": 5000, - "spliter_threshold": 100, - "max_batch_size": 8, -} - - -def get_speakers(): - return speaker_mgr.list_speakers() - - -def get_styles(): - return styles_mgr.list_items() - - -def segments_length_limit(segments, total_max: int): - ret_segments = [] - total_len = 0 - for seg in segments: - if "text" not in seg: - continue - total_len += len(seg["text"]) - if total_len > total_max: - break - ret_segments.append(seg) - return ret_segments - - -@torch.inference_mode() -@spaces.GPU -def synthesize_ssml(ssml: str, batch_size=4): - try: - batch_size = int(batch_size) - except Exception: - batch_size = 8 - - ssml = ssml.strip() - - if ssml == "": - return None - - segments = parse_ssml(ssml) - max_len = webui_config["ssml_max"] - segments = segments_length_limit(segments, max_len) - - if len(segments) == 0: - return None - - synthesize = SynthesizeSegments(batch_size=batch_size) - audio_segments = synthesize.synthesize_segments(segments) - combined_audio = combine_audio_segments(audio_segments) - - return audio.pydub_to_np(combined_audio) - - -@torch.inference_mode() -@spaces.GPU -def tts_generate( - text, - temperature, - top_p, - top_k, - spk, - infer_seed, - use_decoder, - prompt1, - prompt2, - prefix, - style, - disable_normalize=False, - batch_size=4, -): - try: - batch_size = int(batch_size) - except Exception: - batch_size = 4 - - max_len = webui_config["tts_max"] - text = text.strip()[0:max_len] - - if text == "": - return None - - if style == "*auto": - style = None - - if isinstance(top_k, float): - top_k = int(top_k) - - params = calc_spk_style(spk=spk, style=style) - spk = params.get("spk", spk) - - infer_seed = infer_seed or params.get("seed", infer_seed) - temperature = temperature or params.get("temperature", temperature) - prefix = prefix or params.get("prefix", prefix) - prompt1 = prompt1 or params.get("prompt1", "") - prompt2 = prompt2 or params.get("prompt2", "") - - infer_seed = np.clip(infer_seed, -1, 2**32 - 1, out=None, dtype=np.int64) - infer_seed = int(infer_seed) - - if not disable_normalize: - text = text_normalize(text) - - sample_rate, audio_data = synthesize_audio( - text=text, - temperature=temperature, - top_P=top_p, - top_K=top_k, - spk=spk, - infer_seed=infer_seed, - use_decoder=use_decoder, - prompt1=prompt1, - prompt2=prompt2, - prefix=prefix, - batch_size=batch_size, - ) - - audio_data = audio.audio_to_int16(audio_data) - return sample_rate, audio_data - - -@torch.inference_mode() -@spaces.GPU -def refine_text(text: str, prompt: str): - text = text_normalize(text) - return refiner.refine_text(text, prompt=prompt) - - -def read_local_readme(): - with open("README.md", "r", encoding="utf-8") as file: - content = file.read() - content = content[content.index("# ") :] - return content - - -# 演示示例文本 -sample_texts = [ - { - "text": "大🍌,一条大🍌,嘿,你的感觉真的很奇妙 [lbreak]", - }, - { - "text": "天气预报显示,今天会有小雨,请大家出门时记得带伞。降温的天气也提醒我们要适时添衣保暖 [lbreak]", - }, - { - "text": "公司的年度总结会议将在下周三举行,请各部门提前准备好相关材料,确保会议顺利进行 [lbreak]", - }, - { - "text": "今天的午餐菜单包括烤鸡、沙拉和蔬菜汤,大家可以根据自己的口味选择适合的菜品 [lbreak]", - }, - { - "text": "请注意,电梯将在下午两点进行例行维护,预计需要一个小时的时间,请大家在此期间使用楼梯 [lbreak]", - }, - { - "text": "图书馆新到了一批书籍,涵盖了文学、科学和历史等多个领域,欢迎大家前来借阅 [lbreak]", - }, - { - "text": "电影中梁朝伟扮演的陈永仁的编号27149 [lbreak]", - }, - { - "text": "这块黄金重达324.75克 [lbreak]", - }, - { - "text": "我们班的最高总分为583分 [lbreak]", - }, - { - "text": "12~23 [lbreak]", - }, - { - "text": "-1.5~2 [lbreak]", - }, - { - "text": "她出生于86年8月18日,她弟弟出生于1995年3月1日 [lbreak]", - }, - { - "text": "等会请在12:05请通知我 [lbreak]", - }, - { - "text": "今天的最低气温达到-10°C [lbreak]", - }, - { - "text": "现场有7/12的观众投出了赞成票 [lbreak]", - }, - { - "text": "明天有62%的概率降雨 [lbreak]", - }, - { - "text": "随便来几个价格12块5,34.5元,20.1万 [lbreak]", - }, - { - "text": "这是固话0421-33441122 [lbreak]", - }, - { - "text": "这是手机+86 18544139121 [lbreak]", - }, -] - -ssml_example1 = """ - - - 下面是一个 ChatTTS 用于合成多角色多情感的有声书示例[lbreak] - - - 黛玉冷笑道:[lbreak] - - - 我说呢 [uv_break] ,亏了绊住,不然,早就飞起来了[lbreak] - - - 宝玉道:[lbreak] - - - “只许和你玩 [uv_break] ,替你解闷。不过偶然到他那里,就说这些闲话。”[lbreak] - - - “好没意思的话![uv_break] 去不去,关我什么事儿? 又没叫你替我解闷儿 [uv_break],还许你不理我呢” [lbreak] - - - 说着,便赌气回房去了 [lbreak] - - -""" -ssml_example2 = """ - - - 使用 prosody 控制生成文本的语速语调和音量,示例如下 [lbreak] - - - 无任何限制将会继承父级voice配置进行生成 [lbreak] - - - 设置 rate 大于1表示加速,小于1为减速 [lbreak] - - - 设置 pitch 调整音调,设置为6表示提高6个半音 [lbreak] - - - 设置 volume 调整音量,设置为2表示提高2个分贝 [lbreak] - - - 在 voice 中无prosody包裹的文本即为默认生成状态下的语音 [lbreak] - - -""" -ssml_example3 = """ - - - 使用 break 标签将会简单的 [lbreak] - - - - 插入一段空白到生成结果中 [lbreak] - - -""" - -ssml_example4 = """ - - - temperature for sampling (may be overridden by style or speaker) [lbreak] - - 温度值用于采样,这个值有可能被 style 或者 speaker 覆盖 [lbreak] - - temperature for sampling ,这个值有可能被 style 或者 speaker 覆盖 [lbreak] - - 温度值用于采样,(may be overridden by style or speaker) [lbreak] - - -""" - -default_ssml = """ - - - 这里是一个简单的 SSML 示例 [lbreak] - - -""" - - -def create_tts_interface(): - speakers = get_speakers() - - def get_speaker_show_name(spk): - if spk.gender == "*" or spk.gender == "": - return spk.name - return f"{spk.gender} : {spk.name}" - - speaker_names = ["*random"] + [ - get_speaker_show_name(speaker) for speaker in speakers - ] - - styles = ["*auto"] + [s.get("name") for s in get_styles()] - - history = [] - - with gr.Row(): - with gr.Column(scale=1): - with gr.Group(): - gr.Markdown("🎛️Sampling") - temperature_input = gr.Slider( - 0.01, 2.0, value=0.3, step=0.01, label="Temperature" - ) - top_p_input = gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Top P") - top_k_input = gr.Slider(1, 50, value=20, step=1, label="Top K") - batch_size_input = gr.Slider( - 1, - webui_config["max_batch_size"], - value=4, - step=1, - label="Batch Size", - ) - - with gr.Row(): - with gr.Group(): - gr.Markdown("🎭Style") - gr.Markdown("- 后缀为 `_p` 表示带prompt,效果更强但是影响质量") - style_input_dropdown = gr.Dropdown( - choices=styles, - # label="Choose Style", - interactive=True, - show_label=False, - value="*auto", - ) - with gr.Row(): - with gr.Group(): - gr.Markdown("🗣️Speaker (Name or Seed)") - spk_input_text = gr.Textbox( - label="Speaker (Text or Seed)", - value="female2", - show_label=False, - ) - spk_input_dropdown = gr.Dropdown( - choices=speaker_names, - # label="Choose Speaker", - interactive=True, - value="female : female2", - show_label=False, - ) - spk_rand_button = gr.Button( - value="🎲", - # tooltip="Random Seed", - variant="secondary", - ) - spk_input_dropdown.change( - fn=lambda x: x.startswith("*") - and "-1" - or x.split(":")[-1].strip(), - inputs=[spk_input_dropdown], - outputs=[spk_input_text], - ) - spk_rand_button.click( - lambda x: str(torch.randint(0, 2**32 - 1, (1,)).item()), - inputs=[spk_input_text], - outputs=[spk_input_text], - ) - with gr.Group(): - gr.Markdown("💃Inference Seed") - infer_seed_input = gr.Number( - value=42, - label="Inference Seed", - show_label=False, - minimum=-1, - maximum=2**32 - 1, - ) - infer_seed_rand_button = gr.Button( - value="🎲", - # tooltip="Random Seed", - variant="secondary", - ) - use_decoder_input = gr.Checkbox( - value=True, label="Use Decoder", visible=False - ) - with gr.Group(): - gr.Markdown("🔧Prompt engineering") - prompt1_input = gr.Textbox(label="Prompt 1") - prompt2_input = gr.Textbox(label="Prompt 2") - prefix_input = gr.Textbox(label="Prefix") - - infer_seed_rand_button.click( - lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()), - inputs=[infer_seed_input], - outputs=[infer_seed_input], - ) - with gr.Column(scale=3): - with gr.Row(): - with gr.Column(scale=4): - with gr.Group(): - input_title = gr.Markdown( - "📝Text Input", - elem_id="input-title", - ) - gr.Markdown( - f"- 字数限制{webui_config['tts_max']:,}字,超过部分截断" - ) - gr.Markdown("- 如果尾字吞字不读,可以试试结尾加上 `[lbreak]`") - gr.Markdown( - "- If the input text is all in English, it is recommended to check disable_normalize" - ) - text_input = gr.Textbox( - show_label=False, - label="Text to Speech", - lines=10, - placeholder="输入文本或选择示例", - elem_id="text-input", - ) - # TODO 字数统计,其实实现很好写,但是就是会触发loading...并且还要和后端交互... - # text_input.change( - # fn=lambda x: ( - # f"📝Text Input ({len(x)} char)" - # if x - # else ( - # "📝Text Input (0 char)" - # if not x - # else "📝Text Input (0 char)" - # ) - # ), - # inputs=[text_input], - # outputs=[input_title], - # ) - with gr.Row(): - contorl_tokens = [ - "[laugh]", - "[uv_break]", - "[v_break]", - "[lbreak]", - ] - - for tk in contorl_tokens: - t_btn = gr.Button(tk) - t_btn.click( - lambda text, tk=tk: text + " " + tk, - inputs=[text_input], - outputs=[text_input], - ) - with gr.Column(scale=1): - with gr.Group(): - gr.Markdown("🎶Refiner") - refine_prompt_input = gr.Textbox( - label="Refine Prompt", - value="[oral_2][laugh_0][break_6]", - ) - refine_button = gr.Button("✍️Refine Text") - # TODO 分割句子,使用当前配置拼接为SSML,然后发送到SSML tab - # send_button = gr.Button("📩Split and send to SSML") - - with gr.Group(): - gr.Markdown("🔊Generate") - disable_normalize_input = gr.Checkbox( - value=False, label="Disable Normalize" - ) - tts_button = gr.Button( - "🔊Generate Audio", - variant="primary", - elem_classes="big-button", - ) - - with gr.Group(): - gr.Markdown("🎄Examples") - sample_dropdown = gr.Dropdown( - choices=[sample["text"] for sample in sample_texts], - show_label=False, - value=None, - interactive=True, - ) - sample_dropdown.change( - fn=lambda x: x, - inputs=[sample_dropdown], - outputs=[text_input], - ) - - with gr.Group(): - gr.Markdown("🎨Output") - tts_output = gr.Audio(label="Generated Audio") - - refine_button.click( - refine_text, - inputs=[text_input, refine_prompt_input], - outputs=[text_input], - ) - - tts_button.click( - tts_generate, - inputs=[ - text_input, - temperature_input, - top_p_input, - top_k_input, - spk_input_text, - infer_seed_input, - use_decoder_input, - prompt1_input, - prompt2_input, - prefix_input, - style_input_dropdown, - disable_normalize_input, - batch_size_input, - ], - outputs=tts_output, - ) - - -def create_ssml_interface(): - examples = [ - ssml_example1, - ssml_example2, - ssml_example3, - ssml_example4, - ] - - with gr.Row(): - with gr.Column(scale=3): - with gr.Group(): - gr.Markdown("📝SSML Input") - gr.Markdown(f"- 最长{webui_config['ssml_max']:,}字符,超过会被截断") - gr.Markdown("- 尽量保证使用相同的 seed") - gr.Markdown( - "- 关于SSML可以看这个 [文档](https://github.com/lenML/ChatTTS-Forge/blob/main/docs/SSML.md)" - ) - ssml_input = gr.Textbox( - label="SSML Input", - lines=10, - value=default_ssml, - placeholder="输入 SSML 或选择示例", - elem_id="ssml_input", - show_label=False, - ) - ssml_button = gr.Button("🔊Synthesize SSML", variant="primary") - with gr.Column(scale=1): - with gr.Group(): - # 参数 - gr.Markdown("🎛️Parameters") - # batch size - batch_size_input = gr.Slider( - label="Batch Size", - value=4, - minimum=1, - maximum=webui_config["max_batch_size"], - step=1, - ) - with gr.Group(): - gr.Markdown("🎄Examples") - gr.Examples( - examples=examples, - inputs=[ssml_input], - ) - - ssml_output = gr.Audio(label="Generated Audio") - - ssml_button.click( - synthesize_ssml, - inputs=[ssml_input, batch_size_input], - outputs=ssml_output, - ) - - return ssml_input - - -# NOTE: 这个其实是需要GPU的...但是spaces会自动卸载,所以不太好使,具体处理在text_normalize中兼容 -# @spaces.GPU -def split_long_text(long_text_input): - spliter = SentenceSplitter(webui_config["spliter_threshold"]) - sentences = spliter.parse(long_text_input) - sentences = [text_normalize(s) for s in sentences] - data = [] - for i, text in enumerate(sentences): - data.append([i, text, len(text)]) - return data - - -def merge_dataframe_to_ssml(dataframe, spk, style, seed): - if style == "*auto": - style = None - if spk == "-1" or spk == -1: - spk = None - if seed == -1 or seed == "-1": - seed = None - - ssml = "" - indent = " " * 2 - - for i, row in dataframe.iterrows(): - ssml += f"{indent}\n" - return f"\n{ssml}" - - -# 长文本处理 -# 可以输入长文本,并选择切割方法,切割之后可以将拼接的SSML发送到SSML tab -# 根据 。 句号切割,切割之后显示到 data table -def create_long_content_tab(ssml_input, tabs): - speakers = get_speakers() - - def get_speaker_show_name(spk): - if spk.gender == "*" or spk.gender == "": - return spk.name - return f"{spk.gender} : {spk.name}" - - speaker_names = ["*random"] + [ - get_speaker_show_name(speaker) for speaker in speakers - ] - - styles = ["*auto"] + [s.get("name") for s in get_styles()] - - with gr.Row(): - with gr.Column(scale=1): - # 选择说话人 选择风格 选择seed - with gr.Group(): - gr.Markdown("🗣️Speaker") - spk_input_text = gr.Textbox( - label="Speaker (Text or Seed)", - value="female2", - show_label=False, - ) - spk_input_dropdown = gr.Dropdown( - choices=speaker_names, - interactive=True, - value="female : female2", - show_label=False, - ) - spk_rand_button = gr.Button( - value="🎲", - variant="secondary", - ) - with gr.Group(): - gr.Markdown("🎭Style") - style_input_dropdown = gr.Dropdown( - choices=styles, - interactive=True, - show_label=False, - value="*auto", - ) - with gr.Group(): - gr.Markdown("🗣️Seed") - infer_seed_input = gr.Number( - value=42, - label="Inference Seed", - show_label=False, - minimum=-1, - maximum=2**32 - 1, - ) - infer_seed_rand_button = gr.Button( - value="🎲", - variant="secondary", - ) - - send_btn = gr.Button("📩Send to SSML", variant="primary") - - with gr.Column(scale=3): - with gr.Group(): - gr.Markdown("📝Long Text Input") - gr.Markdown("- 此页面用于处理超长文本") - gr.Markdown("- 切割后,可以选择说话人、风格、seed,然后发送到SSML") - long_text_input = gr.Textbox( - label="Long Text Input", - lines=10, - placeholder="输入长文本", - elem_id="long-text-input", - show_label=False, - ) - long_text_split_button = gr.Button("🔪Split Text") - - with gr.Row(): - with gr.Column(scale=3): - with gr.Group(): - gr.Markdown("🎨Output") - long_text_output = gr.DataFrame( - headers=["index", "text", "length"], - datatype=["number", "str", "number"], - elem_id="long-text-output", - interactive=False, - wrap=True, - value=[], - ) - - spk_input_dropdown.change( - fn=lambda x: x.startswith("*") and "-1" or x.split(":")[-1].strip(), - inputs=[spk_input_dropdown], - outputs=[spk_input_text], - ) - spk_rand_button.click( - lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()), - inputs=[spk_input_text], - outputs=[spk_input_text], - ) - infer_seed_rand_button.click( - lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()), - inputs=[infer_seed_input], - outputs=[infer_seed_input], - ) - long_text_split_button.click( - split_long_text, - inputs=[long_text_input], - outputs=[long_text_output], - ) - - infer_seed_rand_button.click( - lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()), - inputs=[infer_seed_input], - outputs=[infer_seed_input], - ) - - send_btn.click( - merge_dataframe_to_ssml, - inputs=[ - long_text_output, - spk_input_text, - style_input_dropdown, - infer_seed_input, - ], - outputs=[ssml_input], - ) - - def change_tab(): - return gr.Tabs(selected="ssml") - - send_btn.click(change_tab, inputs=[], outputs=[tabs]) - - -def create_readme_tab(): - readme_content = read_local_readme() - gr.Markdown(readme_content) - - -def create_app_footer(): - gradio_version = gr.__version__ - git_tag = config.versions.git_tag - git_commit = config.versions.git_commit - git_branch = config.versions.git_branch - python_version = config.versions.python_version - torch_version = config.versions.torch_version - - config.versions.gradio_version = gradio_version - - gr.Markdown( - f""" -🍦 [ChatTTS-Forge](https://github.com/lenML/ChatTTS-Forge) -version: [{git_tag}](https://github.com/lenML/ChatTTS-Forge/commit/{git_commit}) | branch: `{git_branch}` | python: `{python_version}` | torch: `{torch_version}` - """ - ) - - -def create_interface(): - - js_func = """ - function refresh() { - const url = new URL(window.location); - - if (url.searchParams.get('__theme') !== 'dark') { - url.searchParams.set('__theme', 'dark'); - window.location.href = url.href; - } - } - """ - - head_js = """ - - """ - - with gr.Blocks(js=js_func, head=head_js, title="ChatTTS Forge WebUI") as demo: - css = """ - - """ - - gr.HTML(css) - with gr.Tabs() as tabs: - with gr.TabItem("TTS"): - create_tts_interface() - - with gr.TabItem("SSML", id="ssml"): - ssml_input = create_ssml_interface() - - with gr.TabItem("Long Text"): - create_long_content_tab(ssml_input, tabs=tabs) - - with gr.TabItem("README"): - create_readme_tab() - - create_app_footer() - return demo - +from modules.utils import env +from modules.webui import webui_config +from modules.webui.app import webui_init, create_interface +from modules import generate_audio +from modules import config if __name__ == "__main__": import argparse @@ -914,6 +67,12 @@ def create_interface(): type=str.lower, ) parser.add_argument("--compile", action="store_true", help="Enable model compile") + # webui_Experimental + parser.add_argument( + "--webui_experimental", + action="store_true", + help="Enable webui_experimental features", + ) args = parser.parse_args() @@ -934,20 +93,23 @@ def get_and_update_env(*args): device_id = get_and_update_env(args, "device_id", None, str) use_cpu = get_and_update_env(args, "use_cpu", [], list) compile = get_and_update_env(args, "compile", False, bool) + webui_experimental = get_and_update_env(args, "webui_experimental", False, bool) - webui_config["tts_max"] = get_and_update_env(args, "tts_max_len", 1000, int) - webui_config["ssml_max"] = get_and_update_env(args, "ssml_max_len", 5000, int) - webui_config["max_batch_size"] = get_and_update_env(args, "max_batch_size", 8, int) + webui_config.tts_max = get_and_update_env(args, "tts_max_len", 1000, int) + webui_config.ssml_max = get_and_update_env(args, "ssml_max_len", 5000, int) + webui_config.max_batch_size = get_and_update_env(args, "max_batch_size", 8, int) demo = create_interface() if auth: auth = tuple(auth.split(":")) - generate.setup_lru_cache() + generate_audio.setup_lru_cache() devices.reset_device() devices.first_time_calculation() + webui_init() + demo.queue().launch( server_name=server_name, server_port=server_port,