From fdd443b10f099e089f09d7cc471279f4609b2751 Mon Sep 17 00:00:00 2001 From: matatonic Date: Mon, 1 Jul 2024 19:43:32 -0400 Subject: [PATCH] 0.17.1 --- README.md | 5 +++++ add_voice.py | 4 ++-- requirements-rocm.txt | 1 + speech.py | 6 +++++- voice_to_speaker.default.yaml | 3 ++- 5 files changed, 15 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8c73f1a..f775da3 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,11 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s ## Recent Changes +Version 0.17.1, 2024-07-01 + +* fix ROCm (add langdetect to requirements-rocm.txt) +* Fix zh-cn for xtts + Version 0.17.0, 2024-07-01 * Automatic language detection, thanks [@RodolfoCastanheira](https://github.com/RodolfoCastanheira) diff --git a/add_voice.py b/add_voice.py index bb86ac5..7b2c1f8 100755 --- a/add_voice.py +++ b/add_voice.py @@ -11,8 +11,8 @@ parser.add_argument('sample', action='store', help="Set the wav sample file") parser.add_argument('-n', '--name', action='store', help="Set the name for the voice (by default will use the WAV file name)") -parser.add_argument('-l', '--language', action='store', default="en", help="Set the language for the voice", - choices=['en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko']) +parser.add_argument('-l', '--language', action='store', default="auto", help="Set the language for the voice", + choices=['auto', 'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko', 'hi']) parser.add_argument('--openai-model', action='store', default="tts-1-hd", help="Set the openai model for the voice") parser.add_argument('--xtts-model', action='store', default="xtts", help="Set the xtts model for the voice (if using a custom model, also set model_path)") parser.add_argument('--model-path', action='store', default=None, help="Set the path for a custom xtts model") diff --git a/requirements-rocm.txt b/requirements-rocm.txt index a47e4e8..2dcb36f 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -3,6 +3,7 @@ uvicorn loguru piper-tts coqui-tts +langdetect # Creating an environment where deepspeed works is complex, for now it will be disabled by default. #deepspeed torch; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux" diff --git a/speech.py b/speech.py index 0cb5cd3..687a8fc 100755 --- a/speech.py +++ b/speech.py @@ -296,7 +296,11 @@ async def generate_speech(request: GenerateSpeechRequest): hf_generate_kwargs['enable_text_splitting'] = hf_generate_kwargs.get('enable_text_splitting', True) # change the default to true if hf_generate_kwargs['enable_text_splitting']: - all_text = split_sentence(input_text, language, xtts.xtts.tokenizer.char_limits[language]) + if language == 'zh-cn': + split_lang = 'zh' + else: + split_lang = language + all_text = split_sentence(input_text, split_lang, xtts.xtts.tokenizer.char_limits[split_lang]) else: all_text = [input_text] diff --git a/voice_to_speaker.default.yaml b/voice_to_speaker.default.yaml index 0604830..afce1a6 100644 --- a/voice_to_speaker.default.yaml +++ b/voice_to_speaker.default.yaml @@ -46,8 +46,9 @@ tts-1-hd: model: xtts speaker: voices/shimmer.wav me: - model: xtts_v2.0.2 # you can specify different xtts version + model: xtts_v2.0.2 # you can specify an older xtts version speaker: voices/me.wav # this could be you + language: auto enable_text_splitting: True length_penalty: 1.0 repetition_penalty: 10