Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create a separated version for XTTS with HiFi-GAN #3036

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,19 @@
"multi-dataset": {
"xtts_v1": {
"description": "XTTS-v1 by Coqui with 13 languages and cross-language voice cloning.",
"hf_url": [
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/model.pth",
"https://huggingface.co/coqui/XTTS-v1/resolve/xtts_v1/config.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/vocab.json"
],
"default_vocoder": null,
"commit": "e5140314",
"license": "CPML",
"contact": "info@coqui.ai",
"tos_required": true
},
"xtts_v1.1": {
"description": "XTTS-v1.1 by Coqui with 13 languages and cross-language voice cloning with faster inference and streaming support.",
"hf_url": [
"https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/model.pth",
"https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/config.json",
Expand Down
4 changes: 2 additions & 2 deletions TTS/tts/models/xtts.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ class XttsArgs(Coqpit):
clvp_checkpoint (str, optional): The checkpoint for the ConditionalLatentVariablePerseq model. Defaults to None.
decoder_checkpoint (str, optional): The checkpoint for the DiffTTS model. Defaults to None.
num_chars (int, optional): The maximum number of characters to generate. Defaults to 255.
use_hifigan (bool, optional): Whether to use hifigan or diffusion + univnet as a decoder. Defaults to True.
use_hifigan (bool, optional): Whether to use hifigan or diffusion + univnet as a decoder. Defaults to False.

For GPT model:
ar_max_audio_tokens (int, optional): The maximum mel tokens for the autoregressive model. Defaults to 604.
Expand Down Expand Up @@ -238,7 +238,7 @@ class XttsArgs(Coqpit):
clvp_checkpoint: str = None
decoder_checkpoint: str = None
num_chars: int = 255
use_hifigan: bool = True
use_hifigan: bool = False

# XTTS GPT Encoder params
tokenizer_file: str = ""
Expand Down
2 changes: 1 addition & 1 deletion TTS/utils/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def download_model(self, model_name):
output_model_path = output_path
output_config_path = None
if (
model not in ["tortoise-v2", "bark", "xtts_v1"] and "fairseq" not in model_name
model not in ["tortoise-v2", "bark", "xtts_v1", "xtts_v1.1"] and "fairseq" not in model_name
): # TODO:This is stupid but don't care for now.
output_model_path, output_config_path = self._find_files(output_path)
# update paths in the config.json
Expand Down
4 changes: 2 additions & 2 deletions docs/source/models/xtts.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ You can also mail us at info@coqui.ai.

```python
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True)
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1.1", gpu=True)

# generate speech by cloning a voice using default settings
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
Expand All @@ -45,7 +45,7 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
#### 🐸TTS Command line

```console
tts --model_name tts_models/multilingual/multi-dataset/xtts_v1 \
tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 \
--text "Bugün okula gitmek istemiyorum." \
--speaker_wav /path/to/target/speaker.wav \
--language_idx tr \
Expand Down
2 changes: 1 addition & 1 deletion tests/zoo_tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
MODELS_WITH_SEP_TESTS = [
"tts_models/multilingual/multi-dataset/bark",
"tts_models/en/multi-dataset/tortoise-v2",
"tts_models/multilingual/multi-dataset/xtts_v1",
"tts_models/multilingual/multi-dataset/xtts_v1"
]


Expand Down
Loading