Skip to content

Commit

Permalink
Merge pull request #3081 from coqui-ai/dev
Browse files Browse the repository at this point in the history
v0.17.9
  • Loading branch information
erogol authored Oct 19, 2023
2 parents df2422e + bf68848 commit f0faed9
Show file tree
Hide file tree
Showing 17 changed files with 311 additions and 207 deletions.
29 changes: 21 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not relea
You can also help us implement more models.

## Installation
🐸TTS is tested on Ubuntu 18.04 with **python >= 3.7, < 3.11.**.
🐸TTS is tested on Ubuntu 18.04 with **python >= 3.9, < 3.12.**.

If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.

Expand Down Expand Up @@ -198,17 +198,18 @@ from TTS.api import TTS
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"

# List available 🐸TTS models and choose the first one
model_name = TTS().list_models()[0]
# List available 🐸TTS models
print(TTS().list_models())

# Init TTS
tts = TTS(model_name).to(device)
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device)

# Run TTS
# ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language
# Text to speech with a numpy output
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
# Text to speech list of amplitude values as output
wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en")
# Text to speech to a file
tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
```

#### Running a single speaker model
Expand Down Expand Up @@ -347,6 +348,18 @@ If you don't specify any models, then it uses LJSpeech based English model.
$ tts --text "Text for TTS" --out_path output/path/speech.wav
```
- Run TTS and pipe out the generated TTS wav file data:
```
$ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
```
- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
```
$ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
```
- Run a TTS model with its default vocoder model:
```
Expand Down
8 changes: 4 additions & 4 deletions TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
"xtts_v1": {
"description": "XTTS-v1 by Coqui with 13 languages and cross-language voice cloning.",
"hf_url": [
"https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/model.pth",
"https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/config.json",
"https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/vocab.json"
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/model.pth",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/config.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/vocab.json",
],
"default_vocoder": null,
"commit": "e5140314",
Expand Down Expand Up @@ -917,4 +917,4 @@
}
}
}
}
}
2 changes: 1 addition & 1 deletion TTS/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.17.8
0.17.9
18 changes: 15 additions & 3 deletions TTS/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ def is_multi_lingual(self):
return self.synthesizer.tts_model.language_manager.num_languages > 1
return False


@property
def speakers(self):
if not self.is_multi_speaker:
Expand Down Expand Up @@ -265,6 +264,7 @@ def tts_coqui_studio(
language: str = None,
emotion: str = None,
speed: float = 1.0,
pipe_out = None,
file_path: str = None,
) -> Union[np.ndarray, str]:
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
Expand All @@ -281,6 +281,8 @@ def tts_coqui_studio(
with "V1" model. Defaults to None.
speed (float, optional):
Speed of the speech. Defaults to 1.0.
pipe_out (BytesIO, optional):
Flag to stdout the generated TTS wav file for shell pipe.
file_path (str, optional):
Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
Expand All @@ -294,6 +296,7 @@ def tts_coqui_studio(
speaker_name=speaker_name,
language=language,
speed=speed,
pipe_out=pipe_out,
emotion=emotion,
file_path=file_path,
)[0]
Expand Down Expand Up @@ -356,6 +359,7 @@ def tts_to_file(
speaker_wav: str = None,
emotion: str = None,
speed: float = 1.0,
pipe_out = None,
file_path: str = "output.wav",
**kwargs,
):
Expand All @@ -377,6 +381,8 @@ def tts_to_file(
Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
speed (float, optional):
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
pipe_out (BytesIO, optional):
Flag to stdout the generated TTS wav file for shell pipe.
file_path (str, optional):
Output file path. Defaults to "output.wav".
kwargs (dict, optional):
Expand All @@ -386,10 +392,16 @@ def tts_to_file(

if self.csapi is not None:
return self.tts_coqui_studio(
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path
text=text,
speaker_name=speaker,
language=language,
emotion=emotion,
speed=speed,
file_path=file_path,
pipe_out=pipe_out,
)
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
self.synthesizer.save_wav(wav=wav, path=file_path)
self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
return file_path

def voice_conversion(
Expand Down
Loading

0 comments on commit f0faed9

Please sign in to comment.