Merge pull request #3081 from coqui-ai/dev

v0.17.9
coqui-ai · Oct 19, 2023 · f0faed9 · f0faed9
2 parents df2422e + bf68848
commit f0faed9
Show file tree

Hide file tree

Showing 17 changed files with 311 additions and 207 deletions.
diff --git a/README.md b/README.md
@@ -146,7 +146,7 @@ Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not relea
 You can also help us implement more models.
 
 ## Installation
-🐸TTS is tested on Ubuntu 18.04 with **python >= 3.7, < 3.11.**.
+🐸TTS is tested on Ubuntu 18.04 with **python >= 3.9, < 3.12.**.
 
 If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
 
@@ -198,17 +198,18 @@ from TTS.api import TTS
 # Get device
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
-# List available 🐸TTS models and choose the first one
-model_name = TTS().list_models()[0]
+# List available 🐸TTS models
+print(TTS().list_models())
+
 # Init TTS
-tts = TTS(model_name).to(device)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device)
 
 # Run TTS
-# ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language
-# Text to speech with a numpy output
-wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
+# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
+# Text to speech list of amplitude values as output
+wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en")
 # Text to speech to a file
-tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
+tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
 ```
 
 #### Running a single speaker model
@@ -347,6 +348,18 @@ If you don't specify any models, then it uses LJSpeech based English model.
   $ tts --text "Text for TTS" --out_path output/path/speech.wav
   ```
 
+- Run TTS and pipe out the generated TTS wav file data:
+
+  ```
+  $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
+  ```
+
+- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
+
+  ```
+  $ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
+  ```
+
 - Run a TTS model with its default vocoder model:
 
   ```

diff --git a/TTS/.models.json b/TTS/.models.json
@@ -5,9 +5,9 @@
                 "xtts_v1": {
                     "description": "XTTS-v1 by Coqui with 13 languages and cross-language voice cloning.",
                     "hf_url": [
-                        "https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/model.pth",
-                        "https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/config.json",
-                        "https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/vocab.json"
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/model.pth",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/config.json",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/vocab.json",
                     ],
                     "default_vocoder": null,
                     "commit": "e5140314",
@@ -917,4 +917,4 @@
             }
         }
     }
-}
+}
diff --git a/TTS/VERSION b/TTS/VERSION
@@ -1 +1 @@
-0.17.8
+0.17.9
diff --git a/TTS/api.py b/TTS/api.py
@@ -112,7 +112,6 @@ def is_multi_lingual(self):
             return self.synthesizer.tts_model.language_manager.num_languages > 1
         return False
 
-
     @property
     def speakers(self):
         if not self.is_multi_speaker:
@@ -265,6 +264,7 @@ def tts_coqui_studio(
         language: str = None,
         emotion: str = None,
         speed: float = 1.0,
+        pipe_out = None,
         file_path: str = None,
     ) -> Union[np.ndarray, str]:
         """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
@@ -281,6 +281,8 @@ def tts_coqui_studio(
                 with "V1" model. Defaults to None.
             speed (float, optional):
                 Speed of the speech. Defaults to 1.0.
+            pipe_out (BytesIO, optional):
+                Flag to stdout the generated TTS wav file for shell pipe.
             file_path (str, optional):
                 Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
 
@@ -294,6 +296,7 @@ def tts_coqui_studio(
                 speaker_name=speaker_name,
                 language=language,
                 speed=speed,
+                pipe_out=pipe_out,
                 emotion=emotion,
                 file_path=file_path,
             )[0]
@@ -356,6 +359,7 @@ def tts_to_file(
         speaker_wav: str = None,
         emotion: str = None,
         speed: float = 1.0,
+        pipe_out = None,
         file_path: str = "output.wav",
         **kwargs,
     ):
@@ -377,6 +381,8 @@ def tts_to_file(
                 Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
             speed (float, optional):
                 Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
+            pipe_out (BytesIO, optional):
+                Flag to stdout the generated TTS wav file for shell pipe.
             file_path (str, optional):
                 Output file path. Defaults to "output.wav".
             kwargs (dict, optional):
@@ -386,10 +392,16 @@ def tts_to_file(
 
         if self.csapi is not None:
             return self.tts_coqui_studio(
-                text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path
+                text=text,
+                speaker_name=speaker,
+                language=language,
+                emotion=emotion,
+                speed=speed,
+                file_path=file_path,
+                pipe_out=pipe_out,
             )
         wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
-        self.synthesizer.save_wav(wav=wav, path=file_path)
+        self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
         return file_path
 
     def voice_conversion(