base changes to use stdout instead of play audio

Considering conversion to pipe wav data for audio playback with ohter program like aplay. This is incomplete code. Using to get feedback before proceeding with implementation.
coqui-ai · Oct 6, 2023 · f1b1f4a · f1b1f4a
1 parent 403ae73
commit f1b1f4a
Show file tree

Hide file tree

Showing 3 changed files with 185 additions and 186 deletions.
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -3,6 +3,7 @@
 
 import argparse
 import sys
+import contextlib
 from argparse import RawTextHelpFormatter
 
 # pylint: disable=redefined-outer-name, unused-argument
@@ -241,7 +242,7 @@ def main():
         default=None,
     )
     parser.add_argument(
-        "--play",
+        "--pipe_out",
         help="Play the generated TTS wav.",
         type=str2bool,
         nargs="?",
@@ -361,179 +362,177 @@ def main():
     if not any(check_args):
         parser.parse_args(["-h"])
 
-    # Late-import to make things load faster
-    from TTS.api import TTS
-    from TTS.utils.manage import ModelManager
-    from TTS.utils.synthesizer import Synthesizer
-
-    # load model manager
-    path = Path(__file__).parent / "../.models.json"
-    manager = ModelManager(path, progress_bar=args.progress_bar)
-    api = TTS()
-
-    tts_path = None
-    tts_config_path = None
-    speakers_file_path = None
-    language_ids_file_path = None
-    vocoder_path = None
-    vocoder_config_path = None
-    encoder_path = None
-    encoder_config_path = None
-    vc_path = None
-    vc_config_path = None
-    model_dir = None
-
-    # CASE1 #list : list pre-trained TTS models
-    if args.list_models:
-        manager.add_cs_api_models(api.list_models())
-        manager.list_models()
-        sys.exit()
-
-    # CASE2 #info : model info for pre-trained TTS models
-    if args.model_info_by_idx:
-        model_query = args.model_info_by_idx
-        manager.model_info_by_idx(model_query)
-        sys.exit()
-
-    if args.model_info_by_name:
-        model_query_full_name = args.model_info_by_name
-        manager.model_info_by_full_name(model_query_full_name)
-        sys.exit()
-
-    # CASE3: TTS with coqui studio models
-    if "coqui_studio" in args.model_name:
-        print(" > Using 🐸Coqui Studio model: ", args.model_name)
-        api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
-        api.tts_to_file(
-            text=args.text,
-            emotion=args.emotion,
-            file_path=args.out_path,
-            language=args.language,
-            speed=args.speed,
-            play=args.play,
-        )
-        if args.play:
-            print(" > Played TTS wav")
-        print(" > Saving output to ", args.out_path)
-        return
-
-    # CASE4: load pre-trained model paths
-    if args.model_name is not None and not args.model_path:
-        model_path, config_path, model_item = manager.download_model(args.model_name)
-        # tts model
-        if model_item["model_type"] == "tts_models":
-            tts_path = model_path
-            tts_config_path = config_path
-            if "default_vocoder" in model_item:
-                args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
-
-        # voice conversion model
-        if model_item["model_type"] == "voice_conversion_models":
-            vc_path = model_path
-            vc_config_path = config_path
-
-        # tts model with multiple files to be loaded from the directory path
-        if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
-            model_dir = model_path
-            tts_path = None
-            tts_config_path = None
-            args.vocoder_name = None
-
-    # load vocoder
-    if args.vocoder_name is not None and not args.vocoder_path:
-        vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
-
-    # CASE5: set custom model paths
-    if args.model_path is not None:
-        tts_path = args.model_path
-        tts_config_path = args.config_path
-        speakers_file_path = args.speakers_file_path
-        language_ids_file_path = args.language_ids_file_path
-
-    if args.vocoder_path is not None:
-        vocoder_path = args.vocoder_path
-        vocoder_config_path = args.vocoder_config_path
-
-    if args.encoder_path is not None:
-        encoder_path = args.encoder_path
-        encoder_config_path = args.encoder_config_path
-
-    device = args.device
-    if args.use_cuda:
-        device = "cuda"
-
-    # load models
-    synthesizer = Synthesizer(
-        tts_path,
-        tts_config_path,
-        speakers_file_path,
-        language_ids_file_path,
-        vocoder_path,
-        vocoder_config_path,
-        encoder_path,
-        encoder_config_path,
-        vc_path,
-        vc_config_path,
-        model_dir,
-        args.voice_dir,
-    ).to(device)
-
-    # query speaker ids of a multi-speaker model.
-    if args.list_speaker_idxs:
-        print(
-            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
-        )
-        print(synthesizer.tts_model.speaker_manager.name_to_id)
-        return
-
-    # query langauge ids of a multi-lingual model.
-    if args.list_language_idxs:
-        print(
-            " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
-        )
-        print(synthesizer.tts_model.language_manager.name_to_id)
-        return
-
-    # check the arguments against a multi-speaker model.
-    if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
-        print(
-            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
-            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
-        )
-        return
-
-    # RUN THE SYNTHESIS
-    if args.text:
-        print(" > Text: {}".format(args.text))
-
-    # kick it
-    if tts_path is not None:
-        wav = synthesizer.tts(
-            args.text,
-            speaker_name=args.speaker_idx,
-            language_name=args.language_idx,
-            speaker_wav=args.speaker_wav,
-            reference_wav=args.reference_wav,
-            style_wav=args.capacitron_style_wav,
-            style_text=args.capacitron_style_text,
-            reference_speaker_name=args.reference_speaker_idx,
-        )
-    elif vc_path is not None:
-        wav = synthesizer.voice_conversion(
-            source_wav=args.source_wav,
-            target_wav=args.target_wav,
-        )
-    elif model_dir is not None:
-        wav = synthesizer.tts(
-            args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
-        )
-
-    if args.play:
-        print(" > Playing TTS wav")
-
-    # save the results
-    print(" > Saving output to {}".format(args.out_path))
-    synthesizer.save_wav(wav, args.out_path, play=args.play)
+    pipe_out = sys.stdout if args.pipe_out else None
+
+    with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
+        # Late-import to make things load faster
+        from TTS.api import TTS
+        from TTS.utils.manage import ModelManager
+        from TTS.utils.synthesizer import Synthesizer
+
+        # load model manager
+        path = Path(__file__).parent / "../.models.json"
+        manager = ModelManager(path, progress_bar=args.progress_bar)
+        api = TTS()
+
+        tts_path = None
+        tts_config_path = None
+        speakers_file_path = None
+        language_ids_file_path = None
+        vocoder_path = None
+        vocoder_config_path = None
+        encoder_path = None
+        encoder_config_path = None
+        vc_path = None
+        vc_config_path = None
+        model_dir = None
+
+        # CASE1 #list : list pre-trained TTS models
+        if args.list_models:
+            manager.add_cs_api_models(api.list_models())
+            manager.list_models()
+            sys.exit()
+
+        # CASE2 #info : model info for pre-trained TTS models
+        if args.model_info_by_idx:
+            model_query = args.model_info_by_idx
+            manager.model_info_by_idx(model_query)
+            sys.exit()
+
+        if args.model_info_by_name:
+            model_query_full_name = args.model_info_by_name
+            manager.model_info_by_full_name(model_query_full_name)
+            sys.exit()
+
+        # CASE3: TTS with coqui studio models
+        if "coqui_studio" in args.model_name:
+            print(" > Using 🐸Coqui Studio model: ", args.model_name)
+            api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
+            api.tts_to_file(
+                text=args.text,
+                emotion=args.emotion,
+                file_path=args.out_path,
+                language=args.language,
+                speed=args.speed,
+                play=pipe_out,
+            )
+            print(" > Saving output to ", args.out_path)
+            return
+
+        # CASE4: load pre-trained model paths
+        if args.model_name is not None and not args.model_path:
+            model_path, config_path, model_item = manager.download_model(args.model_name)
+            # tts model
+            if model_item["model_type"] == "tts_models":
+                tts_path = model_path
+                tts_config_path = config_path
+                if "default_vocoder" in model_item:
+                    args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
+
+            # voice conversion model
+            if model_item["model_type"] == "voice_conversion_models":
+                vc_path = model_path
+                vc_config_path = config_path
+
+            # tts model with multiple files to be loaded from the directory path
+            if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
+                model_dir = model_path
+                tts_path = None
+                tts_config_path = None
+                args.vocoder_name = None
+
+        # load vocoder
+        if args.vocoder_name is not None and not args.vocoder_path:
+            vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
+
+        # CASE5: set custom model paths
+        if args.model_path is not None:
+            tts_path = args.model_path
+            tts_config_path = args.config_path
+            speakers_file_path = args.speakers_file_path
+            language_ids_file_path = args.language_ids_file_path
+
+        if args.vocoder_path is not None:
+            vocoder_path = args.vocoder_path
+            vocoder_config_path = args.vocoder_config_path
+
+        if args.encoder_path is not None:
+            encoder_path = args.encoder_path
+            encoder_config_path = args.encoder_config_path
+
+        device = args.device
+        if args.use_cuda:
+            device = "cuda"
+
+        # load models
+        synthesizer = Synthesizer(
+            tts_path,
+            tts_config_path,
+            speakers_file_path,
+            language_ids_file_path,
+            vocoder_path,
+            vocoder_config_path,
+            encoder_path,
+            encoder_config_path,
+            vc_path,
+            vc_config_path,
+            model_dir,
+            args.voice_dir,
+        ).to(device)
+
+        # query speaker ids of a multi-speaker model.
+        if args.list_speaker_idxs:
+            print(
+                " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
+            )
+            print(synthesizer.tts_model.speaker_manager.name_to_id)
+            return
+
+        # query langauge ids of a multi-lingual model.
+        if args.list_language_idxs:
+            print(
+                " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
+            )
+            print(synthesizer.tts_model.language_manager.name_to_id)
+            return
+
+        # check the arguments against a multi-speaker model.
+        if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
+            print(
+                " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
+                "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
+            )
+            return
+
+        # RUN THE SYNTHESIS
+        if args.text:
+            print(" > Text: {}".format(args.text))
+
+        # kick it
+        if tts_path is not None:
+            wav = synthesizer.tts(
+                args.text,
+                speaker_name=args.speaker_idx,
+                language_name=args.language_idx,
+                speaker_wav=args.speaker_wav,
+                reference_wav=args.reference_wav,
+                style_wav=args.capacitron_style_wav,
+                style_text=args.capacitron_style_text,
+                reference_speaker_name=args.reference_speaker_idx,
+            )
+        elif vc_path is not None:
+            wav = synthesizer.voice_conversion(
+                source_wav=args.source_wav,
+                target_wav=args.target_wav,
+            )
+        elif model_dir is not None:
+            wav = synthesizer.tts(
+                args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
+            )
+
+        # save the results
+        print(" > Saving output to {}".format(args.out_path))
+        synthesizer.save_wav(wav, args.out_path, play=pipe_out)
 
 
 if __name__ == "__main__":