Skip to content

Commit

Permalink
base changes to use stdout instead of play audio
Browse files Browse the repository at this point in the history
Considering conversion to pipe wav data for audio playback with ohter program
like aplay.

This is incomplete code. Using to get feedback before proceeding with
implementation.
  • Loading branch information
David-bfg committed Oct 6, 2023
1 parent 403ae73 commit f1b1f4a
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 186 deletions.
347 changes: 173 additions & 174 deletions TTS/bin/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import argparse
import sys
import contextlib
from argparse import RawTextHelpFormatter

# pylint: disable=redefined-outer-name, unused-argument
Expand Down Expand Up @@ -241,7 +242,7 @@ def main():
default=None,
)
parser.add_argument(
"--play",
"--pipe_out",
help="Play the generated TTS wav.",
type=str2bool,
nargs="?",
Expand Down Expand Up @@ -361,179 +362,177 @@ def main():
if not any(check_args):
parser.parse_args(["-h"])

# Late-import to make things load faster
from TTS.api import TTS
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer

# load model manager
path = Path(__file__).parent / "../.models.json"
manager = ModelManager(path, progress_bar=args.progress_bar)
api = TTS()

tts_path = None
tts_config_path = None
speakers_file_path = None
language_ids_file_path = None
vocoder_path = None
vocoder_config_path = None
encoder_path = None
encoder_config_path = None
vc_path = None
vc_config_path = None
model_dir = None

# CASE1 #list : list pre-trained TTS models
if args.list_models:
manager.add_cs_api_models(api.list_models())
manager.list_models()
sys.exit()

# CASE2 #info : model info for pre-trained TTS models
if args.model_info_by_idx:
model_query = args.model_info_by_idx
manager.model_info_by_idx(model_query)
sys.exit()

if args.model_info_by_name:
model_query_full_name = args.model_info_by_name
manager.model_info_by_full_name(model_query_full_name)
sys.exit()

# CASE3: TTS with coqui studio models
if "coqui_studio" in args.model_name:
print(" > Using 🐸Coqui Studio model: ", args.model_name)
api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
api.tts_to_file(
text=args.text,
emotion=args.emotion,
file_path=args.out_path,
language=args.language,
speed=args.speed,
play=args.play,
)
if args.play:
print(" > Played TTS wav")
print(" > Saving output to ", args.out_path)
return

# CASE4: load pre-trained model paths
if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name)
# tts model
if model_item["model_type"] == "tts_models":
tts_path = model_path
tts_config_path = config_path
if "default_vocoder" in model_item:
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name

# voice conversion model
if model_item["model_type"] == "voice_conversion_models":
vc_path = model_path
vc_config_path = config_path

# tts model with multiple files to be loaded from the directory path
if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
model_dir = model_path
tts_path = None
tts_config_path = None
args.vocoder_name = None

# load vocoder
if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)

# CASE5: set custom model paths
if args.model_path is not None:
tts_path = args.model_path
tts_config_path = args.config_path
speakers_file_path = args.speakers_file_path
language_ids_file_path = args.language_ids_file_path

if args.vocoder_path is not None:
vocoder_path = args.vocoder_path
vocoder_config_path = args.vocoder_config_path

if args.encoder_path is not None:
encoder_path = args.encoder_path
encoder_config_path = args.encoder_config_path

device = args.device
if args.use_cuda:
device = "cuda"

# load models
synthesizer = Synthesizer(
tts_path,
tts_config_path,
speakers_file_path,
language_ids_file_path,
vocoder_path,
vocoder_config_path,
encoder_path,
encoder_config_path,
vc_path,
vc_config_path,
model_dir,
args.voice_dir,
).to(device)

# query speaker ids of a multi-speaker model.
if args.list_speaker_idxs:
print(
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
)
print(synthesizer.tts_model.speaker_manager.name_to_id)
return

# query langauge ids of a multi-lingual model.
if args.list_language_idxs:
print(
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
)
print(synthesizer.tts_model.language_manager.name_to_id)
return

# check the arguments against a multi-speaker model.
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
print(
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
)
return

# RUN THE SYNTHESIS
if args.text:
print(" > Text: {}".format(args.text))

# kick it
if tts_path is not None:
wav = synthesizer.tts(
args.text,
speaker_name=args.speaker_idx,
language_name=args.language_idx,
speaker_wav=args.speaker_wav,
reference_wav=args.reference_wav,
style_wav=args.capacitron_style_wav,
style_text=args.capacitron_style_text,
reference_speaker_name=args.reference_speaker_idx,
)
elif vc_path is not None:
wav = synthesizer.voice_conversion(
source_wav=args.source_wav,
target_wav=args.target_wav,
)
elif model_dir is not None:
wav = synthesizer.tts(
args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
)

if args.play:
print(" > Playing TTS wav")

# save the results
print(" > Saving output to {}".format(args.out_path))
synthesizer.save_wav(wav, args.out_path, play=args.play)
pipe_out = sys.stdout if args.pipe_out else None

with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
# Late-import to make things load faster
from TTS.api import TTS
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer

# load model manager
path = Path(__file__).parent / "../.models.json"
manager = ModelManager(path, progress_bar=args.progress_bar)
api = TTS()

tts_path = None
tts_config_path = None
speakers_file_path = None
language_ids_file_path = None
vocoder_path = None
vocoder_config_path = None
encoder_path = None
encoder_config_path = None
vc_path = None
vc_config_path = None
model_dir = None

# CASE1 #list : list pre-trained TTS models
if args.list_models:
manager.add_cs_api_models(api.list_models())
manager.list_models()
sys.exit()

# CASE2 #info : model info for pre-trained TTS models
if args.model_info_by_idx:
model_query = args.model_info_by_idx
manager.model_info_by_idx(model_query)
sys.exit()

if args.model_info_by_name:
model_query_full_name = args.model_info_by_name
manager.model_info_by_full_name(model_query_full_name)
sys.exit()

# CASE3: TTS with coqui studio models
if "coqui_studio" in args.model_name:
print(" > Using 🐸Coqui Studio model: ", args.model_name)
api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
api.tts_to_file(
text=args.text,
emotion=args.emotion,
file_path=args.out_path,
language=args.language,
speed=args.speed,
play=pipe_out,
)
print(" > Saving output to ", args.out_path)
return

# CASE4: load pre-trained model paths
if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name)
# tts model
if model_item["model_type"] == "tts_models":
tts_path = model_path
tts_config_path = config_path
if "default_vocoder" in model_item:
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name

# voice conversion model
if model_item["model_type"] == "voice_conversion_models":
vc_path = model_path
vc_config_path = config_path

# tts model with multiple files to be loaded from the directory path
if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
model_dir = model_path
tts_path = None
tts_config_path = None
args.vocoder_name = None

# load vocoder
if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)

# CASE5: set custom model paths
if args.model_path is not None:
tts_path = args.model_path
tts_config_path = args.config_path
speakers_file_path = args.speakers_file_path
language_ids_file_path = args.language_ids_file_path

if args.vocoder_path is not None:
vocoder_path = args.vocoder_path
vocoder_config_path = args.vocoder_config_path

if args.encoder_path is not None:
encoder_path = args.encoder_path
encoder_config_path = args.encoder_config_path

device = args.device
if args.use_cuda:
device = "cuda"

# load models
synthesizer = Synthesizer(
tts_path,
tts_config_path,
speakers_file_path,
language_ids_file_path,
vocoder_path,
vocoder_config_path,
encoder_path,
encoder_config_path,
vc_path,
vc_config_path,
model_dir,
args.voice_dir,
).to(device)

# query speaker ids of a multi-speaker model.
if args.list_speaker_idxs:
print(
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
)
print(synthesizer.tts_model.speaker_manager.name_to_id)
return

# query langauge ids of a multi-lingual model.
if args.list_language_idxs:
print(
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
)
print(synthesizer.tts_model.language_manager.name_to_id)
return

# check the arguments against a multi-speaker model.
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
print(
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
)
return

# RUN THE SYNTHESIS
if args.text:
print(" > Text: {}".format(args.text))

# kick it
if tts_path is not None:
wav = synthesizer.tts(
args.text,
speaker_name=args.speaker_idx,
language_name=args.language_idx,
speaker_wav=args.speaker_wav,
reference_wav=args.reference_wav,
style_wav=args.capacitron_style_wav,
style_text=args.capacitron_style_text,
reference_speaker_name=args.reference_speaker_idx,
)
elif vc_path is not None:
wav = synthesizer.voice_conversion(
source_wav=args.source_wav,
target_wav=args.target_wav,
)
elif model_dir is not None:
wav = synthesizer.tts(
args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
)

# save the results
print(" > Saving output to {}".format(args.out_path))
synthesizer.save_wav(wav, args.out_path, play=pipe_out)


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit f1b1f4a

Please sign in to comment.