From c1f6dc0ef08c6bf4d6fd37af00908a22eaf1d48a Mon Sep 17 00:00:00 2001 From: KoljaB Date: Tue, 10 Dec 2024 22:58:23 +0100 Subject: [PATCH] added 48 kHz azure and styletts voice change --- README.md | 10 +- RealtimeTTS/__init__.py | 4 +- RealtimeTTS/engines/__init__.py | 4 +- RealtimeTTS/engines/azure_engine.py | 24 +++- RealtimeTTS/engines/style_engine.py | 164 +++++++++++++++++++++++++--- setup.py | 2 +- tests/azure_test.py | 22 ++++ tests/style_test.py | 86 +++++++++++---- 8 files changed, 265 insertions(+), 51 deletions(-) create mode 100644 tests/azure_test.py diff --git a/README.md b/README.md index 33dcc53..18d62e5 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ Let me know if you need any adjustments or additional languages! ## Updates -Latest Version: v0.4.19 +Latest Version: v0.4.20 Introducing StyleTTS2 engine: @@ -665,14 +665,8 @@ While the source of this library is open-source, the usage of many of the engine Kolja Beigel Email: kolja.beigel@web.de - -

- - GitHub - -     Back to Top -

+

\ No newline at end of file diff --git a/RealtimeTTS/__init__.py b/RealtimeTTS/__init__.py index ca7cdc8..be3208f 100644 --- a/RealtimeTTS/__init__.py +++ b/RealtimeTTS/__init__.py @@ -49,6 +49,6 @@ EdgeEngine, EdgeVoice = None, None try: - from .engines import StyleTTSEngine # noqa: F401 + from .engines import StyleTTSEngine, StyleTTSVoice # noqa: F401 except ImportError: - StyleTTSEngine = None \ No newline at end of file + StyleTTSEngine, StyleTTSVoice = None \ No newline at end of file diff --git a/RealtimeTTS/engines/__init__.py b/RealtimeTTS/engines/__init__.py index ed69c75..c9d6622 100644 --- a/RealtimeTTS/engines/__init__.py +++ b/RealtimeTTS/engines/__init__.py @@ -48,6 +48,6 @@ EdgeEngine, EdgeVoice = None, None try: - from .style_engine import StyleTTSEngine # noqa: F401 + from .style_engine import StyleTTSEngine, StyleTTSVoice # noqa: F401 except ImportError as e: - StyleTTSEngine = None \ No newline at end of file + StyleTTSEngine, StyleTTSVoice = None \ No newline at end of file diff --git a/RealtimeTTS/engines/azure_engine.py b/RealtimeTTS/engines/azure_engine.py index a1f3715..7517e7e 100644 --- a/RealtimeTTS/engines/azure_engine.py +++ b/RealtimeTTS/engines/azure_engine.py @@ -1,4 +1,5 @@ import azure.cognitiveservices.speech as tts +from azure.cognitiveservices.speech import SpeechSynthesisOutputFormat from .base_engine import BaseEngine from typing import Union import requests @@ -49,6 +50,16 @@ def _extract_voice_language(locale): class AzureEngine(BaseEngine): + SUPPORTED_AUDIO_FORMATS = { + "riff-16khz-16bit-mono-pcm": 16000, + "riff-24khz-16bit-mono-pcm": 24000, + "riff-48khz-16bit-mono-pcm": 48000, + } + AUDIO_FORMAT_MAP = { + "riff-16khz-16bit-mono-pcm": tts.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm, + "riff-24khz-16bit-mono-pcm": tts.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm, + "riff-48khz-16bit-mono-pcm": tts.SpeechSynthesisOutputFormat.Riff48Khz16BitMonoPcm, + } def __init__( self, speech_key: str = "", @@ -56,6 +67,7 @@ def __init__( voice: str = "en-US-AshleyNeural", rate: float = 0.0, pitch: float = 0.0, + audio_format: str = "riff-16khz-16bit-mono-pcm", ): """ Initializes an azure voice realtime text to speech engine object. @@ -66,8 +78,17 @@ def __init__( voice (str, optional): Voice name. Defaults to "en-US-AshleyNeural". rate (float, optional): Speech speed as a percentage. Defaults to "0.0". Indicating the relative change. pitch (float, optional): Speech pitch as a percentage. Defaults to "0.0". Indicating the relative change. + audio_format (str, optional): Audio format for output. Defaults to "riff-16khz-16bit-mono-pcm". Must be one of these supported formats: "riff-16khz-16bit-mono-pcm", "riff-24khz-16bit-mono-pcm", "riff-48khz-16bit-mono-pcm". + Raises: + ValueError: If the provided audio_format is not supported. """ + if audio_format not in self.SUPPORTED_AUDIO_FORMATS: + raise ValueError( + f"Invalid audio_format '{audio_format}'. Supported formats are: {list(self.SUPPORTED_AUDIO_FORMATS.keys())}" + ) + self.audio_format = audio_format + self.sample_rate = self.SUPPORTED_AUDIO_FORMATS[audio_format] self.speech_key = speech_key self.service_region = service_region self.language = voice[:5] @@ -138,7 +159,7 @@ def get_stream_info(self): - Channels (int): The number of audio channels. 1 represents mono audio. - Sample Rate (int): The sample rate of the audio in Hz. 16000 represents 16kHz sample rate. """ - return pyaudio.paInt16, 1, 16000 + return pyaudio.paInt16, 1, self.sample_rate def synthesize(self, text: str) -> bool: """ @@ -152,6 +173,7 @@ def synthesize(self, text: str) -> bool: speech_config = tts.SpeechConfig( subscription=self.speech_key, region=self.service_region ) + speech_config.set_speech_synthesis_output_format(self.AUDIO_FORMAT_MAP[self.audio_format]) stream_callback = PushAudioOutputStreamSampleCallback(self.queue) push_stream = tts.audio.PushAudioOutputStream(stream_callback) stream_config = tts.audio.AudioOutputConfig(stream=push_stream) diff --git a/RealtimeTTS/engines/style_engine.py b/RealtimeTTS/engines/style_engine.py index 2c330c1..91e5235 100644 --- a/RealtimeTTS/engines/style_engine.py +++ b/RealtimeTTS/engines/style_engine.py @@ -4,18 +4,59 @@ import torch import sys import os +import gc +import time +from numba import cuda + +class StyleTTSVoice: + def __init__(self, + model_config_path: str, + model_checkpoint_path: str, + ref_audio_path: str): + """ + Represents a StyleTTS voice configuration. + + Args: + model_config_path (str): Path to the StyleTTS model configuration file. + model_checkpoint_path (str): Path to the StyleTTS model checkpoint file. + ref_audio_path (str): Path to the reference audio file for extracting style. + """ + self.model_config_path = model_config_path + self.model_checkpoint_path = model_checkpoint_path + self.ref_audio_path = ref_audio_path + + def __str__(self): + """ + String representation of the StyleTTS voice configuration. + """ + return ( + f"StyleTTSVoice(" + f"Config: {self.model_config_path}, " + f"Checkpoint: {self.model_checkpoint_path}, " + f"Reference Audio: {self.ref_audio_path})" + ) + + def __repr__(self): + """ + Detailed representation of the StyleTTS voice configuration. + """ + return ( + f"StyleTTSVoice:\n" + f" Model Config Path: {self.model_config_path}\n" + f" Model Checkpoint Path: {self.model_checkpoint_path}\n" + f" Reference Audio Path: {self.ref_audio_path}" + ) class StyleTTSEngine(BaseEngine): def __init__(self, style_root: str, - model_config_path: str, - model_checkpoint_path: str, - ref_audio_path: str, # path to reference audio for style + voice: StyleTTSVoice, device: str = 'cuda', alpha: float = 0.3, beta: float = 0.7, diffusion_steps: int = 5, - embedding_scale: float = 1.0): + embedding_scale: float = 1.0, + cuda_reset_delay: float = 0.0): # Delay after resetting CUDA device """ Initializes the StyleTTS engine with customizable parameters. @@ -66,18 +107,24 @@ def __init__(self, - A higher scale (e.g., 1.2 or 1.5) strengthens the alignment with the text and reference, potentially enhancing style adherence and expressiveness. - A very high scale might introduce artifacts or unnatural audio, so fine-tuning is recommended. + + cuda_reset_delay (float): Time in seconds to wait after resetting the CUDA device. """ self.device = device if torch.cuda.is_available() else 'cpu' self.style_root = style_root.replace("\\", "/") - self.model_config_path = model_config_path.replace("\\", "/") - self.model_checkpoint_path = model_checkpoint_path.replace("\\", "/") - self.ref_audio_path = ref_audio_path + + # Use the properties from the StyleTTSVoice instance + self.voice = voice + self.model_config_path = self.voice.model_config_path.replace("\\", "/") + self.model_checkpoint_path = self.voice.model_checkpoint_path.replace("\\", "/") + self.ref_audio_path = self.voice.ref_audio_path # Parameters for synthesis self.alpha = alpha self.beta = beta self.diffusion_steps = diffusion_steps self.embedding_scale = embedding_scale + self.cuda_reset_delay = cuda_reset_delay # Store the delay parameter # Add the root directory to sys.path sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), self.style_root))) @@ -90,20 +137,83 @@ def __init__(self, def post_init(self): self.engine_name = "styletts" - def get_stream_info(self): + def unload_model(self): """ - Returns the PyAudio stream configuration: - - Format: pyaudio.paInt16 (16-bit) - - Channels: 1 (mono) - - Sample Rate: 24000 Hz + Unloads the current model and clears VRAM to prevent memory leaks. + + Steps: + 1. Move models to CPU to ensure PyTorch releases GPU memory. + 2. Delete references to the model and other components to allow garbage collection. + 3. Trigger garbage collection and clear the CUDA memory cache. """ + # Move models to CPU first + if hasattr(self, 'model'): + for key in self.model: + self.model[key].to('cpu') + # Explanation: Moving models to the CPU ensures that all tensors allocated on the GPU + # are detached from the GPU's memory. If a model is directly deleted while still residing + # on the GPU, PyTorch may not fully release its VRAM due to lingering device-side context. + + # Delete references + if hasattr(self, 'model'): + del self.model # Remove the main model + if hasattr(self, 'sampler'): + del self.sampler # Remove the diffusion sampler + if hasattr(self, 'text_aligner'): + del self.text_aligner # Remove the ASR-based text aligner + if hasattr(self, 'pitch_extractor'): + del self.pitch_extractor # Remove the pitch extraction model + if hasattr(self, 'plbert'): + del self.plbert # Remove the pre-trained BERT model used for prosody + + # Force garbage collection and try to free cache + gc.collect() + torch.cuda.empty_cache() + # Explanation: After removing references, garbage collection ensures that + # Python clears any remaining objects that might still hold references to GPU memory. + # `torch.cuda.empty_cache()` clears PyTorch's internal GPU memory management cache, + # freeing up VRAM for the next model or process. + + def set_model_config_path(self, new_path: str): + self.unload_model() + self.model_config_path = new_path.replace("\\", "/") + self.load_model() + print(f"Model config updated to: {new_path}") + + def set_model_checkpoint_path(self, new_path: str): + self.unload_model() + self.model_checkpoint_path = new_path.replace("\\", "/") + self.load_model() + print(f"Model checkpoint updated to: {new_path}") + + def set_ref_audio_path(self, new_path: str): + # Updating the reference audio doesn't require unloading the model. + # We're just recomputing style embeddings. + self.ref_audio_path = new_path + self.compute_reference_style(self.ref_audio_path) + print(f"Reference audio updated to: {new_path}") + + def set_all_parameters(self, model_config_path: str, model_checkpoint_path: str, ref_audio_path: str): + """ + Updates model config, checkpoint, and reference audio simultaneously, + reloading the model only once. + """ + self.unload_model() # Unload the previous model + self.model_config_path = model_config_path.replace("\\", "/") + self.model_checkpoint_path = model_checkpoint_path.replace("\\", "/") + self.ref_audio_path = ref_audio_path + self.load_model() # Reload the new model with updated config and checkpoint + self.compute_reference_style(self.ref_audio_path) # Recompute style embeddings + print(f"Updated all parameters:\n - Model config: {model_config_path}\n - Model checkpoint: {model_checkpoint_path}\n - Reference audio: {ref_audio_path}") + + def get_stream_info(self): import pyaudio return pyaudio.paInt16, 1, 24000 def synthesize(self, text: str) -> bool: """ Synthesizes text to audio stream using the loaded StyleTTS model. - + Args: text (str): Text to synthesize. """ @@ -186,7 +296,7 @@ def load_model(self): state_dict = params[key] new_state_dict = OrderedDict() for k, v in state_dict.items(): - name = k[7:] # remove `module.` + name = k[7:] new_state_dict[name] = v self.model[key].load_state_dict(new_state_dict, strict=False) _ = [self.model[key].eval() for key in self.model] @@ -198,8 +308,8 @@ def load_model(self): # Initialize phonemizer self.global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', - preserve_punctuation=True, - with_stress=True) + preserve_punctuation=True, + with_stress=True) # Initialize diffusion sampler self.sampler = DiffusionSampler( @@ -241,7 +351,7 @@ def inference(self, text: str, embedding_scale: float = 1.0) -> np.ndarray: """ Run inference with given parameters and return audio waveform. - + Args: text (str): Text to synthesize. alpha (float): Timbre blending factor. @@ -322,3 +432,23 @@ def inference(self, text: str, waveform = waveform[..., :-50] return waveform + + def get_voices(self): + """ + Retrieves the installed voices available for the StyleTTS engine. + We return an empty list since StyleTTS does not support voice retrieval. + """ + voice_objects = [] + return voice_objects + + def set_voice(self, voice: StyleTTSVoice): + """ + Sets the voice to be used for speech synthesis. + """ + if isinstance(voice, StyleTTSVoice): + self.voice = voice + self.set_all_parameters( + model_config_path=voice.model_config_path, + model_checkpoint_path=voice.model_checkpoint_path, + ref_audio_path=voice.ref_audio_path, + ) diff --git a/setup.py b/setup.py index 3f06ca1..0b3bbdf 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -current_version = "0.4.19" +current_version = "0.4.20" import setuptools diff --git a/tests/azure_test.py b/tests/azure_test.py new file mode 100644 index 0000000..b635917 --- /dev/null +++ b/tests/azure_test.py @@ -0,0 +1,22 @@ +if __name__ == "__main__": + import os + from RealtimeTTS import TextToAudioStream, AzureEngine + + def dummy_generator(): + yield "Hey guys! These here are realtime spoken sentences based on local text synthesis. " + yield "With a local, neuronal, cloned voice. So every spoken sentence sounds unique." + + # for normal use with minimal logging: + import os + engine = AzureEngine( + os.environ["AZURE_SPEECH_KEY"], + os.environ["AZURE_SPEECH_REGION"], + audio_format="riff-48khz-16bit-mono-pcm" + ) + + stream = TextToAudioStream(engine) + + print("Starting to play stream") + stream.feed(dummy_generator()).play(log_synthesized_text=True) + + engine.shutdown() diff --git a/tests/style_test.py b/tests/style_test.py index a869f17..e40173a 100644 --- a/tests/style_test.py +++ b/tests/style_test.py @@ -1,33 +1,79 @@ if __name__ == "__main__": - from RealtimeTTS import TextToAudioStream, StyleTTSEngine + from RealtimeTTS import TextToAudioStream, StyleTTSEngine, StyleTTSVoice - def dummy_generator(): - yield "Close your eyes for a moment... can you hear it? " - yield "That’s not just a voice — it’s StyleTTS2, turning words into experiences with depth, charm, and a hint of allure. " - yield "Every word feels intentional, like it was crafted just for you. " - yield "Here’s the exciting part: it’s joining the RealtimeTTS library soon. " - yield "Prepare yourself for real-time expressive speech that’s as authentic as the moment you’re in. " + def dummy_generator_1(): + yield "This is the first voice model speaking. " + yield "The elegance of the style and its flow is simply captivating. " + yield "We’ll soon switch to another model. " - # adjust these paths to your local setup (stylett2 installation folder, model config, model checkpoint, reference audio) + def dummy_generator_2(): + yield "And here we are! " + yield "You’re now listening to the second voice model, with a different style and tone. " + yield "It’s fascinating how StyleTTS can adapt seamlessly. " + + def dummy_generator_3(): + yield "Welcome back again! " + yield "We’re testing the third voice model now. " + yield "The transition between styles is smooth and effortless. " + + # Adjust these paths to your local setup styletts_root = "D:/Dev/StyleTTS_Realtime/StyleTTS2" - model_config_path = "D:/Dev/StyleTTS_Realtime/StyleTTS2/Models/Nicole/config.yml" - model_checkpoint_path = "D:/Dev/StyleTTS_Realtime/StyleTTS2/Models/Nicole/epoch_2nd_00036.pth" - ref_audio_path = "D:/Dev/StyleTTS_Realtime/RealtimeTTS/tests/nicole.wav" - + + # Create StyleTTSVoice instances for both models + voice_1 = StyleTTSVoice( + model_config_path="D:/Data/Models/style/Nicole/config.yml", + model_checkpoint_path="D:/Data/Models/style/Nicole/epoch_2nd_00036.pth", + ref_audio_path="D:/Data/Models/style/Nicole/file___1_file___1_segment_98.wav" + ) + + voice_2 = StyleTTSVoice( + model_config_path="D:/Data/Models/style/LongLasi/LongLasi_config.yml", + model_checkpoint_path="D:/Data/Models/style/LongLasi/epoch_2nd_00047.pth", + ref_audio_path="D:/Data/Models/style/LongLasi/file___1_file___1_segment_116.wav" + ) + + voice_3 = StyleTTSVoice( + model_config_path="D:/Data/Models/style/ExtLasi/ExcLasi_config.yml", + model_checkpoint_path="D:/Data/Models/style/ExtLasi/epoch_2nd_00039.pth", + ref_audio_path="D:/Data/Models/style/ExtLasi/file___1_file___1_segment_33.wav" + ) + + # Initialize the engine with the first voice engine = StyleTTSEngine( style_root=styletts_root, - model_config_path=model_config_path, - model_checkpoint_path=model_checkpoint_path, - ref_audio_path=ref_audio_path, + voice=voice_1, # Pass the first StyleTTSVoice instance alpha=0.3, - beta=0.7, - diffusion_steps=50, - embedding_scale=1,) + beta=1.0, + diffusion_steps=10, + embedding_scale=1.0, + cuda_reset_delay=0.0, # Custom delay for CUDA reset + ) + # Create a TextToAudioStream with the engine stream = TextToAudioStream(engine) - print("Starting to play stream") - stream.feed(dummy_generator()) + # Play with the first model + print("Playing with the first model...") + stream.feed(dummy_generator_1()) + stream.play(log_synthesized_text=True) + + # Switch to the second voice at runtime + print("\nSwitching to the second model...") + engine.set_voice(voice_2) # Use set_voice to update the voice configuration + + # Play with the second model + print("Playing with the second model...") + stream.feed(dummy_generator_2()) + stream.play(log_synthesized_text=True) + + # Switch to the third voice + print("\nSwitching to the third model...") + engine.set_voice(voice_3) # Switch to the third voice configuration + + # Play again with the first model + print("Playing with the first model again...") + stream.feed(dummy_generator_3()) stream.play(log_synthesized_text=True) + # Shutdown the engine engine.shutdown()