From 94d7437e59ca651f3a25ca9ecdac15a4a7bc7e9d Mon Sep 17 00:00:00 2001 From: KoljaB Date: Tue, 1 Oct 2024 18:57:37 +0200 Subject: [PATCH 1/3] faster transcription --- RealtimeSTT/audio_recorder.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/RealtimeSTT/audio_recorder.py b/RealtimeSTT/audio_recorder.py index 6c20f66..a262c66 100644 --- a/RealtimeSTT/audio_recorder.py +++ b/RealtimeSTT/audio_recorder.py @@ -160,7 +160,7 @@ def __init__(self, initial_prompt: Optional[Union[str, Iterable[int]]] = None, suppress_tokens: Optional[List[int]] = [-1], log_transcription_time: bool = False, - early_transcription_on_silence: bool = True + early_transcription_on_silence: bool = False ): """ Initializes an audio recorder and transcription @@ -343,13 +343,14 @@ def __init__(self, from the transcription output. - log_transcription_time (bool, default=False): Logs processing time of main model transcription - - early_transcription_on_silence (bool, default=True): If True, the + - early_transcription_on_silence (bool, default=False): If True, the system will immediately transcribe audio when silence is detected. If silence lasts longer than post_speech_silence_duration, the recording is stopped, and the transcription is submitted. If voice activity resumes within this period, the transcription is discarded. Results in faster final transcriptions to the cost - of some unnecessary final transcriptions + of some unnecessary final transcriptions. Recommended only + when transcription occurs fast enough (strong GPU required). Raises: Exception: Errors related to initializing transcription From ba4d24bd8d3aef808bb5875bd57fb887a0d22ecc Mon Sep 17 00:00:00 2001 From: KoljaB Date: Tue, 1 Oct 2024 19:01:21 +0200 Subject: [PATCH 2/3] updated tests --- tests/realtimestt_test.py | 46 +++++++++++++++++++++++++++++---------- tests/simple_test.py | 24 +++++++++++++++++--- 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/tests/realtimestt_test.py b/tests/realtimestt_test.py index 33c9a13..3b16831 100644 --- a/tests/realtimestt_test.py +++ b/tests/realtimestt_test.py @@ -1,10 +1,21 @@ -from RealtimeSTT import AudioToTextRecorder -from colorama import Fore, Back, Style -import colorama -import os - if __name__ == '__main__': + EXTENDED_LOGGING = False + + import os + import sys + if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99): + from torchaudio._extension.utils import _init_dll_path + _init_dll_path() + + if EXTENDED_LOGGING: + import logging + logging.basicConfig(level=logging.DEBUG) + + from RealtimeSTT import AudioToTextRecorder + from colorama import Fore, Back, Style + import colorama + print("Initializing RealtimeSTT test...") colorama.init() @@ -36,23 +47,34 @@ def process_text(text): recorder_config = { 'spinner': False, 'model': 'large-v2', - 'silero_sensitivity': 0.4, - 'webrtc_sensitivity': 2, + 'realtime_model_type': 'tiny', + 'language': 'en', + 'silero_sensitivity': 0.05, + 'webrtc_sensitivity': 3, 'post_speech_silence_duration': 0.4, 'min_length_of_recording': 0, 'min_gap_between_recordings': 0, - 'enable_realtime_transcription': True, - 'realtime_processing_pause': 0.2, - 'realtime_model_type': 'tiny', + 'enable_realtime_transcription': False, + 'realtime_processing_pause': 0, 'on_realtime_transcription_update': text_detected, 'silero_deactivity_detection': True, + 'min_length_of_recording': 0.5, + 'early_transcription_on_silence': False } + # Conditionally add logging level if EXTENDED_LOGGING is True + if EXTENDED_LOGGING: + recorder_config['level'] = logging.DEBUG + recorder = AudioToTextRecorder(**recorder_config) clear_console() print("Say something...", end="", flush=True) - while True: - recorder.text(process_text) + + try: + while (True): + recorder.text(process_text) + except KeyboardInterrupt: + print("Exiting application due to keyboard interrupt") diff --git a/tests/simple_test.py b/tests/simple_test.py index e6069ab..282edc8 100644 --- a/tests/simple_test.py +++ b/tests/simple_test.py @@ -1,6 +1,24 @@ -from RealtimeSTT import AudioToTextRecorder if __name__ == '__main__': - recorder = AudioToTextRecorder(spinner=False, model="tiny.en", language="en") + + import os + import sys + if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99): + from torchaudio._extension.utils import _init_dll_path + _init_dll_path() + + from RealtimeSTT import AudioToTextRecorder + + recorder = AudioToTextRecorder( + spinner=False, + silero_sensitivity=0.01, + model="tiny.en", + language="en", + ) print("Say something...") - while (True): print(recorder.text(), end=" ", flush=True) \ No newline at end of file + + try: + while (True): + print("Detected text: " + recorder.text()) + except KeyboardInterrupt: + print("Exiting application due to keyboard interrupt") From 62e5e164988f5040f723922ac6453d2e07496667 Mon Sep 17 00:00:00 2001 From: KoljaB Date: Tue, 1 Oct 2024 19:25:51 +0200 Subject: [PATCH 3/3] update tests --- tests/realtimestt_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/realtimestt_test.py b/tests/realtimestt_test.py index 3b16831..097a4b7 100644 --- a/tests/realtimestt_test.py +++ b/tests/realtimestt_test.py @@ -54,7 +54,7 @@ def process_text(text): 'post_speech_silence_duration': 0.4, 'min_length_of_recording': 0, 'min_gap_between_recordings': 0, - 'enable_realtime_transcription': False, + 'enable_realtime_transcription': True, 'realtime_processing_pause': 0, 'on_realtime_transcription_update': text_detected, 'silero_deactivity_detection': True,