#13, #14, #20

KoljaB · Jan 29, 2024 · 8d0ba91 · 8d0ba91
1 parent 3d7143e
commit 8d0ba91
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -186,7 +186,7 @@ recorder = AudioToTextRecorder(on_recording_start=my_start_callback,
 
 ### Feed chunks
 
-If you don't want to use the local microphone set use_microphone parameter to false and provide raw PCM audiochunks in 16-bit mono with this method:
+If you don't want to use the local microphone set use_microphone parameter to false and provide raw PCM audiochunks in 16-bit mono (samplerate 16000) with this method:
 
 ```python
 recorder.feed_audio(audio_chunk)
@@ -253,7 +253,13 @@ When you initialize the `AudioToTextRecorder` class, you have various options to
     - Options: 'tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v1', 'large-v2'.
     - Note: If a size is provided, the model will be downloaded from the Hugging Face Hub.
 
-- **language** (str, default=""): Language code for transcription. If left empty, the model will try to auto-detect the language.
+- **language** (str, default=""): Language code for transcription. If left empty, the model will try to auto-detect the language. Supported language codes are listed in [Whisper Tokenizer library](https://github.com/openai/whisper/blob/main/whisper/tokenizer.py).
+
+- **compute_type** (str, default="default"): Specifies the type of computation to be used for transcription. See [Whisper Quantization](https://opennmt.net/CTranslate2/quantization.html)
+
+- **input_device_index** (int, default=0): Audio Input Device Index to use.
+
+- **gpu_device_index** (int, default=0): GPU Device Index to use. The model can also be loaded on multiple GPUs by passing a list of IDs (e.g. [0, 1, 2, 3]).
 
 - **on_recording_start**: A callable function triggered when recording starts.
 

diff --git a/RealtimeSTT/audio_recorder.py b/RealtimeSTT/audio_recorder.py
@@ -26,7 +26,8 @@
 
 """
 
-from multiprocessing import Process, Pipe, Event, Manager
+import torch.multiprocessing as mp
+from typing import List, Union
 import faster_whisper
 import collections
 import numpy as np
@@ -43,6 +44,7 @@
 import time
 import os
 import re
+import gc
 
 INIT_MODEL_TRANSCRIPTION = "tiny"
 INIT_MODEL_TRANSCRIPTION_REALTIME = "tiny"
@@ -74,6 +76,9 @@ class AudioToTextRecorder:
     def __init__(self,
                  model: str = INIT_MODEL_TRANSCRIPTION,
                  language: str = "",
+                 compute_type: str = "default",
+                 input_device_index: int = 0,
+                 gpu_device_index: Union[int, List[int]] = 0,
                  on_recording_start=None,
                  on_recording_stop=None,
                  on_transcription_start=None,
@@ -136,6 +141,16 @@ def __init__(self,
         - language (str, default=""): Language code for speech-to-text engine.
             If not specified, the model will attempt to detect the language
             automatically.
+        - compute_type (str, default="default"): Specifies the type of
+            computation to be used for transcription.
+            See https://opennmt.net/CTranslate2/quantization.html.
+        - input_device_index (int, default=0): The index of the audio input
+            device to use.
+        - gpu_device_index (int, default=0): Device ID to use.
+            The model can also be loaded on multiple GPUs by passing a list of
+            IDs (e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can
+            run in parallel when transcribe() is called from multiple Python
+            threads
         - on_recording_start (callable, default=None): Callback function to be
             called when recording of audio to be transcripted starts.
         - on_recording_stop (callable, default=None): Callback function to be
@@ -238,6 +253,9 @@ def __init__(self,
         """
 
         self.language = language
+        self.compute_type = compute_type
+        self.input_device_index = input_device_index
+        self.gpu_device_index = gpu_device_index
         self.wake_words = wake_words
         self.wake_word_activation_delay = wake_word_activation_delay
         self.wake_word_timeout = wake_word_timeout
@@ -247,6 +265,7 @@ def __init__(self,
         self.ensure_sentence_ends_with_period = (
             ensure_sentence_ends_with_period
         )
+        self.use_microphone = use_microphone
         self.min_gap_between_recordings = min_gap_between_recordings
         self.min_length_of_recording = min_length_of_recording
         self.pre_recording_buffer_duration = pre_recording_buffer_duration
@@ -272,8 +291,7 @@ def __init__(self,
         self.allowed_latency_limit = ALLOWED_LATENCY_LIMIT
 
         self.level = level
-        manager = Manager()
-        self.audio_queue = manager.Queue()
+        self.audio_queue = mp.Queue()
         self.buffer_size = BUFFER_SIZE
         self.sample_rate = SAMPLE_RATE
         self.recording_start_time = 0
@@ -323,20 +341,30 @@ def __init__(self,
         logger.addHandler(console_handler)
 
         self.is_shut_down = False
-        self.shutdown_event = Event()
+        self.shutdown_event = mp.Event()
 
         logging.info("Starting RealTimeSTT")
 
-        # Start transcription process
-        self.interrupt_stop_event = Event()
-        self.main_transcription_ready_event = Event()
-        self.parent_transcription_pipe, child_transcription_pipe = Pipe()
-
-        self.transcript_process = Process(
+        # Start transcription worker process
+        try:
+            # Only set the start method if it hasn't been set already
+            if mp.get_start_method(allow_none=True) is None:
+                mp.set_start_method("spawn")
+        except RuntimeError as e:
+            print("Start method has already been set. Details:", e)
+
+        self.interrupt_stop_event = mp.Event()
+        self.was_interrupted = mp.Event()
+        self.main_transcription_ready_event = mp.Event()
+        self.parent_transcription_pipe, child_transcription_pipe = mp.Pipe()
+
+        self.transcript_process = mp.Process(
             target=AudioToTextRecorder._transcription_worker,
             args=(
                 child_transcription_pipe,
                 model,
+                self.compute_type,
+                self.gpu_device_index,
                 self.main_transcription_ready_event,
                 self.shutdown_event,
                 self.interrupt_stop_event
@@ -346,12 +374,13 @@ def __init__(self,
 
         # Start audio data reading process
         if use_microphone:
-            self.reader_process = Process(
+            self.reader_process = mp.Process(
                 target=AudioToTextRecorder._audio_data_worker,
                 args=(
                     self.audio_queue,
                     self.sample_rate,
                     self.buffer_size,
+                    self.input_device_index,
                     self.shutdown_event,
                     self.interrupt_stop_event
                 )
@@ -366,7 +395,9 @@ def __init__(self,
                              )
                 self.realtime_model_type = faster_whisper.WhisperModel(
                     model_size_or_path=self.realtime_model_type,
-                    device='cuda' if torch.cuda.is_available() else 'cpu'
+                    device='cuda' if torch.cuda.is_available() else 'cpu',
+                    compute_type=self.compute_type,
+                    device_index=self.gpu_device_index
                 )
 
             except Exception as e:
@@ -473,12 +504,11 @@ def __init__(self,
 
         logging.debug('RealtimeSTT initialization completed successfully')
 
-        print(f"buffer_size: {self.buffer_size}")
-        print(f"samplerate: {self.sample_rate}")
-
     @staticmethod
     def _transcription_worker(conn,
                               model_path,
+                              compute_type,
+                              gpu_device_index,
                               ready_event,
                               shutdown_event,
                               interrupt_stop_event):
@@ -499,6 +529,9 @@ def _transcription_worker(conn,
               for receiving audio data and sending transcription results.
             model_path (str): The path to the pre-trained faster_whisper model
               for transcription.
+            compute_type (str): Specifies the type of computation to be used
+                for transcription.
+            gpu_device_index (int): Device ID to use.
             ready_event (threading.Event): An event that is set when the
               transcription model is successfully initialized and ready.
             shutdown_event (threading.Event): An event that, when set,
@@ -516,7 +549,9 @@ def _transcription_worker(conn,
         try:
             model = faster_whisper.WhisperModel(
                 model_size_or_path=model_path,
-                device='cuda' if torch.cuda.is_available() else 'cpu'
+                device='cuda' if torch.cuda.is_available() else 'cpu',
+                compute_type=compute_type,
+                device_index=gpu_device_index
             )
 
         except Exception as e:
@@ -563,6 +598,7 @@ def _transcription_worker(conn,
     def _audio_data_worker(audio_queue,
                            sample_rate,
                            buffer_size,
+                           input_device_index,
                            shutdown_event,
                            interrupt_stop_event):
         """
@@ -583,6 +619,7 @@ def _audio_data_worker(audio_queue,
             sample_rate (int): The sample rate of the audio input stream.
             buffer_size (int): The size of the buffer used in the audio
               input stream.
+            input_device_index (int): The index of the audio input device
             shutdown_event (threading.Event): An event that, when set, signals
               this worker method to terminate.
 
@@ -600,7 +637,8 @@ def _audio_data_worker(audio_queue,
                                           format=pyaudio.paInt16,
                                           channels=1,
                                           input=True,
-                                          frames_per_buffer=buffer_size
+                                          frames_per_buffer=buffer_size,
+                                          input_device_index=input_device_index,
                                           )
 
         except Exception as e:
@@ -647,6 +685,20 @@ def _audio_data_worker(audio_queue,
             stream.close()
             audio_interface.terminate()
 
+    def wakeup(self):
+        """
+        If in wake work modus, wake up as if a wake word was spoken.
+        """
+        self.listen_start = time.time()
+
+    def abort(self):
+        self.start_recording_on_voice_activity = False
+        self.stop_recording_on_voice_deactivity = False
+        self._set_state("inactive")
+        self.interrupt_stop_event.set()
+        self.was_interrupted.wait()
+        self.was_interrupted.clear()
+
     def wait_audio(self):
         """
         Waits for the start and completion of the audio recording process.
@@ -671,7 +723,7 @@ def wait_audio(self):
 
             # Wait until recording starts
             while not self.interrupt_stop_event.is_set():
-                if self.start_recording_event.wait(timeout=0.5):
+                if self.start_recording_event.wait(timeout=0.02):
                     break
 
         # If recording is ongoing, wait for voice inactivity
@@ -681,7 +733,7 @@ def wait_audio(self):
 
             # Wait until recording stops
             while not self.interrupt_stop_event.is_set():
-                if (self.stop_recording_event.wait(timeout=0.5)):
+                if (self.stop_recording_event.wait(timeout=0.02)):
                     break
 
         # Convert recorded frames to the appropriate audio format.
@@ -756,9 +808,14 @@ def text(self,
             str: The transcription of the recorded audio
         """
 
+        self.interrupt_stop_event.clear()
+        self.was_interrupted.clear()
+
         self.wait_audio()
 
         if self.is_shut_down or self.interrupt_stop_event.is_set():
+            if self.interrupt_stop_event.is_set():
+                self.was_interrupted.set()
             return ""
 
         if on_transcription_finished:
@@ -873,7 +930,8 @@ def shutdown(self):
 
         logging.debug('Terminating reader process')
         # Give it some time to finish the loop and cleanup.
-        self.reader_process.join(timeout=10)
+        if self.use_microphone:
+            self.reader_process.join(timeout=10)
 
         if self.reader_process.is_alive():
             logging.warning("Reader process did not terminate "
@@ -896,6 +954,12 @@ def shutdown(self):
         if self.realtime_thread:
             self.realtime_thread.join()
 
+        if self.enable_realtime_transcription:
+            if self.realtime_model_type:
+                del self.realtime_model_type
+                self.realtime_model_type = None
+        gc.collect()
+
     def _recording_worker(self):
         """
         The main worker method which constantly monitors the audio

diff --git a/install_with_gpu_support.bat b/install_with_gpu_support.bat
@@ -1,2 +1,2 @@
-pip install torch==2.1.1+cu118 torchaudio==2.1.1+cu118 --index-url https://download.pytorch.org/whl/cu118
+pip install torch==2.1.2+cu118 torchaudio==2.1.2+cu118 --index-url https://download.pytorch.org/whl/cu118
 pip install -r requirements-gpu.txt
diff --git a/requirements.txt b/requirements.txt
@@ -3,5 +3,5 @@ faster-whisper==0.10.0
 pvporcupine==1.9.5
 webrtcvad==2.0.10
 halo==0.0.31
-torch==2.1.1
-torchaudio==2.1.1
+torch==2.1.2
+torchaudio==2.1.2
diff --git a/requirements_raw.txt b/requirements_raw.txt
@@ -0,0 +1,5 @@
+PyAudio
+faster-whisper==0.10.0
+pvporcupine==1.9.5
+webrtcvad
+halo