Skip to content

Commit

Permalink
#13, #14, #20
Browse files Browse the repository at this point in the history
  • Loading branch information
KoljaB committed Jan 29, 2024
1 parent 3d7143e commit 8d0ba91
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 25 deletions.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ recorder = AudioToTextRecorder(on_recording_start=my_start_callback,
### Feed chunks
If you don't want to use the local microphone set use_microphone parameter to false and provide raw PCM audiochunks in 16-bit mono with this method:
If you don't want to use the local microphone set use_microphone parameter to false and provide raw PCM audiochunks in 16-bit mono (samplerate 16000) with this method:
```python
recorder.feed_audio(audio_chunk)
Expand Down Expand Up @@ -253,7 +253,13 @@ When you initialize the `AudioToTextRecorder` class, you have various options to
- Options: 'tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v1', 'large-v2'.
- Note: If a size is provided, the model will be downloaded from the Hugging Face Hub.
- **language** (str, default=""): Language code for transcription. If left empty, the model will try to auto-detect the language.
- **language** (str, default=""): Language code for transcription. If left empty, the model will try to auto-detect the language. Supported language codes are listed in [Whisper Tokenizer library](https://github.com/openai/whisper/blob/main/whisper/tokenizer.py).
- **compute_type** (str, default="default"): Specifies the type of computation to be used for transcription. See [Whisper Quantization](https://opennmt.net/CTranslate2/quantization.html)
- **input_device_index** (int, default=0): Audio Input Device Index to use.
- **gpu_device_index** (int, default=0): GPU Device Index to use. The model can also be loaded on multiple GPUs by passing a list of IDs (e.g. [0, 1, 2, 3]).
- **on_recording_start**: A callable function triggered when recording starts.
Expand Down
104 changes: 84 additions & 20 deletions RealtimeSTT/audio_recorder.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
"""

from multiprocessing import Process, Pipe, Event, Manager
import torch.multiprocessing as mp
from typing import List, Union
import faster_whisper
import collections
import numpy as np
Expand All @@ -43,6 +44,7 @@
import time
import os
import re
import gc

INIT_MODEL_TRANSCRIPTION = "tiny"
INIT_MODEL_TRANSCRIPTION_REALTIME = "tiny"
Expand Down Expand Up @@ -74,6 +76,9 @@ class AudioToTextRecorder:
def __init__(self,
model: str = INIT_MODEL_TRANSCRIPTION,
language: str = "",
compute_type: str = "default",
input_device_index: int = 0,
gpu_device_index: Union[int, List[int]] = 0,
on_recording_start=None,
on_recording_stop=None,
on_transcription_start=None,
Expand Down Expand Up @@ -136,6 +141,16 @@ def __init__(self,
- language (str, default=""): Language code for speech-to-text engine.
If not specified, the model will attempt to detect the language
automatically.
- compute_type (str, default="default"): Specifies the type of
computation to be used for transcription.
See https://opennmt.net/CTranslate2/quantization.html.
- input_device_index (int, default=0): The index of the audio input
device to use.
- gpu_device_index (int, default=0): Device ID to use.
The model can also be loaded on multiple GPUs by passing a list of
IDs (e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can
run in parallel when transcribe() is called from multiple Python
threads
- on_recording_start (callable, default=None): Callback function to be
called when recording of audio to be transcripted starts.
- on_recording_stop (callable, default=None): Callback function to be
Expand Down Expand Up @@ -238,6 +253,9 @@ def __init__(self,
"""

self.language = language
self.compute_type = compute_type
self.input_device_index = input_device_index
self.gpu_device_index = gpu_device_index
self.wake_words = wake_words
self.wake_word_activation_delay = wake_word_activation_delay
self.wake_word_timeout = wake_word_timeout
Expand All @@ -247,6 +265,7 @@ def __init__(self,
self.ensure_sentence_ends_with_period = (
ensure_sentence_ends_with_period
)
self.use_microphone = use_microphone
self.min_gap_between_recordings = min_gap_between_recordings
self.min_length_of_recording = min_length_of_recording
self.pre_recording_buffer_duration = pre_recording_buffer_duration
Expand All @@ -272,8 +291,7 @@ def __init__(self,
self.allowed_latency_limit = ALLOWED_LATENCY_LIMIT

self.level = level
manager = Manager()
self.audio_queue = manager.Queue()
self.audio_queue = mp.Queue()
self.buffer_size = BUFFER_SIZE
self.sample_rate = SAMPLE_RATE
self.recording_start_time = 0
Expand Down Expand Up @@ -323,20 +341,30 @@ def __init__(self,
logger.addHandler(console_handler)

self.is_shut_down = False
self.shutdown_event = Event()
self.shutdown_event = mp.Event()

logging.info("Starting RealTimeSTT")

# Start transcription process
self.interrupt_stop_event = Event()
self.main_transcription_ready_event = Event()
self.parent_transcription_pipe, child_transcription_pipe = Pipe()

self.transcript_process = Process(
# Start transcription worker process
try:
# Only set the start method if it hasn't been set already
if mp.get_start_method(allow_none=True) is None:
mp.set_start_method("spawn")
except RuntimeError as e:
print("Start method has already been set. Details:", e)

self.interrupt_stop_event = mp.Event()
self.was_interrupted = mp.Event()
self.main_transcription_ready_event = mp.Event()
self.parent_transcription_pipe, child_transcription_pipe = mp.Pipe()

self.transcript_process = mp.Process(
target=AudioToTextRecorder._transcription_worker,
args=(
child_transcription_pipe,
model,
self.compute_type,
self.gpu_device_index,
self.main_transcription_ready_event,
self.shutdown_event,
self.interrupt_stop_event
Expand All @@ -346,12 +374,13 @@ def __init__(self,

# Start audio data reading process
if use_microphone:
self.reader_process = Process(
self.reader_process = mp.Process(
target=AudioToTextRecorder._audio_data_worker,
args=(
self.audio_queue,
self.sample_rate,
self.buffer_size,
self.input_device_index,
self.shutdown_event,
self.interrupt_stop_event
)
Expand All @@ -366,7 +395,9 @@ def __init__(self,
)
self.realtime_model_type = faster_whisper.WhisperModel(
model_size_or_path=self.realtime_model_type,
device='cuda' if torch.cuda.is_available() else 'cpu'
device='cuda' if torch.cuda.is_available() else 'cpu',
compute_type=self.compute_type,
device_index=self.gpu_device_index
)

except Exception as e:
Expand Down Expand Up @@ -473,12 +504,11 @@ def __init__(self,

logging.debug('RealtimeSTT initialization completed successfully')

print(f"buffer_size: {self.buffer_size}")
print(f"samplerate: {self.sample_rate}")

@staticmethod
def _transcription_worker(conn,
model_path,
compute_type,
gpu_device_index,
ready_event,
shutdown_event,
interrupt_stop_event):
Expand All @@ -499,6 +529,9 @@ def _transcription_worker(conn,
for receiving audio data and sending transcription results.
model_path (str): The path to the pre-trained faster_whisper model
for transcription.
compute_type (str): Specifies the type of computation to be used
for transcription.
gpu_device_index (int): Device ID to use.
ready_event (threading.Event): An event that is set when the
transcription model is successfully initialized and ready.
shutdown_event (threading.Event): An event that, when set,
Expand All @@ -516,7 +549,9 @@ def _transcription_worker(conn,
try:
model = faster_whisper.WhisperModel(
model_size_or_path=model_path,
device='cuda' if torch.cuda.is_available() else 'cpu'
device='cuda' if torch.cuda.is_available() else 'cpu',
compute_type=compute_type,
device_index=gpu_device_index
)

except Exception as e:
Expand Down Expand Up @@ -563,6 +598,7 @@ def _transcription_worker(conn,
def _audio_data_worker(audio_queue,
sample_rate,
buffer_size,
input_device_index,
shutdown_event,
interrupt_stop_event):
"""
Expand All @@ -583,6 +619,7 @@ def _audio_data_worker(audio_queue,
sample_rate (int): The sample rate of the audio input stream.
buffer_size (int): The size of the buffer used in the audio
input stream.
input_device_index (int): The index of the audio input device
shutdown_event (threading.Event): An event that, when set, signals
this worker method to terminate.
Expand All @@ -600,7 +637,8 @@ def _audio_data_worker(audio_queue,
format=pyaudio.paInt16,
channels=1,
input=True,
frames_per_buffer=buffer_size
frames_per_buffer=buffer_size,
input_device_index=input_device_index,
)

except Exception as e:
Expand Down Expand Up @@ -647,6 +685,20 @@ def _audio_data_worker(audio_queue,
stream.close()
audio_interface.terminate()

def wakeup(self):
"""
If in wake work modus, wake up as if a wake word was spoken.
"""
self.listen_start = time.time()

def abort(self):
self.start_recording_on_voice_activity = False
self.stop_recording_on_voice_deactivity = False
self._set_state("inactive")
self.interrupt_stop_event.set()
self.was_interrupted.wait()
self.was_interrupted.clear()

def wait_audio(self):
"""
Waits for the start and completion of the audio recording process.
Expand All @@ -671,7 +723,7 @@ def wait_audio(self):

# Wait until recording starts
while not self.interrupt_stop_event.is_set():
if self.start_recording_event.wait(timeout=0.5):
if self.start_recording_event.wait(timeout=0.02):
break

# If recording is ongoing, wait for voice inactivity
Expand All @@ -681,7 +733,7 @@ def wait_audio(self):

# Wait until recording stops
while not self.interrupt_stop_event.is_set():
if (self.stop_recording_event.wait(timeout=0.5)):
if (self.stop_recording_event.wait(timeout=0.02)):
break

# Convert recorded frames to the appropriate audio format.
Expand Down Expand Up @@ -756,9 +808,14 @@ def text(self,
str: The transcription of the recorded audio
"""

self.interrupt_stop_event.clear()
self.was_interrupted.clear()

self.wait_audio()

if self.is_shut_down or self.interrupt_stop_event.is_set():
if self.interrupt_stop_event.is_set():
self.was_interrupted.set()
return ""

if on_transcription_finished:
Expand Down Expand Up @@ -873,7 +930,8 @@ def shutdown(self):

logging.debug('Terminating reader process')
# Give it some time to finish the loop and cleanup.
self.reader_process.join(timeout=10)
if self.use_microphone:
self.reader_process.join(timeout=10)

if self.reader_process.is_alive():
logging.warning("Reader process did not terminate "
Expand All @@ -896,6 +954,12 @@ def shutdown(self):
if self.realtime_thread:
self.realtime_thread.join()

if self.enable_realtime_transcription:
if self.realtime_model_type:
del self.realtime_model_type
self.realtime_model_type = None
gc.collect()

def _recording_worker(self):
"""
The main worker method which constantly monitors the audio
Expand Down
2 changes: 1 addition & 1 deletion install_with_gpu_support.bat
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
pip install torch==2.1.1+cu118 torchaudio==2.1.1+cu118 --index-url https://download.pytorch.org/whl/cu118
pip install torch==2.1.2+cu118 torchaudio==2.1.2+cu118 --index-url https://download.pytorch.org/whl/cu118
pip install -r requirements-gpu.txt
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ faster-whisper==0.10.0
pvporcupine==1.9.5
webrtcvad==2.0.10
halo==0.0.31
torch==2.1.1
torchaudio==2.1.1
torch==2.1.2
torchaudio==2.1.2
5 changes: 5 additions & 0 deletions requirements_raw.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
PyAudio
faster-whisper==0.10.0
pvporcupine==1.9.5
webrtcvad
halo

0 comments on commit 8d0ba91

Please sign in to comment.