diff --git a/extensions/whisper_stt/requirements.txt b/extensions/whisper_stt/requirements.txt new file mode 100644 index 0000000000..770c38bba8 --- /dev/null +++ b/extensions/whisper_stt/requirements.txt @@ -0,0 +1,4 @@ +git+https://github.com/Uberi/speech_recognition.git@010382b +openai-whisper +soundfile +ffmpeg diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py new file mode 100644 index 0000000000..6ef60c57a5 --- /dev/null +++ b/extensions/whisper_stt/script.py @@ -0,0 +1,54 @@ +import gradio as gr +import speech_recognition as sr + +input_hijack = { + 'state': False, + 'value': ["", ""] +} + + +def do_stt(audio, text_state=""): + transcription = "" + r = sr.Recognizer() + + # Convert to AudioData + audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4) + + try: + transcription = r.recognize_whisper(audio_data, language="english", model="base.en") + except sr.UnknownValueError: + print("Whisper could not understand audio") + except sr.RequestError as e: + print("Could not request results from Whisper", e) + + input_hijack.update({"state": True, "value": [transcription, transcription]}) + + text_state += transcription + " " + return text_state, text_state + + +def update_hijack(val): + input_hijack.update({"state": True, "value": [val, val]}) + return val + + +def auto_transcribe(audio, audio_auto, text_state=""): + if audio is None: + return "", "" + if audio_auto: + return do_stt(audio, text_state) + return "", "" + + +def ui(): + tr_state = gr.State(value="") + output_transcription = gr.Textbox(label="STT-Input", + placeholder="Speech Preview. Click \"Generate\" to send", + interactive=True) + output_transcription.change(fn=update_hijack, inputs=[output_transcription], outputs=[tr_state]) + audio_auto = gr.Checkbox(label="Auto-Transcribe", value=True) + with gr.Row(): + audio = gr.Audio(source="microphone") + audio.change(fn=auto_transcribe, inputs=[audio, audio_auto, tr_state], outputs=[output_transcription, tr_state]) + transcribe_button = gr.Button(value="Transcribe") + transcribe_button.click(do_stt, inputs=[audio, tr_state], outputs=[output_transcription, tr_state])