-
Notifications
You must be signed in to change notification settings - Fork 0
/
skull-gpt-voice-stream.py
188 lines (160 loc) · 7.28 KB
/
skull-gpt-voice-stream.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import os
import openai
import dotenv
import serial
from elevenlabs import set_api_key, play, generate, stream, Voice, VoiceSettings
import serial.tools.list_ports
import speech_recognition as sr
# This version uses threading to stream GPT respones,
# collect them in batches, send them to ElevenLabs and stream the audio
# all concurrently. This is an attempt to make the most efficient way to stream audio from GPT.
# Under development, gotta tyne batch sizes and number or threads
# (one for collecting openai responses in queued batches,
# one for sending batches to Eleven labs,
# one for taking ready audios from a queue and streaming it)
def find_arduino_port():
ports = list(serial.tools.list_ports.comports())
for p in ports:
if 'USB2.0-Serial' in p.description:
return p.device
return None
# Load .env and set API keyssay hi
dotenv.load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
ELEVENLABS_API = os.getenv("ELEVENLABS_API")
set_api_key(ELEVENLABS_API)
SKULL_SYSTEM_PROMPT = """You are an evil bro skull. It is halloween season.
You are very sarcastic and you try to trick user all the time. You also write limerics.
But you are also a bro. You are aggressive.
You also have a stupid scorpion friend, but scorpion does not talk.
You can sometimes forward user questions to the skorpion. Keep your answers short.
You were build from a skul from Ace Hardware, 10$ walgreens speaker, an arduino, and a couple of toy electric parts."""
# Come up with good prank for this hackathon
# Write a limeric about scorpion
import threading
from queue import Queue
import time
def collect_openai_responses(response_openai, queue, messages, user_input_time):
collected_messages, messages_batch = [], []
sentence_counter = 0
for chunk in response_openai:
if chunk['choices'][0]['delta'].get('role') == 'assistant':
print(f"Time to get first chunk: {time.time() - user_input_time}")
chunk_delta = chunk['choices'][0]['delta']
collected_messages.append(chunk_delta)
messages_batch.append(chunk_delta)
if any(punct in chunk_delta.get('content', '') for punct in ".!?"):
sentence_counter += 1
# this is a not very good hack to send first couple of sentences to Eleven Labs while we wait for the rest of the text
# (because ideally we need to mess with eleven labs generate
# to start generating a soon as first couple sentences emerge,
# and i dont immediatelly know how to do that now)
# seems like 2 is optimal on a wework wi-fi
if sentence_counter == 2 or chunk['choices'][0].get('finish_reason') == "stop":
batch_reply_content = ''.join(m.get('content', '') for m in messages_batch)
#print(f"Skull: ###{batch_reply_content}###")
queue.put(batch_reply_content)
messages_batch.clear()
sentence_counter += 1000
# Combine chunks into full text
full_reply_content = ''.join([m.get('content', '') for m in collected_messages])
end_time_openai = time.time()
print(f"Time to collect openai responses: {end_time_openai - user_input_time}")
messages.append({"role": "assistant", "content": full_reply_content})
def generate_and_stream_audio(queue, user_input_time, arduino=None):
while True:
text = queue.get() # get text from the queue
if text is None: break # None is the signal to stop
start_got_first_chunk = time.time()
# I have a feeling here that `generate` function is already optimised to stream audio effitiently
# i have to measure time between user input and the first audio chunk with and without batching
# Convert text to speech and stream
audio_stream = generate(
text=text,
voice=Voice(
voice_id='NuVRT1lw1hsKBhV4OtOv', # batman
settings=VoiceSettings(stability=0.9, similarity_boost=0.75, style=0.0, use_speaker_boost=False)
),
model="eleven_monolingual_v1",
stream=True,
latency=4 # max latency optimisation
)
end_time_first_audio = time.time()
print(f"Time to generate audio: {end_time_first_audio - user_input_time}\n",
f"Time from first chunk to first audio: {end_time_first_audio - start_got_first_chunk}")
if arduino:
arduino.write(b'g')
stream(audio_stream)
if arduino:
arduino.write(b's')
queue.task_done()
def recognize_speech():
recognizer = sr.Recognizer()
#recognizer.energy_threshold = 2000
recognizer.pause_threshold = 0.5
mic = sr.Microphone()
with mic as source:
recognizer.adjust_for_ambient_noise(source, duration = 1)
audio = recognizer.listen(source, phrase_time_limit=5)
try:
return recognizer.recognize_google(audio, language='en-US', show_all=False)
except sr.UnknownValueError:
return None # Return None if audio is not understood
except sr.RequestError:
return "Could not request results; check network"
def main(arduino=None):
messages = [
{"role": "system", "content": SKULL_SYSTEM_PROMPT}
]
try:
while True:
print("Listening...")
user_input = recognize_speech()
if user_input is None: # Skip the loop if no speech recognized
print("No speech recognized. Skipping.")
continue
print(f"You: {user_input}")
user_input_time = time.time()
messages.append({"role": "user", "content": user_input})
response_openai = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
stream=True
)
# Create a queue to communicate between threads
message_queue = Queue()
# Start the thread that collects OpenAI responses
openai_thread = threading.Thread(target=collect_openai_responses,
args=(response_openai, message_queue, messages, user_input_time))
openai_thread.start()
# Start the thread that generates and streams audio
audio_thread = threading.Thread(target=generate_and_stream_audio,
args=(message_queue, user_input_time, arduino))
audio_thread.start()
# Wait for the OpenAI response collection to finish
openai_thread.join()
# Signal the audio thread to stop
message_queue.put(None)
# Wait for the audio thread to finish
audio_thread.join()
except KeyboardInterrupt:
print("Exiting...")
if arduino:
arduino.write(b's')
arduino.close()
except Exception as e:
print(f"An error occurred: {e}")
if arduino:
arduino.write(b's')
arduino.close()
if __name__ == "__main__":
try:
arduino_port = find_arduino_port()
arduino = serial.Serial(arduino_port, 9600)
while arduino.readline().decode('ascii').strip() != "READY":
pass
print(f"Arduino connected: {arduino.name}")
main(arduino)
except Exception as e:
print(f"Failed to connect to Arduino: {e}")
main()