-
Notifications
You must be signed in to change notification settings - Fork 0
/
speech.py
329 lines (249 loc) · 9.18 KB
/
speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
import numpy as np
import speech_recognition as sr
import random
import pyttsx3
import os
from failing import errorloc
########################################################################
######################## FUNCTIONS ########################
########################################################################
## Basics
path_hidden_files = "/Users/ivan/.ssh/"
apikey_file_name = ".apikey_watson"
apiurl_file_name = ".apiurl_watson"
def initConv():
"""
Function to initiliase recogniser and microphone
"""
recognizer = sr.Recognizer()
microphone = sr.Microphone()
print("Microphone and recognizer initiliased")
return [recognizer, microphone]
def initSpeak():
engine = pyttsx3.init()
return engine
def speak(engine, text):
try:
engine.say(text)
# print('START speaking')
engine.runAndWait()
# print('DONE speaking')
except Exception as e:
errorloc(e)
def say(text):
try:
os.system(f"say {text}")
except Exception as e:
errorloc(e)
def listenConv(recognizer, microphone, speaker, PROMPT_LIMIT=5):
for j in range(PROMPT_LIMIT):
speech = recognize_speech_from_mic(recognizer, microphone)
if speech["transcription"]:
break
if not speech["success"]:
break
print(speech)
listen_again = "I didn't catch that, what did you say?"
speak(speaker, listen_again)
return speech
# if there was an error, stop the game
def error_conv(speech):
if speech["error"]:
raise Exception("ERROR: {}".format(speech["error"]))
def checkCorrect(speech, word):
guess_is_correct = speech["transcription"].lower() == word.lower()
return guess_is_correct
def sayCorrect(guess_is_correct):
if guess_is_correct:
print("Correct.")
else:
print("Please. Try again.\n")
#################################################################
####################### IBM watson ##############################
#################################################################
try:
import pyaudio
except:
pass
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
from threading import Thread
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from queue import Queue, Full
###############################################
#### Initalize queue to store the recordings ##
###############################################
# Create an instance of AudioSource
CHUNK = 1024
BUF_MAX_SIZE = CHUNK * 10
def audioInstance(BUF_MAX_SIZE=BUF_MAX_SIZE, CHUNK=CHUNK):
q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK)))
audio_source = AudioSource(q, True, True)
return [audio_source, q]
###############################################
#### Prepare Speech to Text Service ########
###############################################
def getHiddenValue(path, file_name):
with open(f"{path}/{file_name}", 'r') as file:
hidden_value = file.read()
return hidden_value
# initialize speech to text service
def initSpeech2Text():
# get the api key from the environment variable
apikey = getHiddenValue(path_hidden_files, apikey_file_name)
# remove tlast character
apikey = apikey[:-1]
authenticator = IAMAuthenticator(f'{apikey}')
speech_to_text = SpeechToTextV1(authenticator=authenticator)
# get the api url from the environment variable
api_url = getHiddenValue(path_hidden_files, apiurl_file_name)
# remove tlast character
api_url = api_url[:-1]
speech_to_text.set_service_url(
api_url
)
return speech_to_text
# define callback for the speech to text service
class MyRecognizeCallback(RecognizeCallback):
def __init__(self, websocket):
RecognizeCallback.__init__(self)
self.websocket = websocket
def on_transcription(self, transcript):
pass
def on_connected(self):
print("Connection was successful")
def on_error(self, error):
print("Error received: {}".format(error))
def on_inactivity_timeout(self, error):
print("Inactivity timeout: {}".format(error))
def on_listening(self):
print("\nSERVICE IS LISTENING\n")
def on_hypothesis(self, hypothesis):
self.websocket.hypothesis = hypothesis
# print(hypothesis)
def on_data(self, data):
self.websocket.listened = data["results"][0]["alternatives"][0]["transcript"]
try:
self.websocket.confidence = data["results"][0]["alternatives"][0]["confidence"]
except:
print('failed confidence', data["results"][0]["alternatives"][0]["confidence"])
pass
if any(x in self.websocket.listened for x in ["yes", "yeah"]):
self.websocket.answered = 1
elif any(x in self.websocket.listened for x in ["no"]):
self.websocket.answered = 0
def on_close(self):
print("Connection closed")
# this function will initiate the recognize service and pass in the AudioSource
class YesNotoRecognition():
def __init__(self, speech_to_text, audio_source):
self.answered = None
self.listened = None
self.confidence = None
self.hypothesis = None
mycallback = MyRecognizeCallback(self)
self.speech_to_text = speech_to_text
self.audio_source = audio_source
self.mycallback = mycallback
def run(self):
self.speech_to_text.recognize_using_websocket(
audio=self.audio_source,
content_type="audio/l16; rate=44100",
recognize_callback=self.mycallback,
interim_results=True,
)
print("Speech recognition off")
###############################################
#### Prepare the for recording using Pyaudio ##
###############################################
# Variables for recording the speech
try:
FORMAT = pyaudio.paInt16
except:
pass
CHANNELS = 1
RATE = 44100
# define callback for pyaudio to store the recording in queue
def startAudioWatson():
# instantiate pyaudio
audio = pyaudio.PyAudio()
return audio
def openStream(audio, q):
class pyaudioCallback():
def __init__(self):
self.audio = audio
self.q = q
self.frames = []
def run(self, in_data, frame_count, time_info, status):
try:
q.put(in_data)
self.frames.append(in_data)
except Full:
pass # discard
return (None, pyaudio.paContinue)
pyaudio_callback = pyaudioCallback()
stream = audio.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
stream_callback=pyaudio_callback.run,
start=False,
)
return stream, pyaudio_callback.frames
def terminateSpeechRecognition(stream, audio, audio_source):
stream.stop_stream()
stream.close()
audio.terminate()
audio_source.completed_recording()
print("Speech recognition terminated")
##### Functions of functions
def testingSpeechCov(words, repeats):
recognizer, microphone = initConv()
list_ws = words * repeats
random.shuffle(list_ws)
for i in np.arange(len(list_ws)):
temp_word = list_ws[i]
print(f"Say: {temp_word}")
speech = listenConv(recognizer, microphone)
error_conv(speech)
print("You said: {}".format(speech["transcription"]))
bo_co = checkCorrect(speech, temp_word)
sayCorrect(bo_co)
def recognize_speech_from_mic(recognizer, microphone):
"""Transcribe speech from recorded from `microphone`.
Returns a dictionary with three keys:
"success": a boolean indicating whether or not the API request was
successful
"error": `None` if no error occured, otherwise a string containing
an error message if the API could not be reached or
speech was unrecognizable
"transcription": `None` if speech could not be transcribed,
otherwise a string containing the transcribed text
"""
# check that recognizer and microphone arguments are appropriate type
if not isinstance(recognizer, sr.Recognizer):
raise TypeError("`recognizer` must be `Recognizer` instance")
if not isinstance(microphone, sr.Microphone):
raise TypeError("`microphone` must be `Microphone` instance")
# adjust the recognizer sensitivity to ambient noise and record audio
# from the microphone
with microphone as source:
recognizer.adjust_for_ambient_noise(source)
audio = recognizer.listen(source)
# set up the response object
response = {"success": True, "error": None, "transcription": None}
# try recognizing the speech in the recording
# if a RequestError or UnknownValueError exception is caught,
# update the response object accordingly
try:
response["transcription"] = recognizer.recognize_sphinx(audio)
except sr.RequestError:
# API was unreachable or unresponsive
response["success"] = False
response["error"] = "API unavailable"
except sr.UnknownValueError:
# speech was unintelligible
response["error"] = "Unable to recognize speech"
return response