Skip to content

Commit

Permalink
fix: migrated samples to speech 2.0.0 (#78)
Browse files Browse the repository at this point in the history
* I updated the comment on the transcribe_async file to reflect time limitations on local files for the long_running_recognize

* I updated the comment on the transcribe_async file to reflect time limitations on local files for the long_running_recognize

* docs: I updated the comment on the transcribe_async file to reflect time limitations on local files for the long_running_recognize

* chore: I updated the comments on the transcribe_async file to reflect time limitations on local files for the long_running_recognize

* fix: resolved conflicts

* fix: migrated samples to speech 2.0.0

* fix: migrated to speech 2.0.0

* fix: fixed lint issues
  • Loading branch information
b-loved-dreamer authored and dandhlee committed Feb 9, 2023
1 parent 97f39e7 commit 6d0395d
Show file tree
Hide file tree
Showing 18 changed files with 182 additions and 208 deletions.
2 changes: 1 addition & 1 deletion speech/microphone/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
google-cloud-speech==1.3.2
google-cloud-speech==2.0.0
pyaudio==0.2.11
six==1.15.0

80 changes: 35 additions & 45 deletions speech/microphone/transcribe_streaming_infinite.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@
SAMPLE_RATE = 16000
CHUNK_SIZE = int(SAMPLE_RATE / 10) # 100ms

RED = "\033[0;31m"
GREEN = "\033[0;32m"
YELLOW = "\033[0;33m"
RED = '\033[0;31m'
GREEN = '\033[0;32m'
YELLOW = '\033[0;33m'


def get_current_time():
Expand Down Expand Up @@ -123,14 +123,12 @@ def generator(self):
if self.bridging_offset > self.final_request_end_time:
self.bridging_offset = self.final_request_end_time

chunks_from_ms = round(
(self.final_request_end_time - self.bridging_offset)
/ chunk_time
)
chunks_from_ms = round((self.final_request_end_time -
self.bridging_offset) / chunk_time)

self.bridging_offset = round(
(len(self.last_audio_input) - chunks_from_ms) * chunk_time
)
self.bridging_offset = (round((
len(self.last_audio_input) - chunks_from_ms)
* chunk_time))

for i in range(chunks_from_ms, len(self.last_audio_input)):
data.append(self.last_audio_input[i])
Expand Down Expand Up @@ -159,7 +157,7 @@ def generator(self):
except queue.Empty:
break

yield b"".join(data)
yield b''.join(data)


def listen_print_loop(responses, stream):
Expand Down Expand Up @@ -195,45 +193,42 @@ def listen_print_loop(responses, stream):
transcript = result.alternatives[0].transcript

result_seconds = 0
result_nanos = 0
result_micros = 0

if result.result_end_time.seconds:
result_seconds = result.result_end_time.seconds

if result.result_end_time.nanos:
result_nanos = result.result_end_time.nanos
if result.result_end_time.microseconds:
result_micros = result.result_end_time.microseconds

stream.result_end_time = int((result_seconds * 1000) + (result_nanos / 1000000))
stream.result_end_time = int((result_seconds * 1000) + (result_micros / 1000))

corrected_time = (
stream.result_end_time
- stream.bridging_offset
+ (STREAMING_LIMIT * stream.restart_counter)
)
corrected_time = (stream.result_end_time - stream.bridging_offset
+ (STREAMING_LIMIT * stream.restart_counter))
# Display interim results, but with a carriage return at the end of the
# line, so subsequent lines will overwrite them.

if result.is_final:

sys.stdout.write(GREEN)
sys.stdout.write("\033[K")
sys.stdout.write(str(corrected_time) + ": " + transcript + "\n")
sys.stdout.write('\033[K')
sys.stdout.write(str(corrected_time) + ': ' + transcript + '\n')

stream.is_final_end_time = stream.result_end_time
stream.last_transcript_was_final = True

# Exit recognition if any of the transcribed phrases could be
# one of our keywords.
if re.search(r"\b(exit|quit)\b", transcript, re.I):
if re.search(r'\b(exit|quit)\b', transcript, re.I):
sys.stdout.write(YELLOW)
sys.stdout.write("Exiting...\n")
sys.stdout.write('Exiting...\n')
stream.closed = True
break

else:
sys.stdout.write(RED)
sys.stdout.write("\033[K")
sys.stdout.write(str(corrected_time) + ": " + transcript + "\r")
sys.stdout.write('\033[K')
sys.stdout.write(str(corrected_time) + ': ' + transcript + '\r')

stream.last_transcript_was_final = False

Expand All @@ -245,39 +240,34 @@ def main():
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=SAMPLE_RATE,
language_code="en-US",
max_alternatives=1,
)
language_code='en-US',
max_alternatives=1)
streaming_config = speech.StreamingRecognitionConfig(
config=config, interim_results=True
)
config=config,
interim_results=True)

mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)
print(mic_manager.chunk_size)
sys.stdout.write(YELLOW)
sys.stdout.write('\nListening, say "Quit" or "Exit" to stop.\n\n')
sys.stdout.write("End (ms) Transcript Results/Status\n")
sys.stdout.write("=====================================================\n")
sys.stdout.write('End (ms) Transcript Results/Status\n')
sys.stdout.write('=====================================================\n')

with mic_manager as stream:

while not stream.closed:
sys.stdout.write(YELLOW)
sys.stdout.write(
"\n" + str(STREAMING_LIMIT * stream.restart_counter) + ": NEW REQUEST\n"
)
sys.stdout.write('\n' + str(
STREAMING_LIMIT * stream.restart_counter) + ': NEW REQUEST\n')

stream.audio_input = []
audio_generator = stream.generator()

requests = (
speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator
)
requests = (speech.StreamingRecognizeRequest(
audio_content=content)for content in audio_generator)

responses = client.streaming_recognize(
requests=requests, config=streaming_config
)
responses = client.streaming_recognize(streaming_config,
requests)

# Now, put the transcription responses to use.
listen_print_loop(responses, stream)
Expand All @@ -291,11 +281,11 @@ def main():
stream.restart_counter = stream.restart_counter + 1

if not stream.last_transcript_was_final:
sys.stdout.write("\n")
sys.stdout.write('\n')
stream.new_stream = True


if __name__ == "__main__":
if __name__ == '__main__':

main()

Expand Down
39 changes: 16 additions & 23 deletions speech/microphone/transcribe_streaming_mic.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import sys

from google.cloud import speech

import pyaudio
from six.moves import queue

Expand All @@ -42,7 +43,6 @@

class MicrophoneStream(object):
"""Opens a recording stream as a generator yielding the audio chunks."""

def __init__(self, rate, chunk):
self._rate = rate
self._chunk = chunk
Expand All @@ -57,10 +57,8 @@ def __enter__(self):
format=pyaudio.paInt16,
# The API currently only supports 1-channel (mono) audio
# https://goo.gl/z757pE
channels=1,
rate=self._rate,
input=True,
frames_per_buffer=self._chunk,
channels=1, rate=self._rate,
input=True, frames_per_buffer=self._chunk,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
Expand Down Expand Up @@ -105,7 +103,7 @@ def generator(self):
except queue.Empty:
break

yield b"".join(data)
yield b''.join(data)


def listen_print_loop(responses):
Expand Down Expand Up @@ -143,10 +141,10 @@ def listen_print_loop(responses):
#
# If the previous result was longer than this one, we need to print
# some extra spaces to overwrite the previous result
overwrite_chars = " " * (num_chars_printed - len(transcript))
overwrite_chars = ' ' * (num_chars_printed - len(transcript))

if not result.is_final:
sys.stdout.write(transcript + overwrite_chars + "\r")
sys.stdout.write(transcript + overwrite_chars + '\r')
sys.stdout.flush()

num_chars_printed = len(transcript)
Expand All @@ -156,8 +154,8 @@ def listen_print_loop(responses):

# Exit recognition if any of the transcribed phrases could be
# one of our keywords.
if re.search(r"\b(exit|quit)\b", transcript, re.I):
print("Exiting..")
if re.search(r'\b(exit|quit)\b', transcript, re.I):
print('Exiting..')
break

num_chars_printed = 0
Expand All @@ -166,33 +164,28 @@ def listen_print_loop(responses):
def main():
# See http://g.co/cloud/speech/docs/languages
# for a list of supported languages.
language_code = "en-US" # a BCP-47 language tag
language_code = 'en-US' # a BCP-47 language tag

client = speech.SpeechClient()
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=RATE,
language_code=language_code,
)
language_code=language_code)
streaming_config = speech.StreamingRecognitionConfig(
config=config, interim_results=True
)
config=config,
interim_results=True)

with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator()
requests = (
speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator
)
requests = (speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator)

responses = client.streaming_recognize(
requests=requests, config=streaming_config
)
responses = client.streaming_recognize(streaming_config, requests)

# Now, put the transcription responses to use.
listen_print_loop(responses)


if __name__ == "__main__":
if __name__ == '__main__':
main()
# [END speech_transcribe_streaming_mic]
19 changes: 8 additions & 11 deletions speech/microphone/transcribe_streaming_mic_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import mock

RESOURCES = os.path.join(os.path.dirname(__file__), "resources")
RESOURCES = os.path.join(os.path.dirname(__file__), 'resources')


class MockPyAudio(object):
Expand All @@ -32,9 +32,8 @@ def open(self, stream_callback, rate, *args, **kwargs):
self.rate = rate
self.closed = threading.Event()
self.stream_thread = threading.Thread(
target=self.stream_audio,
args=(self.audio_filename, stream_callback, self.closed),
)
target=self.stream_audio, args=(
self.audio_filename, stream_callback, self.closed))
self.stream_thread.start()
return self

Expand All @@ -48,25 +47,23 @@ def terminate(self):
pass

def stream_audio(self, audio_filename, callback, closed, num_frames=512):
with open(audio_filename, "rb") as audio_file:
with open(audio_filename, 'rb') as audio_file:
while not closed.is_set():
# Approximate realtime by sleeping for the appropriate time for
# the requested number of frames
time.sleep(num_frames / float(self.rate))
# audio is 16-bit samples, whereas python byte is 8-bit
num_bytes = 2 * num_frames
chunk = audio_file.read(num_bytes) or b"\0" * num_bytes
chunk = audio_file.read(num_bytes) or b'\0' * num_bytes
callback(chunk, None, None, None)


@mock.patch.dict(
"sys.modules",
pyaudio=mock.MagicMock(PyAudio=MockPyAudio(os.path.join(RESOURCES, "quit.raw"))),
)
@mock.patch.dict('sys.modules', pyaudio=mock.MagicMock(
PyAudio=MockPyAudio(os.path.join(RESOURCES, 'quit.raw'))))
def test_main(capsys):
import transcribe_streaming_mic

transcribe_streaming_mic.main()
out, err = capsys.readouterr()

assert re.search(r"quit", out, re.DOTALL | re.I)
assert re.search(r'quit', out, re.DOTALL | re.I)
Loading

0 comments on commit 6d0395d

Please sign in to comment.