Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor recognize_google #721

Merged
merged 18 commits into from
Dec 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,5 @@ jobs:
python -m pip install .
- name: Test with unittest
run: |
python -m doctest speech_recognition/recognizers/google.py -v
python -m unittest discover --verbose
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python3

import sys
import os
import stat

Expand Down
90 changes: 12 additions & 78 deletions speech_recognition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import time
import uuid
import wave
from typing import TYPE_CHECKING
from urllib.error import HTTPError, URLError
from urllib.parse import urlencode
from urllib.request import Request, urlopen
Expand All @@ -39,10 +38,6 @@
UnknownValueError,
WaitTimeoutError,
)
from .recognizers import whisper

if TYPE_CHECKING:
from .recognizers.google import Alternative, Result

__author__ = "Anthony Zhang (Uberi)"
__version__ = "3.10.0"
Expand Down Expand Up @@ -675,77 +670,6 @@ def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, g
if hypothesis is not None: return hypothesis.hypstr
raise UnknownValueError() # no transcriptions available

def recognize_google(self, audio_data, key=None, language="en-US", pfilter=0, show_all=False, with_confidence=False):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API.

The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**.

To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".

The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__.

The profanity filter level can be adjusted with ``pfilter``: 0 - No filter, 1 - Only shows the first character and replaces the rest with asterisks. The default is level 0.

Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.

Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
"""
assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
assert key is None or isinstance(key, str), "``key`` must be ``None`` or a string"
assert isinstance(language, str), "``language`` must be a string"

flac_data = audio_data.get_flac_data(
convert_rate=None if audio_data.sample_rate >= 8000 else 8000, # audio samples must be at least 8 kHz
convert_width=2 # audio samples must be 16-bit
)
if key is None: key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
url = "http://www.google.com/speech-api/v2/recognize?{}".format(urlencode({
"client": "chromium",
"lang": language,
"key": key,
"pFilter": pfilter
}))
request = Request(url, data=flac_data, headers={"Content-Type": "audio/x-flac; rate={}".format(audio_data.sample_rate)})

# obtain audio transcription results
try:
response = urlopen(request, timeout=self.operation_timeout)
except HTTPError as e:
raise RequestError("recognition request failed: {}".format(e.reason))
except URLError as e:
raise RequestError("recognition connection failed: {}".format(e.reason))
response_text = response.read().decode("utf-8")

# ignore any blank blocks
actual_result = []
for line in response_text.split("\n"):
if not line: continue
result: list[Result] = json.loads(line)["result"]
if len(result) != 0:
actual_result: Result = result[0]
break

# return results
if show_all:
return actual_result

if not isinstance(actual_result, dict) or len(actual_result.get("alternative", [])) == 0: raise UnknownValueError()

if "confidence" in actual_result["alternative"]:
# return alternative with highest confidence score
best_hypothesis: Alternative = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"])
else:
# when there is no confidence available, we arbitrarily choose the first hypothesis.
best_hypothesis: Alternative = actual_result["alternative"][0]
if "transcript" not in best_hypothesis: raise UnknownValueError()
# https://cloud.google.com/speech-to-text/docs/basics#confidence-values
# "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results."
confidence = best_hypothesis.get("confidence", 0.5)
if with_confidence:
return best_hypothesis["transcript"], confidence
return best_hypothesis["transcript"]

def recognize_google_cloud(self, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API.
Expand Down Expand Up @@ -1512,8 +1436,6 @@ def recognize_whisper(self, audio_data, model="base", show_dict=False, load_opti
return result
else:
return result["text"]

recognize_whisper_api = whisper.recognize_whisper_api

def recognize_vosk(self, audio_data, language='en'):
from vosk import KaldiRecognizer, Model
Expand Down Expand Up @@ -1562,6 +1484,18 @@ def flush(self, *args, **kwargs):
return self._file.flush(*args, **kwargs)


# During the pip install process, the 'import speech_recognition' command in setup.py is executed.
# At this time, the dependencies are not yet installed, resulting in a ModuleNotFoundError.
# This is a workaround to resolve this issue
try:
from .recognizers import google, whisper
except (ModuleNotFoundError, ImportError):
pass
else:
Recognizer.recognize_google = google.recognize_legacy
Recognizer.recognize_whisper_api = whisper.recognize_whisper_api


# ===============================
# backwards compatibility shims
# ===============================
Expand Down
233 changes: 232 additions & 1 deletion speech_recognition/recognizers/google.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
from __future__ import annotations

from typing import TypedDict
import json
from typing import Dict, Literal, TypedDict
from urllib.error import HTTPError, URLError
from urllib.parse import urlencode
from urllib.request import Request, urlopen

from typing_extensions import NotRequired

from speech_recognition.audio import AudioData
from speech_recognition.exceptions import RequestError, UnknownValueError


class Alternative(TypedDict):
transcript: str
Expand All @@ -18,3 +25,227 @@ class Result(TypedDict):
class GoogleResponse(TypedDict):
result: list[Result]
result_index: NotRequired[int]


ProfanityFilterLevel = Literal[0, 1]
RequestHeaders = Dict[str, str]


class RequestBuilder:
endpoint = "http://www.google.com/speech-api/v2/recognize"

def __init__(
self, *, key: str, language: str, filter_level: ProfanityFilterLevel
) -> None:
self.key = key
self.language = language
self.filter_level = filter_level

def build(self, audio_data: AudioData) -> Request:
if not isinstance(audio_data, AudioData):
raise ValueError("``audio_data`` must be audio data")

url = self.build_url()
headers = self.build_headers(audio_data)
flac_data = self.build_data(audio_data)
request = Request(url, data=flac_data, headers=headers)
return request

def build_url(self) -> str:
"""
>>> builder = RequestBuilder(key="awesome-key", language="en-US", filter_level=0)
>>> builder.build_url()
'http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=awesome-key&pFilter=0'
"""
params = urlencode(
{
"client": "chromium",
"lang": self.language,
"key": self.key,
"pFilter": self.filter_level,
}
)
return f"{self.endpoint}?{params}"

def build_headers(self, audio_data: AudioData) -> RequestHeaders:
"""
>>> builder = RequestBuilder(key="", language="", filter_level=1)
>>> audio_data = AudioData(b"", 16_000, 1)
>>> builder.build_headers(audio_data)
{'Content-Type': 'audio/x-flac; rate=16000'}
"""
rate = audio_data.sample_rate
headers = {"Content-Type": f"audio/x-flac; rate={rate}"}
return headers

def build_data(self, audio_data: AudioData) -> bytes:
flac_data = audio_data.get_flac_data(
convert_rate=self.to_convert_rate(audio_data.sample_rate),
convert_width=2, # audio samples must be 16-bit
)
return flac_data

@staticmethod
def to_convert_rate(sample_rate: int) -> int:
"""Audio samples must be at least 8 kHz

>>> RequestBuilder.to_convert_rate(16_000)
>>> RequestBuilder.to_convert_rate(8_000)
>>> RequestBuilder.to_convert_rate(7_999)
8000
"""
return None if sample_rate >= 8000 else 8000


def create_request_builder(
*,
key: str | None = None,
language: str = "en-US",
filter_level: ProfanityFilterLevel = 0,
) -> RequestBuilder:
if not isinstance(language, str):
raise ValueError("``language`` must be a string")
if key is not None and not isinstance(key, str):
raise ValueError("``key`` must be ``None`` or a string")

if key is None:
key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
return RequestBuilder(
key=key, language=language, filter_level=filter_level
)


class OutputParser:
def __init__(self, *, show_all: bool, with_confidence: bool) -> None:
self.show_all = show_all
self.with_confidence = with_confidence

def parse(self, response_text: str):
actual_result = self.convert_to_result(response_text)
if self.show_all:
return actual_result

best_hypothesis = self.find_best_hypothesis(
actual_result["alternative"]
)
# https://cloud.google.com/speech-to-text/docs/basics#confidence-values
# "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results."
confidence = best_hypothesis.get("confidence", 0.5)
if self.with_confidence:
return best_hypothesis["transcript"], confidence
return best_hypothesis["transcript"]

@staticmethod
def convert_to_result(response_text: str) -> Result:
r"""
>>> response_text = '''{"result":[]}
... {"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0}
... '''
>>> OutputParser.convert_to_result(response_text)
{'alternative': [{'transcript': 'one two three', 'confidence': 0.49585345}, {'transcript': '1 2', 'confidence': 0.42899391}], 'final': True}

>>> OutputParser.convert_to_result("")
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
>>> OutputParser.convert_to_result('\n{"result":[]}')
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
>>> OutputParser.convert_to_result('{"result":[{"foo": "bar"}]}')
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
>>> OutputParser.convert_to_result('{"result":[{"alternative": []}]}')
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
"""
# ignore any blank blocks
for line in response_text.split("\n"):
if not line:
continue
result: list[Result] = json.loads(line)["result"]
if len(result) != 0:
if len(result[0].get("alternative", [])) == 0:
raise UnknownValueError()
return result[0]
raise UnknownValueError()

@staticmethod
def find_best_hypothesis(alternatives: list[Alternative]) -> Alternative:
"""
>>> alternatives = [{"transcript": "one two three", "confidence": 0.42899391}, {"transcript": "1 2", "confidence": 0.49585345}]
>>> OutputParser.find_best_hypothesis(alternatives)
{'transcript': 'one two three', 'confidence': 0.42899391}

>>> alternatives = [{"confidence": 0.49585345}]
>>> OutputParser.find_best_hypothesis(alternatives)
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
"""
if "confidence" in alternatives:
# BUG: actual_result["alternative"] (=alternatives) is list, not dict
# return alternative with highest confidence score
best_hypothesis: Alternative = max(
alternatives,
key=lambda alternative: alternative["confidence"],
)
else:
# when there is no confidence available, we arbitrarily choose the first hypothesis.
best_hypothesis: Alternative = alternatives[0]
if "transcript" not in best_hypothesis:
raise UnknownValueError()
return best_hypothesis


def obtain_transcription(request: Request, timeout: int) -> str:
try:
response = urlopen(request, timeout=timeout)
except HTTPError as e:
raise RequestError("recognition request failed: {}".format(e.reason))
except URLError as e:
raise RequestError(
"recognition connection failed: {}".format(e.reason)
)
return response.read().decode("utf-8")


def recognize_legacy(
recognizer,
audio_data: AudioData,
key: str | None = None,
language: str = "en-US",
pfilter: ProfanityFilterLevel = 0,
show_all: bool = False,
with_confidence: bool = False,
):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API.

The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**.

To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".

The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__.

The profanity filter level can be adjusted with ``pfilter``: 0 - No filter, 1 - Only shows the first character and replaces the rest with asterisks. The default is level 0.

Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.

Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
"""
request_builder = create_request_builder(
key=key, language=language, filter_level=pfilter
)
request = request_builder.build(audio_data)

response_text = obtain_transcription(
request, timeout=recognizer.operation_timeout
)

output_parser = OutputParser(
show_all=show_all, with_confidence=with_confidence
)
return output_parser.parse(response_text)
Empty file added tests/recognizers/__init__.py
Empty file.
Loading