Merge pull request #2426 from daspecster/add-speech-async

Add speech asynchronous recognize support.
googleapis · Sep 28, 2016 · 5a0e492 · 5a0e492
2 parents a7cb215 + 08e9e03
commit 5a0e492
Show file tree

Hide file tree

Showing 19 changed files with 815 additions and 71 deletions.
diff --git a/docs/index.rst b/docs/index.rst
@@ -173,6 +173,10 @@
 
   speech-usage
   Client <speech-client>
+  speech-encoding
+  speech-metadata
+  speech-operation
+  speech-transcript
 
 .. toctree::
   :maxdepth: 0

diff --git a/docs/speech-client.rst b/docs/speech-client.rst
@@ -1,5 +1,5 @@
 Speech Client
-================
+=============
 
 .. automodule:: google.cloud.speech.client
   :members:

diff --git a/docs/speech-encoding.rst b/docs/speech-encoding.rst
@@ -0,0 +1,7 @@
+Speech Encoding
+===============
+
+.. automodule:: google.cloud.speech.encoding
+  :members:
+  :undoc-members:
+  :show-inheritance:
diff --git a/docs/speech-metadata.rst b/docs/speech-metadata.rst
@@ -0,0 +1,7 @@
+Speech Metadata
+===============
+
+.. automodule:: google.cloud.speech.metadata
+  :members:
+  :undoc-members:
+  :show-inheritance:
diff --git a/docs/speech-operation.rst b/docs/speech-operation.rst
@@ -0,0 +1,7 @@
+Speech Operation
+================
+
+.. automodule:: google.cloud.speech.operation
+  :members:
+  :undoc-members:
+  :show-inheritance:
diff --git a/docs/speech-transcript.rst b/docs/speech-transcript.rst
@@ -0,0 +1,7 @@
+Speech Transcript
+=================
+
+.. automodule:: google.cloud.speech.transcript
+  :members:
+  :undoc-members:
+  :show-inheritance:
diff --git a/docs/speech-usage.rst b/docs/speech-usage.rst
@@ -2,7 +2,8 @@ Using the API
 =============
 
 The `Google Speech`_ API enables developers to convert audio to text.
-The API recognizes over 80 languages and variants, to support your global user base.
+The API recognizes over 80 languages and variants, to support your global user
+base.
 
 .. warning::
 
@@ -30,11 +31,41 @@ create an instance of :class:`~google.cloud.speech.client.Client`.
      >>> client = speech.Client()
 
 
+Asychronous Recognition
+-----------------------
+
+The :meth:`~google.cloud.speech.Client.async_recognize` sends audio data to the
+Speech API and initiates a Long Running Operation. Using this operation, you
+can periodically poll for recognition results. Use asynchronous requests for
+audio data of any duration up to 80 minutes.
+
+See: `Speech Asynchronous Recognize`_
+
+
+  .. code-block:: python
+
+      >>> import time
+      >>> operation = client.async_recognize(
+      ...     None, 'gs://my-bucket/recording.flac',
+      ...     'FLAC', 16000, max_alternatives=2)
+      >>> retry_count = 100
+      >>> while retry_count > 0 and not operation.complete:
+      ...     retry_count -= 1
+      ...     time.sleep(10)
+      ...     operation.poll()  # API call
+      >>> operation.complete
+      True
+      >>> operation.results[0].transcript
+      'how old is the Brooklyn Bridge'
+      >>> operation.results[0].confidence
+      0.98267895
+
+
 Synchronous Recognition
 -----------------------
 
-The :meth:`~google.cloud.speech.Client.sync_recognize` method converts speech data to text
-and returns alternative text transcriptons.
+The :meth:`~google.cloud.speech.Client.sync_recognize` method converts speech
+data to text and returns alternative text transcriptons.
 
   .. code-block:: python
 
@@ -53,3 +84,4 @@ and returns alternative text transcriptons.
      confidence: 0
 
 .. _sync_recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/syncrecognize
+.. _Speech Asynchronous Recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/asyncrecognize
diff --git a/speech/google/cloud/speech/client.py b/speech/google/cloud/speech/client.py
@@ -19,30 +19,8 @@
 from google.cloud._helpers import _to_bytes
 from google.cloud import client as client_module
 from google.cloud.speech.connection import Connection
-
-
-class Encoding(object):
-    """Audio encoding types.
-
-    See:
-    https://cloud.google.com/speech/reference/rest/v1beta1/\
-    RecognitionConfig#AudioEncoding
-    """
-
-    LINEAR16 = 'LINEAR16'
-    """LINEAR16 encoding type."""
-
-    FLAC = 'FLAC'
-    """FLAC encoding type."""
-
-    MULAW = 'MULAW'
-    """MULAW encoding type."""
-
-    AMR = 'AMR'
-    """AMR encoding type."""
-
-    AMR_WB = 'AMR_WB'
-    """AMR_WB encoding type."""
+from google.cloud.speech.encoding import Encoding
+from google.cloud.speech.operation import Operation
 
 
 class Client(client_module.Client):
@@ -68,6 +46,81 @@ class Client(client_module.Client):
 
     _connection_class = Connection
 
+    def async_recognize(self, content, source_uri, encoding, sample_rate,
+                        language_code=None, max_alternatives=None,
+                        profanity_filter=None, speech_context=None):
+        """Asychronous Recognize request to Google Speech API.
+
+        .. _async_recognize: https://cloud.google.com/speech/reference/\
+                             rest/v1beta1/speech/asyncrecognize
+
+        See `async_recognize`_.
+
+        :type content: bytes
+        :param content: Byte stream of audio.
+
+        :type source_uri: str
+        :param source_uri: URI that points to a file that contains audio
+                           data bytes as specified in RecognitionConfig.
+                           Currently, only Google Cloud Storage URIs are
+                           supported, which must be specified in the following
+                           format: ``gs://bucket_name/object_name``.
+
+        :type encoding: str
+        :param encoding: encoding of audio data sent in all RecognitionAudio
+                         messages, can be one of: :attr:`~.Encoding.LINEAR16`,
+                         :attr:`~.Encoding.FLAC`, :attr:`~.Encoding.MULAW`,
+                         :attr:`~.Encoding.AMR`, :attr:`~.Encoding.AMR_WB`
+
+        :type sample_rate: int
+        :param sample_rate: Sample rate in Hertz of the audio data sent in all
+                            requests. Valid values are: 8000-48000. For best
+                            results, set the sampling rate of the audio source
+                            to 16000 Hz. If that's not possible, use the
+                            native sample rate of the audio source (instead of
+                            re-sampling).
+
+        :type language_code: str
+        :param language_code: (Optional) The language of the supplied audio as
+                              BCP-47 language tag. Example: ``'en-GB'``.
+                              If omitted, defaults to ``'en-US'``.
+
+        :type max_alternatives: int
+        :param max_alternatives: (Optional) Maximum number of recognition
+                                 hypotheses to be returned. The server may
+                                 return fewer than maxAlternatives.
+                                 Valid values are 0-30. A value of 0 or 1
+                                 will return a maximum of 1. Defaults to 1
+
+        :type profanity_filter: bool
+        :param profanity_filter: If True, the server will attempt to filter
+                                 out profanities, replacing all but the
+                                 initial character in each filtered word with
+                                 asterisks, e.g. ``'f***'``. If False or
+                                 omitted, profanities won't be filtered out.
+
+        :type speech_context: list
+        :param speech_context: A list of strings (max 50) containing words and
+                               phrases "hints" so that the speech recognition
+                               is more likely to recognize them. This can be
+                               used to improve the accuracy for specific words
+                               and phrases. This can also be used to add new
+                               words to the vocabulary of the recognizer.
+
+        :rtype: `~google.cloud.speech.operation.Operation`
+        :returns: ``Operation`` for asynchronous request to Google Speech API.
+        """
+
+        data = _build_request_data(content, source_uri, encoding,
+                                   sample_rate, language_code,
+                                   max_alternatives, profanity_filter,
+                                   speech_context)
+
+        api_response = self.connection.api_request(
+            method='POST', path='speech:asyncrecognize', data=data)
+
+        return Operation.from_api_repr(self, api_response)
+
     def sync_recognize(self, content, source_uri, encoding, sample_rate,
                        language_code=None, max_alternatives=None,
                        profanity_filter=None, speech_context=None):
@@ -139,44 +192,115 @@ def sync_recognize(self, content, source_uri, encoding, sample_rate,
                     between 0 and 1.
         """
 
-        if content is None and source_uri is None:
-            raise ValueError('content and source_uri cannot be both '
-                             'equal to None')
-
-        if content is not None and source_uri is not None:
-            raise ValueError('content and source_uri cannot be both '
-                             'different from None')
+        data = _build_request_data(content, source_uri, encoding,
+                                   sample_rate, language_code,
+                                   max_alternatives, profanity_filter,
+                                   speech_context)
 
-        if encoding is None:
-            raise ValueError('encoding cannot be None')
-        if sample_rate is None:
-            raise ValueError('sample_rate cannot be None')
+        api_response = self.connection.api_request(
+            method='POST', path='speech:syncrecognize', data=data)
 
-        if content is not None:
-            audio = {'content': b64encode(_to_bytes(content))}
+        if len(api_response['results']) == 1:
+            return api_response['results'][0]['alternatives']
         else:
-            audio = {'uri': source_uri}
+            raise ValueError('result in api should have length 1')
 
-        config = {'encoding': encoding, 'sampleRate': sample_rate}
 
-        if language_code is not None:
-            config['languageCode'] = language_code
-        if max_alternatives is not None:
-            config['maxAlternatives'] = max_alternatives
-        if profanity_filter is not None:
-            config['profanityFilter'] = profanity_filter
-        if speech_context is not None:
-            config['speechContext'] = {'phrases': speech_context}
+def _build_request_data(content, source_uri, encoding, sample_rate,
+                        language_code=None, max_alternatives=None,
+                        profanity_filter=None, speech_context=None):
+    """Builds the request data before making API request.
+
+    :type content: bytes
+    :param content: Byte stream of audio.
+
+    :type source_uri: str
+    :param source_uri: URI that points to a file that contains audio
+                       data bytes as specified in RecognitionConfig.
+                       Currently, only Google Cloud Storage URIs are
+                       supported, which must be specified in the following
+                       format: ``gs://bucket_name/object_name``.
+
+    :type encoding: str
+    :param encoding: encoding of audio data sent in all RecognitionAudio
+                     messages, can be one of: :attr:`~.Encoding.LINEAR16`,
+                     :attr:`~.Encoding.FLAC`, :attr:`~.Encoding.MULAW`,
+                     :attr:`~.Encoding.AMR`, :attr:`~.Encoding.AMR_WB`
+
+    :type sample_rate: int
+    :param sample_rate: Sample rate in Hertz of the audio data sent in all
+                        requests. Valid values are: 8000-48000. For best
+                        results, set the sampling rate of the audio source
+                        to 16000 Hz. If that's not possible, use the
+                        native sample rate of the audio source (instead of
+                        re-sampling).
+
+    :type language_code: str
+    :param language_code: (Optional) The language of the supplied audio as
+                          BCP-47 language tag. Example: ``'en-GB'``.
+                          If omitted, defaults to ``'en-US'``.
+
+    :type max_alternatives: int
+    :param max_alternatives: (Optional) Maximum number of recognition
+                             hypotheses to be returned. The server may
+                             return fewer than maxAlternatives.
+                             Valid values are 0-30. A value of 0 or 1
+                             will return a maximum of 1. Defaults to 1
+
+    :type profanity_filter: bool
+    :param profanity_filter: If True, the server will attempt to filter
+                             out profanities, replacing all but the
+                             initial character in each filtered word with
+                             asterisks, e.g. ``'f***'``. If False or
+                             omitted, profanities won't be filtered out.
+
+    :type speech_context: list
+    :param speech_context: A list of strings (max 50) containing words and
+                           phrases "hints" so that the speech recognition
+                           is more likely to recognize them. This can be
+                           used to improve the accuracy for specific words
+                           and phrases. This can also be used to add new
+                           words to the vocabulary of the recognizer.
+
+    :rtype: dict
+    :returns: Dictionary with required data for Google Speech API.
+    """
+    if content is None and source_uri is None:
+        raise ValueError('content and source_uri cannot be both '
+                         'equal to None')
 
-        data = {
-            'audio': audio,
-            'config': config,
-        }
+    if content is not None and source_uri is not None:
+        raise ValueError('content and source_uri cannot be both '
+                         'different from None')
 
-        api_response = self.connection.api_request(
-            method='POST', path='syncrecognize', data=data)
+    if encoding is None:
+        raise ValueError('encoding cannot be None')
 
-        if len(api_response['results']) == 1:
-            return api_response['results'][0]['alternatives']
-        else:
-            raise ValueError('result in api should have length 1')
+    encoding_value = getattr(Encoding, encoding)
+
+    if sample_rate is None:
+        raise ValueError('sample_rate cannot be None')
+
+    if content is not None:
+        audio = {'content': b64encode(_to_bytes(content))}
+    else:
+        audio = {'uri': source_uri}
+
+    config = {'encoding': encoding_value,
+              'sampleRate': sample_rate}
+
+    if language_code is not None:
+        config['languageCode'] = language_code
+    if max_alternatives is not None:
+        config['maxAlternatives'] = max_alternatives
+    if profanity_filter is not None:
+        config['profanityFilter'] = profanity_filter
+    if speech_context is not None:
+        config['speechContext'] = {'phrases': speech_context}
+
+    data = {
+        'audio': audio,
+        'config': config,
+    }
+
+    return data
diff --git a/speech/google/cloud/speech/connection.py b/speech/google/cloud/speech/connection.py
@@ -26,7 +26,7 @@ class Connection(base_connection.JSONConnection):
     API_VERSION = 'v1beta1'
     """The version of the API, used in building the API call's URL."""
 
-    API_URL_TEMPLATE = '{api_base_url}/{api_version}/speech:{path}'
+    API_URL_TEMPLATE = '{api_base_url}/{api_version}/{path}'
     """A template for the URL of a particular API call."""
 
     SCOPE = ('https://www.googleapis.com/auth/cloud-platform',)