feat: add recognition metadata (#352)

googleapis · May 14, 2019 · 641d812 · 641d812
1 parent 60c7ceb
commit 641d812
Show file tree

Hide file tree

Showing 3 changed files with 373 additions and 5 deletions.
diff --git a/packages/google-cloud-node/protos/google/cloud/speech/v1/cloud_speech.proto b/packages/google-cloud-node/protos/google/cloud/speech/v1/cloud_speech.proto
@@ -19,9 +19,7 @@ package google.cloud.speech.v1;
 
 import "google/api/annotations.proto";
 import "google/longrunning/operations.proto";
-import "google/protobuf/any.proto";
 import "google/protobuf/duration.proto";
-import "google/protobuf/empty.proto";
 import "google/protobuf/timestamp.proto";
 import "google/rpc/status.proto";
 
@@ -278,6 +276,9 @@ message RecognitionConfig {
   // premium feature.
   bool enable_automatic_punctuation = 11;
 
+  // *Optional* Metadata regarding this request.
+  RecognitionMetadata metadata = 9;
+
   // *Optional* Which model to select for the given request. Select the model
   // best suited to your domain to get best results. If a model is not
   // explicitly specified, then we auto-select a model based on the parameters
@@ -330,6 +331,133 @@ message RecognitionConfig {
   bool use_enhanced = 14;
 }
 
+// Description of audio data to be recognized.
+message RecognitionMetadata {
+  // Use case categories that the audio recognition request can be described
+  // by.
+  enum InteractionType {
+    // Use case is either unknown or is something other than one of the other
+    // values below.
+    INTERACTION_TYPE_UNSPECIFIED = 0;
+
+    // Multiple people in a conversation or discussion. For example in a
+    // meeting with two or more people actively participating. Typically
+    // all the primary people speaking would be in the same room (if not,
+    // see PHONE_CALL)
+    DISCUSSION = 1;
+
+    // One or more persons lecturing or presenting to others, mostly
+    // uninterrupted.
+    PRESENTATION = 2;
+
+    // A phone-call or video-conference in which two or more people, who are
+    // not in the same room, are actively participating.
+    PHONE_CALL = 3;
+
+    // A recorded message intended for another person to listen to.
+    VOICEMAIL = 4;
+
+    // Professionally produced audio (eg. TV Show, Podcast).
+    PROFESSIONALLY_PRODUCED = 5;
+
+    // Transcribe spoken questions and queries into text.
+    VOICE_SEARCH = 6;
+
+    // Transcribe voice commands, such as for controlling a device.
+    VOICE_COMMAND = 7;
+
+    // Transcribe speech to text to create a written document, such as a
+    // text-message, email or report.
+    DICTATION = 8;
+  }
+
+  // The use case most closely describing the audio content to be recognized.
+  InteractionType interaction_type = 1;
+
+  // The industry vertical to which this speech recognition request most
+  // closely applies. This is most indicative of the topics contained
+  // in the audio.  Use the 6-digit NAICS code to identify the industry
+  // vertical - see https://www.naics.com/search/.
+  uint32 industry_naics_code_of_audio = 3;
+
+  // Enumerates the types of capture settings describing an audio file.
+  enum MicrophoneDistance {
+    // Audio type is not known.
+    MICROPHONE_DISTANCE_UNSPECIFIED = 0;
+
+    // The audio was captured from a closely placed microphone. Eg. phone,
+    // dictaphone, or handheld microphone. Generally if there speaker is within
+    // 1 meter of the microphone.
+    NEARFIELD = 1;
+
+    // The speaker if within 3 meters of the microphone.
+    MIDFIELD = 2;
+
+    // The speaker is more than 3 meters away from the microphone.
+    FARFIELD = 3;
+  }
+
+  // The audio type that most closely describes the audio being recognized.
+  MicrophoneDistance microphone_distance = 4;
+
+  // The original media the speech was recorded on.
+  enum OriginalMediaType {
+    // Unknown original media type.
+    ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0;
+
+    // The speech data is an audio recording.
+    AUDIO = 1;
+
+    // The speech data originally recorded on a video.
+    VIDEO = 2;
+  }
+
+  // The original media the speech was recorded on.
+  OriginalMediaType original_media_type = 5;
+
+  // The type of device the speech was recorded with.
+  enum RecordingDeviceType {
+    // The recording device is unknown.
+    RECORDING_DEVICE_TYPE_UNSPECIFIED = 0;
+
+    // Speech was recorded on a smartphone.
+    SMARTPHONE = 1;
+
+    // Speech was recorded using a personal computer or tablet.
+    PC = 2;
+
+    // Speech was recorded over a phone line.
+    PHONE_LINE = 3;
+
+    // Speech was recorded in a vehicle.
+    VEHICLE = 4;
+
+    // Speech was recorded outdoors.
+    OTHER_OUTDOOR_DEVICE = 5;
+
+    // Speech was recorded indoors.
+    OTHER_INDOOR_DEVICE = 6;
+  }
+
+  // The type of device the speech was recorded with.
+  RecordingDeviceType recording_device_type = 6;
+
+  // The device used to make the recording.  Examples 'Nexus 5X' or
+  // 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
+  // 'Cardioid Microphone'.
+  string recording_device_name = 7;
+
+  // Mime type of the original audio file.  For example `audio/m4a`,
+  // `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
+  // A list of possible audio mime types is maintained at
+  // http://www.iana.org/assignments/media-types/media-types.xhtml#audio
+  string original_mime_type = 8;
+
+  // Description of the content. Eg. "Recordings of federal supreme court
+  // hearings from 2012".
+  string audio_topic = 10;
+}
+
 // Provides "hints" to the speech recognizer to favor specific words and phrases
 // in the results.
 message SpeechContext {
@@ -504,10 +632,20 @@ message StreamingRecognitionResult {
   // The default of 0.0 is a sentinel value indicating `stability` was not set.
   float stability = 3;
 
+  // Output only. Time offset of the end of this result relative to the
+  // beginning of the audio.
+  google.protobuf.Duration result_end_time = 4;
+
   // For multi-channel audio, this is the channel number corresponding to the
   // recognized result for the audio from that channel.
   // For audio_channel_count = N, its output values can range from '1' to 'N'.
   int32 channel_tag = 5;
+
+  // Output only. The
+  // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
+  // language in this result. This language code was detected to have the most
+  // likelihood of being spoken in the audio.
+  string language_code = 6;
 }
 
 // A speech recognition result corresponding to a portion of the audio.