Skip to content

Commit

Permalink
feat: add recognition metadata (#352)
Browse files Browse the repository at this point in the history
  • Loading branch information
yoshi-automation authored and JustinBeckwith committed May 14, 2019
1 parent 60c7ceb commit 641d812
Show file tree
Hide file tree
Showing 3 changed files with 373 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ package google.cloud.speech.v1;

import "google/api/annotations.proto";
import "google/longrunning/operations.proto";
import "google/protobuf/any.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/empty.proto";
import "google/protobuf/timestamp.proto";
import "google/rpc/status.proto";

Expand Down Expand Up @@ -278,6 +276,9 @@ message RecognitionConfig {
// premium feature.
bool enable_automatic_punctuation = 11;

// *Optional* Metadata regarding this request.
RecognitionMetadata metadata = 9;

// *Optional* Which model to select for the given request. Select the model
// best suited to your domain to get best results. If a model is not
// explicitly specified, then we auto-select a model based on the parameters
Expand Down Expand Up @@ -330,6 +331,133 @@ message RecognitionConfig {
bool use_enhanced = 14;
}

// Description of audio data to be recognized.
message RecognitionMetadata {
// Use case categories that the audio recognition request can be described
// by.
enum InteractionType {
// Use case is either unknown or is something other than one of the other
// values below.
INTERACTION_TYPE_UNSPECIFIED = 0;

// Multiple people in a conversation or discussion. For example in a
// meeting with two or more people actively participating. Typically
// all the primary people speaking would be in the same room (if not,
// see PHONE_CALL)
DISCUSSION = 1;

// One or more persons lecturing or presenting to others, mostly
// uninterrupted.
PRESENTATION = 2;

// A phone-call or video-conference in which two or more people, who are
// not in the same room, are actively participating.
PHONE_CALL = 3;

// A recorded message intended for another person to listen to.
VOICEMAIL = 4;

// Professionally produced audio (eg. TV Show, Podcast).
PROFESSIONALLY_PRODUCED = 5;

// Transcribe spoken questions and queries into text.
VOICE_SEARCH = 6;

// Transcribe voice commands, such as for controlling a device.
VOICE_COMMAND = 7;

// Transcribe speech to text to create a written document, such as a
// text-message, email or report.
DICTATION = 8;
}

// The use case most closely describing the audio content to be recognized.
InteractionType interaction_type = 1;

// The industry vertical to which this speech recognition request most
// closely applies. This is most indicative of the topics contained
// in the audio. Use the 6-digit NAICS code to identify the industry
// vertical - see https://www.naics.com/search/.
uint32 industry_naics_code_of_audio = 3;

// Enumerates the types of capture settings describing an audio file.
enum MicrophoneDistance {
// Audio type is not known.
MICROPHONE_DISTANCE_UNSPECIFIED = 0;

// The audio was captured from a closely placed microphone. Eg. phone,
// dictaphone, or handheld microphone. Generally if there speaker is within
// 1 meter of the microphone.
NEARFIELD = 1;

// The speaker if within 3 meters of the microphone.
MIDFIELD = 2;

// The speaker is more than 3 meters away from the microphone.
FARFIELD = 3;
}

// The audio type that most closely describes the audio being recognized.
MicrophoneDistance microphone_distance = 4;

// The original media the speech was recorded on.
enum OriginalMediaType {
// Unknown original media type.
ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0;

// The speech data is an audio recording.
AUDIO = 1;

// The speech data originally recorded on a video.
VIDEO = 2;
}

// The original media the speech was recorded on.
OriginalMediaType original_media_type = 5;

// The type of device the speech was recorded with.
enum RecordingDeviceType {
// The recording device is unknown.
RECORDING_DEVICE_TYPE_UNSPECIFIED = 0;

// Speech was recorded on a smartphone.
SMARTPHONE = 1;

// Speech was recorded using a personal computer or tablet.
PC = 2;

// Speech was recorded over a phone line.
PHONE_LINE = 3;

// Speech was recorded in a vehicle.
VEHICLE = 4;

// Speech was recorded outdoors.
OTHER_OUTDOOR_DEVICE = 5;

// Speech was recorded indoors.
OTHER_INDOOR_DEVICE = 6;
}

// The type of device the speech was recorded with.
RecordingDeviceType recording_device_type = 6;

// The device used to make the recording. Examples 'Nexus 5X' or
// 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
// 'Cardioid Microphone'.
string recording_device_name = 7;

// Mime type of the original audio file. For example `audio/m4a`,
// `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
// A list of possible audio mime types is maintained at
// http://www.iana.org/assignments/media-types/media-types.xhtml#audio
string original_mime_type = 8;

// Description of the content. Eg. "Recordings of federal supreme court
// hearings from 2012".
string audio_topic = 10;
}

// Provides "hints" to the speech recognizer to favor specific words and phrases
// in the results.
message SpeechContext {
Expand Down Expand Up @@ -504,10 +632,20 @@ message StreamingRecognitionResult {
// The default of 0.0 is a sentinel value indicating `stability` was not set.
float stability = 3;

// Output only. Time offset of the end of this result relative to the
// beginning of the audio.
google.protobuf.Duration result_end_time = 4;

// For multi-channel audio, this is the channel number corresponding to the
// recognized result for the audio from that channel.
// For audio_channel_count = N, its output values can range from '1' to 'N'.
int32 channel_tag = 5;

// Output only. The
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
// language in this result. This language code was detected to have the most
// likelihood of being spoken in the audio.
string language_code = 6;
}

// A speech recognition result corresponding to a portion of the audio.
Expand Down
Loading

0 comments on commit 641d812

Please sign in to comment.