From ac39fe0ea1dd1967b909e708348ffb8076aea7a5 Mon Sep 17 00:00:00 2001 From: SDKAuto Date: Wed, 20 Sep 2023 17:53:08 +0000 Subject: [PATCH] CodeGen from PR 25734 in Azure/azure-rest-api-specs Azure OpenAI: minimal, partial specification for Whisper transcription/translation (#25734) * minimalistic whisper .tsp definitions * merge, format, remove client.tsp changes for mvp simplicity * speculative example JSON update for string response types * restore header traits for swagger hints * review pass, prioritize object response for OpenAPI v2 * PR feedback: fully distinguish transcription/translation models --- .../azure/ai/openai/OpenAIAsyncClient.java | 5 +- .../com/azure/ai/openai/OpenAIClient.java | 5 +- .../ai/openai/models/AudioTranslation.java | 119 ++++++++ .../openai/models/AudioTranslationFormat.java | 65 +++++ .../models/AudioTranslationOptions.java | 46 +-- .../models/AudioTranslationSegment.java | 261 ++++++++++++++++++ sdk/openai/azure-ai-openai/tsp-location.yaml | 6 +- 7 files changed, 477 insertions(+), 30 deletions(-) create mode 100644 sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslation.java create mode 100644 sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslationFormat.java create mode 100644 sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslationSegment.java diff --git a/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/OpenAIAsyncClient.java b/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/OpenAIAsyncClient.java index b95e243439bdb..7866ef6a0513e 100644 --- a/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/OpenAIAsyncClient.java +++ b/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/OpenAIAsyncClient.java @@ -14,6 +14,7 @@ import com.azure.ai.openai.models.AudioTranscription; import com.azure.ai.openai.models.AudioTranscriptionFormat; import com.azure.ai.openai.models.AudioTranscriptionOptions; +import com.azure.ai.openai.models.AudioTranslation; import com.azure.ai.openai.models.AudioTranslationOptions; import com.azure.ai.openai.models.ChatCompletions; import com.azure.ai.openai.models.ChatCompletionsOptions; @@ -1261,14 +1262,14 @@ public Mono getAudioTranscriptionAsPlainText( */ @Generated @ServiceMethod(returns = ReturnType.SINGLE) - public Mono getAudioTranslationAsResponseObject( + public Mono getAudioTranslationAsResponseObject( String deploymentOrModelName, AudioTranslationOptions audioTranslationOptions) { // Generated convenience method for getAudioTranslationAsResponseObjectWithResponse RequestOptions requestOptions = new RequestOptions(); return getAudioTranslationAsResponseObjectWithResponse( deploymentOrModelName, BinaryData.fromObject(audioTranslationOptions), requestOptions) .flatMap(FluxUtil::toMono) - .map(protocolMethodData -> protocolMethodData.toObject(AudioTranscription.class)); + .map(protocolMethodData -> protocolMethodData.toObject(AudioTranslation.class)); } /** diff --git a/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/OpenAIClient.java b/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/OpenAIClient.java index d38b041de3301..2bb1951d4d809 100644 --- a/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/OpenAIClient.java +++ b/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/OpenAIClient.java @@ -12,6 +12,7 @@ import com.azure.ai.openai.models.AudioTranscription; import com.azure.ai.openai.models.AudioTranscriptionFormat; import com.azure.ai.openai.models.AudioTranscriptionOptions; +import com.azure.ai.openai.models.AudioTranslation; import com.azure.ai.openai.models.AudioTranslationOptions; import com.azure.ai.openai.models.ChatCompletions; import com.azure.ai.openai.models.ChatCompletionsOptions; @@ -1242,14 +1243,14 @@ public String getAudioTranscriptionAsPlainText( */ @Generated @ServiceMethod(returns = ReturnType.SINGLE) - public AudioTranscription getAudioTranslationAsResponseObject( + public AudioTranslation getAudioTranslationAsResponseObject( String deploymentOrModelName, AudioTranslationOptions audioTranslationOptions) { // Generated convenience method for getAudioTranslationAsResponseObjectWithResponse RequestOptions requestOptions = new RequestOptions(); return getAudioTranslationAsResponseObjectWithResponse( deploymentOrModelName, BinaryData.fromObject(audioTranslationOptions), requestOptions) .getValue() - .toObject(AudioTranscription.class); + .toObject(AudioTranslation.class); } /** diff --git a/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslation.java b/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslation.java new file mode 100644 index 0000000000000..e428de21efdcc --- /dev/null +++ b/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslation.java @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) AutoRest Code Generator. + +package com.azure.ai.openai.models; + +import com.azure.core.annotation.Generated; +import com.azure.core.annotation.Immutable; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import java.time.Duration; +import java.util.List; + +/** Result information for an operation that translated spoken audio into written text. */ +@Immutable +public final class AudioTranslation { + /* + * The translated text for the provided audio data. + */ + @Generated + @JsonProperty(value = "text") + private String text; + + /* + * The label that describes which operation type generated the accompanying response data. + */ + @Generated + @JsonProperty(value = "task") + private AudioTaskLabel task; + + /* + * The spoken language that was detected in the translated audio data. + * This is expressed as a two-letter ISO-639-1 language code like 'en' or 'fr'. + */ + @Generated + @JsonProperty(value = "language") + private String language; + + /* + * The total duration of the audio processed to produce accompanying translation information. + */ + @Generated + @JsonProperty(value = "duration") + private Double duration; + + /* + * A collection of information about the timing, probabilities, and other detail of each processed audio segment. + */ + @Generated + @JsonProperty(value = "segments") + private List segments; + + /** + * Creates an instance of AudioTranslation class. + * + * @param text the text value to set. + */ + @Generated + @JsonCreator + private AudioTranslation(@JsonProperty(value = "text") String text) { + this.text = text; + } + + /** + * Get the text property: The translated text for the provided audio data. + * + * @return the text value. + */ + @Generated + public String getText() { + return this.text; + } + + /** + * Get the task property: The label that describes which operation type generated the accompanying response data. + * + * @return the task value. + */ + @Generated + public AudioTaskLabel getTask() { + return this.task; + } + + /** + * Get the language property: The spoken language that was detected in the translated audio data. This is expressed + * as a two-letter ISO-639-1 language code like 'en' or 'fr'. + * + * @return the language value. + */ + @Generated + public String getLanguage() { + return this.language; + } + + /** + * Get the duration property: The total duration of the audio processed to produce accompanying translation + * information. + * + * @return the duration value. + */ + @Generated + public Duration getDuration() { + if (this.duration == null) { + return null; + } + return Duration.ofNanos((long) (this.duration * 1000_000_000L)); + } + + /** + * Get the segments property: A collection of information about the timing, probabilities, and other detail of each + * processed audio segment. + * + * @return the segments value. + */ + @Generated + public List getSegments() { + return this.segments; + } +} diff --git a/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslationFormat.java b/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslationFormat.java new file mode 100644 index 0000000000000..786fc9399f34e --- /dev/null +++ b/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslationFormat.java @@ -0,0 +1,65 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) AutoRest Code Generator. + +package com.azure.ai.openai.models; + +import com.azure.core.annotation.Generated; +import com.azure.core.util.ExpandableStringEnum; +import com.fasterxml.jackson.annotation.JsonCreator; +import java.util.Collection; + +/** Defines available options for the underlying response format of output translation information. */ +public final class AudioTranslationFormat extends ExpandableStringEnum { + /** Use a response body that is a JSON object containing a single 'text' field for the translation. */ + @Generated public static final AudioTranslationFormat JSON = fromString("json"); + + /** + * Use a response body that is a JSON object containing translation text along with timing, segments, and other + * metadata. + */ + @Generated public static final AudioTranslationFormat VERBOSE_JSON = fromString("verbose_json"); + + /** Use a response body that is plain text containing the raw, unannotated translation. */ + @Generated public static final AudioTranslationFormat TEXT = fromString("text"); + + /** Use a response body that is plain text in SubRip (SRT) format that also includes timing information. */ + @Generated public static final AudioTranslationFormat SRT = fromString("srt"); + + /** + * Use a response body that is plain text in Web Video Text Tracks (VTT) format that also includes timing + * information. + */ + @Generated public static final AudioTranslationFormat VTT = fromString("vtt"); + + /** + * Creates a new instance of AudioTranslationFormat value. + * + * @deprecated Use the {@link #fromString(String)} factory method. + */ + @Generated + @Deprecated + public AudioTranslationFormat() {} + + /** + * Creates or finds a AudioTranslationFormat from its string representation. + * + * @param name a name to look for. + * @return the corresponding AudioTranslationFormat. + */ + @Generated + @JsonCreator + public static AudioTranslationFormat fromString(String name) { + return fromString(name, AudioTranslationFormat.class); + } + + /** + * Gets known AudioTranslationFormat values. + * + * @return known AudioTranslationFormat values. + */ + @Generated + public static Collection values() { + return values(AudioTranslationFormat.class); + } +} diff --git a/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslationOptions.java b/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslationOptions.java index 65f7b1f873ad0..8f883813827bf 100644 --- a/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslationOptions.java +++ b/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslationOptions.java @@ -14,7 +14,7 @@ public final class AudioTranslationOptions { /* - * The audio data to transcribe. This must be the binary content of a file in one of the supported media formats: + * The audio data to translate. This must be the binary content of a file in one of the supported media formats: * flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, webm. */ @Generated @@ -22,12 +22,12 @@ public final class AudioTranslationOptions { private byte[] file; /* - * The requested format of the transcription response data, which will influence the content and detail of the + * The requested format of the translation response data, which will influence the content and detail of the * result. */ @Generated @JsonProperty(value = "response_format") - private AudioTranscriptionFormat responseFormat; + private AudioTranslationFormat responseFormat; /* * An optional hint to guide the model's style or continue from a prior audio segment. The written language of the @@ -49,7 +49,7 @@ public final class AudioTranslationOptions { private Double temperature; /* - * The model to use for this transcription request. + * The model to use for this translation request. */ @Generated @JsonProperty(value = "model") @@ -67,7 +67,7 @@ public AudioTranslationOptions(@JsonProperty(value = "file") byte[] file) { } /** - * Get the file property: The audio data to transcribe. This must be the binary content of a file in one of the + * Get the file property: The audio data to translate. This must be the binary content of a file in one of the * supported media formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, webm. * * @return the file value. @@ -78,29 +78,16 @@ public byte[] getFile() { } /** - * Get the responseFormat property: The requested format of the transcription response data, which will influence - * the content and detail of the result. + * Get the responseFormat property: The requested format of the translation response data, which will influence the + * content and detail of the result. * * @return the responseFormat value. */ @Generated - public AudioTranscriptionFormat getResponseFormat() { + public AudioTranslationFormat getResponseFormat() { return this.responseFormat; } - /** - * Set the responseFormat property: The requested format of the transcription response data, which will influence - * the content and detail of the result. - * - * @param responseFormat the responseFormat value to set. - * @return the AudioTranslationOptions object itself. - */ - @Generated - public AudioTranslationOptions setResponseFormat(AudioTranscriptionFormat responseFormat) { - this.responseFormat = responseFormat; - return this; - } - /** * Get the prompt property: An optional hint to guide the model's style or continue from a prior audio segment. The * written language of the prompt should match the primary spoken language of the audio data. @@ -152,7 +139,7 @@ public AudioTranslationOptions setTemperature(Double temperature) { } /** - * Get the model property: The model to use for this transcription request. + * Get the model property: The model to use for this translation request. * * @return the model value. */ @@ -162,7 +149,7 @@ public String getModel() { } /** - * Set the model property: The model to use for this transcription request. + * Set the model property: The model to use for this translation request. * * @param model the model value to set. * @return the AudioTranslationOptions object itself. @@ -172,4 +159,17 @@ public AudioTranslationOptions setModel(String model) { this.model = model; return this; } + + /** + * Set the responseFormat property: The requested format of the translation response data, which will influence the + * content and detail of the result. + * + * @param responseFormat the responseFormat value to set. + * @return the AudioTranslationOptions object itself. + */ + @Generated + public AudioTranslationOptions setResponseFormat(AudioTranslationFormat responseFormat) { + this.responseFormat = responseFormat; + return this; + } } diff --git a/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslationSegment.java b/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslationSegment.java new file mode 100644 index 0000000000000..46cdb7ac89f64 --- /dev/null +++ b/sdk/openai/azure-ai-openai/src/main/java/com/azure/ai/openai/models/AudioTranslationSegment.java @@ -0,0 +1,261 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) AutoRest Code Generator. + +package com.azure.ai.openai.models; + +import com.azure.core.annotation.Generated; +import com.azure.core.annotation.Immutable; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import java.time.Duration; +import java.util.List; + +/** + * Extended information about a single segment of translated audio data. Segments generally represent roughly 5-10 + * seconds of speech. Segment boundaries typically occur between words but not necessarily sentences. + */ +@Immutable +public final class AudioTranslationSegment { + /* + * The 0-based index of this segment within a translation. + */ + @Generated + @JsonProperty(value = "id") + private int id; + + /* + * The time at which this segment started relative to the beginning of the translated audio. + */ + @Generated + @JsonProperty(value = "start") + private double start; + + /* + * The time at which this segment ended relative to the beginning of the translated audio. + */ + @Generated + @JsonProperty(value = "end") + private double end; + + /* + * The translated text that was part of this audio segment. + */ + @Generated + @JsonProperty(value = "text") + private String text; + + /* + * The temperature score associated with this audio segment. + */ + @Generated + @JsonProperty(value = "temperature") + private double temperature; + + /* + * The average log probability associated with this audio segment. + */ + @Generated + @JsonProperty(value = "avg_logprob") + private double avgLogprob; + + /* + * The compression ratio of this audio segment. + */ + @Generated + @JsonProperty(value = "compression_ratio") + private double compressionRatio; + + /* + * The probability of no speech detection within this audio segment. + */ + @Generated + @JsonProperty(value = "no_speech_prob") + private double noSpeechProb; + + /* + * The token IDs matching the translated text in this audio segment. + */ + @Generated + @JsonProperty(value = "tokens") + private List tokens; + + /* + * The seek position associated with the processing of this audio segment. + * Seek positions are expressed as hundredths of seconds. + * The model may process several segments from a single seek position, so while the seek position will never + * represent + * a later time than the segment's start, the segment's start may represent a significantly later time than the + * segment's associated seek position. + */ + @Generated + @JsonProperty(value = "seek") + private int seek; + + /** + * Creates an instance of AudioTranslationSegment class. + * + * @param id the id value to set. + * @param start the start value to set. + * @param end the end value to set. + * @param text the text value to set. + * @param temperature the temperature value to set. + * @param avgLogprob the avgLogprob value to set. + * @param compressionRatio the compressionRatio value to set. + * @param noSpeechProb the noSpeechProb value to set. + * @param tokens the tokens value to set. + * @param seek the seek value to set. + */ + @Generated + private AudioTranslationSegment( + int id, + Duration start, + Duration end, + String text, + double temperature, + double avgLogprob, + double compressionRatio, + double noSpeechProb, + List tokens, + int seek) { + this.id = id; + this.start = (double) start.toNanos() / 1000_000_000L; + this.end = (double) end.toNanos() / 1000_000_000L; + this.text = text; + this.temperature = temperature; + this.avgLogprob = avgLogprob; + this.compressionRatio = compressionRatio; + this.noSpeechProb = noSpeechProb; + this.tokens = tokens; + this.seek = seek; + } + + @Generated + @JsonCreator + private AudioTranslationSegment( + @JsonProperty(value = "id") int id, + @JsonProperty(value = "start") double start, + @JsonProperty(value = "end") double end, + @JsonProperty(value = "text") String text, + @JsonProperty(value = "temperature") double temperature, + @JsonProperty(value = "avg_logprob") double avgLogprob, + @JsonProperty(value = "compression_ratio") double compressionRatio, + @JsonProperty(value = "no_speech_prob") double noSpeechProb, + @JsonProperty(value = "tokens") List tokens, + @JsonProperty(value = "seek") int seek) { + this( + id, + Duration.ofNanos((long) (start * 1000_000_000L)), + Duration.ofNanos((long) (end * 1000_000_000L)), + text, + temperature, + avgLogprob, + compressionRatio, + noSpeechProb, + tokens, + seek); + } + + /** + * Get the id property: The 0-based index of this segment within a translation. + * + * @return the id value. + */ + @Generated + public int getId() { + return this.id; + } + + /** + * Get the start property: The time at which this segment started relative to the beginning of the translated audio. + * + * @return the start value. + */ + @Generated + public Duration getStart() { + return Duration.ofNanos((long) (this.start * 1000_000_000L)); + } + + /** + * Get the end property: The time at which this segment ended relative to the beginning of the translated audio. + * + * @return the end value. + */ + @Generated + public Duration getEnd() { + return Duration.ofNanos((long) (this.end * 1000_000_000L)); + } + + /** + * Get the text property: The translated text that was part of this audio segment. + * + * @return the text value. + */ + @Generated + public String getText() { + return this.text; + } + + /** + * Get the temperature property: The temperature score associated with this audio segment. + * + * @return the temperature value. + */ + @Generated + public double getTemperature() { + return this.temperature; + } + + /** + * Get the avgLogprob property: The average log probability associated with this audio segment. + * + * @return the avgLogprob value. + */ + @Generated + public double getAvgLogprob() { + return this.avgLogprob; + } + + /** + * Get the compressionRatio property: The compression ratio of this audio segment. + * + * @return the compressionRatio value. + */ + @Generated + public double getCompressionRatio() { + return this.compressionRatio; + } + + /** + * Get the noSpeechProb property: The probability of no speech detection within this audio segment. + * + * @return the noSpeechProb value. + */ + @Generated + public double getNoSpeechProb() { + return this.noSpeechProb; + } + + /** + * Get the tokens property: The token IDs matching the translated text in this audio segment. + * + * @return the tokens value. + */ + @Generated + public List getTokens() { + return this.tokens; + } + + /** + * Get the seek property: The seek position associated with the processing of this audio segment. Seek positions are + * expressed as hundredths of seconds. The model may process several segments from a single seek position, so while + * the seek position will never represent a later time than the segment's start, the segment's start may represent a + * significantly later time than the segment's associated seek position. + * + * @return the seek value. + */ + @Generated + public int getSeek() { + return this.seek; + } +} diff --git a/sdk/openai/azure-ai-openai/tsp-location.yaml b/sdk/openai/azure-ai-openai/tsp-location.yaml index bc4052dd97e4d..376b199e9d8b8 100644 --- a/sdk/openai/azure-ai-openai/tsp-location.yaml +++ b/sdk/openai/azure-ai-openai/tsp-location.yaml @@ -1,5 +1,5 @@ directory: specification/cognitiveservices/OpenAI.Inference -additionalDirectories: - - specification/cognitiveservices/OpenAI.Authoring -commit: dd2d1e8957ac6654272137e8d5874eacafd80a5f repo: Azure/azure-rest-api-specs +commit: a66833cbdebb0574ba012f814ab271a382a7c500 +additionalDirectories: [] +