diff --git a/Sources/SpeechToTextV1/Models/SupportedFeatures.swift b/Sources/SpeechToTextV1/Models/SupportedFeatures.swift index c7f82df6d..74a78d7c1 100644 --- a/Sources/SpeechToTextV1/Models/SupportedFeatures.swift +++ b/Sources/SpeechToTextV1/Models/SupportedFeatures.swift @@ -1,5 +1,5 @@ /** - * (C) Copyright IBM Corp. 2018, 2020. + * (C) Copyright IBM Corp. 2021. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,10 +35,17 @@ public struct SupportedFeatures: Codable, Equatable { */ public var speakerLabels: Bool + /** + Indicates whether the `low_latency` parameter can be used with a next-generation language model. The field is + returned only for next-generation models. Previous-generation models do not support the `low_latency` parameter. + */ + public var lowLatency: Bool? + // Map each property name to the key that shall be used for encoding/decoding. private enum CodingKeys: String, CodingKey { case customLanguageModel = "custom_language_model" case speakerLabels = "speaker_labels" + case lowLatency = "low_latency" } } diff --git a/Sources/SpeechToTextV1/SpeechToText.swift b/Sources/SpeechToTextV1/SpeechToText.swift index 1130269fd..96792f6ff 100644 --- a/Sources/SpeechToTextV1/SpeechToText.swift +++ b/Sources/SpeechToTextV1/SpeechToText.swift @@ -1,5 +1,5 @@ /** - * (C) Copyright IBM Corp. 2016, 2020. + * (C) Copyright IBM Corp. 2021. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ **/ /** - * IBM OpenAPI SDK Code Generator Version: 99-SNAPSHOT-be3b4618-20201221-123327 + * IBM OpenAPI SDK Code Generator Version: 99-SNAPSHOT-902c9336-20210507-162723 **/ // swiftlint:disable file_length @@ -31,9 +31,13 @@ public typealias WatsonResponse = RestResponse /** The IBM Watson™ Speech to Text service provides APIs that use IBM's speech-recognition capabilities to produce transcripts of spoken audio. The service can transcribe speech from various languages and audio formats. In addition to - basic transcription, the service can produce detailed information about many different aspects of the audio. For most - languages, the service supports two sampling rates, broadband and narrowband. It returns all JSON response content in - the UTF-8 character set. + basic transcription, the service can produce detailed information about many different aspects of the audio. It returns + all JSON response content in the UTF-8 character set. + The service supports two types of models: previous-generation models that include the terms `Broadband` and + `Narrowband` in their names, and beta next-generation models that include the terms `Multimedia` and `Telephony` in + their names. Broadband and multimedia models have minimum sampling rates of 16 kHz. Narrowband and telephony models + have minimum sampling rates of 8 kHz. The beta next-generation models currently support fewer languages and features, + but they offer high throughput and greater transcription accuracy. For speech recognition, the service supports synchronous and asynchronous HTTP Representational State Transfer (REST) interfaces. It also supports a WebSocket interface that provides a full-duplex, low-latency communication channel: Clients send requests and audio to the service and receive results over a single connection asynchronously. @@ -42,8 +46,8 @@ public typealias WatsonResponse = RestResponse characteristics of your audio. For language model customization, the service also supports grammars. A grammar is a formal language specification that lets you restrict the phrases that the service can recognize. Language model customization and acoustic model customization are generally available for production use with all - language models that are generally available. Grammars are beta functionality for all language models that support - language model customization. + previous-generation models that are generally available. Grammars are beta functionality for all previous-generation + models that support language model customization. Next-generation models do not support customization at this time. */ public class SpeechToText { @@ -149,7 +153,7 @@ public class SpeechToText { Lists all language models that are available for use with the service. The information includes the name of the model and its minimum sampling rate in Hertz, among other things. The ordering of the list of models can change from call to call; do not rely on an alphabetized or static list of models. - **See also:** [Languages and models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models#models). + **See also:** [Listing models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-list). - parameter headers: A dictionary of request headers to be sent with this request. - parameter completionHandler: A function executed when the request completes with a successful result or error @@ -193,7 +197,7 @@ public class SpeechToText { Gets information for a single specified language model that is available for use with the service. The information includes the name of the model and its minimum sampling rate in Hertz, among other things. - **See also:** [Languages and models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models#models). + **See also:** [Listing models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-list). - parameter modelID: The identifier of the model in the form of its name from the output of the **Get a model** method. (**Note:** The model `ar-AR_BroadbandModel` is deprecated; use `ar-MS_BroadbandModel` instead.). @@ -288,8 +292,30 @@ public class SpeechToText { models, at least 16 kHz; for narrowband models, at least 8 kHz. If the sampling rate of the audio is higher than the minimum required rate, the service down-samples the audio to the appropriate rate. If the sampling rate of the audio is lower than the minimum required rate, the request fails. - **See also:** [Audio - formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats#audio-formats). + **See also:** [Supported audio + formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats). + ### Next-generation models + **Note:** The next-generation language models are beta functionality. They support a limited number of languages + and features at this time. The supported languages, models, and features will increase with future releases. + The service supports next-generation `Multimedia` (16 kHz) and `Telephony` (8 kHz) models for many languages. + Next-generation models have higher throughput than the service's previous generation of `Broadband` and + `Narrowband` models. When you use next-generation models, the service can return transcriptions more quickly and + also provide noticeably better transcription accuracy. + You specify a next-generation model by using the `model` query parameter, as you do a previous-generation model. + Next-generation models support the same request headers as previous-generation models, but they support only the + following additional query parameters: + * `background_audio_suppression` + * `inactivity_timeout` + * `profanity_filter` + * `redaction` + * `smart_formatting` + * `speaker_labels` + * `speech_detector_sensitivity` + * `timestamps` + Many next-generation models also support the beta `low_latency` parameter, which is not available with + previous-generation models. + **See also:** [Next-generation languages and + models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). ### Multipart speech recognition **Note:** The Watson SDKs do not support multipart speech recognition. The HTTP `POST` method of the service also supports multipart speech recognition. With multipart requests, you pass @@ -307,23 +333,25 @@ public class SpeechToText { format, see **Audio formats (content types)** in the method description. - parameter model: The identifier of the model that is to be used for the recognition request. (**Note:** The model `ar-AR_BroadbandModel` is deprecated; use `ar-MS_BroadbandModel` instead.) See [Languages and - models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models#models). + models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models) and [Next-generation languages and + models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). - parameter languageCustomizationID: The customization ID (GUID) of a custom language model that is to be used with the recognition request. The base model of the specified custom language model must match the model specified with the `model` parameter. You must make the request with credentials for the instance of the service - that owns the custom model. By default, no custom language model is used. See [Custom - models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + that owns the custom model. By default, no custom language model is used. See [Using a custom language model for + speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse). **Note:** Use this parameter instead of the deprecated `customization_id` parameter. - parameter acousticCustomizationID: The customization ID (GUID) of a custom acoustic model that is to be used with the recognition request. The base model of the specified custom acoustic model must match the model specified with the `model` parameter. You must make the request with credentials for the instance of the service - that owns the custom model. By default, no custom acoustic model is used. See [Custom - models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + that owns the custom model. By default, no custom acoustic model is used. See [Using a custom acoustic model for + speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-acousticUse). - parameter baseModelVersion: The version of the specified base model that is to be used with the recognition request. Multiple versions of a base model can exist when a model is updated for internal improvements. The parameter is intended primarily for use with custom models that have been upgraded for a new base model. The - default value depends on whether the parameter is used with or without a custom model. See [Base model - version](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#version). + default value depends on whether the parameter is used with or without a custom model. See [Making speech + recognition requests with upgraded custom + models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade-use#custom-upgrade-use-recognition). - parameter customizationWeight: If you specify the customization ID (GUID) of a custom language model with the recognition request, the customization weight tells the service how much weight to give to words from the custom language model compared to those from the base model for the current request. @@ -333,7 +361,8 @@ public class SpeechToText { The default value yields the best performance in general. Assign a higher value if your audio makes frequent use of OOV words from the custom model. Use caution when setting the weight: a higher value can improve the accuracy of phrases from the custom model's domain, but it can negatively affect performance on non-domain phrases. - See [Custom models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + See [Using customization + weight](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse#weight). - parameter inactivityTimeout: The time in seconds after which, if only silence (no speech) is detected in streaming audio, the connection is closed with a 400 error. The parameter is useful for stopping audio submission from a live microphone when a user simply walks away. Use `-1` for infinity. See [Inactivity @@ -345,52 +374,56 @@ public class SpeechToText { You can spot a maximum of 1000 keywords with a single request. A single keyword can have a maximum length of 1024 characters, though the maximum effective length for double-byte languages might be shorter. Keywords are case-insensitive. - See [Keyword spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#keyword_spotting). + See [Keyword spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting). - parameter keywordsThreshold: A confidence value that is the lower bound for spotting a keyword. A word is considered to match a keyword if its confidence is greater than or equal to the threshold. Specify a probability between 0.0 and 1.0. If you specify a threshold, you must also specify one or more keywords. The service performs no keyword spotting if you omit either parameter. See [Keyword - spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#keyword_spotting). + spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting). - parameter maxAlternatives: The maximum number of alternative transcripts that the service is to return. By default, the service returns a single transcript. If you specify a value of `0`, the service uses the default value, `1`. See [Maximum - alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#max_alternatives). + alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#max-alternatives). - parameter wordAlternativesThreshold: A confidence value that is the lower bound for identifying a hypothesis as a possible word alternative (also known as "Confusion Networks"). An alternative word is considered if its confidence is greater than or equal to the threshold. Specify a probability between 0.0 and 1.0. By default, the service computes no alternative words. See [Word - alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_alternatives). + alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#word-alternatives). - parameter wordConfidence: If `true`, the service returns a confidence measure in the range of 0.0 to 1.0 for each word. By default, the service returns no word confidence scores. See [Word - confidence](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_confidence). + confidence](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-confidence). - parameter timestamps: If `true`, the service returns time alignment for each word. By default, no timestamps are returned. See [Word - timestamps](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_timestamps). + timestamps](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-timestamps). - parameter profanityFilter: If `true`, the service filters profanity from all output except for keyword results by replacing inappropriate words with a series of asterisks. Set the parameter to `false` to return results with - no censoring. Applies to US English transcription only. See [Profanity - filtering](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#profanity_filter). + no censoring. Applies to US English and Japanese transcription only. See [Profanity + filtering](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#profanity-filtering). - parameter smartFormatting: If `true`, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable, conventional representations in the final transcript of a recognition request. For US English, the service also converts certain keyword strings to punctuation symbols. By default, the service performs no smart formatting. **Note:** Applies to US English, Japanese, and Spanish transcription only. - See [Smart formatting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#smart_formatting). + See [Smart + formatting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#smart-formatting). - parameter speakerLabels: If `true`, the response includes labels that identify which words were spoken by which participants in a multi-person exchange. By default, the service returns no speaker labels. Setting `speaker_labels` to `true` forces the `timestamps` parameter to be `true`, regardless of whether you specify `false` for the parameter. - **Note:** Applies to US English, Australian English, German, Japanese, Korean, and Spanish (both broadband and - narrowband models) and UK English (narrowband model) transcription only. - See [Speaker labels](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#speaker_labels). + * For previous-generation models, can be used for US English, Australian English, German, Japanese, Korean, and + Spanish (both broadband and narrowband models) and UK English (narrowband model) transcription only. + * For next-generation models, can be used for English (Australian, UK, and US), German, and Spanish transcription + only. + Restrictions and limitations apply to the use of speaker labels for both types of models. See [Speaker + labels](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speaker-labels). - parameter customizationID: **Deprecated.** Use the `language_customization_id` parameter to specify the customization ID (GUID) of a custom language model that is to be used with the recognition request. Do not specify both parameters with a request. - parameter grammarName: The name of a grammar that is to be used with the recognition request. If you specify a grammar, you must also use the `language_customization_id` parameter to specify the name of the custom language model for which the grammar is defined. The service recognizes only strings that are recognized by the specified - grammar; it does not recognize other custom words from the model's words resource. See - [Grammars](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#grammars-input). + grammar; it does not recognize other custom words from the model's words resource. See [Using a grammar for + speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-grammarUse). - parameter redaction: If `true`, the service redacts, or masks, numeric data from final transcripts. The feature redacts any number that has three or more consecutive digits by replacing each digit with an `X` character. It is intended to redact sensitive numeric data, such as credit card numbers. By default, the service performs no @@ -400,11 +433,12 @@ public class SpeechToText { the `keywords` and `keywords_threshold` parameters) and returns only a single final transcript (forces the `max_alternatives` parameter to be `1`). **Note:** Applies to US English, Japanese, and Korean transcription only. - See [Numeric redaction](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#redaction). + See [Numeric + redaction](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#numeric-redaction). - parameter audioMetrics: If `true`, requests detailed information about the signal characteristics of the input audio. The service returns audio metrics with the final transcription results. By default, the service returns no audio metrics. - See [Audio metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#audio_metrics). + See [Audio metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#audio-metrics). - parameter endOfPhraseSilenceTime: If `true`, specifies the duration of the pause interval at which the service splits a transcript into multiple final results. If the service detects pauses or extended silence before it reaches the end of the audio stream, its response can include multiple final results. Silence indicates a point @@ -415,14 +449,14 @@ public class SpeechToText { parameter. The default pause interval for most languages is 0.8 seconds; the default for Chinese is 0.6 seconds. See [End of phrase silence - time](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#silence_time). + time](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#silence-time). - parameter splitTranscriptAtPhraseEnd: If `true`, directs the service to split the transcript into multiple final results based on semantic features of the input, for example, at the conclusion of meaningful phrases such as sentences. The service bases its understanding of semantic features on the base language model that you use with a request. Custom language models and grammars can also influence how and where the service splits a transcript. By default, the service splits transcripts based solely on the pause interval. See [Split transcript at phrase - end](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#split_transcript). + end](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#split-transcript). - parameter speechDetectorSensitivity: The sensitivity of speech activity detection that the service is to perform. Use the parameter to suppress word insertions from music, coughing, and other non-speech events. The service biases the audio it passes for speech recognition by evaluating the input audio against prior models of @@ -431,8 +465,8 @@ public class SpeechToText { * 0.0 suppresses all audio (no speech is transcribed). * 0.5 (the default) provides a reasonable compromise for the level of sensitivity. * 1.0 suppresses no audio (speech detection sensitivity is disabled). - The values increase on a monotonic curve. See [Speech Activity - Detection](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#detection). + The values increase on a monotonic curve. See [Speech detector + sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity). - parameter backgroundAudioSuppression: The level to which the service is to suppress background audio based on its volume to prevent it from being transcribed as speech. Use the parameter to suppress side conversations or background noise. @@ -440,8 +474,20 @@ public class SpeechToText { * 0.0 (the default) provides no suppression (background audio suppression is disabled). * 0.5 provides a reasonable level of audio suppression for general usage. * 1.0 suppresses all audio (no audio is transcribed). - The values increase on a monotonic curve. See [Speech Activity - Detection](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#detection). + The values increase on a monotonic curve. See [Background audio + suppression](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-suppression). + - parameter lowLatency: If `true` for next-generation `Multimedia` and `Telephony` models that support low + latency, directs the service to produce results even more quickly than it usually does. Next-generation models + produce transcription results faster than previous-generation models. The `low_latency` parameter causes the + models to produce results even more quickly, though the results might be less accurate when the parameter is + used. + **Note:** The parameter is beta functionality. It is not available for previous-generation `Broadband` and + `Narrowband` models. It is available only for some next-generation models. + * For a list of next-generation models that support low latency, see [Supported language + models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng#models-ng-supported) for + next-generation models. + * For more information about the `low_latency` parameter, see [Low + latency](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-interim#low-latency). - parameter headers: A dictionary of request headers to be sent with this request. - parameter completionHandler: A function executed when the request completes with a successful result or error */ @@ -471,6 +517,7 @@ public class SpeechToText { splitTranscriptAtPhraseEnd: Bool? = nil, speechDetectorSensitivity: Double? = nil, backgroundAudioSuppression: Double? = nil, + lowLatency: Bool? = nil, headers: [String: String]? = nil, completionHandler: @escaping (WatsonResponse?, WatsonError?) -> Void) { @@ -582,6 +629,10 @@ public class SpeechToText { let queryParameter = URLQueryItem(name: "background_audio_suppression", value: "\(backgroundAudioSuppression)") queryParameters.append(queryParameter) } + if let lowLatency = lowLatency { + let queryParameter = URLQueryItem(name: "low_latency", value: "\(lowLatency)") + queryParameters.append(queryParameter) + } // construct REST request @@ -806,15 +857,38 @@ public class SpeechToText { models, at least 16 kHz; for narrowband models, at least 8 kHz. If the sampling rate of the audio is higher than the minimum required rate, the service down-samples the audio to the appropriate rate. If the sampling rate of the audio is lower than the minimum required rate, the request fails. - **See also:** [Audio - formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats#audio-formats). + **See also:** [Supported audio + formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats). + ### Next-generation models + **Note:** The next-generation language models are beta functionality. They support a limited number of languages + and features at this time. The supported languages, models, and features will increase with future releases. + The service supports next-generation `Multimedia` (16 kHz) and `Telephony` (8 kHz) models for many languages. + Next-generation models have higher throughput than the service's previous generation of `Broadband` and + `Narrowband` models. When you use next-generation models, the service can return transcriptions more quickly and + also provide noticeably better transcription accuracy. + You specify a next-generation model by using the `model` query parameter, as you do a previous-generation model. + Next-generation models support the same request headers as previous-generation models, but they support only the + following additional query parameters: + * `background_audio_suppression` + * `inactivity_timeout` + * `profanity_filter` + * `redaction` + * `smart_formatting` + * `speaker_labels` + * `speech_detector_sensitivity` + * `timestamps` + Many next-generation models also support the beta `low_latency` parameter, which is not available with + previous-generation models. + **See also:** [Next-generation languages and + models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). - parameter audio: The audio to transcribe. - parameter contentType: The format (MIME type) of the audio. For more information about specifying an audio format, see **Audio formats (content types)** in the method description. - parameter model: The identifier of the model that is to be used for the recognition request. (**Note:** The model `ar-AR_BroadbandModel` is deprecated; use `ar-MS_BroadbandModel` instead.) See [Languages and - models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models#models). + models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models) and [Next-generation languages and + models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). - parameter callbackURL: A URL to which callback notifications are to be sent. The URL must already be successfully allowlisted by using the **Register a callback** method. You can include the same callback URL with any number of job creation requests. Omit the parameter to poll the service for job completion and results. @@ -843,19 +917,20 @@ public class SpeechToText { - parameter languageCustomizationID: The customization ID (GUID) of a custom language model that is to be used with the recognition request. The base model of the specified custom language model must match the model specified with the `model` parameter. You must make the request with credentials for the instance of the service - that owns the custom model. By default, no custom language model is used. See [Custom - models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + that owns the custom model. By default, no custom language model is used. See [Using a custom language model for + speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse). **Note:** Use this parameter instead of the deprecated `customization_id` parameter. - parameter acousticCustomizationID: The customization ID (GUID) of a custom acoustic model that is to be used with the recognition request. The base model of the specified custom acoustic model must match the model specified with the `model` parameter. You must make the request with credentials for the instance of the service - that owns the custom model. By default, no custom acoustic model is used. See [Custom - models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + that owns the custom model. By default, no custom acoustic model is used. See [Using a custom acoustic model for + speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-acousticUse). - parameter baseModelVersion: The version of the specified base model that is to be used with the recognition request. Multiple versions of a base model can exist when a model is updated for internal improvements. The parameter is intended primarily for use with custom models that have been upgraded for a new base model. The - default value depends on whether the parameter is used with or without a custom model. See [Base model - version](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#version). + default value depends on whether the parameter is used with or without a custom model. See [Making speech + recognition requests with upgraded custom + models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade-use#custom-upgrade-use-recognition). - parameter customizationWeight: If you specify the customization ID (GUID) of a custom language model with the recognition request, the customization weight tells the service how much weight to give to words from the custom language model compared to those from the base model for the current request. @@ -865,7 +940,8 @@ public class SpeechToText { The default value yields the best performance in general. Assign a higher value if your audio makes frequent use of OOV words from the custom model. Use caution when setting the weight: a higher value can improve the accuracy of phrases from the custom model's domain, but it can negatively affect performance on non-domain phrases. - See [Custom models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + See [Using customization + weight](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse#weight). - parameter inactivityTimeout: The time in seconds after which, if only silence (no speech) is detected in streaming audio, the connection is closed with a 400 error. The parameter is useful for stopping audio submission from a live microphone when a user simply walks away. Use `-1` for infinity. See [Inactivity @@ -877,52 +953,56 @@ public class SpeechToText { You can spot a maximum of 1000 keywords with a single request. A single keyword can have a maximum length of 1024 characters, though the maximum effective length for double-byte languages might be shorter. Keywords are case-insensitive. - See [Keyword spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#keyword_spotting). + See [Keyword spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting). - parameter keywordsThreshold: A confidence value that is the lower bound for spotting a keyword. A word is considered to match a keyword if its confidence is greater than or equal to the threshold. Specify a probability between 0.0 and 1.0. If you specify a threshold, you must also specify one or more keywords. The service performs no keyword spotting if you omit either parameter. See [Keyword - spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#keyword_spotting). + spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting). - parameter maxAlternatives: The maximum number of alternative transcripts that the service is to return. By default, the service returns a single transcript. If you specify a value of `0`, the service uses the default value, `1`. See [Maximum - alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#max_alternatives). + alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#max-alternatives). - parameter wordAlternativesThreshold: A confidence value that is the lower bound for identifying a hypothesis as a possible word alternative (also known as "Confusion Networks"). An alternative word is considered if its confidence is greater than or equal to the threshold. Specify a probability between 0.0 and 1.0. By default, the service computes no alternative words. See [Word - alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_alternatives). + alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#word-alternatives). - parameter wordConfidence: If `true`, the service returns a confidence measure in the range of 0.0 to 1.0 for each word. By default, the service returns no word confidence scores. See [Word - confidence](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_confidence). + confidence](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-confidence). - parameter timestamps: If `true`, the service returns time alignment for each word. By default, no timestamps are returned. See [Word - timestamps](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_timestamps). + timestamps](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-timestamps). - parameter profanityFilter: If `true`, the service filters profanity from all output except for keyword results by replacing inappropriate words with a series of asterisks. Set the parameter to `false` to return results with - no censoring. Applies to US English transcription only. See [Profanity - filtering](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#profanity_filter). + no censoring. Applies to US English and Japanese transcription only. See [Profanity + filtering](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#profanity-filtering). - parameter smartFormatting: If `true`, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable, conventional representations in the final transcript of a recognition request. For US English, the service also converts certain keyword strings to punctuation symbols. By default, the service performs no smart formatting. **Note:** Applies to US English, Japanese, and Spanish transcription only. - See [Smart formatting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#smart_formatting). + See [Smart + formatting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#smart-formatting). - parameter speakerLabels: If `true`, the response includes labels that identify which words were spoken by which participants in a multi-person exchange. By default, the service returns no speaker labels. Setting `speaker_labels` to `true` forces the `timestamps` parameter to be `true`, regardless of whether you specify `false` for the parameter. - **Note:** Applies to US English, Australian English, German, Japanese, Korean, and Spanish (both broadband and - narrowband models) and UK English (narrowband model) transcription only. - See [Speaker labels](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#speaker_labels). + * For previous-generation models, can be used for US English, Australian English, German, Japanese, Korean, and + Spanish (both broadband and narrowband models) and UK English (narrowband model) transcription only. + * For next-generation models, can be used for English (Australian, UK, and US), German, and Spanish transcription + only. + Restrictions and limitations apply to the use of speaker labels for both types of models. See [Speaker + labels](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speaker-labels). - parameter customizationID: **Deprecated.** Use the `language_customization_id` parameter to specify the customization ID (GUID) of a custom language model that is to be used with the recognition request. Do not specify both parameters with a request. - parameter grammarName: The name of a grammar that is to be used with the recognition request. If you specify a grammar, you must also use the `language_customization_id` parameter to specify the name of the custom language model for which the grammar is defined. The service recognizes only strings that are recognized by the specified - grammar; it does not recognize other custom words from the model's words resource. See - [Grammars](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#grammars-input). + grammar; it does not recognize other custom words from the model's words resource. See [Using a grammar for + speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-grammarUse). - parameter redaction: If `true`, the service redacts, or masks, numeric data from final transcripts. The feature redacts any number that has three or more consecutive digits by replacing each digit with an `X` character. It is intended to redact sensitive numeric data, such as credit card numbers. By default, the service performs no @@ -932,13 +1012,14 @@ public class SpeechToText { the `keywords` and `keywords_threshold` parameters) and returns only a single final transcript (forces the `max_alternatives` parameter to be `1`). **Note:** Applies to US English, Japanese, and Korean transcription only. - See [Numeric redaction](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#redaction). + See [Numeric + redaction](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#numeric-redaction). - parameter processingMetrics: If `true`, requests processing metrics about the service's transcription of the input audio. The service returns processing metrics at the interval specified by the `processing_metrics_interval` parameter. It also returns processing metrics for transcription events, for example, for final and interim results. By default, the service returns no processing metrics. See [Processing - metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#processing_metrics). + metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#processing-metrics). - parameter processingMetricsInterval: Specifies the interval in real wall-clock seconds at which the service is to return processing metrics. The parameter is ignored unless the `processing_metrics` parameter is set to `true`. @@ -948,11 +1029,11 @@ public class SpeechToText { events instead of at periodic intervals, set the value to a large number. If the value is larger than the duration of the audio, the service returns processing metrics only for transcription events. See [Processing - metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#processing_metrics). + metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#processing-metrics). - parameter audioMetrics: If `true`, requests detailed information about the signal characteristics of the input audio. The service returns audio metrics with the final transcription results. By default, the service returns no audio metrics. - See [Audio metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#audio_metrics). + See [Audio metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#audio-metrics). - parameter endOfPhraseSilenceTime: If `true`, specifies the duration of the pause interval at which the service splits a transcript into multiple final results. If the service detects pauses or extended silence before it reaches the end of the audio stream, its response can include multiple final results. Silence indicates a point @@ -963,14 +1044,14 @@ public class SpeechToText { parameter. The default pause interval for most languages is 0.8 seconds; the default for Chinese is 0.6 seconds. See [End of phrase silence - time](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#silence_time). + time](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#silence-time). - parameter splitTranscriptAtPhraseEnd: If `true`, directs the service to split the transcript into multiple final results based on semantic features of the input, for example, at the conclusion of meaningful phrases such as sentences. The service bases its understanding of semantic features on the base language model that you use with a request. Custom language models and grammars can also influence how and where the service splits a transcript. By default, the service splits transcripts based solely on the pause interval. See [Split transcript at phrase - end](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#split_transcript). + end](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#split-transcript). - parameter speechDetectorSensitivity: The sensitivity of speech activity detection that the service is to perform. Use the parameter to suppress word insertions from music, coughing, and other non-speech events. The service biases the audio it passes for speech recognition by evaluating the input audio against prior models of @@ -979,8 +1060,8 @@ public class SpeechToText { * 0.0 suppresses all audio (no speech is transcribed). * 0.5 (the default) provides a reasonable compromise for the level of sensitivity. * 1.0 suppresses no audio (speech detection sensitivity is disabled). - The values increase on a monotonic curve. See [Speech Activity - Detection](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#detection). + The values increase on a monotonic curve. See [Speech detector + sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity). - parameter backgroundAudioSuppression: The level to which the service is to suppress background audio based on its volume to prevent it from being transcribed as speech. Use the parameter to suppress side conversations or background noise. @@ -988,8 +1069,20 @@ public class SpeechToText { * 0.0 (the default) provides no suppression (background audio suppression is disabled). * 0.5 provides a reasonable level of audio suppression for general usage. * 1.0 suppresses all audio (no audio is transcribed). - The values increase on a monotonic curve. See [Speech Activity - Detection](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#detection). + The values increase on a monotonic curve. See [Background audio + suppression](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-suppression). + - parameter lowLatency: If `true` for next-generation `Multimedia` and `Telephony` models that support low + latency, directs the service to produce results even more quickly than it usually does. Next-generation models + produce transcription results faster than previous-generation models. The `low_latency` parameter causes the + models to produce results even more quickly, though the results might be less accurate when the parameter is + used. + **Note:** The parameter is beta functionality. It is not available for previous-generation `Broadband` and + `Narrowband` models. It is available only for some next-generation models. + * For a list of next-generation models that support low latency, see [Supported language + models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng#models-ng-supported) for + next-generation models. + * For more information about the `low_latency` parameter, see [Low + latency](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-interim#low-latency). - parameter headers: A dictionary of request headers to be sent with this request. - parameter completionHandler: A function executed when the request completes with a successful result or error */ @@ -1025,6 +1118,7 @@ public class SpeechToText { splitTranscriptAtPhraseEnd: Bool? = nil, speechDetectorSensitivity: Double? = nil, backgroundAudioSuppression: Double? = nil, + lowLatency: Bool? = nil, headers: [String: String]? = nil, completionHandler: @escaping (WatsonResponse?, WatsonError?) -> Void) { @@ -1160,6 +1254,10 @@ public class SpeechToText { let queryParameter = URLQueryItem(name: "background_audio_suppression", value: "\(backgroundAudioSuppression)") queryParameters.append(queryParameter) } + if let lowLatency = lowLatency { + let queryParameter = URLQueryItem(name: "low_latency", value: "\(lowLatency)") + queryParameters.append(queryParameter) + } // construct REST request @@ -1654,6 +1752,8 @@ public class SpeechToText { of phrases from the custom model's domain, but it can negatively affect performance on non-domain phrases. The value that you assign is used for all recognition requests that use the model. You can override it for any recognition request by specifying a customization weight for that request. + See [Using customization + weight](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse#weight). - parameter headers: A dictionary of request headers to be sent with this request. - parameter completionHandler: A function executed when the request completes with a successful result or error */ @@ -1780,7 +1880,7 @@ public class SpeechToText { is complete, the model resumes the status that it had prior to upgrade. The service cannot accept subsequent requests for the model until the upgrade completes. **See also:** [Upgrading a custom language - model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-customUpgrade#upgradeLanguage). + model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade#custom-upgrade-language). - parameter customizationID: The customization ID (GUID) of the custom language model that is to be used for the request. You must make the request with credentials for the instance of the service that owns the custom model. @@ -1954,7 +2054,7 @@ public class SpeechToText { { // construct body let multipartFormData = MultipartFormData() - multipartFormData.append(corpusFile, withName: "corpus_file", fileName: "filename") + multipartFormData.append(corpusFile, withName: "corpus_file", mimeType: "text/plain", fileName: "filename") guard let body = try? multipartFormData.toData() else { completionHandler(nil, RestError.serialization(values: "request multipart form data")) return @@ -3270,7 +3370,7 @@ public class SpeechToText { must be upgraded before the custom acoustic model can be upgraded. Omit the parameter if the custom acoustic model was not trained with a custom language model. **See also:** [Upgrading a custom acoustic - model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-customUpgrade#upgradeAcoustic). + model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade#custom-upgrade-acoustic). - parameter customizationID: The customization ID (GUID) of the custom acoustic model that is to be used for the request. You must make the request with credentials for the instance of the service that owns the custom model. @@ -3282,7 +3382,7 @@ public class SpeechToText { modified since it was last trained. Use this parameter only to force the upgrade of a custom acoustic model that is trained with a custom language model, and only if you receive a 400 response code and the message `No input data modified since last training`. See [Upgrading a custom acoustic - model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-customUpgrade#upgradeAcoustic). + model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade#custom-upgrade-acoustic). - parameter headers: A dictionary of request headers to be sent with this request. - parameter completionHandler: A function executed when the request completes with a successful result or error */ @@ -3448,8 +3548,8 @@ public class SpeechToText { broadband models, at least 16 kHz; for narrowband models, at least 8 kHz. If the sampling rate of the audio is higher than the minimum required rate, the service down-samples the audio to the appropriate rate. If the sampling rate of the audio is lower than the minimum required rate, the service labels the audio file as `invalid`. - **See also:** [Audio - formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats#audio-formats). + **See also:** [Supported audio + formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats). ### Content types for archive-type resources You can add an archive file (**.zip** or **.tar.gz** file) that contains audio files in any format that the service supports for speech recognition. For an archive-type resource, use the `Content-Type` parameter to specify diff --git a/Sources/TextToSpeechV1/Models/CustomModel.swift b/Sources/TextToSpeechV1/Models/CustomModel.swift index f35e30e1f..9d33b2fa4 100644 --- a/Sources/TextToSpeechV1/Models/CustomModel.swift +++ b/Sources/TextToSpeechV1/Models/CustomModel.swift @@ -1,5 +1,5 @@ /** - * (C) Copyright IBM Corp. 2020. + * (C) Copyright IBM Corp. 2021. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,12 +62,18 @@ public struct CustomModel: Codable, Equatable { /** An array of `Word` objects that lists the words and their translations from the custom model. The words are listed - in alphabetical order, with uppercase letters listed before lowercase letters. The array is empty if the custom - model contains no words. This field is returned only by the **Get a voice** method and only when you specify the - customization ID of a custom model. + in alphabetical order, with uppercase letters listed before lowercase letters. The array is empty if no words are + defined for the custom model. This field is returned only by the **Get a custom model** method. */ public var words: [Word]? + /** + An array of `Prompt` objects that provides information about the prompts that are defined for the specified custom + model. The array is empty if no prompts are defined for the custom model. This field is returned only by the **Get + a custom model** method. + */ + public var prompts: [Prompt]? + // Map each property name to the key that shall be used for encoding/decoding. private enum CodingKeys: String, CodingKey { case customizationID = "customization_id" @@ -78,6 +84,7 @@ public struct CustomModel: Codable, Equatable { case lastModified = "last_modified" case description = "description" case words = "words" + case prompts = "prompts" } } diff --git a/Sources/TextToSpeechV1/Models/Prompt.swift b/Sources/TextToSpeechV1/Models/Prompt.swift new file mode 100644 index 000000000..1a491226a --- /dev/null +++ b/Sources/TextToSpeechV1/Models/Prompt.swift @@ -0,0 +1,65 @@ +/** + * (C) Copyright IBM Corp. 2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import Foundation + +/** + Information about a custom prompt. + */ +public struct Prompt: Codable, Equatable { + + /** + The user-specified text of the prompt. + */ + public var prompt: String + + /** + The user-specified identifier (name) of the prompt. + */ + public var promptID: String + + /** + The status of the prompt: + * `processing`: The service received the request to add the prompt and is analyzing the validity of the prompt. + * `available`: The service successfully validated the prompt, which is now ready for use in a speech synthesis + request. + * `failed`: The service's validation of the prompt failed. The status of the prompt includes an `error` field that + describes the reason for the failure. + */ + public var status: String + + /** + If the status of the prompt is `failed`, an error message that describes the reason for the failure. The field is + omitted if no error occurred. + */ + public var error: String? + + /** + The speaker ID (GUID) of the speaker for which the prompt was defined. The field is omitted if no speaker ID was + specified. + */ + public var speakerID: String? + + // Map each property name to the key that shall be used for encoding/decoding. + private enum CodingKeys: String, CodingKey { + case prompt = "prompt" + case promptID = "prompt_id" + case status = "status" + case error = "error" + case speakerID = "speaker_id" + } + +} diff --git a/Sources/TextToSpeechV1/Models/PromptMetadata.swift b/Sources/TextToSpeechV1/Models/PromptMetadata.swift new file mode 100644 index 000000000..e2cf3e666 --- /dev/null +++ b/Sources/TextToSpeechV1/Models/PromptMetadata.swift @@ -0,0 +1,66 @@ +/** + * (C) Copyright IBM Corp. 2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import Foundation + +/** + Information about the prompt that is to be added to a custom model. The following example of a `PromptMetadata` object + includes both the required prompt text and an optional speaker model ID: + `{ "prompt_text": "Thank you and good-bye!", "speaker_id": "823068b2-ed4e-11ea-b6e0-7b6456aa95cc" }`. + */ +public struct PromptMetadata: Codable, Equatable { + + /** + The required written text of the spoken prompt. The length of a prompt's text is limited to a few sentences. + Speaking one or two sentences of text is the recommended limit. A prompt cannot contain more than 1000 characters + of text. Escape any XML control characters (double quotes, single quotes, ampersands, angle brackets, and slashes) + that appear in the text of the prompt. + */ + public var promptText: String + + /** + The optional speaker ID (GUID) of a previously defined speaker model that is to be associated with the prompt. + */ + public var speakerID: String? + + // Map each property name to the key that shall be used for encoding/decoding. + private enum CodingKeys: String, CodingKey { + case promptText = "prompt_text" + case speakerID = "speaker_id" + } + + /** + Initialize a `PromptMetadata` with member variables. + + - parameter promptText: The required written text of the spoken prompt. The length of a prompt's text is limited + to a few sentences. Speaking one or two sentences of text is the recommended limit. A prompt cannot contain more + than 1000 characters of text. Escape any XML control characters (double quotes, single quotes, ampersands, angle + brackets, and slashes) that appear in the text of the prompt. + - parameter speakerID: The optional speaker ID (GUID) of a previously defined speaker model that is to be + associated with the prompt. + + - returns: An initialized `PromptMetadata`. + */ + public init( + promptText: String, + speakerID: String? = nil + ) + { + self.promptText = promptText + self.speakerID = speakerID + } + +} diff --git a/Sources/TextToSpeechV1/Models/Prompts.swift b/Sources/TextToSpeechV1/Models/Prompts.swift new file mode 100644 index 000000000..239df4691 --- /dev/null +++ b/Sources/TextToSpeechV1/Models/Prompts.swift @@ -0,0 +1,35 @@ +/** + * (C) Copyright IBM Corp. 2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import Foundation + +/** + Information about the custom prompts that are defined for a custom model. + */ +public struct Prompts: Codable, Equatable { + + /** + An array of `Prompt` objects that provides information about the prompts that are defined for the specified custom + model. The array is empty if no prompts are defined for the custom model. + */ + public var prompts: [Prompt] + + // Map each property name to the key that shall be used for encoding/decoding. + private enum CodingKeys: String, CodingKey { + case prompts = "prompts" + } + +} diff --git a/Sources/TextToSpeechV1/Models/Speaker.swift b/Sources/TextToSpeechV1/Models/Speaker.swift new file mode 100644 index 000000000..816cbff79 --- /dev/null +++ b/Sources/TextToSpeechV1/Models/Speaker.swift @@ -0,0 +1,40 @@ +/** + * (C) Copyright IBM Corp. 2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import Foundation + +/** + Information about a speaker model. + */ +public struct Speaker: Codable, Equatable { + + /** + The speaker ID (GUID) of the speaker. + */ + public var speakerID: String + + /** + The user-defined name of the speaker. + */ + public var name: String + + // Map each property name to the key that shall be used for encoding/decoding. + private enum CodingKeys: String, CodingKey { + case speakerID = "speaker_id" + case name = "name" + } + +} diff --git a/Sources/TextToSpeechV1/Models/SpeakerCustomModel.swift b/Sources/TextToSpeechV1/Models/SpeakerCustomModel.swift new file mode 100644 index 000000000..5f30b7aec --- /dev/null +++ b/Sources/TextToSpeechV1/Models/SpeakerCustomModel.swift @@ -0,0 +1,41 @@ +/** + * (C) Copyright IBM Corp. 2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import Foundation + +/** + A custom models for which the speaker has defined prompts. + */ +public struct SpeakerCustomModel: Codable, Equatable { + + /** + The customization ID (GUID) of a custom model for which the speaker has defined one or more prompts. + */ + public var customizationID: String + + /** + An array of `SpeakerPrompt` objects that provides information about each prompt that the user has defined for the + custom model. + */ + public var prompts: [SpeakerPrompt] + + // Map each property name to the key that shall be used for encoding/decoding. + private enum CodingKeys: String, CodingKey { + case customizationID = "customization_id" + case prompts = "prompts" + } + +} diff --git a/Sources/TextToSpeechV1/Models/SpeakerCustomModels.swift b/Sources/TextToSpeechV1/Models/SpeakerCustomModels.swift new file mode 100644 index 000000000..f22e9de29 --- /dev/null +++ b/Sources/TextToSpeechV1/Models/SpeakerCustomModels.swift @@ -0,0 +1,36 @@ +/** + * (C) Copyright IBM Corp. 2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import Foundation + +/** + Custom models for which the speaker has defined prompts. + */ +public struct SpeakerCustomModels: Codable, Equatable { + + /** + An array of `SpeakerCustomModel` objects. Each object provides information about the prompts that are defined for a + specified speaker in the custom models that are owned by a specified service instance. The array is empty if no + prompts are defined for the speaker. + */ + public var customizations: [SpeakerCustomModel] + + // Map each property name to the key that shall be used for encoding/decoding. + private enum CodingKeys: String, CodingKey { + case customizations = "customizations" + } + +} diff --git a/Sources/TextToSpeechV1/Models/SpeakerModel.swift b/Sources/TextToSpeechV1/Models/SpeakerModel.swift new file mode 100644 index 000000000..0ccfb6800 --- /dev/null +++ b/Sources/TextToSpeechV1/Models/SpeakerModel.swift @@ -0,0 +1,34 @@ +/** + * (C) Copyright IBM Corp. 2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import Foundation + +/** + The speaker ID of the speaker model. + */ +public struct SpeakerModel: Codable, Equatable { + + /** + The speaker ID (GUID) of the speaker model. + */ + public var speakerID: String + + // Map each property name to the key that shall be used for encoding/decoding. + private enum CodingKeys: String, CodingKey { + case speakerID = "speaker_id" + } + +} diff --git a/Sources/TextToSpeechV1/Models/SpeakerPrompt.swift b/Sources/TextToSpeechV1/Models/SpeakerPrompt.swift new file mode 100644 index 000000000..29943a14d --- /dev/null +++ b/Sources/TextToSpeechV1/Models/SpeakerPrompt.swift @@ -0,0 +1,58 @@ +/** + * (C) Copyright IBM Corp. 2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import Foundation + +/** + A prompt that a speaker has defined for a custom model. + */ +public struct SpeakerPrompt: Codable, Equatable { + + /** + The user-specified text of the prompt. + */ + public var prompt: String + + /** + The user-specified identifier (name) of the prompt. + */ + public var promptID: String + + /** + The status of the prompt: + * `processing`: The service received the request to add the prompt and is analyzing the validity of the prompt. + * `available`: The service successfully validated the prompt, which is now ready for use in a speech synthesis + request. + * `failed`: The service's validation of the prompt failed. The status of the prompt includes an `error` field that + describes the reason for the failure. + */ + public var status: String + + /** + If the status of the prompt is `failed`, an error message that describes the reason for the failure. The field is + omitted if no error occurred. + */ + public var error: String? + + // Map each property name to the key that shall be used for encoding/decoding. + private enum CodingKeys: String, CodingKey { + case prompt = "prompt" + case promptID = "prompt_id" + case status = "status" + case error = "error" + } + +} diff --git a/Sources/TextToSpeechV1/Models/Speakers.swift b/Sources/TextToSpeechV1/Models/Speakers.swift new file mode 100644 index 000000000..6be879634 --- /dev/null +++ b/Sources/TextToSpeechV1/Models/Speakers.swift @@ -0,0 +1,35 @@ +/** + * (C) Copyright IBM Corp. 2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import Foundation + +/** + Information about all speaker models for the service instance. + */ +public struct Speakers: Codable, Equatable { + + /** + An array of `Speaker` objects that provides information about the speakers for the service instance. The array is + empty if the service instance has no speakers. + */ + public var speakers: [Speaker] + + // Map each property name to the key that shall be used for encoding/decoding. + private enum CodingKeys: String, CodingKey { + case speakers = "speakers" + } + +} diff --git a/Sources/TextToSpeechV1/TextToSpeech.swift b/Sources/TextToSpeechV1/TextToSpeech.swift index 0927e134c..17d2c682b 100644 --- a/Sources/TextToSpeechV1/TextToSpeech.swift +++ b/Sources/TextToSpeechV1/TextToSpeech.swift @@ -1,5 +1,5 @@ /** - * (C) Copyright IBM Corp. 2016, 2020. + * (C) Copyright IBM Corp. 2021. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ **/ /** - * IBM OpenAPI SDK Code Generator Version: 99-SNAPSHOT-be3b4618-20201221-123327 + * IBM OpenAPI SDK Code Generator Version: 99-SNAPSHOT-902c9336-20210507-162723 **/ // swiftlint:disable file_length @@ -41,6 +41,9 @@ public typealias WatsonResponse = RestResponse translation is based on the SSML phoneme format for representing a word. You can specify a phonetic translation in standard International Phonetic Alphabet (IPA) representation or in the proprietary IBM Symbolic Phonetic Representation (SPR). The Arabic, Chinese, Dutch, Australian English, and Korean languages support only IPA. + The service also offers a Tune by Example feature that lets you define custom prompts. You can also define speaker + models to improve the quality of your custom prompts. The service support custom prompts only for US English custom + models and voices. */ public class TextToSpeech { @@ -601,9 +604,9 @@ public class TextToSpeech { List custom models. Lists metadata such as the name and description for all custom models that are owned by an instance of the service. - Specify a language to list the custom models for that language only. To see the words in addition to the metadata - for a specific custom model, use the **List a custom model** method. You must use credentials for the instance of - the service that owns a model to list information about it. + Specify a language to list the custom models for that language only. To see the words and prompts in addition to + the metadata for a specific custom model, use the **Get a custom model** method. You must use credentials for the + instance of the service that owns a model to list information about it. **See also:** [Querying all custom models](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-customModels#cuModelsQueryAll). @@ -753,8 +756,9 @@ public class TextToSpeech { Get a custom model. Gets all information about a specified custom model. In addition to metadata such as the name and description of - the custom model, the output includes the words and their translations as defined in the model. To see just the - metadata for a model, use the **List custom models** method. + the custom model, the output includes the words and their translations that are defined for the model, as well as + any prompts that are defined for the model. To see just the metadata for a model, use the **List custom models** + method. **See also:** [Querying a custom model](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-customModels#cuModelsQuery). @@ -1204,6 +1208,578 @@ public class TextToSpeech { request.response(completionHandler: completionHandler) } + /** + List custom prompts. + + Lists information about all custom prompts that are defined for a custom model. The information includes the prompt + ID, prompt text, status, and optional speaker ID for each prompt of the custom model. You must use credentials for + the instance of the service that owns the custom model. The same information about all of the prompts for a custom + model is also provided by the **Get a custom model** method. That method provides complete details about a + specified custom model, including its language, owner, custom words, and more. + **Beta:** Custom prompts are beta functionality that is supported only for use with US English custom models and + voices. + **See also:** [Listing custom + prompts](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-custom-prompts#tbe-custom-prompts-list). + + - parameter customizationID: The customization ID (GUID) of the custom model. You must make the request with + credentials for the instance of the service that owns the custom model. + - parameter headers: A dictionary of request headers to be sent with this request. + - parameter completionHandler: A function executed when the request completes with a successful result or error + */ + public func listCustomPrompts( + customizationID: String, + headers: [String: String]? = nil, + completionHandler: @escaping (WatsonResponse?, WatsonError?) -> Void) + { + // construct header parameters + var headerParameters = defaultHeaders + let sdkHeaders = Shared.getSDKHeaders(serviceName: serviceName, serviceVersion: serviceVersion, methodName: "listCustomPrompts") + headerParameters.merge(sdkHeaders) { (_, new) in new } + headerParameters["Accept"] = "application/json" + if let headers = headers { + headerParameters.merge(headers) { (_, new) in new } + } + + // construct REST request + let path = "/v1/customizations/\(customizationID)/prompts" + guard let encodedPath = path.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) else { + completionHandler(nil, RestError.urlEncoding(path: path)) + return + } + + // ensure that serviceURL is set + guard let serviceEndpoint = serviceURL else { + completionHandler(nil, RestError.noEndpoint) + return + } + + let request = RestRequest( + session: session, + authenticator: authenticator, + errorResponseDecoder: errorResponseDecoder, + method: "GET", + url: serviceEndpoint + encodedPath, + headerParameters: headerParameters + ) + + // execute REST request + request.responseObject(completionHandler: completionHandler) + } + + /** + Add a custom prompt. + + Adds a custom prompt to a custom model. A prompt is defined by the text that is to be spoken, the audio for that + text, a unique user-specified ID for the prompt, and an optional speaker ID. The information is used to generate + prosodic data that is not visible to the user. This data is used by the service to produce the synthesized audio + upon request. You must use credentials for the instance of the service that owns a custom model to add a prompt to + it. You can add a maximum of 1000 custom prompts to a single custom model. + You are recommended to assign meaningful values for prompt IDs. For example, use `goodbye` to identify a prompt + that speaks a farewell message. Prompt IDs must be unique within a given custom model. You cannot define two + prompts with the same name for the same custom model. If you provide the ID of an existing prompt, the previously + uploaded prompt is replaced by the new information. The existing prompt is reprocessed by using the new text and + audio and, if provided, new speaker model, and the prosody data associated with the prompt is updated. + The quality of a prompt is undefined if the language of a prompt does not match the language of its custom model. + This is consistent with any text or SSML that is specified for a speech synthesis request. The service makes a + best-effort attempt to render the specified text for the prompt; it does not validate that the language of the text + matches the language of the model. + Adding a prompt is an asynchronous operation. Although it accepts less audio than speaker enrollment, the service + must align the audio with the provided text. The time that it takes to process a prompt depends on the prompt + itself. The processing time for a reasonably sized prompt generally matches the length of the audio (for example, + it takes 20 seconds to process a 20-second prompt). + For shorter prompts, you can wait for a reasonable amount of time and then check the status of the prompt with the + **Get a custom prompt** method. For longer prompts, consider using that method to poll the service every few + seconds to determine when the prompt becomes available. No prompt can be used for speech synthesis if it is in the + `processing` or `failed` state. Only prompts that are in the `available` state can be used for speech synthesis. + When it processes a request, the service attempts to align the text and the audio that are provided for the prompt. + The text that is passed with a prompt must match the spoken audio as closely as possible. Optimally, the text and + audio match exactly. The service does its best to align the specified text with the audio, and it can often + compensate for mismatches between the two. But if the service cannot effectively align the text and the audio, + possibly because the magnitude of mismatches between the two is too great, processing of the prompt fails. + ### Evaluating a prompt + Always listen to and evaluate a prompt to determine its quality before using it in production. To evaluate a + prompt, include only the single prompt in a speech synthesis request by using the following SSML extension, in this + case for a prompt whose ID is `goodbye`: + `` + In some cases, you might need to rerecord and resubmit a prompt as many as five times to address the following + possible problems: + * The service might fail to detect a mismatch between the prompt’s text and audio. The longer the prompt, the + greater the chance for misalignment between its text and audio. Therefore, multiple shorter prompts are preferable + to a single long prompt. + * The text of a prompt might include a word that the service does not recognize. In this case, you can create a + custom word and pronunciation pair to tell the service how to pronounce the word. You must then re-create the + prompt. + * The quality of the input audio might be insufficient or the service’s processing of the audio might fail to + detect the intended prosody. Submitting new audio for the prompt can correct these issues. + If a prompt that is created without a speaker ID does not adequately reflect the intended prosody, enrolling the + speaker and providing a speaker ID for the prompt is one recommended means of potentially improving the quality of + the prompt. This is especially important for shorter prompts such as "good-bye" or "thank you," where less audio + data makes it more difficult to match the prosody of the speaker. + **Beta:** Custom prompts are beta functionality that is supported only for use with US English custom models and + voices. + **See also:** + * [Add a custom + prompt](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-create#tbe-create-add-prompt) + * [Evaluate a custom + prompt](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-create#tbe-create-evaluate-prompt) + * [Rules for creating custom + prompts](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-rules#tbe-rules-prompts). + + - parameter customizationID: The customization ID (GUID) of the custom model. You must make the request with + credentials for the instance of the service that owns the custom model. + - parameter promptID: The identifier of the prompt that is to be added to the custom model: + * Include a maximum of 49 characters in the ID. + * Include only alphanumeric characters and `_` (underscores) in the ID. + * Do not include XML sensitive characters (double quotes, single quotes, ampersands, angle brackets, and slashes) + in the ID. + * To add a new prompt, the ID must be unique for the specified custom model. Otherwise, the new information for + the prompt overwrites the existing prompt that has that ID. + - parameter metadata: Information about the prompt that is to be added to a custom model. The following example + of a `PromptMetadata` object includes both the required prompt text and an optional speaker model ID: + `{ "prompt_text": "Thank you and good-bye!", "speaker_id": "823068b2-ed4e-11ea-b6e0-7b6456aa95cc" }`. + - parameter file: An audio file that speaks the text of the prompt with intonation and prosody that matches how + you would like the prompt to be spoken. + * The prompt audio must be in WAV format and must have a minimum sampling rate of 16 kHz. The service accepts + audio with higher sampling rates. The service transcodes all audio to 16 kHz before processing it. + * The length of the prompt audio is limited to 30 seconds. + - parameter filename: The filename for file. + - parameter headers: A dictionary of request headers to be sent with this request. + - parameter completionHandler: A function executed when the request completes with a successful result or error + */ + public func addCustomPrompt( + customizationID: String, + promptID: String, + metadata: PromptMetadata, + file: Data, + filename: String, + headers: [String: String]? = nil, + completionHandler: @escaping (WatsonResponse?, WatsonError?) -> Void) + { + // construct body + let multipartFormData = MultipartFormData() + guard let metadataJSON = try? JSON.encoder.encode(metadata) else { + completionHandler(nil, RestError.serialization(values: "metadata")) + return + } + multipartFormData.append(metadataJSON, withName: "metadata", mimeType: "application/json") + + multipartFormData.append(file, withName: "file", mimeType: "audio/wav", fileName: filename) + guard let body = try? multipartFormData.toData() else { + completionHandler(nil, RestError.serialization(values: "request multipart form data")) + return + } + + // construct header parameters + var headerParameters = defaultHeaders + let sdkHeaders = Shared.getSDKHeaders(serviceName: serviceName, serviceVersion: serviceVersion, methodName: "addCustomPrompt") + headerParameters.merge(sdkHeaders) { (_, new) in new } + headerParameters["Accept"] = "application/json" + headerParameters["Content-Type"] = multipartFormData.contentType + if let headers = headers { + headerParameters.merge(headers) { (_, new) in new } + } + + // construct REST request + let path = "/v1/customizations/\(customizationID)/prompts/\(promptID)" + guard let encodedPath = path.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) else { + completionHandler(nil, RestError.urlEncoding(path: path)) + return + } + + // ensure that serviceURL is set + guard let serviceEndpoint = serviceURL else { + completionHandler(nil, RestError.noEndpoint) + return + } + + let request = RestRequest( + session: session, + authenticator: authenticator, + errorResponseDecoder: errorResponseDecoder, + method: "POST", + url: serviceEndpoint + encodedPath, + headerParameters: headerParameters, + messageBody: body + ) + + // execute REST request + request.responseObject(completionHandler: completionHandler) + } + + /** + Get a custom prompt. + + Gets information about a specified custom prompt for a specified custom model. The information includes the prompt + ID, prompt text, status, and optional speaker ID for each prompt of the custom model. You must use credentials for + the instance of the service that owns the custom model. + **Beta:** Custom prompts are beta functionality that is supported only for use with US English custom models and + voices. + **See also:** [Listing custom + prompts](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-custom-prompts#tbe-custom-prompts-list). + + - parameter customizationID: The customization ID (GUID) of the custom model. You must make the request with + credentials for the instance of the service that owns the custom model. + - parameter promptID: The identifier (name) of the prompt. + - parameter headers: A dictionary of request headers to be sent with this request. + - parameter completionHandler: A function executed when the request completes with a successful result or error + */ + public func getCustomPrompt( + customizationID: String, + promptID: String, + headers: [String: String]? = nil, + completionHandler: @escaping (WatsonResponse?, WatsonError?) -> Void) + { + // construct header parameters + var headerParameters = defaultHeaders + let sdkHeaders = Shared.getSDKHeaders(serviceName: serviceName, serviceVersion: serviceVersion, methodName: "getCustomPrompt") + headerParameters.merge(sdkHeaders) { (_, new) in new } + headerParameters["Accept"] = "application/json" + if let headers = headers { + headerParameters.merge(headers) { (_, new) in new } + } + + // construct REST request + let path = "/v1/customizations/\(customizationID)/prompts/\(promptID)" + guard let encodedPath = path.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) else { + completionHandler(nil, RestError.urlEncoding(path: path)) + return + } + + // ensure that serviceURL is set + guard let serviceEndpoint = serviceURL else { + completionHandler(nil, RestError.noEndpoint) + return + } + + let request = RestRequest( + session: session, + authenticator: authenticator, + errorResponseDecoder: errorResponseDecoder, + method: "GET", + url: serviceEndpoint + encodedPath, + headerParameters: headerParameters + ) + + // execute REST request + request.responseObject(completionHandler: completionHandler) + } + + /** + Delete a custom prompt. + + Deletes an existing custom prompt from a custom model. The service deletes the prompt with the specified ID. You + must use credentials for the instance of the service that owns the custom model from which the prompt is to be + deleted. + **Caution:** Deleting a custom prompt elicits a 400 response code from synthesis requests that attempt to use the + prompt. Make sure that you do not attempt to use a deleted prompt in a production application. + **Beta:** Custom prompts are beta functionality that is supported only for use with US English custom models and + voices. + **See also:** [Deleting a custom + prompt](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-custom-prompts#tbe-custom-prompts-delete). + + - parameter customizationID: The customization ID (GUID) of the custom model. You must make the request with + credentials for the instance of the service that owns the custom model. + - parameter promptID: The identifier (name) of the prompt that is to be deleted. + - parameter headers: A dictionary of request headers to be sent with this request. + - parameter completionHandler: A function executed when the request completes with a successful result or error + */ + public func deleteCustomPrompt( + customizationID: String, + promptID: String, + headers: [String: String]? = nil, + completionHandler: @escaping (WatsonResponse?, WatsonError?) -> Void) + { + // construct header parameters + var headerParameters = defaultHeaders + let sdkHeaders = Shared.getSDKHeaders(serviceName: serviceName, serviceVersion: serviceVersion, methodName: "deleteCustomPrompt") + headerParameters.merge(sdkHeaders) { (_, new) in new } + if let headers = headers { + headerParameters.merge(headers) { (_, new) in new } + } + + // construct REST request + let path = "/v1/customizations/\(customizationID)/prompts/\(promptID)" + guard let encodedPath = path.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) else { + completionHandler(nil, RestError.urlEncoding(path: path)) + return + } + + // ensure that serviceURL is set + guard let serviceEndpoint = serviceURL else { + completionHandler(nil, RestError.noEndpoint) + return + } + + let request = RestRequest( + session: session, + authenticator: authenticator, + errorResponseDecoder: errorResponseDecoder, + method: "DELETE", + url: serviceEndpoint + encodedPath, + headerParameters: headerParameters + ) + + // execute REST request + request.response(completionHandler: completionHandler) + } + + /** + List speaker models. + + Lists information about all speaker models that are defined for a service instance. The information includes the + speaker ID and speaker name of each defined speaker. You must use credentials for the instance of a service to list + its speakers. + **Beta:** Speaker models and the custom prompts with which they are used are beta functionality that is supported + only for use with US English custom models and voices. + **See also:** [Listing speaker + models](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-speaker-models#tbe-speaker-models-list). + + - parameter headers: A dictionary of request headers to be sent with this request. + - parameter completionHandler: A function executed when the request completes with a successful result or error + */ + public func listSpeakerModels( + headers: [String: String]? = nil, + completionHandler: @escaping (WatsonResponse?, WatsonError?) -> Void) + { + // construct header parameters + var headerParameters = defaultHeaders + let sdkHeaders = Shared.getSDKHeaders(serviceName: serviceName, serviceVersion: serviceVersion, methodName: "listSpeakerModels") + headerParameters.merge(sdkHeaders) { (_, new) in new } + headerParameters["Accept"] = "application/json" + if let headers = headers { + headerParameters.merge(headers) { (_, new) in new } + } + + // construct REST request + + // ensure that serviceURL is set + guard let serviceEndpoint = serviceURL else { + completionHandler(nil, RestError.noEndpoint) + return + } + + let request = RestRequest( + session: session, + authenticator: authenticator, + errorResponseDecoder: errorResponseDecoder, + method: "GET", + url: serviceEndpoint + "/v1/speakers", + headerParameters: headerParameters + ) + + // execute REST request + request.responseObject(completionHandler: completionHandler) + } + + /** + Create a speaker model. + + Creates a new speaker model, which is an optional enrollment token for users who are to add prompts to custom + models. A speaker model contains information about a user's voice. The service extracts this information from a WAV + audio sample that you pass as the body of the request. Associating a speaker model with a prompt is optional, but + the information that is extracted from the speaker model helps the service learn about the speaker's voice. + A speaker model can make an appreciable difference in the quality of prompts, especially short prompts with + relatively little audio, that are associated with that speaker. A speaker model can help the service produce a + prompt with more confidence; the lack of a speaker model can potentially compromise the quality of a prompt. + The gender of the speaker who creates a speaker model does not need to match the gender of a voice that is used + with prompts that are associated with that speaker model. For example, a speaker model that is created by a male + speaker can be associated with prompts that are spoken by female voices. + You create a speaker model for a given instance of the service. The new speaker model is owned by the service + instance whose credentials are used to create it. That same speaker can then be used to create prompts for all + custom models within that service instance. No language is associated with a speaker model, but each custom model + has a single specified language. You can add prompts only to US English models. + You specify a name for the speaker when you create it. The name must be unique among all speaker names for the + owning service instance. To re-create a speaker model for an existing speaker name, you must first delete the + existing speaker model that has that name. + Speaker enrollment is a synchronous operation. Although it accepts more audio data than a prompt, the process of + adding a speaker is very fast. The service simply extracts information about the speaker’s voice from the audio. + Unlike prompts, speaker models neither need nor accept a transcription of the audio. When the call returns, the + audio is fully processed and the speaker enrollment is complete. + The service returns a speaker ID with the request. A speaker ID is globally unique identifier (GUID) that you use + to identify the speaker in subsequent requests to the service. + **Beta:** Speaker models and the custom prompts with which they are used are beta functionality that is supported + only for use with US English custom models and voices. + **See also:** + * [Create a speaker + model](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-create#tbe-create-speaker-model) + * [Rules for creating speaker + models](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-rules#tbe-rules-speakers). + + - parameter speakerName: The name of the speaker that is to be added to the service instance. + * Include a maximum of 49 characters in the name. + * Include only alphanumeric characters and `_` (underscores) in the name. + * Do not include XML sensitive characters (double quotes, single quotes, ampersands, angle brackets, and slashes) + in the name. + * Do not use the name of an existing speaker that is already defined for the service instance. + - parameter audio: An enrollment audio file that contains a sample of the speaker’s voice. + * The enrollment audio must be in WAV format and must have a minimum sampling rate of 16 kHz. The service accepts + audio with higher sampling rates. It transcodes all audio to 16 kHz before processing it. + * The length of the enrollment audio is limited to 1 minute. Speaking one or two paragraphs of text that include + five to ten sentences is recommended. + - parameter headers: A dictionary of request headers to be sent with this request. + - parameter completionHandler: A function executed when the request completes with a successful result or error + */ + public func createSpeakerModel( + speakerName: String, + audio: Data, + headers: [String: String]? = nil, + completionHandler: @escaping (WatsonResponse?, WatsonError?) -> Void) + { + let body = audio + + // construct header parameters + var headerParameters = defaultHeaders + let sdkHeaders = Shared.getSDKHeaders(serviceName: serviceName, serviceVersion: serviceVersion, methodName: "createSpeakerModel") + headerParameters.merge(sdkHeaders) { (_, new) in new } + headerParameters["Accept"] = "application/json" + headerParameters["Content-Type"] = "audio/wav" + if let headers = headers { + headerParameters.merge(headers) { (_, new) in new } + } + + // construct query parameters + var queryParameters = [URLQueryItem]() + queryParameters.append(URLQueryItem(name: "speaker_name", value: speakerName)) + + // construct REST request + + // ensure that serviceURL is set + guard let serviceEndpoint = serviceURL else { + completionHandler(nil, RestError.noEndpoint) + return + } + + let request = RestRequest( + session: session, + authenticator: authenticator, + errorResponseDecoder: errorResponseDecoder, + method: "POST", + url: serviceEndpoint + "/v1/speakers", + headerParameters: headerParameters, + queryItems: queryParameters, + messageBody: body + ) + + // execute REST request + request.responseObject(completionHandler: completionHandler) + } + + /** + Get a speaker model. + + Gets information about all prompts that are defined by a specified speaker for all custom models that are owned by + a service instance. The information is grouped by the customization IDs of the custom models. For each custom + model, the information lists information about each prompt that is defined for that custom model by the speaker. + You must use credentials for the instance of the service that owns a speaker model to list its prompts. + **Beta:** Speaker models and the custom prompts with which they are used are beta functionality that is supported + only for use with US English custom models and voices. + **See also:** [Listing the custom prompts for a speaker + model](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-speaker-models#tbe-speaker-models-list-prompts). + + - parameter speakerID: The speaker ID (GUID) of the speaker model. You must make the request with service + credentials for the instance of the service that owns the speaker model. + - parameter headers: A dictionary of request headers to be sent with this request. + - parameter completionHandler: A function executed when the request completes with a successful result or error + */ + public func getSpeakerModel( + speakerID: String, + headers: [String: String]? = nil, + completionHandler: @escaping (WatsonResponse?, WatsonError?) -> Void) + { + // construct header parameters + var headerParameters = defaultHeaders + let sdkHeaders = Shared.getSDKHeaders(serviceName: serviceName, serviceVersion: serviceVersion, methodName: "getSpeakerModel") + headerParameters.merge(sdkHeaders) { (_, new) in new } + headerParameters["Accept"] = "application/json" + if let headers = headers { + headerParameters.merge(headers) { (_, new) in new } + } + + // construct REST request + let path = "/v1/speakers/\(speakerID)" + guard let encodedPath = path.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) else { + completionHandler(nil, RestError.urlEncoding(path: path)) + return + } + + // ensure that serviceURL is set + guard let serviceEndpoint = serviceURL else { + completionHandler(nil, RestError.noEndpoint) + return + } + + let request = RestRequest( + session: session, + authenticator: authenticator, + errorResponseDecoder: errorResponseDecoder, + method: "GET", + url: serviceEndpoint + encodedPath, + headerParameters: headerParameters + ) + + // execute REST request + request.responseObject(completionHandler: completionHandler) + } + + /** + Delete a speaker model. + + Deletes an existing speaker model from the service instance. The service deletes the enrolled speaker with the + specified speaker ID. You must use credentials for the instance of the service that owns a speaker model to delete + the speaker. + Any prompts that are associated with the deleted speaker are not affected by the speaker's deletion. The prosodic + data that defines the quality of a prompt is established when the prompt is created. A prompt is static and remains + unaffected by deletion of its associated speaker. However, the prompt cannot be resubmitted or updated with its + original speaker once that speaker is deleted. + **Beta:** Speaker models and the custom prompts with which they are used are beta functionality that is supported + only for use with US English custom models and voices. + **See also:** [Deleting a speaker + model](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-speaker-models#tbe-speaker-models-delete). + + - parameter speakerID: The speaker ID (GUID) of the speaker model. You must make the request with service + credentials for the instance of the service that owns the speaker model. + - parameter headers: A dictionary of request headers to be sent with this request. + - parameter completionHandler: A function executed when the request completes with a successful result or error + */ + public func deleteSpeakerModel( + speakerID: String, + headers: [String: String]? = nil, + completionHandler: @escaping (WatsonResponse?, WatsonError?) -> Void) + { + // construct header parameters + var headerParameters = defaultHeaders + let sdkHeaders = Shared.getSDKHeaders(serviceName: serviceName, serviceVersion: serviceVersion, methodName: "deleteSpeakerModel") + headerParameters.merge(sdkHeaders) { (_, new) in new } + if let headers = headers { + headerParameters.merge(headers) { (_, new) in new } + } + + // construct REST request + let path = "/v1/speakers/\(speakerID)" + guard let encodedPath = path.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) else { + completionHandler(nil, RestError.urlEncoding(path: path)) + return + } + + // ensure that serviceURL is set + guard let serviceEndpoint = serviceURL else { + completionHandler(nil, RestError.noEndpoint) + return + } + + let request = RestRequest( + session: session, + authenticator: authenticator, + errorResponseDecoder: errorResponseDecoder, + method: "DELETE", + url: serviceEndpoint + encodedPath, + headerParameters: headerParameters + ) + + // execute REST request + request.response(completionHandler: completionHandler) + } + /** Delete labeled data. diff --git a/Tests/TextToSpeechV1Tests/TextToSpeechTests.swift b/Tests/TextToSpeechV1Tests/TextToSpeechTests.swift index 064f8c105..8fe9984d8 100644 --- a/Tests/TextToSpeechV1Tests/TextToSpeechTests.swift +++ b/Tests/TextToSpeechV1Tests/TextToSpeechTests.swift @@ -754,7 +754,7 @@ class TextToSpeechTests: XCTestCase { expectation4.fulfill() return } - XCTAssertNotNil(response?.result) + XCTAssertNotNil(response) expectation4.fulfill() } waitForExpectations()