Add whisper (#29)

SwiftBeta · Nov 14, 2023 · c0c7b6b · c0c7b6b
1 parent b37189e
commit c0c7b6b
Show file tree

Hide file tree

Showing 7 changed files with 252 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -70,6 +70,56 @@ struct Config {
 var openAI = SwiftOpenAI(apiKey: Config.openAIKey)
 ```
 
+## [Audio Text To Speech](https://platform.openai.com/docs/api-reference/audio/createSpeech)
+Generates audio from the input text.
+
+```swift
+do {
+    let input = "Hello, I'm SwiftBeta, a developer who in his free time tries to teach through his blog swiftbeta.com and his YouTube channel. Now I'm adding the OpenAI API to transform this text into audio"
+    let data = try await openAI.createSpeech(model: .tts(.tts1), 
+                                             input: input,
+                                             voice: .alloy,
+                                             responseFormat: .mp3,
+                                             speed: 1.0)
+
+    if let filePath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask).first?.appendingPathComponent("speech.mp3"), let data {
+        do {
+            try data.write(to: filePath)
+            print("Audio file saved: \(filePath)")
+        } catch {
+            print("Error savind Audio file: \(error)")
+        }
+    }
+} catch {
+    print(error.localizedDescription)
+}
+```
+
+## [Audio Transcriptions](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+Transcribes audio into the input language.
+
+```
+let fileData = // Data fromyour video, audio, etc
+let model: OpenAITranscriptionModelType = .whisper
+
+do {
+    for try await newMessage in try await openAI.createTranscription(model: model,
+                                                                    file: fileData,
+                                                                    language: "en",
+                                                                    prompt: "",
+                                                                    responseFormat: .mp3,
+                                                                    temperature: 1.0) {
+        print("Received Transcription \(newMessage)")
+        await MainActor.run {
+            isLoading = false
+            transcription = newMessage.text
+        }
+    }
+} catch {
+    print(error.localizedDescription)
+}
+```
+
 ## [Models](https://platform.openai.com/docs/api-reference/models)
 List and describe the various models available in the API. You can refer to the Models documentation to understand what models are available and the differences between them.
 

diff --git a/Sources/SwiftOpenAI/OpenAI/DataModels/Audio/CreateTranscriptionDataModel.swift b/Sources/SwiftOpenAI/OpenAI/DataModels/Audio/CreateTranscriptionDataModel.swift
@@ -0,0 +1,5 @@
+import Foundation
+
+public struct CreateTranscriptionDataModel: Decodable {
+    public let text: String
+}
diff --git a/Sources/SwiftOpenAI/OpenAI/DataModels/Audio/OpenAITranscriptionModelType.swift b/Sources/SwiftOpenAI/OpenAI/DataModels/Audio/OpenAITranscriptionModelType.swift
@@ -0,0 +1,5 @@
+import Foundation
+
+public enum OpenAITranscriptionModelType: String {
+    case whisper = "whisper-1"
+}
diff --git a/Sources/SwiftOpenAI/OpenAI/OpenAIEndpoints/List/CreateTranscriptionEndpoint.swift b/Sources/SwiftOpenAI/OpenAI/OpenAIEndpoints/List/CreateTranscriptionEndpoint.swift
@@ -0,0 +1,38 @@
+import Foundation
+
+struct CreateTranscriptionEndpoint: Endpoint {
+    private let file: Data
+    private let model: OpenAITranscriptionModelType
+    private let language: String
+    private let prompt: String
+    private let responseFormat: OpenAIAudioResponseType
+    private let temperature: Double
+
+    var method: HTTPMethod {
+        .POST
+    }
+
+    var path: String = "audio/transcriptions"
+
+    init(file: Data,
+         model: OpenAITranscriptionModelType,
+         language: String = "en",
+         prompt: String = "",
+         responseFormat: OpenAIAudioResponseType,
+         temperature: Double = 0.0) {
+        self.file = file
+        self.model = model
+        self.language = language
+        self.prompt = prompt
+        self.responseFormat = responseFormat
+        self.temperature = temperature
+    }
+
+    var parameters: [String: Any]? {
+        ["model": self.model.rawValue as Any,
+         "language": self.language as Any,
+         "prompt": self.prompt as Any,
+         "response_format": self.responseFormat.rawValue as Any,
+         "temperature": self.temperature as Any]
+    }
+}
diff --git a/Sources/SwiftOpenAI/OpenAI/OpenAIEndpoints/OpenAIEndpoints.swift b/Sources/SwiftOpenAI/OpenAI/OpenAIEndpoints/OpenAIEndpoints.swift
@@ -20,6 +20,8 @@ enum OpenAIEndpoints {
     case moderations(input: String)
 
     case createSpeech(model: OpenAITTSModelType, input: String, voice: OpenAIVoiceType, responseFormat: OpenAIAudioResponseType, speed: Double)
+
+    case createTranscription(file: Data, model: OpenAITranscriptionModelType, language: String, prompt: String, responseFormat: OpenAIAudioResponseType, temperature: Double)
 
     public var endpoint: Endpoint {
         switch self {
@@ -43,7 +45,18 @@ enum OpenAIEndpoints {
         case .moderations(input: let input):
             return ModerationEndpoint(input: input)
         case .createSpeech(model: let model, input: let input, voice: let voice, responseFormat: let responseFormat, speed: let speed):
-            return CreateSpeechEndpoint(model: model, input: input, voice: voice, responseFormat: responseFormat, speed: speed)
+            return CreateSpeechEndpoint(model: model, 
+                                        input: input,
+                                        voice: voice,
+                                        responseFormat: responseFormat,
+                                        speed: speed)
+        case .createTranscription(file: let file, model: let model, language: let language, prompt: let prompt, responseFormat: let responseFormat, temperature: let temperature):
+            return CreateTranscriptionEndpoint(file: file, 
+                                               model: model, 
+                                               language: language, 
+                                               prompt: prompt, 
+                                               responseFormat: responseFormat, 
+                                               temperature: temperature)
         }
     }
 }
diff --git a/Sources/SwiftOpenAI/OpenAI/Requests/Audio/CreateTranscriptionRequest.swift b/Sources/SwiftOpenAI/OpenAI/Requests/Audio/CreateTranscriptionRequest.swift
@@ -0,0 +1,97 @@
+import Foundation
+
+protocol CreateTranscriptionRequestProtocol {
+    func execute(api: API,
+                 apiKey: String,
+                 file: Data,
+                 model: OpenAITranscriptionModelType,
+                 language: String,
+                 prompt: String,
+                 responseFormat: OpenAIAudioResponseType,
+                 temperature: Double) async throws -> AsyncThrowingStream<CreateTranscriptionDataModel, Error>
+}
+
+final public class CreateTranscriptionRequest: NSObject, CreateTranscriptionRequestProtocol {
+    public typealias Init = (_ api: API,
+                             _ apiKey: String,
+                             _ file: Data,
+                             _ model: OpenAITranscriptionModelType,
+                             _ language: String,
+                             _ prompt: String,
+                             _ responseFormat: OpenAIAudioResponseType,
+                             _ temperature: Double) async throws -> AsyncThrowingStream<CreateTranscriptionDataModel, Error>
+
+    private var urlSession: URLSession?
+    private var dataTask: URLSessionDataTask?
+    private var continuation: AsyncThrowingStream<CreateTranscriptionDataModel, Error>.Continuation?
+
+    public override init() {
+        super.init()
+    }
+
+    public func execute(api: API,
+                        apiKey: String,
+                        file: Data,
+                        model: OpenAITranscriptionModelType,
+                        language: String,
+                        prompt: String,
+                        responseFormat: OpenAIAudioResponseType,
+                        temperature: Double) async throws -> AsyncThrowingStream<CreateTranscriptionDataModel, Error> {
+
+        return AsyncThrowingStream<CreateTranscriptionDataModel, Error> { continuation in
+            self.continuation = continuation
+
+            var endpoint = OpenAIEndpoints.createTranscription(file: file, model: model, language: language, prompt: prompt, responseFormat: responseFormat, temperature: temperature).endpoint
+            api.routeEndpoint(&endpoint, environment: OpenAIEnvironmentV1())
+
+            let boundary = UUID().uuidString
+
+            var urlRequest = api.buildURLRequest(endpoint: endpoint)
+            api.addHeaders(urlRequest: &urlRequest,
+                           headers: ["Content-Type": "multipart/form-data; boundary=\(boundary)",
+                                     "Authorization": "Bearer \(apiKey)"])
+
+            var body = Data()
+
+            body.append("--\(boundary)\r\n".data(using: .utf8)!)
+            body.append("Content-Disposition: form-data; name=\"model\"\r\n\r\n".data(using: .utf8)!)
+            body.append("whisper-1\r\n".data(using: .utf8)!)
+
+            body.append("--\(boundary)\r\n".data(using: .utf8)!)
+            body.append("Content-Disposition: form-data; name=\"file\"; filename=\"steve.mp4\"\r\n".data(using: .utf8)!)
+            body.append("Content-Type: audio/mpeg\r\n\r\n".data(using: .utf8)!)
+            body.append(file)
+            body.append("\r\n".data(using: .utf8)!)
+
+            body.append("--\(boundary)--\r\n".data(using: .utf8)!)
+
+            urlRequest.httpBody = body
+
+            self.urlSession = URLSession(configuration: .default,
+                                         delegate: self,
+                                         delegateQueue: OperationQueue())
+
+            dataTask = urlSession?.dataTask(with: urlRequest)
+            dataTask?.resume()
+        }
+    }
+}
+
+extension CreateTranscriptionRequest: URLSessionDataDelegate {
+    public func urlSession(_ session: URLSession, dataTask: URLSessionDataTask, didReceive data: Data) {
+        do {
+            let createTranscriptionDataModel = try JSONDecoder().decode(CreateTranscriptionDataModel.self, from: data)
+            self.continuation?.yield(createTranscriptionDataModel)
+        } catch {
+            print("Error al parsear JSON:", error.localizedDescription)
+        }
+    }
+
+    public func urlSession(_ session: URLSession, task: URLSessionTask, didCompleteWithError error: Error?) {
+        guard let error = error else {
+            continuation?.finish()
+            return
+        }
+        continuation?.finish(throwing: error)
+    }
+}
diff --git a/Sources/SwiftOpenAI/OpenAI/SwiftOpenAI.swift b/Sources/SwiftOpenAI/OpenAI/SwiftOpenAI.swift
@@ -22,6 +22,8 @@ protocol OpenAIProtocol {
     func moderations(input: String) async throws -> ModerationDataModel?
 
     func createSpeech(model: OpenAITTSModelType, input: String, voice: OpenAIVoiceType, responseFormat: OpenAIAudioResponseType, speed: Double) async throws -> Data?
+
+    func createTranscription(model: OpenAITranscriptionModelType, file: Data, language: String, prompt: String, responseFormat: OpenAIAudioResponseType, temperature: Double) async throws -> AsyncThrowingStream<CreateTranscriptionDataModel, Error>
 }
 
 // swiftlint:disable line_length
@@ -37,6 +39,7 @@ public class SwiftOpenAI: OpenAIProtocol {
     private let embeddingsRequest: EmbeddingsRequest.Init
     private let moderationsRequest: ModerationsRequest.Init
     private let createSpeechRequest: CreateSpeechRequest.Init
+    private let createTranscriptionRequest: CreateTranscriptionRequest.Init
 
     public init(api: API = API(),
                 apiKey: String,
@@ -47,7 +50,8 @@ public class SwiftOpenAI: OpenAIProtocol {
                 createImagesRequest: @escaping CreateImagesRequest.Init = CreateImagesRequest().execute,
                 embeddingsRequest: @escaping EmbeddingsRequest.Init = EmbeddingsRequest().execute,
                 moderationsRequest: @escaping ModerationsRequest.Init = ModerationsRequest().execute,
-                createSpeechRequest: @escaping CreateSpeechRequest.Init = CreateSpeechRequest().execute) {
+                createSpeechRequest: @escaping CreateSpeechRequest.Init = CreateSpeechRequest().execute,
+                createTranscriptionRequest: @escaping CreateTranscriptionRequest.Init = CreateTranscriptionRequest().execute) {
         self.api = api
         self.apiKey = apiKey
         self.listModelsRequest = listModelsRequest
@@ -58,6 +62,7 @@ public class SwiftOpenAI: OpenAIProtocol {
         self.embeddingsRequest = embeddingsRequest
         self.moderationsRequest = moderationsRequest
         self.createSpeechRequest = createSpeechRequest
+        self.createTranscriptionRequest = createTranscriptionRequest
     }
 
     /**
@@ -320,5 +325,42 @@ public class SwiftOpenAI: OpenAIProtocol {
     public func createSpeech(model: OpenAITTSModelType, input: String, voice: OpenAIVoiceType, responseFormat: OpenAIAudioResponseType, speed: Double) async throws -> Data? {
         try await createSpeechRequest(api, apiKey, model, input, voice, responseFormat, speed)
     }
+
+    /**
+      Transcribes audio files into text using the OpenAI Transcription API.
+
+      This method employs the OpenAI Transcription API to convert audio files into textual transcriptions. It allows you to specify the transcription model, language, and other parameters to tailor the transcription process to your needs. The method supports various file formats and provides flexibility in terms of language and response format.
+
+      The function is designed with Swift's concurrency features and supports async/await for seamless integration into modern Swift applications.
+
+      - Parameters:
+        - model: An `OpenAITranscriptionModelType` representing the chosen model for transcription.
+        - file: A `Data` object containing the audio file to be transcribed.
+        - language: A `String` specifying the language of the audio content.
+        - prompt: A `String` used to provide any specific instructions or context for the transcription.
+        - responseFormat: An `OpenAIAudioResponseType` indicating the format of the transcription response.
+        - temperature: A `Double` that adjusts the creativity or variability of the transcription.
+
+      - Throws: An error if the API request fails or if there are issues in processing the audio file.
+
+      - Returns: An `AsyncThrowingStream` of `CreateTranscriptionDataModel`, providing a stream of transcription results or errors encountered during the process.
+
+      Example usage:
+
+          let audioFileData = // Your audio file data here
+
+          do {
+              let transcriptionStream = try await createTranscription(model: .base, file: audioFileData, language: "en", prompt: "General transcription", responseFormat: .json, temperature: 0.5)
+              
+              for try await transcription in transcriptionStream {
+                  // Process each transcription result
+              }
+          } catch {
+              print("Error: \(error)")
+          }
+    */
+    public func createTranscription(model: OpenAITranscriptionModelType, file: Data, language: String, prompt: String, responseFormat: OpenAIAudioResponseType, temperature: Double) async throws -> AsyncThrowingStream<CreateTranscriptionDataModel, Error> {
+        try await createTranscriptionRequest(api, apiKey, file, model, language, prompt, responseFormat, temperature)
+    }
 }
 // swiftlint:enable line_length