Skip to content

Commit

Permalink
Add whisper (#29)
Browse files Browse the repository at this point in the history
  • Loading branch information
SwiftBeta authored Nov 14, 2023
1 parent b37189e commit c0c7b6b
Show file tree
Hide file tree
Showing 7 changed files with 252 additions and 2 deletions.
50 changes: 50 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,56 @@ struct Config {
var openAI = SwiftOpenAI(apiKey: Config.openAIKey)
```

## [Audio Text To Speech](https://platform.openai.com/docs/api-reference/audio/createSpeech)
Generates audio from the input text.

```swift
do {
let input = "Hello, I'm SwiftBeta, a developer who in his free time tries to teach through his blog swiftbeta.com and his YouTube channel. Now I'm adding the OpenAI API to transform this text into audio"
let data = try await openAI.createSpeech(model: .tts(.tts1),
input: input,
voice: .alloy,
responseFormat: .mp3,
speed: 1.0)

if let filePath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask).first?.appendingPathComponent("speech.mp3"), let data {
do {
try data.write(to: filePath)
print("Audio file saved: \(filePath)")
} catch {
print("Error savind Audio file: \(error)")
}
}
} catch {
print(error.localizedDescription)
}
```

## [Audio Transcriptions](https://platform.openai.com/docs/api-reference/audio/createTranscription)
Transcribes audio into the input language.

```
let fileData = // Data fromyour video, audio, etc
let model: OpenAITranscriptionModelType = .whisper
do {
for try await newMessage in try await openAI.createTranscription(model: model,
file: fileData,
language: "en",
prompt: "",
responseFormat: .mp3,
temperature: 1.0) {
print("Received Transcription \(newMessage)")
await MainActor.run {
isLoading = false
transcription = newMessage.text
}
}
} catch {
print(error.localizedDescription)
}
```

## [Models](https://platform.openai.com/docs/api-reference/models)
List and describe the various models available in the API. You can refer to the Models documentation to understand what models are available and the differences between them.

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import Foundation

public struct CreateTranscriptionDataModel: Decodable {
public let text: String
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import Foundation

public enum OpenAITranscriptionModelType: String {
case whisper = "whisper-1"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import Foundation

struct CreateTranscriptionEndpoint: Endpoint {
private let file: Data
private let model: OpenAITranscriptionModelType
private let language: String
private let prompt: String
private let responseFormat: OpenAIAudioResponseType
private let temperature: Double

var method: HTTPMethod {
.POST
}

var path: String = "audio/transcriptions"

init(file: Data,
model: OpenAITranscriptionModelType,
language: String = "en",
prompt: String = "",
responseFormat: OpenAIAudioResponseType,
temperature: Double = 0.0) {
self.file = file
self.model = model
self.language = language
self.prompt = prompt
self.responseFormat = responseFormat
self.temperature = temperature
}

var parameters: [String: Any]? {
["model": self.model.rawValue as Any,
"language": self.language as Any,
"prompt": self.prompt as Any,
"response_format": self.responseFormat.rawValue as Any,
"temperature": self.temperature as Any]
}
}
15 changes: 14 additions & 1 deletion Sources/SwiftOpenAI/OpenAI/OpenAIEndpoints/OpenAIEndpoints.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ enum OpenAIEndpoints {
case moderations(input: String)

case createSpeech(model: OpenAITTSModelType, input: String, voice: OpenAIVoiceType, responseFormat: OpenAIAudioResponseType, speed: Double)

case createTranscription(file: Data, model: OpenAITranscriptionModelType, language: String, prompt: String, responseFormat: OpenAIAudioResponseType, temperature: Double)

public var endpoint: Endpoint {
switch self {
Expand All @@ -43,7 +45,18 @@ enum OpenAIEndpoints {
case .moderations(input: let input):
return ModerationEndpoint(input: input)
case .createSpeech(model: let model, input: let input, voice: let voice, responseFormat: let responseFormat, speed: let speed):
return CreateSpeechEndpoint(model: model, input: input, voice: voice, responseFormat: responseFormat, speed: speed)
return CreateSpeechEndpoint(model: model,
input: input,
voice: voice,
responseFormat: responseFormat,
speed: speed)
case .createTranscription(file: let file, model: let model, language: let language, prompt: let prompt, responseFormat: let responseFormat, temperature: let temperature):
return CreateTranscriptionEndpoint(file: file,
model: model,
language: language,
prompt: prompt,
responseFormat: responseFormat,
temperature: temperature)
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import Foundation

protocol CreateTranscriptionRequestProtocol {
func execute(api: API,
apiKey: String,
file: Data,
model: OpenAITranscriptionModelType,
language: String,
prompt: String,
responseFormat: OpenAIAudioResponseType,
temperature: Double) async throws -> AsyncThrowingStream<CreateTranscriptionDataModel, Error>
}

final public class CreateTranscriptionRequest: NSObject, CreateTranscriptionRequestProtocol {
public typealias Init = (_ api: API,
_ apiKey: String,
_ file: Data,
_ model: OpenAITranscriptionModelType,
_ language: String,
_ prompt: String,
_ responseFormat: OpenAIAudioResponseType,
_ temperature: Double) async throws -> AsyncThrowingStream<CreateTranscriptionDataModel, Error>

private var urlSession: URLSession?
private var dataTask: URLSessionDataTask?
private var continuation: AsyncThrowingStream<CreateTranscriptionDataModel, Error>.Continuation?

public override init() {
super.init()
}

public func execute(api: API,
apiKey: String,
file: Data,
model: OpenAITranscriptionModelType,
language: String,
prompt: String,
responseFormat: OpenAIAudioResponseType,
temperature: Double) async throws -> AsyncThrowingStream<CreateTranscriptionDataModel, Error> {

return AsyncThrowingStream<CreateTranscriptionDataModel, Error> { continuation in
self.continuation = continuation

var endpoint = OpenAIEndpoints.createTranscription(file: file, model: model, language: language, prompt: prompt, responseFormat: responseFormat, temperature: temperature).endpoint
api.routeEndpoint(&endpoint, environment: OpenAIEnvironmentV1())

let boundary = UUID().uuidString

var urlRequest = api.buildURLRequest(endpoint: endpoint)
api.addHeaders(urlRequest: &urlRequest,
headers: ["Content-Type": "multipart/form-data; boundary=\(boundary)",
"Authorization": "Bearer \(apiKey)"])

var body = Data()

body.append("--\(boundary)\r\n".data(using: .utf8)!)
body.append("Content-Disposition: form-data; name=\"model\"\r\n\r\n".data(using: .utf8)!)
body.append("whisper-1\r\n".data(using: .utf8)!)

body.append("--\(boundary)\r\n".data(using: .utf8)!)
body.append("Content-Disposition: form-data; name=\"file\"; filename=\"steve.mp4\"\r\n".data(using: .utf8)!)
body.append("Content-Type: audio/mpeg\r\n\r\n".data(using: .utf8)!)
body.append(file)
body.append("\r\n".data(using: .utf8)!)

body.append("--\(boundary)--\r\n".data(using: .utf8)!)

urlRequest.httpBody = body

self.urlSession = URLSession(configuration: .default,
delegate: self,
delegateQueue: OperationQueue())

dataTask = urlSession?.dataTask(with: urlRequest)
dataTask?.resume()
}
}
}

extension CreateTranscriptionRequest: URLSessionDataDelegate {
public func urlSession(_ session: URLSession, dataTask: URLSessionDataTask, didReceive data: Data) {
do {
let createTranscriptionDataModel = try JSONDecoder().decode(CreateTranscriptionDataModel.self, from: data)
self.continuation?.yield(createTranscriptionDataModel)
} catch {
print("Error al parsear JSON:", error.localizedDescription)
}
}

public func urlSession(_ session: URLSession, task: URLSessionTask, didCompleteWithError error: Error?) {
guard let error = error else {
continuation?.finish()
return
}
continuation?.finish(throwing: error)
}
}
44 changes: 43 additions & 1 deletion Sources/SwiftOpenAI/OpenAI/SwiftOpenAI.swift
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ protocol OpenAIProtocol {
func moderations(input: String) async throws -> ModerationDataModel?

func createSpeech(model: OpenAITTSModelType, input: String, voice: OpenAIVoiceType, responseFormat: OpenAIAudioResponseType, speed: Double) async throws -> Data?

func createTranscription(model: OpenAITranscriptionModelType, file: Data, language: String, prompt: String, responseFormat: OpenAIAudioResponseType, temperature: Double) async throws -> AsyncThrowingStream<CreateTranscriptionDataModel, Error>
}

// swiftlint:disable line_length
Expand All @@ -37,6 +39,7 @@ public class SwiftOpenAI: OpenAIProtocol {
private let embeddingsRequest: EmbeddingsRequest.Init
private let moderationsRequest: ModerationsRequest.Init
private let createSpeechRequest: CreateSpeechRequest.Init
private let createTranscriptionRequest: CreateTranscriptionRequest.Init

public init(api: API = API(),
apiKey: String,
Expand All @@ -47,7 +50,8 @@ public class SwiftOpenAI: OpenAIProtocol {
createImagesRequest: @escaping CreateImagesRequest.Init = CreateImagesRequest().execute,
embeddingsRequest: @escaping EmbeddingsRequest.Init = EmbeddingsRequest().execute,
moderationsRequest: @escaping ModerationsRequest.Init = ModerationsRequest().execute,
createSpeechRequest: @escaping CreateSpeechRequest.Init = CreateSpeechRequest().execute) {
createSpeechRequest: @escaping CreateSpeechRequest.Init = CreateSpeechRequest().execute,
createTranscriptionRequest: @escaping CreateTranscriptionRequest.Init = CreateTranscriptionRequest().execute) {
self.api = api
self.apiKey = apiKey
self.listModelsRequest = listModelsRequest
Expand All @@ -58,6 +62,7 @@ public class SwiftOpenAI: OpenAIProtocol {
self.embeddingsRequest = embeddingsRequest
self.moderationsRequest = moderationsRequest
self.createSpeechRequest = createSpeechRequest
self.createTranscriptionRequest = createTranscriptionRequest
}

/**
Expand Down Expand Up @@ -320,5 +325,42 @@ public class SwiftOpenAI: OpenAIProtocol {
public func createSpeech(model: OpenAITTSModelType, input: String, voice: OpenAIVoiceType, responseFormat: OpenAIAudioResponseType, speed: Double) async throws -> Data? {
try await createSpeechRequest(api, apiKey, model, input, voice, responseFormat, speed)
}

/**
Transcribes audio files into text using the OpenAI Transcription API.
This method employs the OpenAI Transcription API to convert audio files into textual transcriptions. It allows you to specify the transcription model, language, and other parameters to tailor the transcription process to your needs. The method supports various file formats and provides flexibility in terms of language and response format.
The function is designed with Swift's concurrency features and supports async/await for seamless integration into modern Swift applications.
- Parameters:
- model: An `OpenAITranscriptionModelType` representing the chosen model for transcription.
- file: A `Data` object containing the audio file to be transcribed.
- language: A `String` specifying the language of the audio content.
- prompt: A `String` used to provide any specific instructions or context for the transcription.
- responseFormat: An `OpenAIAudioResponseType` indicating the format of the transcription response.
- temperature: A `Double` that adjusts the creativity or variability of the transcription.
- Throws: An error if the API request fails or if there are issues in processing the audio file.
- Returns: An `AsyncThrowingStream` of `CreateTranscriptionDataModel`, providing a stream of transcription results or errors encountered during the process.
Example usage:
let audioFileData = // Your audio file data here
do {
let transcriptionStream = try await createTranscription(model: .base, file: audioFileData, language: "en", prompt: "General transcription", responseFormat: .json, temperature: 0.5)
for try await transcription in transcriptionStream {
// Process each transcription result
}
} catch {
print("Error: \(error)")
}
*/
public func createTranscription(model: OpenAITranscriptionModelType, file: Data, language: String, prompt: String, responseFormat: OpenAIAudioResponseType, temperature: Double) async throws -> AsyncThrowingStream<CreateTranscriptionDataModel, Error> {
try await createTranscriptionRequest(api, apiKey, file, model, language, prompt, responseFormat, temperature)
}
}
// swiftlint:enable line_length

0 comments on commit c0c7b6b

Please sign in to comment.