Merge pull request #747 from 10up/feat/718

feat/718: add OpenAI Text to Speech as a Provider
10up · Apr 1, 2024 · 4d91802 · 4d91802
2 parents fe28ee7 + 76d83b7
commit 4d91802
Show file tree

Hide file tree

Showing 12 changed files with 671 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@
 * [Set Up OpenAI Embeddings Language Processing](#set-up-classification-via-openai-embeddings)
 * [Set Up OpenAI Whisper Language Processing](#set-up-audio-transcripts-generation-via-openai-whisper)
 * [Set Up Azure AI Language Processing](#set-up-text-to-speech-via-microsoft-azure)
+* [Set Up OpenAI Text to Speech Processing](#set-up-text-to-speech-via-openai)
 * [Set Up AWS Language Processing](#set-up-text-to-speech-via-amazon-polly)
 * [Set Up Azure AI Vision Image Processing](#set-up-image-processing-features-via-microsoft-azure)
 * [Set Up OpenAI DALL·E Image Processing](#set-up-image-generation-via-openai)
@@ -46,7 +47,7 @@ Tap into leading cloud-based services like [OpenAI](https://openai.com/), [Micro
 * Generate new images on demand to use in-content or as a featured image using [OpenAI's DALL·E 3 API](https://platform.openai.com/docs/guides/images)
 * Generate transcripts of audio files using [OpenAI's Whisper API](https://platform.openai.com/docs/guides/speech-to-text)
 * Moderate incoming comments for sensitive content using [OpenAI's Moderation API](https://platform.openai.com/docs/guides/moderation)
-* Convert text content into audio and output a "read-to-me" feature on the front-end to play this audio using [Microsoft Azure's Text to Speech API](https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/text-to-speech) or [Amazon Polly](https://aws.amazon.com/polly/)
+* Convert text content into audio and output a "read-to-me" feature on the front-end to play this audio using [Microsoft Azure's Text to Speech API](https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/text-to-speech), [Amazon Polly](https://aws.amazon.com/polly/) or [OpenAI's Text to Speech API](https://platform.openai.com/docs/guides/text-to-speech)
 * Classify post content using [IBM Watson's Natural Language Understanding API](https://www.ibm.com/watson/services/natural-language-understanding/) and [OpenAI's Embedding API](https://platform.openai.com/docs/guides/embeddings)
 * BETA: Recommend content based on overall site traffic via [Microsoft Azure's AI Personalizer API](https://azure.microsoft.com/en-us/services/cognitive-services/personalizer/) *(note that this service has been [deprecated by Microsoft](https://learn.microsoft.com/en-us/azure/ai-services/personalizer/) and as such, will no longer work. We are looking to replace this with a new provider to maintain the same functionality (see [issue#392](https://github.com/10up/classifai/issues/392))*
 * Generate image alt text, image tags, and smartly crop images using [Microsoft Azure's AI Vision API](https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/)
@@ -74,7 +75,7 @@ Tap into leading cloud-based services like [OpenAI](https://openai.com/), [Micro
 * PHP 7.4+
 * [WordPress](http://wordpress.org) 6.1+
 * To utilize the NLU Language Processing functionality, you will need an active [IBM Watson](https://cloud.ibm.com/registration) account.
-* To utilize the ChatGPT, Embeddings, or Whisper Language Processing functionality or DALL·E Image Processing functionality, you will need an active [OpenAI](https://platform.openai.com/signup) account.
+* To utilize the ChatGPT, Embeddings, Text to Speech or Whisper Language Processing functionality or DALL·E Image Processing functionality, you will need an active [OpenAI](https://platform.openai.com/signup) account.
 * To utilize the Azure AI Vision Image Processing functionality or Text to Speech Language Processing functionality, you will need an active [Microsoft Azure](https://signup.azure.com/signup) account.
 * To utilize the Azure OpenAI Language Processing functionality, you will need an active [Microsoft Azure](https://signup.azure.com/signup) account and you will need to [apply](https://aka.ms/oai/access) for OpenAI access.
 * To utilize the Google Gemini Language Processing functionality, you will need an active [Google Gemini](https://ai.google.dev/tutorials/setup) account.
@@ -86,7 +87,7 @@ Note that there is no cost to using ClassifAI itself. Both IBM Watson and Micros
 
 IBM Watson's Natural Language Understanding ("NLU"), which is one of the providers that powers the classification feature, has a ["lite" pricing tier](https://www.ibm.com/cloud/watson-natural-language-understanding/pricing) that offers 30,000 free NLU items per month.
 
-OpenAI, which is one of the providers that powers the classification, title generation, excerpt generation, content resizing, audio transcripts generation, moderation and image generation features, has a limited free trial and then requires a [pay per usage](https://openai.com/pricing) plan.
+OpenAI, which is one of the providers that powers the classification, title generation, excerpt generation, content resizing, audio transcripts generation, text to speech, moderation and image generation features, has a limited free trial and then requires a [pay per usage](https://openai.com/pricing) plan.
 
 Microsoft Azure AI Vision, which is one of the providers that powers the descriptive text generator, image tags generator, image cropping, image text extraction and PDF text extraction features, has a ["free" pricing tier](https://azure.microsoft.com/en-us/pricing/details/cognitive-services/computer-vision/) that offers 20 transactions per minute and 5,000 transactions per month.
 
@@ -349,6 +350,7 @@ IBM Watson's [Categories](https://cloud.ibm.com/docs/natural-language-understand
 ## Set Up Audio Transcripts Generation (via OpenAI Whisper)
 
 Note that [OpenAI](https://platform.openai.com/docs/guides/speech-to-text) can create a transcript for audio files that meet the following requirements:
+
 * The file must be presented in mp3, mp4, mpeg, mpga, m4a, wav, or webm format
 * The file size must be less than 25 megabytes (MB)
 
@@ -401,6 +403,27 @@ Note that [OpenAI](https://platform.openai.com/docs/guides/speech-to-text) can c
 * Click the button to preview the generated speech audio for the post.
 * View the post on the front-end and see a read-to-me feature has been added
 
+## Set Up Text to Speech (via OpenAI)
+
+### 1. Sign up for OpenAI
+
+* [Sign up for an OpenAI account](https://platform.openai.com/signup) or sign into your existing one.
+* If creating a new account, complete the verification process (requires confirming your email and phone number).
+* Log into your account and go to the [API key page](https://platform.openai.com/account/api-keys).
+* Click `Create new secret key` and copy the key that is shown.
+
+### 2. Configure OpenAI API Keys under Tools > ClassifAI > Language Processing > Text to Speech
+
+* Select **OpenAI Text to Speech** in the provider dropdown.
+* Enter your API Key copied from the above step into the `API Key` field.
+
+### 3. Using the Text to Speech service
+
+* Assuming the post type selected is "post", create a new post and publish it.
+* After a few seconds, a "Preview" button will appear under the ClassifAI settings panel.
+* Click the button to preview the generated speech audio for the post.
+* View the post on the front-end and see a read-to-me feature has been added
+
 ## Set Up Text to Speech (via Amazon Polly)
 
 ### 1. Sign up for AWS (Amazon Web Services)

diff --git a/includes/Classifai/Features/TextToSpeech.php b/includes/Classifai/Features/TextToSpeech.php
@@ -5,6 +5,8 @@
 use Classifai\Services\LanguageProcessing;
 use Classifai\Providers\Azure\Speech;
 use Classifai\Providers\AWS\AmazonPolly;
+use Classifai\Providers\OpenAI\TextToSpeech as OpenAITTS;
+use Classifai\Normalizer;
 use WP_REST_Server;
 use WP_REST_Request;
 use WP_Error;
@@ -44,6 +46,14 @@ class TextToSpeech extends Feature {
 	 */
 	const DISPLAY_GENERATED_AUDIO = '_classifai_display_generated_audio';
 
+	/**
+	 * Meta key to get/set the audio hash that helps to indicate if there is any need
+	 * for the audio file to be regenerated or not.
+	 *
+	 * @var string
+	 */
+	const AUDIO_HASH_KEY = '_classifai_post_audio_hash';
+
 	/**
 	 * Constructor.
 	 */
@@ -55,8 +65,9 @@ public function __construct() {
 
 		// Contains just the providers this feature supports.
 		$this->supported_providers = [
-			Speech::ID      => __( 'Microsoft Azure AI Speech', 'classifai' ),
 			AmazonPolly::ID => __( 'Amazon Polly', 'classifai' ),
+			Speech::ID      => __( 'Microsoft Azure AI Speech', 'classifai' ),
+			OpenAITTS::ID   => __( 'OpenAI Text to Speech', 'classifai' ),
 		];
 	}
 
@@ -840,6 +851,21 @@ public function get_audio_generation_subsequent_state( $post = null ): bool {
 		return apply_filters( 'classifai_audio_generation_subsequent_state', false, get_post( $post ) );
 	}
 
+	/**
+	 * Normalizes the post content for text to speech generation.
+	 *
+	 * @param int $post_id The post ID.
+	 *
+	 * @return string The normalized post content.
+	 */
+	public function normalize_post_content( int $post_id ): string {
+		$normalizer   = new Normalizer();
+		$post         = get_post( $post_id );
+		$post_content = $normalizer->normalize_content( $post->post_content, $post->post_title, $post_id );
+
+		return $post_content;
+	}
+
 	/**
 	 * Generates feature setting data required for migration from
 	 * ClassifAI < 3.0.0 to 3.0.0

diff --git a/includes/Classifai/Providers/AWS/AmazonPolly.php b/includes/Classifai/Providers/AWS/AmazonPolly.php
@@ -9,7 +9,6 @@
 namespace Classifai\Providers\AWS;
 
 use Classifai\Providers\Provider;
-use Classifai\Normalizer;
 use Classifai\Features\TextToSpeech;
 use WP_Error;
 use Aws\Sdk;
@@ -18,14 +17,6 @@ class AmazonPolly extends Provider {
 
 	const ID = 'aws_polly';
 
-	/**
-	 * Meta key to get/set the audio hash that helps to indicate if there is any need
-	 * for the audio file to be regenerated or not.
-	 *
-	 * @var string
-	 */
-	const AUDIO_HASH_KEY = '_classifai_post_audio_hash';
-
 	/**
 	 * AmazonPolly Text to Speech constructor.
 	 *
@@ -374,12 +365,10 @@ public function synthesize_speech( int $post_id ) {
 			);
 		}
 
-		$normalizer          = new Normalizer();
 		$feature             = new TextToSpeech();
 		$settings            = $feature->get_settings();
-		$post                = get_post( $post_id );
-		$post_content        = $normalizer->normalize_content( $post->post_content, $post->post_title, $post_id );
-		$content_hash        = get_post_meta( $post_id, self::AUDIO_HASH_KEY, true );
+		$post_content        = $feature->normalize_post_content( $post_id );
+		$content_hash        = get_post_meta( $post_id, TextToSpeech::AUDIO_HASH_KEY, true );
 		$saved_attachment_id = (int) get_post_meta( $post_id, $feature::AUDIO_ID_KEY, true );
 
 		// Don't regenerate the audio file it it already exists and the content hasn't changed.
@@ -453,7 +442,7 @@ public function synthesize_speech( int $post_id ) {
 			$polly_client = $this->get_polly_client();
 			$result       = $polly_client->synthesizeSpeech( $synthesize_data );
 
-			update_post_meta( $post_id, self::AUDIO_HASH_KEY, md5( $post_content ) );
+			update_post_meta( $post_id, TextToSpeech::AUDIO_HASH_KEY, md5( $post_content ) );
 			$contents = $result['AudioStream']->getContents();
 			return $contents;
 		} catch ( \Exception $e ) {

diff --git a/includes/Classifai/Providers/Azure/Speech.php b/includes/Classifai/Providers/Azure/Speech.php
@@ -6,7 +6,6 @@
 namespace Classifai\Providers\Azure;
 
 use Classifai\Providers\Provider;
-use Classifai\Normalizer;
 use Classifai\Features\TextToSpeech;
 use stdClass;
 use WP_Http;
@@ -30,14 +29,6 @@ class Speech extends Provider {
 	 */
 	const API_PATH = 'cognitiveservices/v1';
 
-	/**
-	 * Meta key to get/set the audio hash that helps to indicate if there is any need
-	 * for the audio file to be regenerated or not.
-	 *
-	 * @var string
-	 */
-	const AUDIO_HASH_KEY = '_classifai_post_audio_hash';
-
 	/**
 	 * Azure Text to Speech constructor.
 	 *
@@ -337,12 +328,10 @@ public function synthesize_speech( int $post_id ) {
 			);
 		}
 
-		$normalizer          = new Normalizer();
 		$feature             = new TextToSpeech();
 		$settings            = $feature->get_settings();
-		$post                = get_post( $post_id );
-		$post_content        = $normalizer->normalize_content( $post->post_content, $post->post_title, $post_id );
-		$content_hash        = get_post_meta( $post_id, self::AUDIO_HASH_KEY, true );
+		$post_content        = $feature->normalize_post_content( $post_id );
+		$content_hash        = get_post_meta( $post_id, TextToSpeech::AUDIO_HASH_KEY, true );
 		$saved_attachment_id = (int) get_post_meta( $post_id, $feature::AUDIO_ID_KEY, true );
 
 		// Don't regenerate the audio file it it already exists and the content hasn't changed.
@@ -415,7 +404,7 @@ public function synthesize_speech( int $post_id ) {
 			);
 		}
 
-		update_post_meta( $post_id, self::AUDIO_HASH_KEY, md5( $post_content ) );
+		update_post_meta( $post_id, TextToSpeech::AUDIO_HASH_KEY, md5( $post_content ) );
 
 		return $response_body;
 	}

diff --git a/includes/Classifai/Providers/OpenAI/APIRequest.php b/includes/Classifai/Providers/OpenAI/APIRequest.php
@@ -270,19 +270,33 @@ public function post_form( string $url = '', array $body = [] ) {
 	 */
 	public function get_result( $response ) {
 		if ( ! is_wp_error( $response ) ) {
+			$headers      = wp_remote_retrieve_headers( $response );
+			$content_type = false;
+
+			if ( ! is_wp_error( $headers ) ) {
+				$content_type = isset( $headers['content-type'] ) ? $headers['content-type'] : false;
+			}
+
 			$body = wp_remote_retrieve_body( $response );
 			$code = wp_remote_retrieve_response_code( $response );
-			$json = json_decode( $body, true );
 
-			if ( json_last_error() === JSON_ERROR_NONE ) {
-				if ( empty( $json['error'] ) ) {
-					return $json;
+			if ( false === $content_type || false !== strpos( $content_type, 'application/json' ) ) {
+				$json = json_decode( $body, true );
+
+				if ( json_last_error() === JSON_ERROR_NONE ) {
+					if ( empty( $json['error'] ) ) {
+						return $json;
+					} else {
+						$message = $json['error']['message'] ?? esc_html__( 'An error occured', 'classifai' );
+						return new WP_Error( $code, $message );
+					}
 				} else {
-					$message = $json['error']['message'] ?? esc_html__( 'An error occured', 'classifai' );
-					return new WP_Error( $code, $message );
+					return new WP_Error( 'Invalid JSON: ' . json_last_error_msg(), $body );
 				}
+			} elseif ( $content_type && false !== strpos( $content_type, 'audio/mpeg' ) ) {
+				return $response;
 			} else {
-				return new WP_Error( 'Invalid JSON: ' . json_last_error_msg(), $body );
+				return new WP_Error( 'Invalid content type', $response );
 			}
 		} else {
 			return $response;