Merge pull request #72 from Sgvkamalakar/main

Text-to-Speech Application
yagyesh-bobde · Feb 5, 2024 · 5bdedec · 5bdedec
2 parents 5f2250d + d3cc57d
commit 5bdedec
Show file tree

Hide file tree

Showing 8 changed files with 343 additions and 0 deletions.
diff --git a/Text-To-Speech/Azure-Talking-Avatar/.env sample b/Text-To-Speech/Azure-Talking-Avatar/.env sample
@@ -0,0 +1,3 @@
+SUBSCRIPTION_KEY=<your_subscription_key>
+SERVICE_REGION=<your_service_region>
+SERVICE_HOST=<your_service_host>
diff --git a/Text-To-Speech/Azure-Talking-Avatar/.gitignore b/Text-To-Speech/Azure-Talking-Avatar/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/Text-To-Speech/Azure-Talking-Avatar/.streamlit/config.toml b/Text-To-Speech/Azure-Talking-Avatar/.streamlit/config.toml
@@ -0,0 +1,21 @@
+[theme]
+
+# The preset Streamlit theme that your custom theme inherits from.
+# One of "light" or "dark".
+base = "dark"
+
+# Used to style primary interface elements. It's the color displayed most
+# frequently across your app's widgets (slider, checkbox, buttons)
+primaryColor = "skyblue" 
+
+# Background color for the main container.
+backgroundColor = "black"
+
+# Used as the background for most widgets (sidebar, text input etc.)
+secondaryBackgroundColor =  ""
+
+# Font color for the page.
+textColor = "white"
+
+# Font family for the page. (One of "serif", "sans serif" or "mono")
+font = "serif"
diff --git a/Text-To-Speech/Azure-Talking-Avatar/LICENSE b/Text-To-Speech/Azure-Talking-Avatar/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Kamalakar Satapathi
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Text-To-Speech/Azure-Talking-Avatar/README.md b/Text-To-Speech/Azure-Talking-Avatar/README.md
@@ -0,0 +1,107 @@
+# Azure Text-to-Speech with Avatar
+[![Open in Streamlit](https://img.shields.io/badge/Open%20in-Streamlit-red?style=for-the-badge&logo=streamlit)](https://talking-avatar.streamlit.app/)
+
+<p align="center">
+  <img src="https://github.com/Sgvkamalakar/Azure-Talking-Avatar/assets/103712713/09fc79f9-cc68-4354-bae7-e75e24add235" width="400" height="400"/>
+</p>
+This Streamlit app allows you to submit and monitor batch synthesis jobs using Azure Text-to-Speech with Avatar. It leverages the Azure AI Services to create talking avatars based on the provided text.
+
+## Usage
+- Choose the language and avatar style using the sidebar dropdowns.
+- Type the text you want to synthesize in the selected language.
+- Click the "Submit Synthesis Job" button to initiate the batch synthesis job.
+- Monitor the status in real-time. Once the job is successful, the talking avatar video will be displayed.
+
+## Supported Languages and Avatars
+The application supports multiple languages, each associated with a specific talking avatar style.
+<div align='center'>
+  <table>
+  <thead>
+    <tr>
+      <th>Language</th>
+      <th>Voice</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Arabic</td>
+      <td>ZariyahNeural</td>
+    </tr>
+    <tr>
+      <td>Bahasa Indonesian</td>
+      <td>GadisNeural</td>
+    </tr>
+    <tr>
+      <td>Bengali</td>
+      <td>TanishaaNeural</td>
+    </tr>
+    <tr>
+      <td>Chinese Mandarin</td>
+      <td>XiaoxiaoNeural</td>
+    </tr>
+    <tr>
+      <td>Dutch</td>
+      <td>FennaNeural</td>
+    </tr>
+    <tr>
+      <td>English</td>
+      <td>AvaNeural</td>
+    </tr>
+    <tr>
+      <td>French</td>
+      <td>DeniseNeural</td>
+    </tr>
+    <tr>
+      <td>German</td>
+      <td>KatjaNeural</td>
+    </tr>
+    <tr>
+      <td>Hindi</td>
+      <td>SwaraNeural</td>
+    </tr>
+    <tr>
+      <td>Italian</td>
+      <td>ElsaNeural</td>
+    </tr>
+    <tr>
+      <td>Japanese</td>
+      <td>NanamiNeural</td>
+    </tr>
+    <tr>
+      <td>Korean</td>
+      <td>SunHiNeural</td>
+    </tr>
+    <tr>
+      <td>Russian</td>
+      <td>SvetlanaNeural</td>
+    </tr>
+    <tr>
+      <td>Spanish</td>
+      <td>ElviraNeural</td>
+    </tr>
+    <tr>
+      <td>Telugu</td>
+      <td>ShrutiNeural</td>
+    </tr>
+  </tbody>
+</table>
+</div>
+
+## Demo
+Check out the demo video:
+
+https://github.com/Sgvkamalakar/Azure-Talking-Avatar/assets/103712713/adf5c293-e1cc-4fb5-94e2-b87ca5f5501c
+
+
+## Setup
+To run the application, you need to set up your Azure Text-to-Speech subscription key, service region, and service host. You can set these values in a `.env` file or directly in the script.
+
+```dotenv
+SUBSCRIPTION_KEY=<your_subscription_key>
+SERVICE_REGION=<your_service_region>
+SERVICE_HOST=<your_service_host>
+```
+
+## References
+Learn more about Text-to-Speech Avatar on Microsoft Azure [here](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/text-to-speech-avatar/what-is-text-to-speech-avatar)
+
diff --git a/Text-To-Speech/Azure-Talking-Avatar/app.py b/Text-To-Speech/Azure-Talking-Avatar/app.py
@@ -0,0 +1,187 @@
+import streamlit as st
+import requests
+import json
+from urllib.parse import unquote
+import time
+import logging
+from dotenv import load_dotenv
+import os
+logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p %Z")
+logger = logging.getLogger(__name__)
+
+load_dotenv()
+SUBSCRIPTION_KEY = os.getenv('SUBSCRIPTION_KEY')
+SERVICE_REGION = os.getenv('SERVICE_REGION')
+SERVICE_HOST = os.getenv('SERVICE_HOST')
+
+st.set_page_config(page_title="Talking Avatar", page_icon="🗣️",initial_sidebar_state="auto",layout='centered')
+NAME = "Text-to-Speech"
+DESCRIPTION = "Using Azure AI Services"
+
+lang_voices = {
+    'Arabic': ['ar-SA', 'ar-SA-ZariyahNeural'],
+    'Bahasa Indonesian': ['id-ID', 'id-ID-GadisNeural'],
+    'Bengali': ['bn-IN', 'bn-IN-TanishaaNeural'],
+    'Chinese Mandarin': ['zh-CN', 'zh-CN-XiaoxiaoNeural'],
+    'Dutch': ['nl-NL', 'nl-NL-FennaNeural'],
+    'English': ['en-US', 'en-US-AvaNeural'],
+    'French': ['fr-FR', 'fr-FR-DeniseNeural'],
+    'German': ['de-DE', 'de-DE-KatjaNeural'],
+    'Hindi': ['hi-IN', 'hi-IN-SwaraNeural'],
+    'Italian': ['it-IT', 'it-IT-ElsaNeural'],
+    'Japanese': ['ja-JP', 'ja-JP-NanamiNeural'],
+    'Korean': ['ko-KR', 'ko-KR-SunHiNeural'],
+    'Russian': ['ru-RU', 'ru-RU-SvetlanaNeural'],
+    'Spanish': ['es-ES', 'es-ES-ElviraNeural'],
+    'Telugu': ['te-IN', 'te-IN-ShrutiNeural']
+}
+
+with st.sidebar:
+    lang=st.selectbox('Choose the language',list(lang_voices.keys()), index=5) 
+    style=st.selectbox('Avatar Style',["Casual-Sitting","Graceful-Sitting","Technical-Sitting","Graceful-Standing","Technical-Standing"],index=1)
+    style=style.lower()
+    voice=lang_voices[lang][1]
+    st.markdown("[Source Code](https://github.com/Sgvkamalakar/Azure-Talking-Avatar)")
+    st.markdown("[Explore my Codes](https://github.com/sgvkamalakar)")
+    st.markdown("[Connect with me on LinkedIn](https://www.linkedin.com/in/sgvkamlakar)")
+
+
+def submit_synthesis(text):
+    url = f'https://{SERVICE_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis/talkingavatar'
+    header = {
+        'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY,
+        'Content-Type':'application/json'
+    }
+    payload = {
+        'displayName': NAME,
+        'description': DESCRIPTION,
+        "textType": "PlainText",
+        'synthesisConfig': {
+            "voice": voice,
+        },
+        'customVoices': {},
+        "inputs": [
+            {
+                "text": text,
+            },
+        ],
+        "properties": {
+            "customized": False,
+            "talkingAvatarCharacter": "lisa",
+            "talkingAvatarStyle": style,
+            "videoFormat": "webm",
+            "videoCodec": "vp9",
+            "subtitleType": "soft_embedded",
+            "backgroundColor": "transparent",
+        }
+    }
+
+    response = requests.post(url, json.dumps(payload), headers=header)
+    if response.status_code < 400:
+        logger.info('Batch avatar synthesis job submitted successfully')
+        logger.info(f'Job ID: {response.json()["id"]}')
+        return response.json()["id"]
+    else:
+        logger.error(f'Failed to submit batch avatar synthesis job: {response.text}')
+
+
+def get_synthesis(job_id):
+    url = f'https://{SERVICE_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis/talkingavatar/{job_id}'
+    header = {
+        'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY
+    }
+    response = requests.get(url, headers=header)
+    if response.status_code < 400:
+        logger.debug('Get batch synthesis job successfully')
+        logger.debug(response.json())
+        if response.json()['status'] == 'Succeeded':
+            logger.info(f'Batch synthesis job succeeded, download URL: {response.json()["outputs"]["result"]}')
+            video_url = f'{response.json()["outputs"]["result"]}'
+            decoded_url = unquote(video_url)
+            st.video(decoded_url)
+
+
+        return response.json()['status']
+    else:
+        logger.error(f'Failed to get batch synthesis job: {response.text}')
+
+def list_synthesis_jobs(skip: int = 0, top: int = 100):
+    url = f'https://{SERVICE_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis/talkingavatar?skip={skip}&top={top}'
+    header = {
+        'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY
+    }
+    response = requests.get(url, headers=header)
+    if response.status_code < 400:
+        logger.info(f'List batch synthesis jobs successfully, got {len(response.json()["values"])} jobs')
+        logger.info(response.json())
+    else:
+        logger.error(f'Failed to list batch synthesis jobs: {response.text}')
+
+
+def main():
+    st.title("Azure Text-to-Talking Avatar")
+    # st.info()
+    text_input = st.text_area(f'Type text in {lang}')
+    submit_button = st.button("Submit Job")
+    if submit_button:
+        with st.spinner("Processing..."):
+            job_id = submit_synthesis(text_input)
+            if job_id is not None:
+                while True:
+                    status = get_synthesis(job_id)
+                    if status == 'Succeeded':
+                        st.success('Batch avatar synthesis job succeeded ✅')
+                        break
+                    elif status == 'Failed':
+                        st.error('Batch avatar synthesis job failed ❌')
+                        break
+                    else:
+                        time.sleep(5)  
+
+footer = """<style>
+a:link , a:visited{
+    color: #00aadd;
+    background-color: transparent;
+}
+
+a:hover, a:active {
+    color: blue;
+    background-color: transparent;
+    text-decoration: underline;
+}
+
+.footer {
+    position: fixed;
+    left: 0;
+    bottom: 0;
+    width: 100%;
+    background-color:#0e1117;
+    color: white;
+    text-align: center;
+    padding: 10px;  /* Added padding for better appearance */
+}
+
+.footer p {
+    margin-bottom: 5px;  /* Adjusted margin for better spacing */
+}
+
+.footer a {
+    text-decoration: none;
+}
+.red-heart {
+    color: red;  /* Set the color of the heart emoji to red */
+}
+.footer a:hover {
+    text-decoration: underline;
+}
+</style>
+<div class="footer">
+    <p>Developed with <span class="red-heart">❤</span> using <a href="https://speech.microsoft.com/" target="_blank">Azure Speech Services</a>  by <a href="https://www.linkedin.com/in/sgvkamalakar" target="_blank">Kamalakar</a></p>
+</div>
+"""
+
+st.markdown(footer, unsafe_allow_html=True)
+
+if __name__ == '__main__':
+    main()
+
diff --git a/Text-To-Speech/Azure-Talking-Avatar/demo/demo.webm b/Text-To-Speech/Azure-Talking-Avatar/demo/demo.webm
diff --git a/Text-To-Speech/Azure-Talking-Avatar/requirements.txt b/Text-To-Speech/Azure-Talking-Avatar/requirements.txt
@@ -0,0 +1,3 @@
+streamlit
+python-dotenv
+requests