feat(ui): add page to talk with voice, transcription, and tts (#2520)

* feat(ui): add page to talk with voice, transcription, and tts Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Enhance graphics and status reporting Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Better UX by blocking unvalid actions Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
mudler · Jun 8, 2024 · e96d2d7 · e96d2d7
1 parent aae7ad9
commit e96d2d7
Show file tree

Hide file tree

Showing 4 changed files with 321 additions and 0 deletions.
diff --git a/core/http/routes/ui.go b/core/http/routes/ui.go
@@ -247,6 +247,26 @@ func RegisterUIRoutes(app *fiber.App,
 		// Render index
 		return c.Render("views/chat", summary)
 	})
+
+	app.Get("/talk/", auth, func(c *fiber.Ctx) error {
+		backendConfigs := cl.GetAllBackendConfigs()
+
+		if len(backendConfigs) == 0 {
+			// If no model is available redirect to the index which suggests how to install models
+			return c.Redirect("/")
+		}
+
+		summary := fiber.Map{
+			"Title":        "LocalAI - Talk",
+			"ModelsConfig": backendConfigs,
+			"Model":        backendConfigs[0].Name,
+			"Version":      internal.PrintableVersion(),
+		}
+
+		// Render index
+		return c.Render("views/talk", summary)
+	})
+
 	app.Get("/chat/", auth, func(c *fiber.Ctx) error {
 
 		backendConfigs := cl.GetAllBackendConfigs()

diff --git a/core/http/static/talk.js b/core/http/static/talk.js
@@ -0,0 +1,191 @@
+
+const recordButton = document.getElementById('recordButton');
+const audioPlayback = document.getElementById('audioPlayback');
+const resetButton = document.getElementById('resetButton');
+
+let mediaRecorder;
+let audioChunks = [];
+let isRecording = false;
+let conversationHistory = [];
+let resetTimer;
+
+function getApiKey() {
+    return document.getElementById('apiKey').value;
+}
+
+function getModel() {
+    return document.getElementById('modelSelect').value;
+}
+
+function getWhisperModel() {
+    return document.getElementById('whisperModelSelect').value;
+}
+
+function getTTSModel() {
+    return document.getElementById('ttsModelSelect').value;
+}
+
+function resetConversation() {
+    conversationHistory = [];
+    console.log("Conversation has been reset.");
+    clearTimeout(resetTimer);
+}
+
+function setResetTimer() {
+    clearTimeout(resetTimer);
+    resetTimer = setTimeout(resetConversation, 300000); // Reset after 5 minutes
+}
+
+recordButton.addEventListener('click', toggleRecording);
+resetButton.addEventListener('click', resetConversation);
+
+function toggleRecording() {
+    if (!isRecording) {
+        startRecording();
+    } else {
+        stopRecording();
+    }
+}
+
+async function startRecording() {
+    document.getElementById("recording").style.display = "block";
+    document.getElementById("resetButton").style.display = "none";
+    if (!navigator.mediaDevices) {
+        alert('MediaDevices API not supported!');
+        return;
+    }
+    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+    mediaRecorder = new MediaRecorder(stream);
+    audioChunks = [];
+    mediaRecorder.ondataavailable = (event) => {
+        audioChunks.push(event.data);
+    };
+    mediaRecorder.start();
+    recordButton.textContent = 'Stop Recording';
+    // add class bg-red-500 to recordButton
+    recordButton.classList.add("bg-gray-500");
+
+    isRecording = true;
+}
+
+function stopRecording() {
+    mediaRecorder.stop();
+    mediaRecorder.onstop = async () => {
+        document.getElementById("recording").style.display = "none";
+        document.getElementById("recordButton").style.display = "none";
+
+        document.getElementById("loader").style.display = "block";
+        const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
+        document.getElementById("statustext").textContent = "Processing audio...";
+        const transcript = await sendAudioToWhisper(audioBlob);
+        console.log("Transcript:", transcript);
+        document.getElementById("statustext").textContent = "Seems you said: " + transcript+ ". Generating response...";
+        const responseText = await sendTextToChatGPT(transcript);
+
+        console.log("Response:", responseText);
+        document.getElementById("statustext").textContent = "Response generated: '" + responseText + "'. Generating audio response...";
+
+        const ttsAudio = await getTextToSpeechAudio(responseText);
+        playAudioResponse(ttsAudio);
+
+        recordButton.textContent = 'Record';
+        // remove class bg-red-500 from recordButton
+        recordButton.classList.remove("bg-gray-500");
+        isRecording = false;
+        document.getElementById("loader").style.display = "none";
+        document.getElementById("recordButton").style.display = "block";
+        document.getElementById("resetButton").style.display = "block";
+        document.getElementById("statustext").textContent = "Press the record button to start recording.";
+    };
+}
+
+function submitKey(event) {
+    event.preventDefault();
+    localStorage.setItem("key", document.getElementById("apiKey").value);
+    document.getElementById("apiKey").blur();
+}
+
+document.getElementById("key").addEventListener("submit", submitKey);
+
+
+storeKey = localStorage.getItem("key");
+if (storeKey) {
+  document.getElementById("apiKey").value = storeKey;
+} else {
+  document.getElementById("apiKey").value = null;
+}
+
+
+async function sendAudioToWhisper(audioBlob) {
+    const formData = new FormData();
+    formData.append('file', audioBlob);
+    formData.append('model', getWhisperModel());
+    API_KEY = localStorage.getItem("key");
+
+    const response = await fetch('/v1/audio/transcriptions', {
+        method: 'POST',
+        headers: {
+            'Authorization': `Bearer ${API_KEY}`
+        },
+        body: formData
+    });
+
+    const result = await response.json();
+    console.log("Whisper result:", result)
+    return result.text;
+}
+
+async function sendTextToChatGPT(text) {
+    conversationHistory.push({ role: "user", content: text });
+    API_KEY = localStorage.getItem("key");
+
+    const response = await fetch('/v1/chat/completions', {
+        method: 'POST',
+        headers: {
+            'Authorization': `Bearer ${API_KEY}`,
+            'Content-Type': 'application/json'
+        },
+        body: JSON.stringify({
+            model: getModel(),
+            messages: conversationHistory
+        })
+    });
+
+    const result = await response.json();
+    const responseText = result.choices[0].message.content;
+    conversationHistory.push({ role: "assistant", content: responseText });
+
+    setResetTimer();
+
+    return responseText;
+}
+
+async function getTextToSpeechAudio(text) {
+    API_KEY = localStorage.getItem("key");
+
+    const response = await fetch('/v1/audio/speech', {
+
+        method: 'POST',
+        headers: {
+            'Authorization': `Bearer ${API_KEY}`,
+            'Content-Type': 'application/json'
+        },
+        body: JSON.stringify({ 
+          //  "backend": "string",
+            input: text,
+            model: getTTSModel(),
+           // "voice": "string"
+         })
+    });
+
+    const audioBlob = await response.blob();
+    return audioBlob;  // Return the blob directly
+}
+
+function playAudioResponse(audioBlob) {
+    const audioUrl = URL.createObjectURL(audioBlob);
+    audioPlayback.src = audioUrl;
+    audioPlayback.hidden = false;
+    audioPlayback.play();
+}
+
diff --git a/core/http/views/partials/navbar.html b/core/http/views/partials/navbar.html
@@ -20,6 +20,7 @@
                 <a href="/chat/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-comments pr-2"></i> Chat</a>
                 <a href="/text2image/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-image pr-2"></i> Generate images</a>
                 <a href="/tts/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-music pr-2"></i> TTS </a>
+                <a href="/talk/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-phone pr-2"></i> Talk </a>
                 <a href="/swagger/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-code pr-2"></i> API</a>
             </div>
         </div>
@@ -32,6 +33,7 @@
                 <a href="/chat/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-comments pr-2"></i> Chat</a>
                 <a href="/text2image/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-image pr-2"></i> Generate images</a>
                 <a href="/tts/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-music pr-2"></i> TTS </a>
+                <a href="/talk/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-phone pr-2"></i> Talk </a>
                 <a href="/swagger/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-code pr-2"></i> API</a>
             </div>
         </div>

diff --git a/core/http/views/talk.html b/core/http/views/talk.html
@@ -0,0 +1,108 @@
+<!doctype html>
+<html lang="en">
+  {{template "views/partials/head" .}}
+  <script defer src="/static/talk.js"></script>
+  <style>
+    body {
+        overflow: hidden; 
+    }
+  </style>
+  <body class="bg-gray-900 text-gray-200" x-data="{ key: $store.chat.key }">
+    <div class="flex flex-col min-h-screen">
+
+    {{template "views/partials/navbar"}}
+    <div class="chat-container mt-2 mr-2 ml-2 mb-2 bg-gray-800 shadow-lg rounded-lg " >
+     <!-- Chat Header -->
+      <div class="border-b border-gray-700 p-4"  x-data="{ component: 'menu' }">
+
+        <div class="flex items-center justify-center">
+
+          <div x-show="component === 'menu'" id="menu">
+
+            <button @click="component = 'key'" title="Update API key"
+            class="m-2 float-right inline-block rounded bg-primary px-6 pb-2.5 mb-3 pt-2.5 text-xs font-medium uppercase leading-normal text-white shadow-primary-3 transition duration-150 ease-in-out hover:bg-primary-accent-300 hover:shadow-primary-2 focus:bg-primary-accent-300 focus:shadow-primary-2 focus:outline-none focus:ring-0 active:bg-primary-600 active:shadow-primary-2 dark:shadow-black/30 dark:hover:shadow-dark-strong dark:focus:shadow-dark-strong dark:active:shadow-dark-strong"
+            >Set API Key🔑</button>
+
+          </div>
+
+        <form x-show="component === 'key'" id="key">
+          <input
+            type="password"
+            id="apiKey"
+            name="apiKey"
+            class="bg-gray-800 text-white border border-gray-600 focus:border-blue-500 focus:ring focus:ring-blue-500 focus:ring-opacity-50 rounded-md shadow-sm p-2 appearance-none"
+            placeholder="API Key"
+            x-model.lazy="key"
+          />
+          <button @click="component = 'menu'" type="submit" title="Save API key">
+            <i class="fa-solid fa-arrow-right"></i>
+          </button>
+        </form>
+        </div>
+      </div>
+
+    <div class="flex items-center justify-center">
+    <div class="w-full p-4  max-w-md border-t border-gray-700 ">
+      <div class="bg-gray-700 shadow-md rounded px-8 pt-6 pb-8 mb-4">
+      <div id="recording" class="" style="display: none;">
+        <i class="fa-solid fa-microphone animate-pulse text-red-700"></i>
+        <span class="text-white-700 text-sm font-bold mb-2">Recording... press "Stop recording" to stop</span>
+      </div>
+      <div id="loader" class="my-2 loader" style="display: none;"></div>
+      <div id="statustext" class="my-2 p-2 block text-white-700 text-sm font-bold mb-2" ></div>
+      <div class="mb-4" >
+        <label for="modelSelect" class="block text-white-700 text-sm font-bold mb-2">LLM Model:</label>
+        <select id="modelSelect"
+        class="bg-gray-800 text-white border border-gray-600 focus:border-blue-500 focus:ring focus:ring-blue-500 focus:ring-opacity-50 rounded-md shadow-sm p-2 appearance-none"
+        >
+          <option value="" disabled class="text-gray-400" >Select a model</option>
+
+          {{ range .ModelsConfig }}
+          <option value="{{.Name}}"  class="bg-gray-700 text-white">{{.Name}}</option>
+          {{ end }}
+        </select>
+      </div>
+
+      <div class="mb-4" >
+        <label for="whisperModelSelect" class="block text-white-700 text-sm font-bold mb-2">Whisper Model:</label>
+          <select id="whisperModelSelect"
+          class="bg-gray-800 text-white border border-gray-600 focus:border-blue-500 focus:ring focus:ring-blue-500 focus:ring-opacity-50 rounded-md shadow-sm p-2 appearance-none"
+
+          >
+            <option value="" disabled class="text-gray-400" >Select a model</option>
+
+            {{ range .ModelsConfig }}
+            <option value="{{.Name}}"  class="bg-gray-700 text-white">{{.Name}}</option>
+            {{ end }}
+          </select>
+      </div>
+
+
+      <div class="mb-4" >
+        <label for="ttsModelSelect" class="block text-white-700 text-sm font-bold mb-2">TTS Model:</label>
+        <select id="ttsModelSelect"
+        class="bg-gray-800 text-white border border-gray-600 focus:border-blue-500 focus:ring focus:ring-blue-500 focus:ring-opacity-50 rounded-md shadow-sm p-2 appearance-none"
+        >
+          <option value="" disabled class="text-gray-400" >Select a model</option>
+          {{ range .ModelsConfig }}
+          <option value="{{.Name}}"  class="bg-gray-700 text-white">{{.Name}}</option>
+          {{ end }}
+        </select>
+      </div>
+
+
+      <button id="recordButton"
+        class="bg-red-500 hover:bg-red-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline"
+      ><i class="fa-solid fa-microphone pr-2"></i>Talk</button>
+      <a id="resetButton"
+      class="inline-block align-baseline font-bold text-sm text-blue-500 hover:text-blue-800"
+      href="#"
+      >Reset conversation</a>
+      <audio id="audioPlayback" controls hidden></audio>
+
+        </div>
+      </div>
+      </div>
+    </div>
+  </body>
+</html>