Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speech recognition / dictation #114

Merged
merged 13 commits into from
Aug 18, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,11 @@
sling:resourceType="granite/ui/components/coral/foundation/actionbar">
<primary
jcr:primaryType="nt:unstructured">
<dictate
jcr:primaryType="nt:unstructured" icon="stage"
sling:resourceType="granite/ui/components/coral/foundation/button"
granite:title="Dictate your prompt to the AI: while pressed, audio is recorded and after releasing the button it's transcribed and inserted into the prompt."
text="Dictate" granite:class="composum-ai-dictate-button hide"/>
<generate
jcr:primaryType="nt:unstructured" icon="play"
sling:resourceType="granite/ui/components/coral/foundation/button"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@
sling:resourceType="granite/ui/components/coral/foundation/actionbar">
<primary
jcr:primaryType="nt:unstructured">
<dictate
jcr:primaryType="nt:unstructured"
icon="stage"
sling:resourceType="granite/ui/components/coral/foundation/button"
granite:title="Dictate your prompt to the AI: while pressed, audio is recorded and after releasing the button it's transcribed and inserted into the prompt."
granite:class="composum-ai-dictate-button hide"/>
<generate
jcr:primaryType="nt:unstructured" icon="play"
sling:resourceType="granite/ui/components/coral/foundation/button"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@
jcr:title="Describe image"
jcr:description="Please describe the following image in a way that a blind person can understand it."
/>
<teaser_22
jcr:primaryType="nt:unstructured"
sling:resourceType="wknd/components/teaser"
jcr:title="Dictation fixup"
jcr:description="Please fix any dictation errors in the following dictated text, and follow embedded editing instructions:&#xA;&#xA;"
/>
</container>
</root>
</jcr:content>
Expand Down
182 changes: 182 additions & 0 deletions aem/ui.frontend/src/main/webpack/site/AIDictate.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
/** Dictation service - a button records while pressed and then transcribes using the AIDictationServlet
* and then inserts the result into a text area. */

import {AIConfig} from './AIConfig.js';

const AIDICTATE_SERVLET = '/bin/cpm/ai/dictate';

class AIDictate {

/**
* Creates a new AIDictate instance.
* @param {string} dictatebutton - Selector for the button that starts and stops recording.
* @param {string} textarea - Selector for the textarea where the transcription is inserted.
* @param {function} onChangeCallback - Callback function that is called after the transcription is inserted.
*/
constructor(dictatebutton, textarea, onChangeCallback, onErrorCallback) {
this.dictatebutton = $(dictatebutton)[0];
this.onChangeCallback = onChangeCallback;
this.setTextarea(textarea);
this.recorder = null;
this.audioStream = null;
this.timeoutCall = null;
this.isRecording = false;
this.isStoppingRecording = false;
this.dictateUrl = Granite.HTTP.externalize(AIDICTATE_SERVLET) + ".txt" + new AIConfig().getContentURL();
console.log("AIDictate constructor", this.dictatebutton, this.textarea);
this.enableCheck();
this.attachEventListeners();
}

setTextarea(textarea) {
this.textarea = $(textarea)[0];
// verify that this is a textarea or text input field
if (!this.textarea || !this.textarea.tagName || !this.textarea.tagName.match(/textarea|input/i)) {
throw new Error('AIDictate: textarea parameter must be a textarea or text input field');
}
this.lastPosition = this.textarea.selectionStart;
}

/** Performs a GET request to the servlet with current page path as suffix,
* and if that answers with 200 enable() is called. */
enableCheck() {
fetch(this.dictateUrl)
.then(response => {
if (response.status === 200) {
this.importRecorderAndEnable();
}
});
}

/** Adds a script https://cdnjs.cloudflare.com/ajax/libs/recorderjs/0.1.0/recorder.js to the document
* if Recorder isn't yet defined. */
importRecorderAndEnable() {
if (typeof Recorder === 'undefined') {
const script = document.createElement('script');
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/recorderjs/0.1.0/recorder.js';
script.onload = () => {
this.enable();
};
document.body.appendChild(script);
} else {
this.enable();
}
}

enable() {
this.dictatebutton.classList.remove('hide');
}

startRecording = async () => {
if (!this.isRecording && !this.isStoppingRecording) {
console.log('Recording...');
this.audioStream = await navigator.mediaDevices.getUserMedia({audio: true});
const audioContext = new AudioContext();
const input = audioContext.createMediaStreamSource(this.audioStream);
this.recorder = new Recorder(input, {numChannels: 1});
this.recorder.record();
this.timeoutCall = setTimeout(this.stopRecording, 120000); // Stop recording after 2 minutes
this.isRecording = true;
}
};

stopRecording = async () => {
if (!this.isRecording || this.isStoppingRecording) {
return;
}
this.isStoppingRecording = true;
console.log('Stopping recording');
this.dictatebutton.disabled = true;
this.recorder.stop();
clearTimeout(this.timeoutCall);
this.audioStream.getTracks().forEach(track => track.stop());
this.recorder.exportWAV(async (blob) => {
Granite.csrf.refreshToken().then(token => {
const promptText = this.textarea.value && this.textarea.selectionStart &&
this.textarea.value.substring(0, this.textarea.selectionStart);
console.log('Exported WAV');
const formData = new FormData();
formData.append('audioStream', blob);
formData.append('contentType', 'audio/wav');
if (promptText) {
formData.append('prompt', promptText);
}

console.log('Sending request');
fetch(this.dictateUrl, {
method: 'POST',
body: formData,
headers: {'CSRF-Token': token}
}).then(response => {
console.log('Received response', response.status);
if (response.ok) {
return response.text();
} else {
throw new Error(`Error: ${response.statusText}`);
}
}).then(data => {
console.log('Received data', data);
this.insertResult(data.trim());
}).catch(error => {
this.onError(error);
debugger;
}).finally(() => {
console.log('Finished, enabling button');
this.dictatebutton.disabled = false;
this.recorder = null;
this.isRecording = false;
this.isStoppingRecording = false;
});
});
});
};

insertResult(data) {
// Insert transcription at current cursor position
let cursorPosition = document.activeElement === this.textarea && this.textarea.selectionStart ?
this.textarea.selectionStart : this.lastPosition;
let value = this.textarea.value || '';
const textBefore = value.substring(0, cursorPosition);
const textAfter = value.substring(cursorPosition);
this.textarea.value = `${textBefore}${/\s$/.test(textBefore) ? '' : ' '}${data}${/^\s/.test(textAfter) ? '' : ' '}${textAfter}`;
this.textarea.selectionStart = cursorPosition + (/\s$/.test(textBefore) ? 0 : 1) + data.length + (/^\s/.test(textAfter) ? 0 : 1);
this.textarea.selectionEnd = this.textarea.selectionStart;
this.lastPosition = this.textarea.selectionStart;
this.textarea.focus();
this.textarea.scrollIntoViewIfNeeded();
if (this.onChangeCallback) {
this.onChangeCallback();
}
}

attachEventListeners() {
this.dictatebutton.addEventListener('mousedown', this.startRecording);
this.dictatebutton.addEventListener('mouseup', this.stopRecording);
window.addEventListener('keydown', (e) => {
if (e.metaKey && e.ctrlKey && e.key === 't') {
this.startRecording();
e.preventDefault();
}
});
window.addEventListener('keyup', () => {
if (!this.isStoppingRecording) {
this.stopRecording();
}
});
this.textarea.addEventListener('blur', () => {
this.lastPosition = this.textarea.selectionStart;
});
}

onError(error) {
console.error(error);
if (this.onErrorCallback) {
this.onErrorCallback(error);
}
}

}

export {AIDictate};

console.log("AIDictate.js loaded", AIDictate);
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/** Implementation for the actions of the Content Creation Dialog - button actions, drop down list actions etc. */

import {AICreate} from './AICreate.js';
import {AIDictate} from './AIDictate.js';
import {errorText, findSingleElement} from './common.js';
import {DialogHistory} from './DialogHistory.js';
import {HelpPage} from './HelpPage.js';
Expand Down Expand Up @@ -59,6 +60,9 @@ class ContentCreationDialog {
this.assignElements();
this.bindActions();
this.createServlet = new AICreate(this.streamingCallback.bind(this), this.doneCallback.bind(this), this.errorCallback.bind(this));
this.dictate = new AIDictate(this.$dialog.find('.composum-ai-dictate-button'),
this.$prompt, this.onPromptChanged.bind(this)
);
const historyPath = property ? componentPath + '/' + property : componentPath;
if (!historyMap[historyPath]) {
historyMap[historyPath] = [];
Expand Down
23 changes: 18 additions & 5 deletions aem/ui.frontend/src/main/webpack/site/SidePanelDialog.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
/** Implementation for the actions of the Content Creation Dialog - button actions, drop down list actions etc. */

import {AICreate} from './AICreate.js';
import {contentFragmentPath, errorText, findSingleElement, coralSelectValue} from './common.js';
import {AIDictate} from './AIDictate.js';
import {contentFragmentPath, coralSelectValue, errorText, findSingleElement} from './common.js';
import {DialogHistory} from './DialogHistory.js';
import {HelpPage} from './HelpPage.js';

/** Keeps dialog histories per path. */
const historyMap = {};

class SidePanelDialog {
Expand All @@ -23,6 +23,14 @@ class SidePanelDialog {
findSingleElement(this.$dialog, '.composum-ai-templates').hide(); // class hidden isn't present in content fragment editor
this.createServlet = new AICreate(this.streamingCallback.bind(this), this.doneCallback.bind(this), this.errorCallback.bind(this));

// Initialize AIDictate
this.dictate = new AIDictate(
this.$dialog.find('.composum-ai-dictate-button'),
this.$promptContainer.find('.composum-ai-prompt:first'),
this.onPromptAreaChanged.bind(this),
this.showError.bind(this)
);

const historyPath = this.getContentPath();
if (!historyMap[historyPath]) {
historyMap[historyPath] = [];
Expand All @@ -45,12 +53,11 @@ class SidePanelDialog {
bindActions() {
this.$predefinedPromptsSelector.on('change', this.onPredefinedPromptsChanged.bind(this));
this.$promptContainer.on('change input', '.composum-ai-prompt', this.onPromptAreaChanged.bind(this));
this.$promptContainer.on('focus', '.composum-ai-prompt', this.expandOnFocus);
this.$promptContainer.on('focus', '.composum-ai-prompt', this.onPromptFocus.bind(this)); // Update dictate textarea on focus
this.$promptContainer.on('blur', '.composum-ai-prompt', this.shrinkOnBlur);
this.$generateButton.on('click', this.onGenerateButtonClicked.bind(this));
this.$stopButton.on('click', this.onStopClicked.bind(this));
findSingleElement(this.$dialog, '.composum-ai-reset-button').on('click', this.resetForm.bind(this));
// bind enter key (without any modifiers) in .composum-ai-promptcontainer .composum-ai-prompt to submit
findSingleElement(this.$dialog, '.composum-ai-promptcontainer').on('keydown', '.composum-ai-prompt', (function (event) {
if (event.keyCode === 13 && !event.ctrlKey && !event.shiftKey && !event.altKey && !event.metaKey) {
event.preventDefault();
Expand Down Expand Up @@ -158,13 +165,19 @@ class SidePanelDialog {
}
}

onPromptAreaChanged(event) {
onPromptAreaChanged() {
if (this.verbose) console.log("onPromptAreaChanged", arguments); // on each key press
coralSelectValue(this.$predefinedPromptsSelector, '-');
coralSelectValue(this.$predefinedPromptsSelector, '');
this.setAutomaticGenerateButtonState();
}

onPromptFocus(event) {
const focusedElement = $(event.target);
this.dictate.setTextarea(focusedElement); // Update the textarea in the AIDictate instance
console.log("Updated dictate textarea to", focusedElement);
}

// TODO: possibly use resize on typing https://stackoverflow.com/questions/454202/creating-a-textarea-with-auto-resize/77155208
expandOnFocus(event) {
if (this.debug) console.log("expandOnFocus", arguments);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package com.composum.ai.backend.base.service.chat;

import java.io.InputStream;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

/**
* Services related to dictation.
*/
public interface GPTDictationService {

/**
* Whether the service is enabled and properly configured.
*/
boolean isAvailable(@Nullable GPTConfiguration configuration);

/**
* Transcribes the input audio to text.
* @param audioStream the audio stream to transcribe, will be closed
* @param contentType the content type of the audio, e.g. "audio/mpeg" for mp3, "audio/wav" for wav
* @param language the language code to use, e.g. "en" for English, or null for automatic detection
* @param configuration the configuration to use, or null for the default configuration
* @param prompt an optional prompt to give the AI some context, e.g. previous sentences
* @exception IllegalStateException if the service is not available / configured
*/
String transcribe(@Nonnull InputStream audioStream, @Nonnull String contentType, @Nullable String language,
@Nullable GPTConfiguration configuration, @Nullable String prompt)
throws IllegalStateException;

}
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.apache.hc.client5.http.async.methods.SimpleHttpRequest;
import org.apache.hc.client5.http.async.methods.SimpleHttpResponse;
import org.apache.hc.client5.http.async.methods.SimpleRequestProducer;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.config.ConnectionConfig;
import org.apache.hc.client5.http.config.RequestConfig;
import org.apache.hc.client5.http.impl.async.CloseableHttpAsyncClient;
Expand Down Expand Up @@ -94,9 +95,10 @@
* @see "https://platform.openai.com/docs/api-reference/chat/create"
* @see "https://platform.openai.com/docs/guides/chat"
*/
@Component(service = GPTChatCompletionService.class)
@Component(service = {GPTChatCompletionService.class, GPTInternalOpenAIHelper.class})
@Designate(ocd = GPTChatCompletionServiceImpl.GPTChatCompletionServiceConfig.class)
public class GPTChatCompletionServiceImpl implements GPTChatCompletionService {
public class GPTChatCompletionServiceImpl extends GPTInternalOpenAIHelper.GPTInternalOpenAIHelperInst
implements GPTChatCompletionService, GPTInternalOpenAIHelper {

protected static final Logger LOG = LoggerFactory.getLogger(GPTChatCompletionServiceImpl.class);

Expand Down Expand Up @@ -974,6 +976,21 @@ protected static GPTException buildException(Integer errorStatusCode, String res
+ ") : " + result);
}

@Override
public GPTInternalOpenAIHelperInst getInstance() {
return this;
}

@Override
void initOpenAIRequest(@Nonnull HttpPost request, @Nullable GPTConfiguration gptConfiguration) {
String actualApiKey = gptConfiguration != null && gptConfiguration.getApiKey() != null && !gptConfiguration.getApiKey().trim().isEmpty() ? gptConfiguration.getApiKey() : this.apiKey;
String actualOrganizationId = gptConfiguration != null && gptConfiguration.getOrganizationId() != null && !gptConfiguration.getOrganizationId().trim().isEmpty() ? gptConfiguration.getOrganizationId() : this.organizationId;
request.addHeader("Authorization", "Bearer " + actualApiKey);
if (actualOrganizationId != null && !actualOrganizationId.trim().isEmpty()) {
request.addHeader("OpenAI-Organization", actualOrganizationId);
}
}

/**
* Makes doubly sure that result is somehow set after the call.
*/
Expand Down
Loading
Loading