alessandro trinca tornidor
feat: port whisper and faster-whisper support from https://github.com/Thiagohgl/ai-pronunciation-trainer
85b7206
// Audio context initialization
let mediaRecorder, audioChunks, audioBlob, stream, audioRecorded;
const ctx = new AudioContext();
let currentAudioForPlaying;
let lettersOfWordAreCorrect = [];
// UI-related variables
const page_title = "AI Pronunciation Trainer";
const accuracy_colors = ["green", "orange", "red"];
let badScoreThreshold = 30;
let mediumScoreThreshold = 70;
let currentSample = 0;
let currentScore = 0.;
let sample_difficult = 0;
let scoreMultiplier = 1;
let playAnswerSounds = true;
let isNativeSelectedForPlayback = true;
let isRecording = false;
let serverIsInitialized = false;
let serverWorking = true;
let languageFound = true;
let currentSoundRecorded = false;
let currentText, currentIpa, real_transcripts_ipa, matched_transcripts_ipa;
let wordCategories;
let startTime, endTime;
let SingleWordIpaPairBackup;
// API related variables
let AILanguage = "de"; // Standard is German
// Read the Public Key from an env variable (it's managed within the python/flask code - STScoreAPIKey).
// If, for some reason, you would like a private one, send-me a message and we can discuss some possibilities
try {
const cookieList = document.cookie.split("=")
const STScoreAPIKey = cookieList[1]
} catch (error) {
console.log("STScoreAPIKey::error:", error, "#")
}
let apiMainPathSample = '';// 'http://127.0.0.1:3001';// 'https://a3hj0l2j2m.execute-api.eu-central-1.amazonaws.com/Prod';
let apiMainPathSTS = '';// 'https://wrg7ayuv7i.execute-api.eu-central-1.amazonaws.com/Prod';
const defaultOriginalScript = "Click on the bar on the right to generate a new sentence (please use chrome web browser)."
const defaultErrorScript = "Server error. Either the daily quota of the server is over or there was some internal error. You can try to generate a new sample in a few seconds. If the error persist, try comming back tomorrow or download the local version from Github :)";
const editErrorScript = "Please edit this text before generating the IPA for a custom sentence!";
const browserUnsupported = "Browser unsupported";
const recordingError = "Recording error, please try again or restart page.";
// Variables to playback accuracy sounds
let soundsPath = '../static';//'https://stscore-sounds-bucket.s3.eu-central-1.amazonaws.com';
let soundFileGood = null;
let soundFileOkay = null;
let soundFileBad = null;
// Speech generation
var synth = window.speechSynthesis;
let voice_idx = 0;
let voice_synth = null;
//############################ UI general control functions ###################
const unblockUI = (unlockIPACustomText = false) => {
document.getElementById("recordAudio").classList.remove('disabled');
document.getElementById("playSampleAudio").classList.remove('disabled');
document.getElementById("buttonNext").onclick = () => getNextSample();
document.getElementById("nextButtonDiv").classList.remove('disabled');
document.getElementById("original_script").classList.remove('disabled');
document.getElementById("buttonNext").style["background-color"] = '#58636d';
if (currentSoundRecorded)
document.getElementById("playRecordedAudio").classList.remove('disabled');
enableElementWithClass("input-uploader-audio-file")
if (unlockIPACustomText) {
enableElementWithClass("buttonCustomText")
}
};
const blockUI = () => {
document.getElementById("recordAudio").classList.add('disabled');
document.getElementById("playSampleAudio").classList.add('disabled');
document.getElementById("buttonNext").onclick = null;
document.getElementById("original_script").classList.add('disabled');
document.getElementById("playRecordedAudio").classList.add('disabled');
document.getElementById("buttonNext").classList.add('disabled');
disableElementWithClass("input-uploader-audio-file")
};
const UIError = (errorMsg = defaultErrorScript) => {
blockUI();
document.getElementById("buttonNext").onclick = () => getNextSample(); //If error, user can only try to get a new sample
document.getElementById("buttonNext").style["background-color"] = '#58636d';
document.getElementById("recorded_ipa_script").innerHTML = "";
document.getElementById("single_word_ipa_pair_error").style["display"] = "inline";
document.getElementById("single_word_ipa_pair_separator").style["display"] = "none";
document.getElementById("single_word_ipa_reference_recorded").style["display"] = "none";
document.getElementById("single_word_ipa_current").style["display"] = "none";
document.getElementById("ipa_script").innerText = "Error"
document.getElementById("main_title").innerText = 'Server Error';
document.getElementById("original_script").innerHTML = errorMsg;
};
const disableElementWithClass = (id) => {
let el = document.getElementById(id)
el.disabled = true;
el.classList.remove('darkgreen');
}
const enableElementWithClass = (id) => {
let el = document.getElementById(id)
el.removeAttribute("disabled");
el.classList.add('darkgreen');
}
const UINotSupported = () => {
unblockUI();
document.getElementById("main_title").innerText = browserUnsupported;
}
const UIRecordingError = () => {
unblockUI();
document.getElementById("main_title").innerText = recordingError;
startMediaDevice();
}
//################### Application state functions #######################
function updateScore(currentPronunciationScore) {
if (Number.isNaN(currentPronunciationScore))
return;
currentScore += currentPronunciationScore * scoreMultiplier;
currentScore = Math.round(currentScore);
}
const cacheSoundFiles = async () => {
await fetch(soundsPath + '/ASR_good.wav').then(data => data.arrayBuffer()).
then(arrayBuffer => ctx.decodeAudioData(arrayBuffer)).
then(decodeAudioData => {
soundFileGood = decodeAudioData;
});
await fetch(soundsPath + '/ASR_okay.wav').then(data => data.arrayBuffer()).
then(arrayBuffer => ctx.decodeAudioData(arrayBuffer)).
then(decodeAudioData => {
soundFileOkay = decodeAudioData;
});
await fetch(soundsPath + '/ASR_bad.wav').then(data => data.arrayBuffer()).
then(arrayBuffer => ctx.decodeAudioData(arrayBuffer)).
then(decodeAudioData => {
soundFileBad = decodeAudioData;
});
}
const getCustomTextIsDisabled = () => {
const checkText = document.getElementById("original_script").innerText.trim();
let cleanedText = checkText.toString().replace(/[^\w\s]/gi, ' ').trim();
return checkText === defaultOriginalScript || checkText === defaultErrorScript || checkText === editErrorScript || cleanedText === "";
}
const getCustomText = async () => {
blockUI();
if (!serverIsInitialized)
await initializeServer();
if (!serverWorking) {
UIError();
return;
}
if (soundFileBad == null)
cacheSoundFiles();
if (getCustomTextIsDisabled()) {
UIError(editErrorScript);
return;
}
updateScore(parseFloat(document.getElementById("pronunciation_accuracy").innerHTML));
document.getElementById("main_title").innerText = "Get IPA transcription for custom text...";
try {
const original_script_element = document.getElementById("original_script")
const original_script = original_script_element.innerText;
await fetch(apiMainPathSample + '/getSample', {
method: "post",
body: JSON.stringify({
"language": AILanguage,
"transcript": original_script
}),
headers: { "X-Api-Key": STScoreAPIKey }
}).then(res => res.json()).
then(data => {
formatTranscriptData(data);
audioRecorded = undefined;
})
}
catch (err)
{
console.log("getCustomText::err:", err)
UIError();
}
}
const getNextSample = async () => {
blockUI();
if (!serverIsInitialized)
await initializeServer();
if (!serverWorking) {
UIError();
return;
}
if (soundFileBad == null)
cacheSoundFiles();
updateScore(parseFloat(document.getElementById("pronunciation_accuracy").innerHTML));
document.getElementById("main_title").innerText = "Processing new sample...";
if (document.getElementById('lengthCat1').checked) {
sample_difficult = 0;
scoreMultiplier = 1.3;
}
else if (document.getElementById('lengthCat2').checked) {
sample_difficult = 1;
scoreMultiplier = 1;
}
else if (document.getElementById('lengthCat3').checked) {
sample_difficult = 2;
scoreMultiplier = 1.3;
}
else if (document.getElementById('lengthCat4').checked) {
sample_difficult = 3;
scoreMultiplier = 1.6;
}
try {
await fetch(apiMainPathSample + '/getSample', {
method: "post",
body: JSON.stringify({
"category": sample_difficult.toString(), "language": AILanguage
}),
headers: { "X-Api-Key": STScoreAPIKey }
}).then(res => res.json()).
then(data => {
formatTranscriptData(data);
})
}
catch (err)
{
console.log("getNextSample::err:", err)
UIError();
}
};
const formatTranscriptData = (data) => {
let doc = document.getElementById("original_script");
currentText = data.real_transcript;
doc.innerText = currentText;
currentIpa = data.ipa_transcript
let doc_ipa = document.getElementById("ipa_script");
doc_ipa.ariaLabel = "ipa_script"
doc_ipa.innerText = `/ ${currentIpa} /`;
let recorded_ipa_script = document.getElementById("recorded_ipa_script")
recorded_ipa_script.ariaLabel = "recorded_ipa_script"
recorded_ipa_script.innerText = ""
let pronunciation_accuracy = document.getElementById("pronunciation_accuracy")
pronunciation_accuracy.ariaLabel = "pronunciation_accuracy"
pronunciation_accuracy.innerHTML = "";
// restore a clean state for document.getElementById("single_word_ipa_pair") to avoid errors when playing the word audio
$(document).ready(function() {
$("#single_word_ipa_pair").replaceWith(SingleWordIpaPairBackup.clone())
})
document.getElementById("section_accuracy").innerText = `| Score: ${currentScore.toString()} - (${currentSample.toString()})`;
currentSample += 1;
document.getElementById("main_title").innerText = page_title;
document.getElementById("translated_script").innerText = data.transcript_translation;
currentSoundRecorded = false;
unblockUI(true);
document.getElementById("playRecordedAudio").classList.add('disabled');
}
const updateRecordingState = async () => {
return isRecording ? stopRecording() : recordSample();
}
const generateWordModal = (word_idx) => {
wrapWordForPlayingLink(real_transcripts_ipa[word_idx], word_idx, false, "black")
wrapWordForPlayingLink(matched_transcripts_ipa[word_idx], word_idx, true, accuracy_colors[parseInt(wordCategories[word_idx])])
}
const recordSample = async () => {
document.getElementById("main_title").innerText = "Recording... click again when done speaking";
document.getElementById("recordIcon").innerHTML = 'pause_presentation';
blockUI();
document.getElementById("recordAudio").classList.remove('disabled');
audioChunks = [];
isRecording = true;
mediaRecorder.start();
}
const changeLanguage = (language, generateNewSample = false) => {
voices = synth.getVoices();
AILanguage = language;
languageFound = false;
let languageIdentifier, languageName;
switch (language) {
case 'de':
document.getElementById("languageBox").innerText = "German";
languageIdentifier = 'de';
languageName = 'Anna';
break;
case 'en':
document.getElementById("languageBox").innerText = "English";
languageIdentifier = 'en';
languageName = 'Daniel';
break;
};
for (idx = 0; idx < voices.length; idx++) {
if (voices[idx].lang.slice(0, 2) === languageIdentifier && voices[idx].name === languageName) {
voice_synth = voices[idx];
languageFound = true;
break;
}
}
// If specific voice not found, search anything with the same language
if (!languageFound) {
for (idx = 0; idx < voices.length; idx++) {
if (voices[idx].lang.slice(0, 2) === languageIdentifier) {
voice_synth = voices[idx];
languageFound = true;
break;
}
}
}
if (generateNewSample)
getNextSample();
}
//################### Speech-To-Score function ########################
const mediaStreamConstraints = {
audio: {
channelCount: 1,
sampleRate: 48000
}
}
async function sendAudioToGetAccuracyFromRecordedAudio(audioBase64) {
try {
// Get currentText from "original_script" div, in case user has change it
let text = document.getElementById("original_script").innerHTML;
// Remove html tags
text = text.replace(/<[^>]*>?/gm, '');
//Remove spaces on the beginning and end
text = text.trim();
// Remove double spaces
text = text.replace(/\s\s+/g, ' ');
currentText = [text];
let useDTWValue = document.getElementById("checkbox-dtw").checked
console.log(`useDTWValue: '${typeof useDTWValue}', '${useDTWValue}'`)
await fetch(apiMainPathSTS + '/GetAccuracyFromRecordedAudio', {
method: "post",
body: JSON.stringify({
"title": currentText[0], "base64Audio": audioBase64, "language": AILanguage, "useDTW": useDTWValue
}),
headers: {"X-Api-Key": STScoreAPIKey}
}).then(res => res.json()).then(data => {
if (playAnswerSounds)
playSoundForAnswerAccuracy(parseFloat(data.pronunciation_accuracy))
document.getElementById("recorded_ipa_script").innerText = `/ ${data.ipa_transcript} /`;
document.getElementById("recordAudio").classList.add('disabled');
document.getElementById("main_title").innerText = page_title;
document.getElementById("pronunciation_accuracy").innerText = `${data.pronunciation_accuracy}%`;
document.getElementById("ipa_script").innerText = data.real_transcripts_ipa
lettersOfWordAreCorrect = data.is_letter_correct_all_words.split(" ")
startTime = data.start_time;
endTime = data.end_time;
real_transcripts_ipa = data.real_transcripts_ipa.split(" ")
matched_transcripts_ipa = data.matched_transcripts_ipa.split(" ")
wordCategories = data.pair_accuracy_category.split(" ")
let arrayOriginalText = currentText[0].split(" ")
let arrayColoredWords = document.getElementById("original_script")
arrayColoredWords.textContent = ""
for (let wordIdx in arrayOriginalText) {
let currentWordText = arrayOriginalText[wordIdx]
let letterIsCorrect = lettersOfWordAreCorrect[wordIdx]
let coloredWordTemp = document.createElement("a")
for (let letterIdx in currentWordText) {
let letterCorrect = letterIsCorrect[letterIdx] === "1"
let containerLetter = document.createElement("span")
containerLetter.style.color = letterCorrect ? 'green' : "red"
containerLetter.innerText = currentWordText[letterIdx];
coloredWordTemp.appendChild(containerLetter)
coloredWordTemp.ariaLabel = `word${wordIdx}${currentWordText}`.replace(/[^a-zA-Z0-9]/g, "")
console.log(`coloredWordTemp.ariaLabel:${coloredWordTemp.ariaLabel}!`)
coloredWordTemp.style.whiteSpace = "nowrap"
coloredWordTemp.style.textDecoration = "underline"
coloredWordTemp.onclick = function () {
generateWordModal(wordIdx.toString())
}
arrayColoredWords.appendChild(coloredWordTemp)
}
let containerSpace = document.createElement("span")
containerSpace.textContent = " "
arrayColoredWords.appendChild(containerSpace)
}
currentSoundRecorded = true;
unblockUI();
document.getElementById("playRecordedAudio").classList.remove('disabled');
});
} catch {
UIError();
}
}
const startMediaDevice = () => {
navigator.mediaDevices.getUserMedia(mediaStreamConstraints).then(_stream => {
stream = _stream
mediaRecorder = new MediaRecorder(stream);
let currentSamples = 0
mediaRecorder.ondataavailable = event => {
currentSamples += event.data.length
audioChunks.push(event.data);
};
mediaRecorder.onstop = async () => {
document.getElementById("recordIcon").innerHTML = 'mic';
blockUI();
audioBlob = new Blob(audioChunks, { type: 'audio/ogg;' });
let audioUrl = URL.createObjectURL(audioBlob);
audioRecorded = new Audio(audioUrl);
let audioBase64 = await convertBlobToBase64(audioBlob);
let minimumAllowedLength = 6;
if (audioBase64.length < minimumAllowedLength) {
setTimeout(UIRecordingError, 50); // Make sure this function finished after get called again
return;
}
await sendAudioToGetAccuracyFromRecordedAudio(audioBase64);
};
});
};
startMediaDevice();
// ################### Audio playback ##################
const playSoundForAnswerAccuracy = async (accuracy) => {
currentAudioForPlaying = soundFileGood;
if (accuracy < mediumScoreThreshold) {
if (accuracy < badScoreThreshold) {
currentAudioForPlaying = soundFileBad;
}
else {
currentAudioForPlaying = soundFileOkay;
}
}
playback();
}
const playAudio = async () => {
document.getElementById("main_title").innerText = "Generating sound...";
playWithMozillaApi(currentText[0]);
document.getElementById("main_title").innerText = "Current Sound was played";
};
function playback() {
const playSound = ctx.createBufferSource();
playSound.buffer = currentAudioForPlaying;
playSound.connect(ctx.destination);
playSound.start(ctx.currentTime)
}
const playRecording = async (start = null, end = null) => {
blockUI();
try {
if (start == null || end == null) {
endTimeInMs = Math.round(audioRecorded.duration * 1000)
audioRecorded.addEventListener("ended", function () {
audioRecorded.currentTime = 0;
unblockUI();
document.getElementById("main_title").innerText = "Recorded Sound was played";
});
await audioRecorded.play();
}
else {
audioRecorded.currentTime = start;
audioRecorded.play();
durationInSeconds = end - start;
endTimeInMs = Math.round(durationInSeconds * 1000);
setTimeout(function () {
unblockUI();
audioRecorded.pause();
audioRecorded.currentTime = 0;
document.getElementById("main_title").innerHTML = "Recorded Sound was played";
}, endTimeInMs);
}
}
catch {
UINotSupported();
}
};
const playNativeAndRecordedWord = async (word_idx) => {
if (isNativeSelectedForPlayback)
playCurrentWord(word_idx)
else
playRecordedWord(word_idx);
isNativeSelectedForPlayback = !isNativeSelectedForPlayback;
}
const stopRecording = () => {
isRecording = false
mediaRecorder.stop()
document.getElementById("main_title").innerText = "Processing audio...";
}
const playCurrentWord = async (word_idx) => {
document.getElementById("main_title").innerText = "Generating word...";
playWithMozillaApi(currentText[0].split(' ')[word_idx]);
document.getElementById("main_title").innerText = "Word was played";
}
// TODO: Check if fallback is correct
const playWithMozillaApi = (text) => {
if (languageFound) {
blockUI();
if (voice_synth == null)
changeLanguage(AILanguage);
var utterThis = new SpeechSynthesisUtterance(text);
utterThis.voice = voice_synth;
utterThis.rate = 0.7;
utterThis.onend = function (event) {
unblockUI();
}
synth.speak(utterThis);
}
else {
UINotSupported();
}
}
const playRecordedWord = (word_idx) => {
wordStartTime = parseFloat(startTime.split(' ')[word_idx]);
wordEndTime = parseFloat(endTime.split(' ')[word_idx]);
playRecording(wordStartTime, wordEndTime);
}
// ############# Utils #####################
const convertBlobToBase64 = async (blob) => {
return await blobToBase64(blob);
}
const blobToBase64 = blob => new Promise((resolve, reject) => {
const reader = new FileReader();
reader.readAsDataURL(blob);
reader.onload = () => resolve(reader.result);
reader.onerror = error => reject(error);
});
const wrapWordForPlayingLink = (word, word_idx, isSpokenWord, word_color) => {
// for some reason here the function is swapped
const fn = isSpokenWord ? "playRecordedWord" : "playCurrentWord";
const id = isSpokenWord ? "single_word_ipa_current" : "single_word_ipa_reference_recorded";
const element = document.getElementById(id)
element.innerText = word
element.href = `javascript:${fn}(${word_idx.toString()})`
element.removeAttribute("disabled")
element.style["color"] = word_color
element.style["whiteSpace"] = "nowrap"
}
// ########## Function to initialize server ###############
// This is to try to avoid aws lambda cold start
try {
fetch(apiMainPathSTS + '/GetAccuracyFromRecordedAudio', {
method: "post",
body: JSON.stringify({ "title": '', "base64Audio": '', "language": AILanguage }),
headers: { "X-Api-Key": STScoreAPIKey }
});
}
catch { }
const audioToBase64 = async (audioFile) => {
return new Promise((resolve, reject) => {
let reader = new FileReader();
reader.onerror = reject;
reader.onload = (e) => resolve(e.target.result);
// custom: set the global variable 'audioRecorded' to play later the uploaded audio
let audioUrl = URL.createObjectURL(audioFile);
audioRecorded = new Audio(audioUrl);
reader.readAsDataURL(audioFile);
});
}
const audioUpload = async (audioFile) => {
console.log("starting uploading the file...")
let audioBase64 = await audioToBase64(audioFile);
console.log("file uploaded, starting making the request...")
await sendAudioToGetAccuracyFromRecordedAudio(audioBase64);
console.log("request done!")
}
const initializeServer = async () => {
let valid_response = false;
document.getElementById("main_title").innerText = 'Initializing server, this may take up to 2 minutes...';
$(document).ready(function() {
// required to properly reset the #single_word_ipa_pair element
SingleWordIpaPairBackup = $("#single_word_ipa_pair").clone();
})
let number_of_tries = 0;
let maximum_number_of_tries = 4;
while (!valid_response) {
if (number_of_tries > maximum_number_of_tries) {
serverWorking = false;
break;
}
try {
await fetch(apiMainPathSTS + '/GetAccuracyFromRecordedAudio', {
method: "post",
body: JSON.stringify({ "title": '', "base64Audio": '', "language": AILanguage }),
headers: { "X-Api-Key": STScoreAPIKey }
}).then(
valid_response = true);
serverIsInitialized = true;
}
catch (e)
{
number_of_tries += 1;
console.log(`initializeServer::error: ${e}, retry n=${number_of_tries}.`)
}
}
}