Spaces:
Running
Running
/* Mini Piper implementation in Javascript. */ | |
import EspeakModule from "./espeakng.worker.js"; | |
const AUDIO_OUTPUT_SYNCHRONOUS = 2; | |
const espeakCHARS_AUTO = 0; | |
const CLAUSE_INTONATION_FULL_STOP = 0x00000000; | |
const CLAUSE_INTONATION_COMMA = 0x00001000; | |
const CLAUSE_INTONATION_QUESTION = 0x00002000; | |
const CLAUSE_INTONATION_EXCLAMATION = 0x00003000; | |
const CLAUSE_TYPE_CLAUSE = 0x00040000; | |
const CLAUSE_TYPE_SENTENCE = 0x00080000; | |
const CLAUSE_PERIOD = 40 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_SENTENCE; | |
const CLAUSE_COMMA = 20 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE; | |
const CLAUSE_QUESTION = 40 | CLAUSE_INTONATION_QUESTION | CLAUSE_TYPE_SENTENCE; | |
const CLAUSE_EXCLAMATION = | |
45 | CLAUSE_INTONATION_EXCLAMATION | CLAUSE_TYPE_SENTENCE; | |
const CLAUSE_COLON = 30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE; | |
const CLAUSE_SEMICOLON = 30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE; | |
const BOS = "^"; | |
const EOS = "$"; | |
const PAD = "_"; | |
let espeakInstance = null; | |
let espeakInitialized = false; | |
let voiceModel = null; | |
let voiceConfig = null; | |
async function setVoice(voiceModelUrl, voiceConfigUrl = undefined) { | |
voiceConfigUrl = voiceConfigUrl ?? `${voiceModelUrl}.json`; | |
const response = await fetch(voiceConfigUrl); | |
if (!response.ok) { | |
throw new Error(`Error loading voice configuration: {voiceConfigUrl}`); | |
} | |
voiceConfig = await response.json(); | |
if (voiceConfig.phoneme_type == "espeak") { | |
if (!espeakInstance) { | |
espeakInstance = await EspeakModule(); | |
espeakInstance._espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, 0, 0); | |
} | |
} | |
voiceModel = await ort.InferenceSession.create(voiceModelUrl); | |
} | |
async function textToWavAudio( | |
text, | |
speakerId = undefined, | |
noiseScale = undefined, | |
lengthScale = undefined, | |
noiseWScale = undefined, | |
) { | |
if (!voiceConfig) { | |
throw new Error("Voice is not set"); | |
} | |
const sampleRate = voiceConfig.audio.sample_rate; | |
const float32Audio = await textToFloat32Audio( | |
text, | |
speakerId, | |
noiseScale, | |
lengthScale, | |
noiseWScale, | |
); | |
return float32ToWavBlob(float32Audio, sampleRate); | |
} | |
async function textToFloat32Audio( | |
text, | |
speakerId = undefined, | |
lengthScale = undefined, | |
noiseScale = undefined, | |
noiseWScale = undefined, | |
) { | |
if (!voiceConfig) { | |
throw new Error("Voice is not set"); | |
} | |
lengthScale = lengthScale ?? voiceConfig.inference.length_scale ?? 1.0; | |
noiseScale = noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667; | |
noiseWScale = noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8; | |
if (voiceConfig.num_speakers > 1) { | |
speakerId = speakerId ?? 0; // first speaker | |
} | |
const textPhonemes = textToPhonemes(text); | |
const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, textPhonemes); | |
// Run onnx model | |
const phonemeIdsTensor = new ort.Tensor( | |
"int64", | |
new BigInt64Array(phonemeIds.map((x) => BigInt(x))), | |
[1, phonemeIds.length], | |
); | |
const phonemeLengthsTensor = new ort.Tensor( | |
"int64", | |
BigInt64Array.from([BigInt(phonemeIds.length)]), | |
[1], | |
); | |
const scalesTensor = new ort.Tensor( | |
"float32", | |
Float32Array.from([noiseScale, lengthScale, noiseWScale]), | |
[3], | |
); | |
let feeds = { | |
input: phonemeIdsTensor, | |
input_lengths: phonemeLengthsTensor, | |
scales: scalesTensor, | |
}; | |
if (voiceConfig.num_speakers > 1) { | |
// Multi-speaker | |
feeds["sid"] = new ort.Tensor( | |
"int64", | |
BigInt64Array.from([BigInt(speakerId)]), | |
); | |
} | |
const results = await voiceModel.run(feeds); | |
const float32Audio = results.output.cpuData; | |
return float32Audio; | |
} | |
function textToPhonemes(text) { | |
if (!voiceConfig) { | |
throw new Error("Voice is not set"); | |
} | |
if (voiceConfig.phoneme_type == "text") { | |
// Text phonemes | |
return [Array.from(text.normalize("NFD"))]; | |
} | |
if (!espeakInstance) { | |
throw new Error("espeak-ng is not initialized"); | |
} | |
const voice = voiceConfig.espeak.voice; | |
// Set voice | |
const voicePtr = espeakInstance._malloc( | |
espeakInstance.lengthBytesUTF8(voice) + 1, | |
); | |
espeakInstance.stringToUTF8( | |
voice, | |
voicePtr, | |
espeakInstance.lengthBytesUTF8(voice) + 1, | |
); | |
espeakInstance._espeak_SetVoiceByName(voicePtr); | |
espeakInstance._free(voicePtr); | |
// Prepare text | |
const textPtr = espeakInstance._malloc( | |
espeakInstance.lengthBytesUTF8(text) + 1, | |
); | |
espeakInstance.stringToUTF8( | |
text, | |
textPtr, | |
espeakInstance.lengthBytesUTF8(text) + 1, | |
); | |
const textPtrPtr = espeakInstance._malloc(4); | |
espeakInstance.setValue(textPtrPtr, textPtr, "*"); | |
// End of clause and sentences | |
const terminatorPtr = espeakInstance._malloc(4); | |
// Phoneme lists for each sentence | |
const textPhonemes = []; | |
// Phoneme list for current sentence | |
let sentencePhonemes = []; | |
while (true) { | |
const phonemesPtr = espeakInstance._espeak_TextToPhonemesWithTerminator( | |
textPtrPtr, | |
espeakCHARS_AUTO, | |
/* IPA */ 0x02, | |
terminatorPtr, | |
); | |
const clausePhonemes = espeakInstance.UTF8ToString(phonemesPtr); | |
sentencePhonemes.push(clausePhonemes); | |
const terminator = espeakInstance.getValue(terminatorPtr, "i32"); | |
const punctuation = terminator & 0x000fffff; | |
// Add punctuation phonemes | |
if (punctuation === CLAUSE_PERIOD) { | |
sentencePhonemes.push("."); | |
} else if (punctuation === CLAUSE_QUESTION) { | |
sentencePhonemes.push("?"); | |
} else if (punctuation === CLAUSE_EXCLAMATION) { | |
sentencePhonemes.push("!"); | |
} else if (punctuation === CLAUSE_COMMA) { | |
sentencePhonemes.push(", "); | |
} else if (punctuation === CLAUSE_COLON) { | |
sentencePhonemes.push(": "); | |
} else if (punctuation === CLAUSE_SEMICOLON) { | |
sentencePhonemes.push("; "); | |
} | |
if ((terminator & CLAUSE_TYPE_SENTENCE) === CLAUSE_TYPE_SENTENCE) { | |
// End of sentence | |
textPhonemes.push(sentencePhonemes); | |
sentencePhonemes = []; | |
} | |
const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*"); | |
if (nextTextPtr === 0) { | |
break; // All text processed | |
} | |
// Advance text pointer | |
espeakInstance.setValue(textPtrPtr, nextTextPtr, "*"); | |
} | |
// Clean up | |
espeakInstance._free(textPtr); | |
espeakInstance._free(textPtrPtr); | |
espeakInstance._free(terminatorPtr); | |
// Add lingering phonemes | |
if (sentencePhonemes.length > 0) { | |
textPhonemes.push(sentencePhonemes); | |
sentencePhonemes = []; | |
} | |
// Prepare phonemes for Piper | |
for (let i = 0; i < textPhonemes.length; i++) { | |
textPhonemes[i] = Array.from(textPhonemes[i].join("").normalize("NFD")); | |
} | |
return textPhonemes; | |
} | |
function phonemesToIds(idMap, textPhonemes) { | |
let phonemeIds = []; | |
for (let sentencePhonemes of textPhonemes) { | |
phonemeIds.push(idMap[BOS]); | |
phonemeIds.push(idMap[PAD]); | |
for (let phoneme of sentencePhonemes) { | |
if (!(phoneme in idMap)) { | |
continue; | |
} | |
phonemeIds.push(idMap[phoneme]); | |
phonemeIds.push(idMap[PAD]); | |
} | |
phonemeIds.push(idMap[EOS]); | |
} | |
return phonemeIds; | |
} | |
function float32ToWavBlob(floatArray, sampleRate) { | |
const int16 = new Int16Array(floatArray.length); | |
for (let i = 0; i < floatArray.length; i++) { | |
int16[i] = Math.max(-1, Math.min(1, floatArray[i])) * 32767; | |
} | |
const buffer = new ArrayBuffer(44 + int16.length * 2); | |
const view = new DataView(buffer); | |
const writeStr = (offset, str) => { | |
for (let i = 0; i < str.length; i++) | |
view.setUint8(offset + i, str.charCodeAt(i)); | |
}; | |
writeStr(0, "RIFF"); | |
view.setUint32(4, 36 + int16.length * 2, true); | |
writeStr(8, "WAVE"); | |
writeStr(12, "fmt "); | |
view.setUint32(16, 16, true); | |
view.setUint16(20, 1, true); // PCM | |
view.setUint16(22, 1, true); // mono | |
view.setUint32(24, sampleRate, true); | |
view.setUint32(28, sampleRate * 2, true); // byte rate | |
view.setUint16(32, 2, true); // block align | |
view.setUint16(34, 16, true); // bits per sample | |
writeStr(36, "data"); | |
view.setUint32(40, int16.length * 2, true); | |
for (let i = 0; i < int16.length; i++) { | |
view.setInt16(44 + i * 2, int16[i], true); | |
} | |
return new Blob([view], { type: "audio/wav" }); | |
} | |
export { setVoice, textToWavAudio, textToFloat32Audio }; | |