piper-demo / js /piper.js
Michael Hansen
First working version
0c6d0de
raw
history blame contribute delete
8.18 kB
/* Mini Piper implementation in Javascript. */
import EspeakModule from "./espeakng.worker.js";
const AUDIO_OUTPUT_SYNCHRONOUS = 2;
const espeakCHARS_AUTO = 0;
const CLAUSE_INTONATION_FULL_STOP = 0x00000000;
const CLAUSE_INTONATION_COMMA = 0x00001000;
const CLAUSE_INTONATION_QUESTION = 0x00002000;
const CLAUSE_INTONATION_EXCLAMATION = 0x00003000;
const CLAUSE_TYPE_CLAUSE = 0x00040000;
const CLAUSE_TYPE_SENTENCE = 0x00080000;
const CLAUSE_PERIOD = 40 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_SENTENCE;
const CLAUSE_COMMA = 20 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE;
const CLAUSE_QUESTION = 40 | CLAUSE_INTONATION_QUESTION | CLAUSE_TYPE_SENTENCE;
const CLAUSE_EXCLAMATION =
45 | CLAUSE_INTONATION_EXCLAMATION | CLAUSE_TYPE_SENTENCE;
const CLAUSE_COLON = 30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE;
const CLAUSE_SEMICOLON = 30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE;
const BOS = "^";
const EOS = "$";
const PAD = "_";
let espeakInstance = null;
let espeakInitialized = false;
let voiceModel = null;
let voiceConfig = null;
async function setVoice(voiceModelUrl, voiceConfigUrl = undefined) {
voiceConfigUrl = voiceConfigUrl ?? `${voiceModelUrl}.json`;
const response = await fetch(voiceConfigUrl);
if (!response.ok) {
throw new Error(`Error loading voice configuration: {voiceConfigUrl}`);
}
voiceConfig = await response.json();
if (voiceConfig.phoneme_type == "espeak") {
if (!espeakInstance) {
espeakInstance = await EspeakModule();
espeakInstance._espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, 0, 0);
}
}
voiceModel = await ort.InferenceSession.create(voiceModelUrl);
}
async function textToWavAudio(
text,
speakerId = undefined,
noiseScale = undefined,
lengthScale = undefined,
noiseWScale = undefined,
) {
if (!voiceConfig) {
throw new Error("Voice is not set");
}
const sampleRate = voiceConfig.audio.sample_rate;
const float32Audio = await textToFloat32Audio(
text,
speakerId,
noiseScale,
lengthScale,
noiseWScale,
);
return float32ToWavBlob(float32Audio, sampleRate);
}
async function textToFloat32Audio(
text,
speakerId = undefined,
lengthScale = undefined,
noiseScale = undefined,
noiseWScale = undefined,
) {
if (!voiceConfig) {
throw new Error("Voice is not set");
}
lengthScale = lengthScale ?? voiceConfig.inference.length_scale ?? 1.0;
noiseScale = noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667;
noiseWScale = noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8;
if (voiceConfig.num_speakers > 1) {
speakerId = speakerId ?? 0; // first speaker
}
const textPhonemes = textToPhonemes(text);
const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, textPhonemes);
// Run onnx model
const phonemeIdsTensor = new ort.Tensor(
"int64",
new BigInt64Array(phonemeIds.map((x) => BigInt(x))),
[1, phonemeIds.length],
);
const phonemeLengthsTensor = new ort.Tensor(
"int64",
BigInt64Array.from([BigInt(phonemeIds.length)]),
[1],
);
const scalesTensor = new ort.Tensor(
"float32",
Float32Array.from([noiseScale, lengthScale, noiseWScale]),
[3],
);
let feeds = {
input: phonemeIdsTensor,
input_lengths: phonemeLengthsTensor,
scales: scalesTensor,
};
if (voiceConfig.num_speakers > 1) {
// Multi-speaker
feeds["sid"] = new ort.Tensor(
"int64",
BigInt64Array.from([BigInt(speakerId)]),
);
}
const results = await voiceModel.run(feeds);
const float32Audio = results.output.cpuData;
return float32Audio;
}
function textToPhonemes(text) {
if (!voiceConfig) {
throw new Error("Voice is not set");
}
if (voiceConfig.phoneme_type == "text") {
// Text phonemes
return [Array.from(text.normalize("NFD"))];
}
if (!espeakInstance) {
throw new Error("espeak-ng is not initialized");
}
const voice = voiceConfig.espeak.voice;
// Set voice
const voicePtr = espeakInstance._malloc(
espeakInstance.lengthBytesUTF8(voice) + 1,
);
espeakInstance.stringToUTF8(
voice,
voicePtr,
espeakInstance.lengthBytesUTF8(voice) + 1,
);
espeakInstance._espeak_SetVoiceByName(voicePtr);
espeakInstance._free(voicePtr);
// Prepare text
const textPtr = espeakInstance._malloc(
espeakInstance.lengthBytesUTF8(text) + 1,
);
espeakInstance.stringToUTF8(
text,
textPtr,
espeakInstance.lengthBytesUTF8(text) + 1,
);
const textPtrPtr = espeakInstance._malloc(4);
espeakInstance.setValue(textPtrPtr, textPtr, "*");
// End of clause and sentences
const terminatorPtr = espeakInstance._malloc(4);
// Phoneme lists for each sentence
const textPhonemes = [];
// Phoneme list for current sentence
let sentencePhonemes = [];
while (true) {
const phonemesPtr = espeakInstance._espeak_TextToPhonemesWithTerminator(
textPtrPtr,
espeakCHARS_AUTO,
/* IPA */ 0x02,
terminatorPtr,
);
const clausePhonemes = espeakInstance.UTF8ToString(phonemesPtr);
sentencePhonemes.push(clausePhonemes);
const terminator = espeakInstance.getValue(terminatorPtr, "i32");
const punctuation = terminator & 0x000fffff;
// Add punctuation phonemes
if (punctuation === CLAUSE_PERIOD) {
sentencePhonemes.push(".");
} else if (punctuation === CLAUSE_QUESTION) {
sentencePhonemes.push("?");
} else if (punctuation === CLAUSE_EXCLAMATION) {
sentencePhonemes.push("!");
} else if (punctuation === CLAUSE_COMMA) {
sentencePhonemes.push(", ");
} else if (punctuation === CLAUSE_COLON) {
sentencePhonemes.push(": ");
} else if (punctuation === CLAUSE_SEMICOLON) {
sentencePhonemes.push("; ");
}
if ((terminator & CLAUSE_TYPE_SENTENCE) === CLAUSE_TYPE_SENTENCE) {
// End of sentence
textPhonemes.push(sentencePhonemes);
sentencePhonemes = [];
}
const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*");
if (nextTextPtr === 0) {
break; // All text processed
}
// Advance text pointer
espeakInstance.setValue(textPtrPtr, nextTextPtr, "*");
}
// Clean up
espeakInstance._free(textPtr);
espeakInstance._free(textPtrPtr);
espeakInstance._free(terminatorPtr);
// Add lingering phonemes
if (sentencePhonemes.length > 0) {
textPhonemes.push(sentencePhonemes);
sentencePhonemes = [];
}
// Prepare phonemes for Piper
for (let i = 0; i < textPhonemes.length; i++) {
textPhonemes[i] = Array.from(textPhonemes[i].join("").normalize("NFD"));
}
return textPhonemes;
}
function phonemesToIds(idMap, textPhonemes) {
let phonemeIds = [];
for (let sentencePhonemes of textPhonemes) {
phonemeIds.push(idMap[BOS]);
phonemeIds.push(idMap[PAD]);
for (let phoneme of sentencePhonemes) {
if (!(phoneme in idMap)) {
continue;
}
phonemeIds.push(idMap[phoneme]);
phonemeIds.push(idMap[PAD]);
}
phonemeIds.push(idMap[EOS]);
}
return phonemeIds;
}
function float32ToWavBlob(floatArray, sampleRate) {
const int16 = new Int16Array(floatArray.length);
for (let i = 0; i < floatArray.length; i++) {
int16[i] = Math.max(-1, Math.min(1, floatArray[i])) * 32767;
}
const buffer = new ArrayBuffer(44 + int16.length * 2);
const view = new DataView(buffer);
const writeStr = (offset, str) => {
for (let i = 0; i < str.length; i++)
view.setUint8(offset + i, str.charCodeAt(i));
};
writeStr(0, "RIFF");
view.setUint32(4, 36 + int16.length * 2, true);
writeStr(8, "WAVE");
writeStr(12, "fmt ");
view.setUint32(16, 16, true);
view.setUint16(20, 1, true); // PCM
view.setUint16(22, 1, true); // mono
view.setUint32(24, sampleRate, true);
view.setUint32(28, sampleRate * 2, true); // byte rate
view.setUint16(32, 2, true); // block align
view.setUint16(34, 16, true); // bits per sample
writeStr(36, "data");
view.setUint32(40, int16.length * 2, true);
for (let i = 0; i < int16.length; i++) {
view.setInt16(44 + i * 2, int16[i], true);
}
return new Blob([view], { type: "audio/wav" });
}
export { setVoice, textToWavAudio, textToFloat32Audio };