Spaces:

rhasspy
/

piper-demo

Running

File size: 8,177 Bytes

0c6d0de

/* Mini Piper implementation in Javascript. */

import EspeakModule from "./espeakng.worker.js";

const AUDIO_OUTPUT_SYNCHRONOUS = 2;
const espeakCHARS_AUTO = 0;

const CLAUSE_INTONATION_FULL_STOP = 0x00000000;
const CLAUSE_INTONATION_COMMA = 0x00001000;
const CLAUSE_INTONATION_QUESTION = 0x00002000;
const CLAUSE_INTONATION_EXCLAMATION = 0x00003000;

const CLAUSE_TYPE_CLAUSE = 0x00040000;
const CLAUSE_TYPE_SENTENCE = 0x00080000;

const CLAUSE_PERIOD = 40 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_SENTENCE;
const CLAUSE_COMMA = 20 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE;
const CLAUSE_QUESTION = 40 | CLAUSE_INTONATION_QUESTION | CLAUSE_TYPE_SENTENCE;
const CLAUSE_EXCLAMATION =
  45 | CLAUSE_INTONATION_EXCLAMATION | CLAUSE_TYPE_SENTENCE;
const CLAUSE_COLON = 30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE;
const CLAUSE_SEMICOLON = 30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE;

const BOS = "^";
const EOS = "$";
const PAD = "_";

let espeakInstance = null;
let espeakInitialized = false;
let voiceModel = null;
let voiceConfig = null;

async function setVoice(voiceModelUrl, voiceConfigUrl = undefined) {
  voiceConfigUrl = voiceConfigUrl ?? `${voiceModelUrl}.json`;

  const response = await fetch(voiceConfigUrl);
  if (!response.ok) {
    throw new Error(`Error loading voice configuration: {voiceConfigUrl}`);
  }
  voiceConfig = await response.json();

  if (voiceConfig.phoneme_type == "espeak") {
    if (!espeakInstance) {
      espeakInstance = await EspeakModule();
      espeakInstance._espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, 0, 0);
    }
  }

  voiceModel = await ort.InferenceSession.create(voiceModelUrl);
}

async function textToWavAudio(
  text,
  speakerId = undefined,
  noiseScale = undefined,
  lengthScale = undefined,
  noiseWScale = undefined,
) {
  if (!voiceConfig) {
    throw new Error("Voice is not set");
  }

  const sampleRate = voiceConfig.audio.sample_rate;
  const float32Audio = await textToFloat32Audio(
    text,
    speakerId,
    noiseScale,
    lengthScale,
    noiseWScale,
  );

  return float32ToWavBlob(float32Audio, sampleRate);
}

async function textToFloat32Audio(
  text,
  speakerId = undefined,
  lengthScale = undefined,
  noiseScale = undefined,
  noiseWScale = undefined,
) {
  if (!voiceConfig) {
    throw new Error("Voice is not set");
  }

  lengthScale = lengthScale ?? voiceConfig.inference.length_scale ?? 1.0;
  noiseScale = noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667;
  noiseWScale = noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8;

  if (voiceConfig.num_speakers > 1) {
    speakerId = speakerId ?? 0; // first speaker
  }

  const textPhonemes = textToPhonemes(text);
  const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, textPhonemes);

  // Run onnx model
  const phonemeIdsTensor = new ort.Tensor(
    "int64",
    new BigInt64Array(phonemeIds.map((x) => BigInt(x))),
    [1, phonemeIds.length],
  );
  const phonemeLengthsTensor = new ort.Tensor(
    "int64",
    BigInt64Array.from([BigInt(phonemeIds.length)]),
    [1],
  );
  const scalesTensor = new ort.Tensor(
    "float32",
    Float32Array.from([noiseScale, lengthScale, noiseWScale]),
    [3],
  );

  let feeds = {
    input: phonemeIdsTensor,
    input_lengths: phonemeLengthsTensor,
    scales: scalesTensor,
  };

  if (voiceConfig.num_speakers > 1) {
    // Multi-speaker
    feeds["sid"] = new ort.Tensor(
      "int64",
      BigInt64Array.from([BigInt(speakerId)]),
    );
  }

  const results = await voiceModel.run(feeds);
  const float32Audio = results.output.cpuData;

  return float32Audio;
}

function textToPhonemes(text) {
  if (!voiceConfig) {
    throw new Error("Voice is not set");
  }

  if (voiceConfig.phoneme_type == "text") {
    // Text phonemes
    return [Array.from(text.normalize("NFD"))];
  }

  if (!espeakInstance) {
    throw new Error("espeak-ng is not initialized");
  }

  const voice = voiceConfig.espeak.voice;

  // Set voice
  const voicePtr = espeakInstance._malloc(
    espeakInstance.lengthBytesUTF8(voice) + 1,
  );
  espeakInstance.stringToUTF8(
    voice,
    voicePtr,
    espeakInstance.lengthBytesUTF8(voice) + 1,
  );
  espeakInstance._espeak_SetVoiceByName(voicePtr);
  espeakInstance._free(voicePtr);

  // Prepare text
  const textPtr = espeakInstance._malloc(
    espeakInstance.lengthBytesUTF8(text) + 1,
  );
  espeakInstance.stringToUTF8(
    text,
    textPtr,
    espeakInstance.lengthBytesUTF8(text) + 1,
  );

  const textPtrPtr = espeakInstance._malloc(4);
  espeakInstance.setValue(textPtrPtr, textPtr, "*");

  // End of clause and sentences
  const terminatorPtr = espeakInstance._malloc(4);

  // Phoneme lists for each sentence
  const textPhonemes = [];

  // Phoneme list for current sentence
  let sentencePhonemes = [];

  while (true) {
    const phonemesPtr = espeakInstance._espeak_TextToPhonemesWithTerminator(
      textPtrPtr,
      espeakCHARS_AUTO,
      /* IPA */ 0x02,
      terminatorPtr,
    );
    const clausePhonemes = espeakInstance.UTF8ToString(phonemesPtr);
    sentencePhonemes.push(clausePhonemes);

    const terminator = espeakInstance.getValue(terminatorPtr, "i32");
    const punctuation = terminator & 0x000fffff;

    // Add punctuation phonemes
    if (punctuation === CLAUSE_PERIOD) {
      sentencePhonemes.push(".");
    } else if (punctuation === CLAUSE_QUESTION) {
      sentencePhonemes.push("?");
    } else if (punctuation === CLAUSE_EXCLAMATION) {
      sentencePhonemes.push("!");
    } else if (punctuation === CLAUSE_COMMA) {
      sentencePhonemes.push(", ");
    } else if (punctuation === CLAUSE_COLON) {
      sentencePhonemes.push(": ");
    } else if (punctuation === CLAUSE_SEMICOLON) {
      sentencePhonemes.push("; ");
    }

    if ((terminator & CLAUSE_TYPE_SENTENCE) === CLAUSE_TYPE_SENTENCE) {
      // End of sentence
      textPhonemes.push(sentencePhonemes);
      sentencePhonemes = [];
    }

    const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*");
    if (nextTextPtr === 0) {
      break; // All text processed
    }

    // Advance text pointer
    espeakInstance.setValue(textPtrPtr, nextTextPtr, "*");
  }

  // Clean up
  espeakInstance._free(textPtr);
  espeakInstance._free(textPtrPtr);
  espeakInstance._free(terminatorPtr);

  // Add lingering phonemes
  if (sentencePhonemes.length > 0) {
    textPhonemes.push(sentencePhonemes);
    sentencePhonemes = [];
  }

  // Prepare phonemes for Piper
  for (let i = 0; i < textPhonemes.length; i++) {
    textPhonemes[i] = Array.from(textPhonemes[i].join("").normalize("NFD"));
  }

  return textPhonemes;
}

function phonemesToIds(idMap, textPhonemes) {
  let phonemeIds = [];

  for (let sentencePhonemes of textPhonemes) {
    phonemeIds.push(idMap[BOS]);
    phonemeIds.push(idMap[PAD]);

    for (let phoneme of sentencePhonemes) {
      if (!(phoneme in idMap)) {
        continue;
      }

      phonemeIds.push(idMap[phoneme]);
      phonemeIds.push(idMap[PAD]);
    }

    phonemeIds.push(idMap[EOS]);
  }

  return phonemeIds;
}

function float32ToWavBlob(floatArray, sampleRate) {
  const int16 = new Int16Array(floatArray.length);
  for (let i = 0; i < floatArray.length; i++) {
    int16[i] = Math.max(-1, Math.min(1, floatArray[i])) * 32767;
  }

  const buffer = new ArrayBuffer(44 + int16.length * 2);
  const view = new DataView(buffer);

  const writeStr = (offset, str) => {
    for (let i = 0; i < str.length; i++)
      view.setUint8(offset + i, str.charCodeAt(i));
  };

  writeStr(0, "RIFF");
  view.setUint32(4, 36 + int16.length * 2, true);
  writeStr(8, "WAVE");
  writeStr(12, "fmt ");
  view.setUint32(16, 16, true);
  view.setUint16(20, 1, true); // PCM
  view.setUint16(22, 1, true); // mono
  view.setUint32(24, sampleRate, true);
  view.setUint32(28, sampleRate * 2, true); // byte rate
  view.setUint16(32, 2, true); // block align
  view.setUint16(34, 16, true); // bits per sample
  writeStr(36, "data");
  view.setUint32(40, int16.length * 2, true);

  for (let i = 0; i < int16.length; i++) {
    view.setInt16(44 + i * 2, int16[i], true);
  }

  return new Blob([view], { type: "audio/wav" });
}

export { setVoice, textToWavAudio, textToFloat32Audio };