Michael Hansen commited on
Commit
0c6d0de
·
1 Parent(s): 329d161

First working version

Browse files
Files changed (44) hide show
  1. .gitattributes +2 -0
  2. espeakng.worker.data +3 -0
  3. img/logo.png +0 -0
  4. index.html +115 -24
  5. js/app.js +318 -0
  6. js/espeakng.worker.js +0 -0
  7. js/espeakng.worker.wasm +3 -0
  8. js/piper.js +303 -0
  9. txt/ar.txt +1 -0
  10. txt/ca.txt +1 -0
  11. txt/cs.txt +1 -0
  12. txt/cy.txt +1 -0
  13. txt/da.txt +1 -0
  14. txt/de.txt +1 -0
  15. txt/el.txt +1 -0
  16. txt/en.txt +1 -0
  17. txt/es.txt +1 -0
  18. txt/eu.txt +1 -0
  19. txt/fa.txt +1 -0
  20. txt/fi.txt +1 -0
  21. txt/fr.txt +1 -0
  22. txt/hu.txt +1 -0
  23. txt/is.txt +1 -0
  24. txt/it.txt +1 -0
  25. txt/ka.txt +1 -0
  26. txt/kk.txt +1 -0
  27. txt/lb.txt +1 -0
  28. txt/lv.txt +1 -0
  29. txt/ne.txt +1 -0
  30. txt/nl.txt +1 -0
  31. txt/no.txt +1 -0
  32. txt/pl.txt +1 -0
  33. txt/pt.txt +1 -0
  34. txt/ro.txt +1 -0
  35. txt/ru.txt +1 -0
  36. txt/sk.txt +1 -0
  37. txt/sl.txt +1 -0
  38. txt/sr.txt +1 -0
  39. txt/sv.txt +1 -0
  40. txt/sw.txt +1 -0
  41. txt/tr.txt +1 -0
  42. txt/uk.txt +1 -0
  43. txt/vi.txt +1 -0
  44. txt/zh.txt +1 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ espeakng.worker.data filter=lfs diff=lfs merge=lfs -text
37
+ js/espeakng.worker.wasm filter=lfs diff=lfs merge=lfs -text
espeakng.worker.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07263c3a96e26dcea39e0d7e5001d2121f0f4fccbb8110c2ad9285e527d77e97
3
+ size 24183288
img/logo.png ADDED
index.html CHANGED
@@ -1,25 +1,116 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- <script>
19
- document.addEventListener("DOMContentLoaded", async () => {
20
- let response = await fetch("https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx.json?download=true");
21
- console.log(await response.json());
22
- });
23
- </script>
24
- </body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  </html>
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Piper Demo</title>
6
+ <style type="text/css">
7
+ body {
8
+ margin: auto;
9
+ max-width: 90%;
10
+ }
11
+
12
+ #textInput {
13
+ width: 100%;
14
+ }
15
+
16
+ #logo {
17
+ margin-left: -30px;
18
+ height: 6em;
19
+ }
20
+
21
+ #sponsored {
22
+ position: absolute;
23
+ right: 75px;
24
+ top: 20px;
25
+ }
26
+
27
+ #buttonSpeak {
28
+ font-size: 1.25em;
29
+ padding: 5px 8px 5px 8px;
30
+ }
31
+
32
+ #divSpeak {
33
+ margin-top: 10px;
34
+ margin-bottom: 20px;
35
+ }
36
+
37
+ #divSpeak > audio {
38
+ vertical-align: bottom;
39
+ margin-left: 10px;
40
+ }
41
+
42
+ #key {
43
+ font-weight: bold;
44
+ margin-left: 10px;
45
+ }
46
+
47
+ #languages {
48
+ margin-bottom: 15px;
49
+ }
50
+
51
+ #status {
52
+ margin-left: 10px;
53
+ }
54
+
55
+ .setting {
56
+ margin-top: 10px;
57
+ }
58
+ </style>
59
+ </head>
60
+ <body>
61
+ <a href="https://github.com/rhasspy/piper" title="Piper TTS">
62
+ <img id="logo" src="img/logo.png" alt="Piper logo">
63
+ </a>
64
+ <a href="https://www.openhomefoundation.org/" title="A library from the Open Home Foundation">
65
+ <img id="sponsored" src="https://www.openhomefoundation.org/badges/ohf-library.png" alt="A library from the Open Home Foundation">
66
+ </a>
67
+
68
+ <br />
69
+
70
+ <select id="languages" onchange="setLanguage()">
71
+ <option value="">Language</option>
72
+ </select>
73
+
74
+ <select id="voice" onchange="setVoiceName()">
75
+ <option value="">Voice</option>
76
+ </select>
77
+
78
+ <select id="quality" onchange="setQuality()">
79
+ <option value="">Quality</option>
80
+ </select>
81
+
82
+ <select id="speaker" onchange="setSpeaker()">
83
+ <option value="">Speaker</option>
84
+ </select>
85
+
86
+ <span id="key"></span>
87
+
88
+ <br />
89
+
90
+ <textarea id="textInput" rows=5 disabled></textarea>
91
+
92
+ <div id="divSpeak">
93
+ <button id="buttonSpeak" disabled>Speak</button>
94
+ <audio id="audioTTS" controls></audio>
95
+ <span id="status">Ready</span>
96
+ </div>
97
+
98
+ <div class="setting">
99
+ <label for="lengthScale">Length Scale:</label>
100
+ <input id="lengthScale" type="number" step="0.1" value="1.0" />
101
+ </div>
102
+
103
+ <div class="setting">
104
+ <label for="noiseScale">Noise Scale:</label>
105
+ <input id="noiseScale" type="number" step="0.1" value="0.667" />
106
+ </div>
107
+
108
+ <div class="setting">
109
+ <label for="noiseWScale">Noise W Scale:</label>
110
+ <input id="noiseWScale" type="number" step="0.1" value="0.8" />
111
+ </div>
112
+
113
+ <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.min.js"></script>
114
+ <script type="module" src="js/app.js"></script>
115
+ </body>
116
  </html>
js/app.js ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { setVoice, textToWavAudio } from "./piper.js";
2
+
3
+ const voiceRoot = "https:huggingface.co/rhasspy/piper-voices/resolve/main";
4
+
5
+ let languageToSelect = null;
6
+ let voiceToSelect = null;
7
+ let qualityToSelect = null;
8
+
9
+ let voices = {};
10
+ const qualitySort = {
11
+ x_low: 0,
12
+ low: 1,
13
+ medium: 2,
14
+ high: 3,
15
+ };
16
+
17
+ let voiceId = "";
18
+ let voiceUrl = "";
19
+ let loadedVoiceId = "";
20
+
21
+ async function main() {
22
+ loadVoices();
23
+
24
+ const buttonSpeak = document.getElementById("buttonSpeak");
25
+ const audioTTS = document.getElementById("audioTTS");
26
+ const textInput = document.getElementById("textInput");
27
+ const status = document.getElementById("status");
28
+ const speakerSelect = document.getElementById("speaker");
29
+ const inputLengthScale = document.getElementById("lengthScale");
30
+ const inputNoiseScale = document.getElementById("noiseScale");
31
+ const inputNoiseWScale = document.getElementById("noiseWScale");
32
+
33
+ buttonSpeak.addEventListener("click", async () => {
34
+ const text = textInput.value;
35
+ if (!text) {
36
+ return;
37
+ }
38
+
39
+ if (!voiceId) {
40
+ console.log("ERROR: No voice id");
41
+ return;
42
+ }
43
+
44
+ if (voiceId != loadedVoiceId) {
45
+ if (!voiceUrl) {
46
+ console.log("ERROR: No voice URL");
47
+ return;
48
+ }
49
+
50
+ status.innerHTML = "Loading voice...";
51
+ await setVoice(
52
+ `${voiceUrl}/${voiceId}.onnx?download=true`,
53
+ `${voiceUrl}/${voiceId}.onnx.json?download=true`,
54
+ );
55
+ loadedVoiceId = voiceId;
56
+ }
57
+
58
+ let speakerId = null;
59
+ if (speakerSelect.selectedIndex > 0) {
60
+ speakerId = parseInt(speakerSelect.value);
61
+ }
62
+
63
+ let lengthScale = parseFloat(inputLengthScale.value);
64
+ if (isNaN(lengthScale)) {
65
+ lengthScale = null;
66
+ }
67
+
68
+ let noiseScale = parseFloat(inputNoiseScale.value);
69
+ if (isNaN(noiseScale)) {
70
+ noiseScale = null;
71
+ }
72
+
73
+ let noiseWScale = parseFloat(inputNoiseWScale.value);
74
+ if (isNaN(noiseWScale)) {
75
+ noiseWScale = null;
76
+ }
77
+
78
+ status.innerHTML = "Synthesizing audio...";
79
+ const wavAudio = await textToWavAudio(
80
+ text,
81
+ speakerId,
82
+ lengthScale,
83
+ noiseScale,
84
+ noiseWScale,
85
+ );
86
+ const audioURL = URL.createObjectURL(wavAudio);
87
+
88
+ audioTTS.src = audioURL;
89
+ audioTTS.play();
90
+
91
+ status.innerHTML = "Ready";
92
+ });
93
+
94
+ textInput.disabled = false;
95
+ buttonSpeak.disabled = false;
96
+
97
+ window.setLanguage = setLanguage;
98
+ window.setVoiceName = setVoiceName;
99
+ window.setQuality = setQuality;
100
+ window.setSpeaker = setSpeaker;
101
+ }
102
+
103
+ document.addEventListener("DOMContentLoaded", () => {
104
+ main();
105
+ });
106
+
107
+ // ----------------------------------------------------------------------------
108
+
109
+ export function setLanguage() {
110
+ var language = document.getElementById("languages").value;
111
+ if (language.length > 0) {
112
+ var voiceSelect = document.getElementById("voice");
113
+ while (voiceSelect.options.length > 1) {
114
+ voiceSelect.remove(voiceSelect.options.length - 1);
115
+ }
116
+
117
+ let names = [];
118
+ for (let key in voices) {
119
+ let voice = voices[key];
120
+ if (voice.language.code == language) {
121
+ names.push(voice.name);
122
+ }
123
+ }
124
+
125
+ names = Array.from(new Set(names)).sort();
126
+ for (let i in names) {
127
+ let name = names[i];
128
+ let option = document.createElement("option");
129
+ option.text = name;
130
+ option.value = name;
131
+ voiceSelect.add(option);
132
+ }
133
+
134
+ if (voiceToSelect) {
135
+ voiceSelect.value = voiceToSelect;
136
+ voiceToSelect = null;
137
+ setVoiceName();
138
+ } else if (voiceSelect.options.length > 1) {
139
+ // Select first voice
140
+ voiceSelect.selectedIndex = 1;
141
+ setVoiceName();
142
+ }
143
+ }
144
+ }
145
+
146
+ function setVoiceName() {
147
+ var language = document.getElementById("languages").value;
148
+ var voiceName = document.getElementById("voice").value;
149
+ if (voiceName.length > 0) {
150
+ var qualitySelect = document.getElementById("quality");
151
+ while (qualitySelect.options.length > 1) {
152
+ qualitySelect.remove(qualitySelect.options.length - 1);
153
+ }
154
+
155
+ let qualities = [];
156
+ for (let key in voices) {
157
+ let voice = voices[key];
158
+ if (voice.language.code == language && voice.name == voiceName) {
159
+ qualities.push(voice.quality);
160
+ }
161
+ }
162
+
163
+ qualities = Array.from(new Set(qualities)).sort(
164
+ (a, b) => qualitySort[a] - qualitySort[b],
165
+ );
166
+ for (let i in qualities) {
167
+ let quality = qualities[i];
168
+ let option = document.createElement("option");
169
+ option.text = quality;
170
+ option.value = quality;
171
+ qualitySelect.add(option);
172
+ }
173
+
174
+ if (qualityToSelect) {
175
+ qualitySelect.value = qualityToSelect;
176
+ qualityToSelect = null;
177
+ setQuality();
178
+ } else if (qualitySelect.options.length > 1) {
179
+ // Select highest quality
180
+ qualitySelect.selectedIndex = qualitySelect.options.length - 1;
181
+ setQuality();
182
+ }
183
+ }
184
+ }
185
+
186
+ function setQuality() {
187
+ var language = document.getElementById("languages").value;
188
+ var voiceName = document.getElementById("voice").value;
189
+ var quality = document.getElementById("quality").value;
190
+ if (quality.length > 0) {
191
+ var speakerSelect = document.getElementById("speaker");
192
+ while (speakerSelect.options.length > 1) {
193
+ speakerSelect.remove(speakerSelect.options.length - 1);
194
+ }
195
+
196
+ var numSpeakers = 1;
197
+ var speakerIdMap = {};
198
+ for (let key in voices) {
199
+ let voice = voices[key];
200
+ if (
201
+ voice.language.code == language &&
202
+ voice.name == voiceName &&
203
+ voice.quality == quality
204
+ ) {
205
+ numSpeakers = voice.num_speakers;
206
+ speakerIdMap = voice.speaker_id_map;
207
+ break;
208
+ }
209
+ }
210
+
211
+ if (numSpeakers <= 1) {
212
+ // Single speaker model
213
+ let option = document.createElement("option");
214
+ option.text = "default";
215
+ option.value = "0";
216
+ speakerSelect.add(option);
217
+ } else {
218
+ // Multi-speaker model
219
+ let sortedSpeakers = Object.keys(speakerIdMap).sort(
220
+ (a, b) => speakerIdMap[a] - speakerIdMap[b],
221
+ );
222
+ for (let i in sortedSpeakers) {
223
+ let speaker = sortedSpeakers[i];
224
+ let option = document.createElement("option");
225
+ option.text = speaker + " (" + i.toString() + ")";
226
+ option.value = i.toString();
227
+ speakerSelect.add(option);
228
+ }
229
+ }
230
+
231
+ if (speakerSelect.options.length > 1) {
232
+ // Select first speaker
233
+ speakerSelect.selectedIndex = 1;
234
+ setSpeaker();
235
+ }
236
+
237
+ voiceId = `${language}-${voiceName}-${quality}`;
238
+ window.location.hash = voiceId;
239
+ }
240
+ }
241
+
242
+ function setSpeaker() {
243
+ var language = document.getElementById("languages").value;
244
+ let languageFamily = language.split("_")[0];
245
+ var voiceName = document.getElementById("voice").value;
246
+ var quality = document.getElementById("quality").value;
247
+ var speaker = document.getElementById("speaker").value;
248
+ if (speaker.length > 0) {
249
+ for (let key in voices) {
250
+ let voice = voices[key];
251
+ if (
252
+ voice.language.code == language &&
253
+ voice.name == voiceName &&
254
+ voice.quality == quality
255
+ ) {
256
+ voiceUrl = `${voiceRoot}/${languageFamily}/${language}/${voiceName}/${quality}`;
257
+
258
+ let aKey = document.getElementById("key");
259
+ aKey.innerHTML = key;
260
+ aKey.href = voiceUrl;
261
+
262
+ fetch(`txt/${languageFamily}.txt`)
263
+ .then((response) => response.text())
264
+ .then((text) => {
265
+ document.getElementById("textInput").innerHTML = text;
266
+ });
267
+ }
268
+ }
269
+ }
270
+ }
271
+
272
+ function loadVoices() {
273
+ let hash = window.location.hash;
274
+ if (hash.length > 0) {
275
+ let voiceIdRegexp = RegExp("^#([^-]+)-([^-]+)-([^-]+)$");
276
+ let match = voiceIdRegexp.exec(hash);
277
+ if (match) {
278
+ languageToSelect = match[1];
279
+ voiceToSelect = match[2];
280
+ qualityToSelect = match[3];
281
+ }
282
+ }
283
+
284
+ fetch(`${voiceRoot}/voices.json?download=true`)
285
+ .then((response) => response.json())
286
+ .then((response_obj) => {
287
+ voices = response_obj;
288
+ let voiceLanguages = [];
289
+ let languageNames = {};
290
+ for (let key in voices) {
291
+ let voice = voices[key];
292
+ voiceLanguages.push(voice.language.code);
293
+ languageNames[voice.language.code] =
294
+ voice.language.name_native +
295
+ " (" +
296
+ voice.language.name_english +
297
+ ", " +
298
+ voice.language.country_english +
299
+ ")";
300
+ }
301
+
302
+ let sortedLanguages = Array.from(new Set(voiceLanguages)).sort();
303
+ let languagesSelect = document.getElementById("languages");
304
+ for (let i in sortedLanguages) {
305
+ let language = sortedLanguages[i];
306
+ let option = document.createElement("option");
307
+ option.text = languageNames[language];
308
+ option.value = language;
309
+ languagesSelect.add(option);
310
+ }
311
+
312
+ if (languageToSelect) {
313
+ languagesSelect.value = languageToSelect;
314
+ languageToSelect = null;
315
+ setLanguage();
316
+ }
317
+ });
318
+ }
js/espeakng.worker.js ADDED
The diff for this file is too large to render. See raw diff
 
js/espeakng.worker.wasm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb0ea920003d62ed0a97f6d51a0903ff0cb6553d8d5e02eb5dce4c26e055f33f
3
+ size 361999
js/piper.js ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Mini Piper implementation in Javascript. */
2
+
3
+ import EspeakModule from "./espeakng.worker.js";
4
+
5
+ const AUDIO_OUTPUT_SYNCHRONOUS = 2;
6
+ const espeakCHARS_AUTO = 0;
7
+
8
+ const CLAUSE_INTONATION_FULL_STOP = 0x00000000;
9
+ const CLAUSE_INTONATION_COMMA = 0x00001000;
10
+ const CLAUSE_INTONATION_QUESTION = 0x00002000;
11
+ const CLAUSE_INTONATION_EXCLAMATION = 0x00003000;
12
+
13
+ const CLAUSE_TYPE_CLAUSE = 0x00040000;
14
+ const CLAUSE_TYPE_SENTENCE = 0x00080000;
15
+
16
+ const CLAUSE_PERIOD = 40 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_SENTENCE;
17
+ const CLAUSE_COMMA = 20 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE;
18
+ const CLAUSE_QUESTION = 40 | CLAUSE_INTONATION_QUESTION | CLAUSE_TYPE_SENTENCE;
19
+ const CLAUSE_EXCLAMATION =
20
+ 45 | CLAUSE_INTONATION_EXCLAMATION | CLAUSE_TYPE_SENTENCE;
21
+ const CLAUSE_COLON = 30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE;
22
+ const CLAUSE_SEMICOLON = 30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE;
23
+
24
+ const BOS = "^";
25
+ const EOS = "$";
26
+ const PAD = "_";
27
+
28
+ let espeakInstance = null;
29
+ let espeakInitialized = false;
30
+ let voiceModel = null;
31
+ let voiceConfig = null;
32
+
33
+ async function setVoice(voiceModelUrl, voiceConfigUrl = undefined) {
34
+ voiceConfigUrl = voiceConfigUrl ?? `${voiceModelUrl}.json`;
35
+
36
+ const response = await fetch(voiceConfigUrl);
37
+ if (!response.ok) {
38
+ throw new Error(`Error loading voice configuration: {voiceConfigUrl}`);
39
+ }
40
+ voiceConfig = await response.json();
41
+
42
+ if (voiceConfig.phoneme_type == "espeak") {
43
+ if (!espeakInstance) {
44
+ espeakInstance = await EspeakModule();
45
+ espeakInstance._espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, 0, 0);
46
+ }
47
+ }
48
+
49
+ voiceModel = await ort.InferenceSession.create(voiceModelUrl);
50
+ }
51
+
52
+ async function textToWavAudio(
53
+ text,
54
+ speakerId = undefined,
55
+ noiseScale = undefined,
56
+ lengthScale = undefined,
57
+ noiseWScale = undefined,
58
+ ) {
59
+ if (!voiceConfig) {
60
+ throw new Error("Voice is not set");
61
+ }
62
+
63
+ const sampleRate = voiceConfig.audio.sample_rate;
64
+ const float32Audio = await textToFloat32Audio(
65
+ text,
66
+ speakerId,
67
+ noiseScale,
68
+ lengthScale,
69
+ noiseWScale,
70
+ );
71
+
72
+ return float32ToWavBlob(float32Audio, sampleRate);
73
+ }
74
+
75
+ async function textToFloat32Audio(
76
+ text,
77
+ speakerId = undefined,
78
+ lengthScale = undefined,
79
+ noiseScale = undefined,
80
+ noiseWScale = undefined,
81
+ ) {
82
+ if (!voiceConfig) {
83
+ throw new Error("Voice is not set");
84
+ }
85
+
86
+ lengthScale = lengthScale ?? voiceConfig.inference.length_scale ?? 1.0;
87
+ noiseScale = noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667;
88
+ noiseWScale = noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8;
89
+
90
+ if (voiceConfig.num_speakers > 1) {
91
+ speakerId = speakerId ?? 0; // first speaker
92
+ }
93
+
94
+ const textPhonemes = textToPhonemes(text);
95
+ const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, textPhonemes);
96
+
97
+ // Run onnx model
98
+ const phonemeIdsTensor = new ort.Tensor(
99
+ "int64",
100
+ new BigInt64Array(phonemeIds.map((x) => BigInt(x))),
101
+ [1, phonemeIds.length],
102
+ );
103
+ const phonemeLengthsTensor = new ort.Tensor(
104
+ "int64",
105
+ BigInt64Array.from([BigInt(phonemeIds.length)]),
106
+ [1],
107
+ );
108
+ const scalesTensor = new ort.Tensor(
109
+ "float32",
110
+ Float32Array.from([noiseScale, lengthScale, noiseWScale]),
111
+ [3],
112
+ );
113
+
114
+ let feeds = {
115
+ input: phonemeIdsTensor,
116
+ input_lengths: phonemeLengthsTensor,
117
+ scales: scalesTensor,
118
+ };
119
+
120
+ if (voiceConfig.num_speakers > 1) {
121
+ // Multi-speaker
122
+ feeds["sid"] = new ort.Tensor(
123
+ "int64",
124
+ BigInt64Array.from([BigInt(speakerId)]),
125
+ );
126
+ }
127
+
128
+ const results = await voiceModel.run(feeds);
129
+ const float32Audio = results.output.cpuData;
130
+
131
+ return float32Audio;
132
+ }
133
+
134
+ function textToPhonemes(text) {
135
+ if (!voiceConfig) {
136
+ throw new Error("Voice is not set");
137
+ }
138
+
139
+ if (voiceConfig.phoneme_type == "text") {
140
+ // Text phonemes
141
+ return [Array.from(text.normalize("NFD"))];
142
+ }
143
+
144
+ if (!espeakInstance) {
145
+ throw new Error("espeak-ng is not initialized");
146
+ }
147
+
148
+ const voice = voiceConfig.espeak.voice;
149
+
150
+ // Set voice
151
+ const voicePtr = espeakInstance._malloc(
152
+ espeakInstance.lengthBytesUTF8(voice) + 1,
153
+ );
154
+ espeakInstance.stringToUTF8(
155
+ voice,
156
+ voicePtr,
157
+ espeakInstance.lengthBytesUTF8(voice) + 1,
158
+ );
159
+ espeakInstance._espeak_SetVoiceByName(voicePtr);
160
+ espeakInstance._free(voicePtr);
161
+
162
+ // Prepare text
163
+ const textPtr = espeakInstance._malloc(
164
+ espeakInstance.lengthBytesUTF8(text) + 1,
165
+ );
166
+ espeakInstance.stringToUTF8(
167
+ text,
168
+ textPtr,
169
+ espeakInstance.lengthBytesUTF8(text) + 1,
170
+ );
171
+
172
+ const textPtrPtr = espeakInstance._malloc(4);
173
+ espeakInstance.setValue(textPtrPtr, textPtr, "*");
174
+
175
+ // End of clause and sentences
176
+ const terminatorPtr = espeakInstance._malloc(4);
177
+
178
+ // Phoneme lists for each sentence
179
+ const textPhonemes = [];
180
+
181
+ // Phoneme list for current sentence
182
+ let sentencePhonemes = [];
183
+
184
+ while (true) {
185
+ const phonemesPtr = espeakInstance._espeak_TextToPhonemesWithTerminator(
186
+ textPtrPtr,
187
+ espeakCHARS_AUTO,
188
+ /* IPA */ 0x02,
189
+ terminatorPtr,
190
+ );
191
+ const clausePhonemes = espeakInstance.UTF8ToString(phonemesPtr);
192
+ sentencePhonemes.push(clausePhonemes);
193
+
194
+ const terminator = espeakInstance.getValue(terminatorPtr, "i32");
195
+ const punctuation = terminator & 0x000fffff;
196
+
197
+ // Add punctuation phonemes
198
+ if (punctuation === CLAUSE_PERIOD) {
199
+ sentencePhonemes.push(".");
200
+ } else if (punctuation === CLAUSE_QUESTION) {
201
+ sentencePhonemes.push("?");
202
+ } else if (punctuation === CLAUSE_EXCLAMATION) {
203
+ sentencePhonemes.push("!");
204
+ } else if (punctuation === CLAUSE_COMMA) {
205
+ sentencePhonemes.push(", ");
206
+ } else if (punctuation === CLAUSE_COLON) {
207
+ sentencePhonemes.push(": ");
208
+ } else if (punctuation === CLAUSE_SEMICOLON) {
209
+ sentencePhonemes.push("; ");
210
+ }
211
+
212
+ if ((terminator & CLAUSE_TYPE_SENTENCE) === CLAUSE_TYPE_SENTENCE) {
213
+ // End of sentence
214
+ textPhonemes.push(sentencePhonemes);
215
+ sentencePhonemes = [];
216
+ }
217
+
218
+ const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*");
219
+ if (nextTextPtr === 0) {
220
+ break; // All text processed
221
+ }
222
+
223
+ // Advance text pointer
224
+ espeakInstance.setValue(textPtrPtr, nextTextPtr, "*");
225
+ }
226
+
227
+ // Clean up
228
+ espeakInstance._free(textPtr);
229
+ espeakInstance._free(textPtrPtr);
230
+ espeakInstance._free(terminatorPtr);
231
+
232
+ // Add lingering phonemes
233
+ if (sentencePhonemes.length > 0) {
234
+ textPhonemes.push(sentencePhonemes);
235
+ sentencePhonemes = [];
236
+ }
237
+
238
+ // Prepare phonemes for Piper
239
+ for (let i = 0; i < textPhonemes.length; i++) {
240
+ textPhonemes[i] = Array.from(textPhonemes[i].join("").normalize("NFD"));
241
+ }
242
+
243
+ return textPhonemes;
244
+ }
245
+
246
+ function phonemesToIds(idMap, textPhonemes) {
247
+ let phonemeIds = [];
248
+
249
+ for (let sentencePhonemes of textPhonemes) {
250
+ phonemeIds.push(idMap[BOS]);
251
+ phonemeIds.push(idMap[PAD]);
252
+
253
+ for (let phoneme of sentencePhonemes) {
254
+ if (!(phoneme in idMap)) {
255
+ continue;
256
+ }
257
+
258
+ phonemeIds.push(idMap[phoneme]);
259
+ phonemeIds.push(idMap[PAD]);
260
+ }
261
+
262
+ phonemeIds.push(idMap[EOS]);
263
+ }
264
+
265
+ return phonemeIds;
266
+ }
267
+
268
+ function float32ToWavBlob(floatArray, sampleRate) {
269
+ const int16 = new Int16Array(floatArray.length);
270
+ for (let i = 0; i < floatArray.length; i++) {
271
+ int16[i] = Math.max(-1, Math.min(1, floatArray[i])) * 32767;
272
+ }
273
+
274
+ const buffer = new ArrayBuffer(44 + int16.length * 2);
275
+ const view = new DataView(buffer);
276
+
277
+ const writeStr = (offset, str) => {
278
+ for (let i = 0; i < str.length; i++)
279
+ view.setUint8(offset + i, str.charCodeAt(i));
280
+ };
281
+
282
+ writeStr(0, "RIFF");
283
+ view.setUint32(4, 36 + int16.length * 2, true);
284
+ writeStr(8, "WAVE");
285
+ writeStr(12, "fmt ");
286
+ view.setUint32(16, 16, true);
287
+ view.setUint16(20, 1, true); // PCM
288
+ view.setUint16(22, 1, true); // mono
289
+ view.setUint32(24, sampleRate, true);
290
+ view.setUint32(28, sampleRate * 2, true); // byte rate
291
+ view.setUint16(32, 2, true); // block align
292
+ view.setUint16(34, 16, true); // bits per sample
293
+ writeStr(36, "data");
294
+ view.setUint32(40, int16.length * 2, true);
295
+
296
+ for (let i = 0; i < int16.length; i++) {
297
+ view.setInt16(44 + i * 2, int16[i], true);
298
+ }
299
+
300
+ return new Blob([view], { type: "audio/wav" });
301
+ }
302
+
303
+ export { setVoice, textToWavAudio, textToFloat32Audio };
txt/ar.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ قَوْسُ قُزَحْ، يُسَمَّى كَذَلِكَ: قَوْسُ الْمَطَرِ أَوْ قَوْسُ الْأَلْوَانِ، وَهُوَ ظَاهِرَةٌ طَبِيعِيَّةٌ فِزْيَائِيَّةٌ نَاتِجَةٌ عَنِ انْكِسَارِ وَتَحَلُّلِ ضَوْءِ الشَّمْسِ خِلالَ قَطْرَةِ مَاءِ الْمَطَرِ.
txt/ca.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ L'arc de Sant Martí o arc del cel és un fenomen meteorològic òptic produït per la reflexió, refracció i dispersió de la llum causada per gotes d'aigua en suspensió a la troposfera que resulta en l'aparició al cel de l'espectre de la llum visible, interpretat per l'ull humà com els colors vermell, taronja, groc, verd, blau, indi i violat.
txt/cs.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Duha je fotometeor, projevující se jako skupina soustředných barevných oblouků, které vznikají lomem a vnitřním odrazem slunečního nebo měsíčního světla na vodních kapkách v atmosféře.
txt/cy.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Rhyfeddod neu ffenomenon optegol a meteorolegol yw enfys, pan fydd sbectrwm o olau yn ymddangos yn yr awyr pan fo'r haul yn disgleirio ar ddiferion o leithder yn atmosffer y ddaear.
txt/da.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ En regnbue er et optisk fænomen; en "lyseffekt", som skabes på himlen, når lys fra Solen rammer små vanddråber i luften, f.eks. faldende regn.
txt/de.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Der Regenbogen ist ein atmosphärisch-optisches Phänomen, das als kreisbogenförmiges farbiges Lichtband in einer von der Sonne beschienenen Regenwand oder -wolke wahrgenommen wird.
txt/el.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Οι επιστήμονες μελετούν ακόμη το ουράνιο τόξο.
txt/en.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky.
txt/es.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Un arcoíris​ o arco iris es un fenómeno óptico y meteorológico que consiste en la aparición en el cielo de un arco de luz multicolor, originado por la descomposición de la luz solar en el espectro visible, la cual se produce por refracción, cuando los rayos del sol atraviesan pequeñas gotas de agua contenidas en la atmósfera terrestre.
txt/eu.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Ostadarra, halaber Erromako zubia edo uztargia, gertaera optiko eta meteorologiko bat da, zeruan, jarraikako argi zerrenda bat eragiten duena, eguzkiaren izpiek Lurreko atmosferan aurkitzen diren hezetasun tanta txikiak zeharkatzen dituztenean.
txt/fa.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ رنگین‌کمان پدیده‌ای نوری و کمانی است که زمانی که خورشید به قطرات نم و رطوبت جو زمین می‌تابد باعث ایجاد طیفی از نور در آسمان می‌شود. این پدیده به شکل یک کمان
txt/fi.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Sateenkaari on spektrin väreissä esiintyvä ilmakehän optinen ilmiö.
txt/fr.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Un arc-en-ciel est un photométéore, un phénomène optique se produisant dans le ciel, visible dans la direction opposée au Soleil quand il brille pendant la pluie.
txt/hu.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ A szivárvány olyan optikai jelenség, melyet eső- vagy páracseppek okoznak, mikor a fény prizmaszerűen megtörik rajtuk és színeire bomlik, kialakul a színképe, más néven spektruma.
txt/is.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Regnbogi (einnig kallaður friðarbogi) er ljósfræðilegt og veðurfræðilegt fyrirbæri sem orsakast þegar litróf birtist á himninum á meðan sólin skín á vætu í andrúmslofti jarðar.
txt/it.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ In fisica dell'atmosfera e meteorologia l'arcobaleno è un fenomeno ottico atmosferico che produce uno spettro quasi continuo di luce nel cielo quando la luce del Sole attraversa le gocce d'acqua rimaste in sospensione dopo un temporale, o presso una cascata o una fontana.
txt/ka.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ცისარტყელა — ატმოსფერული ოპტიკური და მეტეოროლოგიური მოვლენა, რომელიც ხშირად წვიმის შემდეგ ჩნდება.
txt/kk.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Кемпірқосақ – аспан күмбезінде түрлі түсті доға түрінде көрінетін атмосферадағы оптикалық құбылыс.
txt/lb.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Et freet mech, Iech kennen ze léieren.
txt/lv.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Varavīksne ir optiska parādība atmosfērā, kuru rada Saules staru laušana un atstarošana krītošos lietus pilienos.
txt/ne.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ इन्द्रेणी वा इन्द्रधनुष प्रकाश र रंगबाट उत्पन्न भएको यस्तो घटना हो जसमा रंगीन प्रकाशको एउटा अर्धवृत आकाशमा देखिन्छ। जब सूर्यको प्रकाश पृथ्वीको वायुमण्डलमा भएको पानीको थोपा माथि पर्छ, पानीको थोपाले प्रकाशलाई परावर्तन, आवर्तन र डिस्पर्सन गर्दछ। फलस्वरुप आकाशमा एउटा सप्तरङ्गी अर्धवृताकार प्रकाशीय आकृति उत्पन्न हुन्छ। यो आकृतिलाई नै इन्द्रेणी भनिन्छ। इन्द्रेणी देखिनुको कारण वायुमण्डलमा पानीका कणहरु हुनु नै हो। वर्षा, झरनाबाट उछिट्टिएको पानी, शीत, कुहिरो आदिको इन्द्रेणी देखिने प्रक्रियामा महत्त्वपूर्ण भूमिका हुन्छ। इन्द्रेणीमा सात रंगहरु रातो, सुन्तला, पहेंलो, हरियो, आकाशे निलो, गाढा निलो र बैजनी रंग क्रमैसँग देखिन्छ। यसमा सबैभन्दा माथिल्लो छेउमा रातो रंग र अर्को छेउमा बैजनी रंग देखिन्छ। इन्द्रेणी पूर्ण वृत्ताकार समेत हुन सक्ने भए पनि साधरण अवलोकनकर्ताले जमिन माथि बनेको आधा भाग मात्र देख्न सकिन्छ ।
txt/nl.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Een regenboog is een gekleurde cirkelboog die aan de hemel waargenomen kan worden als de, laagstaande, zon tegen een nevel van waterdruppeltjes aan schijnt en de zon zich achter de waarnemer bevindt.
txt/no.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Regnbuen eller regnbogen er et optisk fenomen som oppstår når solen skinner gjennom regndråper i atmosfæren og betrakteren står med solen i ryggen.
txt/pl.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Tęcza, zjawisko optyczne i meteorologiczne, występujące w postaci charakterystycznego wielobarwnego łuku powstającego w wyniku rozszczepienia światła widzialnego, zwykle promieniowania słonecznego, załamującego się i odbijającego wewnątrz licznych kropli wody mających kształt zbliżony do kulistego.
txt/pt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Um arco-íris, também popularmente denominado arco-da-velha, é um fenômeno óptico e meteorológico que separa a luz do sol em seu espectro contínuo quando o sol brilha sobre gotículas de água suspensas no ar.
txt/ro.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Curcubeul este un fenomen optic și meteorologic atmosferic care se manifestă prin apariția pe cer a unui spectru de forma unui arc colorat atunci când lumina soarelui se refractă în picăturile de apă din atmosferă.
txt/ru.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Радуга, атмосферное, оптическое и метеорологическое явление, наблюдаемое при освещении ярким источником света множества водяных капель.
txt/sk.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Dúha je optický úkaz vznikajúci v atmosfére Zeme.
txt/sl.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Mavrica je svetlobni pojav v ozračju, ki ga vidimo v obliki loka spektralnih barv.
txt/sr.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Дуга је оптичка и метеоролошка појава који се појављује на небу, када се сунчеви зраци преламају кроз ситне водене капи, најчешће након кише.
txt/sv.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ En regnbåge är ett optiskt, meteorologiskt fenomen som uppträder som ett fullständigt ljusspektrum i form av en båge på himlen då solen lyser på nedfallande regn.
txt/sw.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Upinde wa mvua ni tao la rangi mbalimbali angani ambalo linaweza kuonekana wakati Jua huangaza kupitia matone ya mvua inayoanguka.
txt/tr.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Gökkuşağı, güneş ışınlarının yağmur damlalarında veya sis bulutlarında yansıması ve kırılmasıyla meydana gelen ve ışık tayfı renklerinin bir yay şeklinde göründüğü meteorolojik bir olaydır.
txt/uk.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Весе́лка, також ра́йдуга оптичне явище в атмосфері, що являє собою одну, дві чи декілька різнокольорових дуг ,або кіл, якщо дивитися з повітря, що спостерігаються на тлі хмари, якщо вона розташована проти Сонця.
txt/vi.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Cầu vồng hay mống cũng như quang phổ là hiện tượng tán sắc của các ánh sáng từ Mặt Trời khi khúc xạ và phản xạ qua các giọt nước mưa.
txt/zh.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 彩虹,又稱天弓、天虹、絳等,簡稱虹,是氣象中的一種光學現象,當太陽 光照射到半空中的水滴,光線被折射及反射,在天空上形成拱形的七彩光譜,由外 圈至内圈呈紅、橙、黃、綠、蓝、靛蓝、堇紫七种颜色(霓虹則相反)。