UnityGiles commited on
Commit
bb35590
·
1 Parent(s): 148e5e2

updated to inference engine

Browse files
README.md CHANGED
@@ -1,25 +1,28 @@
1
  ---
2
  license: cc-by-4.0
3
  library_name: unity-sentis
 
 
4
  ---
5
 
6
- # Jets Text-to-Speech Model validated for Sentis 2.1.2 in Unity 6
7
 
8
- This is a text to speech model called [Jets](https://huggingface.co/imdanboy/jets). It takes in a text string which you convert to phonemes using a dictionary and then outputs a wav to play the voice.
9
 
10
  ## How to Use
11
- * Create a new scene in Unity 6
12
- * Install `com.unity.sentis` version `2.1.2` package
13
- * Put the c# script on the Main Camera
14
- * Put the `jets-text-to-speech.sentis` file and the `phoneme_dict.txt` file in the `Assets/StreamingAssets` folder
15
- * Add an AudioSource component on the Main Camera
16
- * Set the `inputText` string for what you want it to say
17
- * Press play
18
-
19
- ## Information
20
- This version uses a phoneme dictionary to convert the text into a string of phonemes. There are other ways to do this, for example using another model, or heuristics.
21
-
22
- Since we are using a simple dictionary it has no way of distinguishing heteronyms (two words with the same spelling but different pronounciation).
 
23
 
24
  ## License
25
  Attribution for the original creators is required. See[Jets](https://huggingface.co/imdanboy/jets) for more details.
 
1
  ---
2
  license: cc-by-4.0
3
  library_name: unity-sentis
4
+ tags:
5
+ - unity-inference-engine
6
  ---
7
 
8
+ # Jets in Unity 6 using Inference Engine
9
 
10
+ This is the [Jets](https://huggingface.co/imdanboy/jets) model running in Unity 6 with Inference Engine. It text-to-speech model that takes phonemes as an input and outputs wav data of a voice speaking the text.
11
 
12
  ## How to Use
13
+
14
+ * Create a new scene in Unity 6;
15
+ * Install `com.unity.ai.inference` from the package manager;
16
+ * Add the `RunJets.cs` script to the Main Camera;
17
+ * Add an AudioSource component to the Main Camera;
18
+ * Drag the `jets-text-to-speech.onnx` file from the `models` folder into the `Model Asset` field;
19
+ * Drag the `phoneme_dict.txt` file from the `data` folder into the `Phoneme Asset` field;
20
+
21
+ ## Preview
22
+ Enter play mode. If working correctly you should hear the inferred audio of the voice.
23
+
24
+ ## Inference Engine
25
+ Inference Engine is a neural network inference library for Unity. Find out more [here](https://docs.unity3d.com/Packages/com.unity.ai.inference@latest).
26
 
27
  ## License
28
  Attribution for the original creators is required. See[Jets](https://huggingface.co/imdanboy/jets) for more details.
RunJets.cs CHANGED
@@ -1,20 +1,12 @@
 
1
  using System.Collections.Generic;
 
2
  using UnityEngine;
3
- using Unity.Sentis;
4
- using System.IO;
5
-
6
- // Jets Text-To-Speech Inference
7
- // =============================
8
- //
9
- // This file implements the Jets Text-to-speech model in Unity Sentis
10
- // The model uses phenomes instead of raw text so you have to convert it first.
11
- // Place this file on the Main Camera
12
- // Add an audio source
13
- // Change the inputText
14
- // When running you can press space bar to play it again
15
 
16
  public class RunJets : MonoBehaviour
17
  {
 
 
18
  public string inputText = "Once upon a time, there lived a girl called Alice. She lived in a house in the woods.";
19
  //string inputText = "The quick brown fox jumped over the lazy dog";
20
  //string inputText = "There are many uses of the things she uses!";
@@ -22,21 +14,23 @@ public class RunJets : MonoBehaviour
22
  //Set to true if we have put the phoneme_dict.txt in the Assets/StreamingAssets folder
23
  bool hasPhenomeDictionary = true;
24
 
25
- readonly string[] phonemes = new string[] {
26
- "<blank>", "<unk>", "AH0", "N", "T", "D", "S", "R", "L", "DH", "K", "Z", "IH1",
27
- "IH0", "M", "EH1", "W", "P", "AE1", "AH1", "V", "ER0", "F", ",", "AA1", "B",
28
- "HH", "IY1", "UW1", "IY0", "AO1", "EY1", "AY1", ".", "OW1", "SH", "NG", "G",
29
- "ER1", "CH", "JH", "Y", "AW1", "TH", "UH1", "EH2", "OW0", "EY2", "AO0", "IH2",
30
- "AE2", "AY2", "AA2", "UW0", "EH0", "OY1", "EY0", "AO2", "ZH", "OW2", "AE0", "UW2",
31
- "AH2", "AY0", "IY2", "AW2", "AA0", "\"", "ER2", "UH2", "?", "OY2", "!", "AW0",
32
- "UH0", "OY0", "..", "<sos/eos>" };
 
 
33
 
34
  readonly string[] alphabet = "AE1 B K D EH1 F G HH IH1 JH K L M N AA1 P K R S T AH1 V W K Y Z".Split(' ');
35
 
36
  //Can change pitch and speed with this for a slightly different voice:
37
  const int samplerate = 22050;
38
 
39
- Dictionary<string, string> dict = new ();
40
 
41
  Worker worker;
42
 
@@ -51,7 +45,7 @@ public class RunJets : MonoBehaviour
51
 
52
  void LoadModel()
53
  {
54
- var model = ModelLoader.Load(Path.Join(Application.streamingAssetsPath, "jets-text-to-speech.sentis"));
55
  worker = new Worker(model, BackendType.GPUCompute);
56
  }
57
 
@@ -76,10 +70,12 @@ public class RunJets : MonoBehaviour
76
  void ReadDictionary()
77
  {
78
  if (!hasPhenomeDictionary) return;
79
- string[] words = File.ReadAllLines(Path.Join(Application.streamingAssetsPath,"phoneme_dict.txt"));
80
  for (int i = 0; i < words.Length; i++)
81
  {
82
  string s = words[i];
 
 
83
  string[] parts = s.Split();
84
  if (parts[0] != ";;;") //ignore comments in file
85
  {
@@ -93,7 +89,7 @@ public class RunJets : MonoBehaviour
93
  dict.Add("!", "!");
94
  dict.Add("?", "?");
95
  dict.Add("\"", "\"");
96
- // You could add extra word pronounciations here e.g.
97
  //dict.Add("somenewword","[phonemes]");
98
  }
99
 
@@ -126,15 +122,15 @@ public class RunJets : MonoBehaviour
126
  }
127
 
128
  //Decode the word into phenomes by looking for the longest word in the dictionary that matches
129
- //the first part of the word and so on.
130
  //This works fairly well but could be improved. The original paper had a model that
131
  //dealt with guessing the phonemes of words
132
  public string DecodeWord(string word)
133
  {
134
  string output = "";
135
  int start = 0;
136
- for (int end = word.Length; end >= 0 && start < word.Length ; end--)
137
- {
138
  if (end <= start) //no matches
139
  {
140
  start++;
@@ -151,20 +147,20 @@ public class RunJets : MonoBehaviour
151
  }
152
  return output;
153
  }
154
-
155
  int[] GetTokens(string ptext)
156
  {
157
  string[] p = ptext.Split();
158
  var tokens = new int[p.Length];
159
  for (int i = 0; i < tokens.Length; i++)
160
  {
161
- tokens[i] = Mathf.Max(0, System.Array.IndexOf(phonemes, p[i]));
162
  }
163
  return tokens;
164
  }
165
 
166
  public void DoInference(string ptext)
167
- {
168
  int[] tokens = GetTokens(ptext);
169
 
170
  using var input = new Tensor<int>(new TensorShape(tokens.Length), tokens);
@@ -180,7 +176,8 @@ public class RunJets : MonoBehaviour
180
 
181
  Speak();
182
  }
183
- private void Speak()
 
184
  {
185
  AudioSource audioSource = GetComponent<AudioSource>();
186
  if (audioSource != null)
@@ -202,7 +199,7 @@ public class RunJets : MonoBehaviour
202
  }
203
  }
204
 
205
- private void OnDestroy()
206
  {
207
  worker?.Dispose();
208
  }
 
1
+ using System;
2
  using System.Collections.Generic;
3
+ using Unity.InferenceEngine;
4
  using UnityEngine;
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  public class RunJets : MonoBehaviour
7
  {
8
+ public ModelAsset modelAsset;
9
+ public TextAsset phonemeAsset;
10
  public string inputText = "Once upon a time, there lived a girl called Alice. She lived in a house in the woods.";
11
  //string inputText = "The quick brown fox jumped over the lazy dog";
12
  //string inputText = "There are many uses of the things she uses!";
 
14
  //Set to true if we have put the phoneme_dict.txt in the Assets/StreamingAssets folder
15
  bool hasPhenomeDictionary = true;
16
 
17
+ readonly string[] phonemes =
18
+ {
19
+ "<blank>", "<unk>", "AH0", "N", "T", "D", "S", "R", "L", "DH", "K", "Z", "IH1",
20
+ "IH0", "M", "EH1", "W", "P", "AE1", "AH1", "V", "ER0", "F", ",", "AA1", "B",
21
+ "HH", "IY1", "UW1", "IY0", "AO1", "EY1", "AY1", ".", "OW1", "SH", "NG", "G",
22
+ "ER1", "CH", "JH", "Y", "AW1", "TH", "UH1", "EH2", "OW0", "EY2", "AO0", "IH2",
23
+ "AE2", "AY2", "AA2", "UW0", "EH0", "OY1", "EY0", "AO2", "ZH", "OW2", "AE0", "UW2",
24
+ "AH2", "AY0", "IY2", "AW2", "AA0", "\"", "ER2", "UH2", "?", "OY2", "!", "AW0",
25
+ "UH0", "OY0", "..", "<sos/eos>"
26
+ };
27
 
28
  readonly string[] alphabet = "AE1 B K D EH1 F G HH IH1 JH K L M N AA1 P K R S T AH1 V W K Y Z".Split(' ');
29
 
30
  //Can change pitch and speed with this for a slightly different voice:
31
  const int samplerate = 22050;
32
 
33
+ Dictionary<string, string> dict = new();
34
 
35
  Worker worker;
36
 
 
45
 
46
  void LoadModel()
47
  {
48
+ var model = ModelLoader.Load(modelAsset);
49
  worker = new Worker(model, BackendType.GPUCompute);
50
  }
51
 
 
70
  void ReadDictionary()
71
  {
72
  if (!hasPhenomeDictionary) return;
73
+ string[] words = phonemeAsset.text.Split("\r\n");
74
  for (int i = 0; i < words.Length; i++)
75
  {
76
  string s = words[i];
77
+ if (string.IsNullOrWhiteSpace(s))
78
+ continue;
79
  string[] parts = s.Split();
80
  if (parts[0] != ";;;") //ignore comments in file
81
  {
 
89
  dict.Add("!", "!");
90
  dict.Add("?", "?");
91
  dict.Add("\"", "\"");
92
+ // You could add extra word pronunciations here e.g.
93
  //dict.Add("somenewword","[phonemes]");
94
  }
95
 
 
122
  }
123
 
124
  //Decode the word into phenomes by looking for the longest word in the dictionary that matches
125
+ //the first part of the word and so on.
126
  //This works fairly well but could be improved. The original paper had a model that
127
  //dealt with guessing the phonemes of words
128
  public string DecodeWord(string word)
129
  {
130
  string output = "";
131
  int start = 0;
132
+ for (int end = word.Length; end >= 0 && start < word.Length; end--)
133
+ {
134
  if (end <= start) //no matches
135
  {
136
  start++;
 
147
  }
148
  return output;
149
  }
150
+
151
  int[] GetTokens(string ptext)
152
  {
153
  string[] p = ptext.Split();
154
  var tokens = new int[p.Length];
155
  for (int i = 0; i < tokens.Length; i++)
156
  {
157
+ tokens[i] = Mathf.Max(0, Array.IndexOf(phonemes, p[i]));
158
  }
159
  return tokens;
160
  }
161
 
162
  public void DoInference(string ptext)
163
+ {
164
  int[] tokens = GetTokens(ptext);
165
 
166
  using var input = new Tensor<int>(new TensorShape(tokens.Length), tokens);
 
176
 
177
  Speak();
178
  }
179
+
180
+ void Speak()
181
  {
182
  AudioSource audioSource = GetComponent<AudioSource>();
183
  if (audioSource != null)
 
199
  }
200
  }
201
 
202
+ void OnDestroy()
203
  {
204
  worker?.Dispose();
205
  }
phoneme_dict.txt → data/phoneme_dict.txt RENAMED
File without changes
info.json CHANGED
@@ -3,13 +3,12 @@
3
  "RunJets.cs"
4
  ],
5
  "models": [
6
- "jets-text-to-speech.onnx",
7
- "jets-text-to-speech.sentis"
8
  ],
9
  "data": [
10
- "phoneme_dict.txt"
11
  ],
12
  "version": [
13
- "2.1.2"
14
  ]
15
  }
 
3
  "RunJets.cs"
4
  ],
5
  "models": [
6
+ "models/jets-text-to-speech.onnx"
 
7
  ],
8
  "data": [
9
+ "data/phoneme_dict.txt"
10
  ],
11
  "version": [
12
+ "2.2.0"
13
  ]
14
  }
jets-text-to-speech.sentis DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:405a4e8d24b07142ac6109b653750b5d97ec720bbea41c5e76838e5f30ec5c70
3
- size 138331240
 
 
 
 
jets-text-to-speech.onnx → models/jets-text-to-speech.onnx RENAMED
File without changes