Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,7 @@ from streaming_stt_nemo import Model
|
|
9 |
import torch
|
10 |
import random
|
11 |
from openai import OpenAI
|
|
|
12 |
|
13 |
default_lang = "en"
|
14 |
|
@@ -84,47 +85,82 @@ def models(text, model="Llama 3B Service", seed=42):
|
|
84 |
async def respond(audio, model, seed):
|
85 |
user = transcribe(audio)
|
86 |
reply = models(user, model, seed)
|
87 |
-
# Change the voice to a deep male voice
|
88 |
communicate = edge_tts.Communicate(reply, voice="en-US-ChristopherNeural")
|
89 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
90 |
tmp_path = tmp_file.name
|
91 |
await communicate.save(tmp_path)
|
92 |
yield tmp_path
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
DESCRIPTION = """ # <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>"""
|
95 |
|
96 |
with gr.Blocks(css="style.css") as demo:
|
97 |
gr.Markdown(DESCRIPTION)
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
if __name__ == "__main__":
|
130 |
demo.queue(max_size=200).launch()
|
|
|
9 |
import torch
|
10 |
import random
|
11 |
from openai import OpenAI
|
12 |
+
from transformers import AutoProcessor, SeamlessM4TModel
|
13 |
|
14 |
default_lang = "en"
|
15 |
|
|
|
85 |
async def respond(audio, model, seed):
|
86 |
user = transcribe(audio)
|
87 |
reply = models(user, model, seed)
|
|
|
88 |
communicate = edge_tts.Communicate(reply, voice="en-US-ChristopherNeural")
|
89 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
90 |
tmp_path = tmp_file.name
|
91 |
await communicate.save(tmp_path)
|
92 |
yield tmp_path
|
93 |
|
94 |
+
# Load the Seamless M4T model and processor
|
95 |
+
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-medium")
|
96 |
+
seamless_model = SeamlessM4TModel.from_pretrained("facebook/seamless-m4t-medium")
|
97 |
+
|
98 |
+
def translate_speech(audio, target_lang):
|
99 |
+
audio_array, sample_rate = audio
|
100 |
+
inputs = processor(audios=audio_array, return_tensors="pt", sampling_rate=sample_rate)
|
101 |
+
|
102 |
+
generated_speech = seamless_model.generate(**inputs, tgt_lang=target_lang)
|
103 |
+
|
104 |
+
translated_text = processor.batch_decode(generated_speech, skip_special_tokens=True)[0]
|
105 |
+
|
106 |
+
return translated_text
|
107 |
+
|
108 |
DESCRIPTION = """ # <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>"""
|
109 |
|
110 |
with gr.Blocks(css="style.css") as demo:
|
111 |
gr.Markdown(DESCRIPTION)
|
112 |
+
|
113 |
+
with gr.Tabs():
|
114 |
+
with gr.TabItem("Voice Assistant"):
|
115 |
+
with gr.Row():
|
116 |
+
select = gr.Dropdown([
|
117 |
+
'Llama 3B Service',
|
118 |
+
'Mixtral 8x7B',
|
119 |
+
'Llama 3 8B',
|
120 |
+
'Mistral 7B v0.3',
|
121 |
+
'Phi 3 mini',
|
122 |
+
],
|
123 |
+
value="Llama 3B Service",
|
124 |
+
label="Model"
|
125 |
+
)
|
126 |
+
seed = gr.Slider(
|
127 |
+
label="Seed",
|
128 |
+
minimum=0,
|
129 |
+
maximum=999999,
|
130 |
+
step=1,
|
131 |
+
value=0,
|
132 |
+
visible=False
|
133 |
+
)
|
134 |
+
input = gr.Audio(label="User", sources="microphone", type="filepath", waveform_options=False)
|
135 |
+
output = gr.Audio(label="AI", type="filepath",
|
136 |
+
interactive=False,
|
137 |
+
autoplay=True,
|
138 |
+
elem_classes="audio")
|
139 |
+
gr.Interface(
|
140 |
+
batch=True,
|
141 |
+
max_batch_size=10,
|
142 |
+
fn=respond,
|
143 |
+
inputs=[input, select, seed],
|
144 |
+
outputs=[output],
|
145 |
+
live=True
|
146 |
+
)
|
147 |
+
|
148 |
+
with gr.TabItem("Speech Translation"):
|
149 |
+
with gr.Row():
|
150 |
+
input_audio = gr.Audio(label="Input Speech", sources="microphone", type="numpy", streaming=True)
|
151 |
+
target_lang = gr.Dropdown(
|
152 |
+
choices=["es", "fr", "de", "it", "ja", "ko", "zh"],
|
153 |
+
value="es",
|
154 |
+
label="Target Language"
|
155 |
+
)
|
156 |
+
output_text = gr.Textbox(label="Translated Text")
|
157 |
+
|
158 |
+
gr.Interface(
|
159 |
+
fn=translate_speech,
|
160 |
+
inputs=[input_audio, target_lang],
|
161 |
+
outputs=[output_text],
|
162 |
+
live=True
|
163 |
+
)
|
164 |
|
165 |
if __name__ == "__main__":
|
166 |
demo.queue(max_size=200).launch()
|