Jaward commited on
Commit
781ee39
·
verified ·
1 Parent(s): 05677d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -31
app.py CHANGED
@@ -9,6 +9,7 @@ from streaming_stt_nemo import Model
9
  import torch
10
  import random
11
  from openai import OpenAI
 
12
 
13
  default_lang = "en"
14
 
@@ -84,47 +85,82 @@ def models(text, model="Llama 3B Service", seed=42):
84
  async def respond(audio, model, seed):
85
  user = transcribe(audio)
86
  reply = models(user, model, seed)
87
- # Change the voice to a deep male voice
88
  communicate = edge_tts.Communicate(reply, voice="en-US-ChristopherNeural")
89
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
90
  tmp_path = tmp_file.name
91
  await communicate.save(tmp_path)
92
  yield tmp_path
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  DESCRIPTION = """ # <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>"""
95
 
96
  with gr.Blocks(css="style.css") as demo:
97
  gr.Markdown(DESCRIPTION)
98
- with gr.Row():
99
- select = gr.Dropdown([
100
- 'Llama 3B Service',
101
- 'Mixtral 8x7B',
102
- 'Llama 3 8B',
103
- 'Mistral 7B v0.3',
104
- 'Phi 3 mini',
105
- ],
106
- value="Llama 3B Service",
107
- label="Model"
108
- )
109
- seed = gr.Slider(
110
- label="Seed",
111
- minimum=0,
112
- maximum=999999,
113
- step=1,
114
- value=0,
115
- visible=False
116
- )
117
- input = gr.Audio(label="User", sources="microphone", type="filepath", waveform_options=False)
118
- output = gr.Audio(label="AI", type="filepath",
119
- interactive=False,
120
- autoplay=True,
121
- elem_classes="audio")
122
- gr.Interface(
123
- batch=True,
124
- max_batch_size=10,
125
- fn=respond,
126
- inputs=[input, select, seed],
127
- outputs=[output], live=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  if __name__ == "__main__":
130
  demo.queue(max_size=200).launch()
 
9
  import torch
10
  import random
11
  from openai import OpenAI
12
+ from transformers import AutoProcessor, SeamlessM4TModel
13
 
14
  default_lang = "en"
15
 
 
85
  async def respond(audio, model, seed):
86
  user = transcribe(audio)
87
  reply = models(user, model, seed)
 
88
  communicate = edge_tts.Communicate(reply, voice="en-US-ChristopherNeural")
89
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
90
  tmp_path = tmp_file.name
91
  await communicate.save(tmp_path)
92
  yield tmp_path
93
 
94
+ # Load the Seamless M4T model and processor
95
+ processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-medium")
96
+ seamless_model = SeamlessM4TModel.from_pretrained("facebook/seamless-m4t-medium")
97
+
98
+ def translate_speech(audio, target_lang):
99
+ audio_array, sample_rate = audio
100
+ inputs = processor(audios=audio_array, return_tensors="pt", sampling_rate=sample_rate)
101
+
102
+ generated_speech = seamless_model.generate(**inputs, tgt_lang=target_lang)
103
+
104
+ translated_text = processor.batch_decode(generated_speech, skip_special_tokens=True)[0]
105
+
106
+ return translated_text
107
+
108
  DESCRIPTION = """ # <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>"""
109
 
110
  with gr.Blocks(css="style.css") as demo:
111
  gr.Markdown(DESCRIPTION)
112
+
113
+ with gr.Tabs():
114
+ with gr.TabItem("Voice Assistant"):
115
+ with gr.Row():
116
+ select = gr.Dropdown([
117
+ 'Llama 3B Service',
118
+ 'Mixtral 8x7B',
119
+ 'Llama 3 8B',
120
+ 'Mistral 7B v0.3',
121
+ 'Phi 3 mini',
122
+ ],
123
+ value="Llama 3B Service",
124
+ label="Model"
125
+ )
126
+ seed = gr.Slider(
127
+ label="Seed",
128
+ minimum=0,
129
+ maximum=999999,
130
+ step=1,
131
+ value=0,
132
+ visible=False
133
+ )
134
+ input = gr.Audio(label="User", sources="microphone", type="filepath", waveform_options=False)
135
+ output = gr.Audio(label="AI", type="filepath",
136
+ interactive=False,
137
+ autoplay=True,
138
+ elem_classes="audio")
139
+ gr.Interface(
140
+ batch=True,
141
+ max_batch_size=10,
142
+ fn=respond,
143
+ inputs=[input, select, seed],
144
+ outputs=[output],
145
+ live=True
146
+ )
147
+
148
+ with gr.TabItem("Speech Translation"):
149
+ with gr.Row():
150
+ input_audio = gr.Audio(label="Input Speech", sources="microphone", type="numpy", streaming=True)
151
+ target_lang = gr.Dropdown(
152
+ choices=["es", "fr", "de", "it", "ja", "ko", "zh"],
153
+ value="es",
154
+ label="Target Language"
155
+ )
156
+ output_text = gr.Textbox(label="Translated Text")
157
+
158
+ gr.Interface(
159
+ fn=translate_speech,
160
+ inputs=[input_audio, target_lang],
161
+ outputs=[output_text],
162
+ live=True
163
+ )
164
 
165
  if __name__ == "__main__":
166
  demo.queue(max_size=200).launch()