nareauow commited on
Commit
a642818
·
verified ·
1 Parent(s): 15041ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -55
app.py CHANGED
@@ -177,10 +177,12 @@ def synthesize_speech(text):
177
  print(f"Speech synthesis error: {str(e)}")
178
  return None
179
 
180
- # Fonction prédiction
 
 
181
  def predict_speaker(audio, model, processor):
182
  if audio is None:
183
- return "Aucun audio détecté.", None, None
184
 
185
  try:
186
  audio_data, sr = sf.read(audio)
@@ -191,7 +193,7 @@ def predict_speaker(audio, model, processor):
191
 
192
  with torch.no_grad():
193
  output = model(input_tensor)
194
- print(output)
195
  probabilities = F.softmax(output, dim=1)
196
  confidence, predicted_class = torch.max(probabilities, 1)
197
 
@@ -203,28 +205,45 @@ def predict_speaker(audio, model, processor):
203
  probs_dict = {speakers[i]: float(probs) for i, probs in enumerate(probabilities[0].cpu().numpy())}
204
 
205
  # Recognize speech
206
- recognized_text = recognize_speech(audio)
207
 
208
- return result, probs_dict, recognized_text,predicted_speaker
209
 
210
  except Exception as e:
211
- return f"Erreur : {str(e)}", None, None
212
 
213
- # Charger modèle
214
- def load_model(model_id="nareauow/my_speech_recognition", model_filename="model_3.pth"):
215
- try:
216
- model_path = hf_hub_download(repo_id=model_id, filename=model_filename)
217
- model = modele_CNN(num_classes=7, dropout=0.)
218
- model.load_state_dict(torch.load(model_path, map_location=device))
219
- model.to(device)
220
- model.eval()
221
- print("Modèle chargé avec succès !")
222
- return model
223
- except Exception as e:
224
- print(f"Erreur de chargement: {e}")
225
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
- # Gradio Interface
228
  def create_interface():
229
  processor = AudioProcessor()
230
 
@@ -239,49 +258,29 @@ def create_interface():
239
  value="model_3.pth",
240
  label="Choisissez le modèle"
241
  )
242
- audio_input = gr.Audio(sources=["microphone"], type="filepath", label="🎙️ Parlez ici")
 
 
 
 
 
 
243
  record_btn = gr.Button("Reconnaître")
 
244
  with gr.Column():
245
  result_text = gr.Textbox(label="Résultat")
246
  plot_output = gr.Plot(label="Confiance par locuteur")
247
  recognized_text = gr.Textbox(label="Texte reconnu")
248
- audio_output = gr.Audio(label="Synthèse vocale", type="numpy")
249
-
250
- def recognize(audio, selected_model):
251
- model = load_model(model_filename=selected_model)
252
- res, probs, text,locuteur = predict_speaker(audio, model, processor)
253
-
254
- # Generate plot
255
- fig = None
256
- if probs:
257
- fig, ax = plt.subplots()
258
- ax.bar(probs.keys(), probs.values(), color='skyblue')
259
- ax.set_ylim([0, 1])
260
- ax.set_ylabel("Confiance")
261
- ax.set_xlabel("Locuteurs")
262
- plt.xticks(rotation=45)
263
-
264
- # Generate speech synthesis if text was recognized
265
- synth_audio = None
266
- if text and "error" not in text.lower():
267
- synth_text = f"{locuteur} said : {text}"
268
- synth_audio = synthesize_speech(synth_text)
269
-
270
- return res, fig, text, synth_audio
271
-
272
- record_btn.click(fn=recognize,
273
- inputs=[audio_input, model_selector],
274
- outputs=[result_text, plot_output, recognized_text, audio_output])
275
 
276
- gr.Markdown("""### Comment utiliser ?
277
- - Choisissez le modèle.
278
- - Cliquez sur 🎙️ pour enregistrer votre voix.
279
- - Cliquez sur **Reconnaître** pour obtenir la prédiction.
280
- """)
281
 
282
  return interface
283
 
284
- # Lancer
285
  if __name__ == "__main__":
286
  app = create_interface()
287
- app.launch()
 
177
  print(f"Speech synthesis error: {str(e)}")
178
  return None
179
 
180
+ # ... (keep all previous imports and class definitions)
181
+
182
+ # Updated predict_speaker function to return consistent values
183
  def predict_speaker(audio, model, processor):
184
  if audio is None:
185
+ return "Aucun audio détecté.", {}, "Aucun texte reconnu", "Inconnu" # Now returns 4 values
186
 
187
  try:
188
  audio_data, sr = sf.read(audio)
 
193
 
194
  with torch.no_grad():
195
  output = model(input_tensor)
196
+ print(output) # Debug output
197
  probabilities = F.softmax(output, dim=1)
198
  confidence, predicted_class = torch.max(probabilities, 1)
199
 
 
205
  probs_dict = {speakers[i]: float(probs) for i, probs in enumerate(probabilities[0].cpu().numpy())}
206
 
207
  # Recognize speech
208
+ recognized_text = recognize_speech(audio) if speech_recognizer else "Modèle de reconnaissance vocale non disponible"
209
 
210
+ return result, probs_dict, recognized_text, predicted_speaker # Now returns 4 values
211
 
212
  except Exception as e:
213
+ return f"Erreur : {str(e)}", {}, "Erreur de reconnaissance", "Inconnu"
214
 
215
+ # Updated recognize function
216
+ def recognize(audio, selected_model):
217
+ model = load_model(model_filename=selected_model)
218
+ if model is None:
219
+ return "Erreur: Modèle non chargé", None, "Erreur", None
220
+
221
+ res, probs, text, speaker = predict_speaker(audio, model, processor) # Now expects 4 values
222
+
223
+ # Generate plot
224
+ fig = None
225
+ if probs:
226
+ fig, ax = plt.subplots(figsize=(10, 6))
227
+ ax.bar(probs.keys(), probs.values(), color='skyblue')
228
+ ax.set_ylim([0, 1])
229
+ ax.set_ylabel("Confiance")
230
+ ax.set_xlabel("Locuteurs")
231
+ ax.set_title("Probabilités de reconnaissance")
232
+ plt.xticks(rotation=45)
233
+ plt.tight_layout()
234
+
235
+ # Generate speech synthesis if text was recognized
236
+ synth_audio = None
237
+ if synthesizer is not None and text and "erreur" not in text.lower():
238
+ try:
239
+ synth_text = f"Le locuteur {speaker} a dit : {text}" if speaker else f"Le locuteur a dit : {text}"
240
+ synth_audio = synthesize_speech(synth_text)
241
+ except Exception as e:
242
+ print(f"Erreur de synthèse vocale: {e}")
243
+
244
+ return res, fig, text, synth_audio if synth_audio else None
245
 
246
+ # Updated interface creation
247
  def create_interface():
248
  processor = AudioProcessor()
249
 
 
258
  value="model_3.pth",
259
  label="Choisissez le modèle"
260
  )
261
+
262
+ with gr.Tab("Microphone"):
263
+ mic_input = gr.Audio(sources=["microphone"], type="filepath", label="🎙️ Enregistrer depuis le microphone")
264
+
265
+ with gr.Tab("Upload Audio"):
266
+ file_input = gr.Audio(sources=["upload"], type="filepath", label="📁 Télécharger un fichier audio")
267
+
268
  record_btn = gr.Button("Reconnaître")
269
+
270
  with gr.Column():
271
  result_text = gr.Textbox(label="Résultat")
272
  plot_output = gr.Plot(label="Confiance par locuteur")
273
  recognized_text = gr.Textbox(label="Texte reconnu")
274
+ audio_output = gr.Audio(label="Synthèse vocale", visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
+ record_btn.click(
277
+ fn=recognize,
278
+ inputs=[gr.inputs.Union([mic_input, file_input]), model_selector],
279
+ outputs=[result_text, plot_output, recognized_text, audio_output]
280
+ )
281
 
282
  return interface
283
 
 
284
  if __name__ == "__main__":
285
  app = create_interface()
286
+ app.launch(share=True)