Spaces:

camanalo1
/

MyAlexa

Sleeping

camanalo1 commited on May 1, 2024

Commit

3fdc3cc

verified ·

1 Parent(s): 5d7b200

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -19,12 +19,9 @@ model_tts = VitsModel.from_pretrained("facebook/mms-tts-eng")
 print("TTS Tokenizer:", tokenizer_tts)  # Print the tokenizer for the TTS model
 def transcribe_and_generate_audio(audio):
-    sr, y = audio
-    y = y.astype(np.float32)
-    y /= np.max(np.abs(y))
     # Transcribe audio
-    asr_output = transcriber({"sampling_rate": sr, "raw": y})["text"]
     # Generate text based on ASR output
     generated_text = generator(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
@@ -44,7 +41,7 @@ def transcribe_and_generate_audio(audio):
 audio_input = gr.Interface(
     transcribe_and_generate_audio,
     gr.Audio(sources=["microphone"], label="Speak Here"),
-    ["audio", "text", "text"],
     title="ASR -> LLM -> TTS",
     description="Speak into the microphone and hear the generated audio."
 )

 print("TTS Tokenizer:", tokenizer_tts)  # Print the tokenizer for the TTS model
 def transcribe_and_generate_audio(audio):
     # Transcribe audio
+    asr_output = transcriber(audio)["text"]
     # Generate text based on ASR output
     generated_text = generator(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
 audio_input = gr.Interface(
     transcribe_and_generate_audio,
     gr.Audio(sources=["microphone"], label="Speak Here"),
+    "audio",
     title="ASR -> LLM -> TTS",
     description="Speak into the microphone and hear the generated audio."
 )