Spaces:

camanalo1
/

MyAlexaExperiment

Sleeping

App Files Files Community

camanalo1 commited on May 1, 2024

Commit

11ccb7a

verified ·

1 Parent(s): 6d5e6cf

Create app.py

Browse files

Files changed (1) hide show

app.py +50 -0

app.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import gradio as gr
+from transformers import pipeline, VitsTokenizer, VitsModel, set_seed
+import numpy as np
+import torch
+import io
+import soundfile as sf
+# Initialize ASR pipeline
+transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small-librispeech-asr")
+# Initialize LLM pipeline
+generator = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)
+# Initialize TTS tokenizer and model
+tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+model = VitsModel.from_pretrained("facebook/mms-tts-eng")
+def transcribe_generate_and_speak(audio):
+    sr, y = audio
+    y = y.astype(np.float32)
+    y /= np.max(np.abs(y))
+    # Transcribe audio
+    asr_output = transcriber({"sampling_rate": sr, "raw": y})["text"]
+    # Generate text based on ASR output
+    generated_text = generator(asr_output, max_length=100, num_return_sequences=1)[0]['generated_text']
+    # Generate audio from text
+    inputs = tokenizer(text=generated_text, return_tensors="pt")
+    set_seed(555)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    waveform = outputs.waveform[0]
+    waveform_path = "output.wav"
+    sf.write(waveform_path, waveform.numpy(), 16000, format='wav')
+    return waveform_path
+# Define Gradio interface
+audio_input = gr.Interface(
+    transcribe_generate_and_speak,
+    gr.Audio(sources=["microphone"], label="Speak Here"),
+    "audio",
+    title="ASR -> LLM -> TTS",
+    description="Speak into the microphone and hear the generated audio."
+)
+# Launch the interface
+audio_input.launch()