camanalo1 commited on
Commit
11ccb7a
·
verified ·
1 Parent(s): 6d5e6cf

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -0
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline, VitsTokenizer, VitsModel, set_seed
3
+ import numpy as np
4
+ import torch
5
+ import io
6
+ import soundfile as sf
7
+
8
+ # Initialize ASR pipeline
9
+ transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small-librispeech-asr")
10
+
11
+ # Initialize LLM pipeline
12
+ generator = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)
13
+
14
+ # Initialize TTS tokenizer and model
15
+ tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
16
+ model = VitsModel.from_pretrained("facebook/mms-tts-eng")
17
+
18
+ def transcribe_generate_and_speak(audio):
19
+ sr, y = audio
20
+ y = y.astype(np.float32)
21
+ y /= np.max(np.abs(y))
22
+
23
+ # Transcribe audio
24
+ asr_output = transcriber({"sampling_rate": sr, "raw": y})["text"]
25
+
26
+ # Generate text based on ASR output
27
+ generated_text = generator(asr_output, max_length=100, num_return_sequences=1)[0]['generated_text']
28
+
29
+ # Generate audio from text
30
+ inputs = tokenizer(text=generated_text, return_tensors="pt")
31
+ set_seed(555)
32
+ with torch.no_grad():
33
+ outputs = model(**inputs)
34
+ waveform = outputs.waveform[0]
35
+ waveform_path = "output.wav"
36
+ sf.write(waveform_path, waveform.numpy(), 16000, format='wav')
37
+
38
+ return waveform_path
39
+
40
+ # Define Gradio interface
41
+ audio_input = gr.Interface(
42
+ transcribe_generate_and_speak,
43
+ gr.Audio(sources=["microphone"], label="Speak Here"),
44
+ "audio",
45
+ title="ASR -> LLM -> TTS",
46
+ description="Speak into the microphone and hear the generated audio."
47
+ )
48
+
49
+ # Launch the interface
50
+ audio_input.launch()