camanalo1 commited on
Commit
4ae34f2
·
verified ·
1 Parent(s): 311f586

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -30
app.py CHANGED
@@ -1,41 +1,34 @@
1
  import gradio as gr
2
  from transformers import pipeline, VitsTokenizer, VitsModel, set_seed
 
 
 
3
  import soundfile as sf
4
 
5
  # Initialize ASR pipeline
6
- transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small-librispeech-asr")
7
-
8
- # Initialize LLM pipeline
9
- generator = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)
10
-
11
- # Initialize TTS tokenizer and model
12
- tokenizer_tts = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
13
- model_tts = VitsModel.from_pretrained("facebook/mms-tts-eng")
14
-
15
  print("TTS Tokenizer:", tokenizer_tts) # Print the tokenizer for the TTS model
16
 
17
  def transcribe_and_generate_audio(audio):
18
- try:
19
- # Transcribe audio
20
- asr_output = transcriber(audio)["text"]
21
- print("ASR Output:", asr_output)
22
-
23
- # Generate text based on ASR output
24
- generated_text = generator(asr_output)[0]['generated_text']
25
- print("Generated Text:", generated_text)
26
-
27
- # Generate audio from text using TTS model
28
- inputs = tokenizer_tts(text=generated_text, return_tensors="pt")
29
- set_seed(555)
30
- with torch.no_grad():
31
- outputs = model_tts(**inputs)
32
- waveform = outputs.waveform[0]
33
- waveform_path = "output.wav"
34
- sf.write(waveform_path, waveform.numpy(), 16000, format='wav')
35
-
36
- return waveform_path, asr_output, generated_text
37
- except Exception as e:
38
- return f"Error: {str(e)}"
39
 
40
  # Define Gradio interface
41
  audio_input = gr.Interface(
 
1
  import gradio as gr
2
  from transformers import pipeline, VitsTokenizer, VitsModel, set_seed
3
+ import numpy as np
4
+ import torch
5
+ import io
6
  import soundfile as sf
7
 
8
  # Initialize ASR pipeline
 
 
 
 
 
 
 
 
 
9
  print("TTS Tokenizer:", tokenizer_tts) # Print the tokenizer for the TTS model
10
 
11
  def transcribe_and_generate_audio(audio):
12
+ sr, y = audio
13
+ y = y.astype(np.float32)
14
+ y /= np.max(np.abs(y))
15
+
16
+ # Transcribe audio
17
+ asr_output = transcriber({"sampling_rate": sr, "raw": y})["text"]
18
+
19
+ # Generate text based on ASR output
20
+ generated_text = generator(asr_output)[0]['generated_text']
21
+
22
+ # Generate audio from text using TTS model
23
+ inputs = tokenizer_tts(text=generated_text, return_tensors="pt")
24
+ set_seed(555)
25
+ with torch.no_grad():
26
+ outputs = model_tts(**inputs)
27
+ waveform = outputs.waveform[0]
28
+ waveform_path = "output.wav"
29
+ sf.write(waveform_path, waveform.numpy(), 16000, format='wav')
30
+
31
+ return waveform_path
 
32
 
33
  # Define Gradio interface
34
  audio_input = gr.Interface(