preston-cell commited on
Commit
26dbd13
·
verified ·
1 Parent(s): 005e1c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -17
app.py CHANGED
@@ -16,7 +16,7 @@ model.to(device)
16
  processor = AutoProcessor.from_pretrained(model_id)
17
 
18
  pipe = pipeline(
19
- "automatic-speech-recognition",
20
  model=model,
21
  tokenizer=processor.tokenizer,
22
  feature_extractor=processor.feature_extractor,
@@ -24,20 +24,21 @@ pipe = pipeline(
24
  device=device,
25
  )
26
 
27
- def transcribe(audio_path):
28
- try:
29
- # Transcribe the audio using Whisper
30
- result = pipe(audio_path)
31
- return result["text"]
32
- except Exception as e:
33
- return str(e)
34
-
35
- demo = gr.Interface(
36
- fn=transcribe,
37
- inputs=gr.Audio(type="filepath"),
38
- outputs=gr.Textbox(),
39
- title="Whisper Speech Recognition",
40
- description="Upload an audio file to transcribe using Whisper large-v3-turbo."
41
- )
 
42
 
43
- demo.launch(share=True)
 
16
  processor = AutoProcessor.from_pretrained(model_id)
17
 
18
  pipe = pipeline(
19
+ "text-to-speech",
20
  model=model,
21
  tokenizer=processor.tokenizer,
22
  feature_extractor=processor.feature_extractor,
 
24
  device=device,
25
  )
26
 
27
+ def generate_caption_and_speech(input_image):
28
+ # Generate caption
29
+ caption = captioner(input_image)[0]['generated_text']
30
+
31
+ # Generate speech from caption
32
+ audio_output = pipe(caption)
33
+
34
+ return (np.array(audio_output["audio"]), audio_output["sampling_rate"]), caption
35
+
36
+ # Gradio Interface
37
+ iface = gr.Interface(
38
+ fn=generate_caption_and_speech,
39
+ inputs=gr.Image(type='pil', label="Upload Image"),
40
+ outputs=[gr.Audio(type="numpy", label="Generated Audio"), gr.Textbox(label="Generated Caption")],
41
+ title="SeeSay",
42
+ description="Upload an image to generate a caption and hear it described with speech."")
43
 
44
+ iface.launch(share=True)