preston-cell commited on
Commit
629e04f
·
verified ·
1 Parent(s): 26dbd13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -33
app.py CHANGED
@@ -1,44 +1,35 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
- from datasets import load_dataset
5
 
6
- device = "cpu"
7
- torch_dtype = torch.float32
8
 
9
- # Load the Whisper model
10
- model_id = "openai/whisper-large-v3-turbo"
11
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
12
- model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
13
- )
14
- model.to(device)
15
-
16
- processor = AutoProcessor.from_pretrained(model_id)
17
 
18
- pipe = pipeline(
19
- "text-to-speech",
20
- model=model,
21
- tokenizer=processor.tokenizer,
22
- feature_extractor=processor.feature_extractor,
23
- torch_dtype=torch_dtype,
24
- device=device,
25
- )
26
 
27
- def generate_caption_and_speech(input_image):
28
- # Generate caption
29
- caption = captioner(input_image)[0]['generated_text']
30
-
31
- # Generate speech from caption
32
- audio_output = pipe(caption)
33
 
34
- return (np.array(audio_output["audio"]), audio_output["sampling_rate"]), caption
 
 
 
35
 
36
- # Gradio Interface
37
  iface = gr.Interface(
38
  fn=generate_caption_and_speech,
39
  inputs=gr.Image(type='pil', label="Upload Image"),
40
- outputs=[gr.Audio(type="numpy", label="Generated Audio"), gr.Textbox(label="Generated Caption")],
41
- title="SeeSay",
42
- description="Upload an image to generate a caption and hear it described with speech."")
 
 
 
 
43
 
44
- iface.launch(share=True)
 
1
  import gradio as gr
2
+ from transformers import pipeline
 
 
3
 
4
+ # Load the Whisper model for generating speech
5
+ speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
6
 
7
+ # Load the BLIP model for image captioning
8
+ caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
 
 
 
 
 
 
9
 
10
+ def generate_caption_and_speech(image):
11
+ try:
12
+ # Generate a caption from the image
13
+ caption = caption_model(image)[0]['generated_text']
 
 
 
 
14
 
15
+ # Generate speech using the caption
16
+ speech = speech_model(caption)
 
 
 
 
17
 
18
+ # Return both the caption and the audio
19
+ return speech["audio"], caption
20
+ except Exception as e:
21
+ return str(e), ""
22
 
23
+ # Set up the Gradio interface
24
  iface = gr.Interface(
25
  fn=generate_caption_and_speech,
26
  inputs=gr.Image(type='pil', label="Upload Image"),
27
+ outputs=[
28
+ gr.Audio(type="filepath", label="Generated Audio"),
29
+ gr.Textbox(label="Generated Caption")
30
+ ],
31
+ title="SeeSay: Image to Speech",
32
+ description="Upload an image to generate a caption and hear it described with speech."
33
+ )
34
 
35
+ iface.launch(share=True)