preston-cell commited on
Commit
602e80d
·
verified ·
1 Parent(s): 0bf1b01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -8
app.py CHANGED
@@ -1,23 +1,53 @@
1
  import gradio as gr
2
  from transformers import pipeline
 
 
 
3
 
4
  # Load BLIP model for image captioning
5
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
6
 
7
- def generate_caption(image):
 
 
 
 
 
 
 
 
8
  try:
 
9
  caption = caption_model(image)[0]['generated_text']
10
- return caption
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  except Exception as e:
12
- return f"Error: {str(e)}"
13
 
14
- # Simple Gradio interface
 
15
  iface = gr.Interface(
16
- fn=generate_caption,
17
  inputs=gr.Image(type='pil', label="Upload an Image"),
18
- outputs=gr.Textbox(label="Generated Caption"),
19
- title="Image Captioning with BLIP",
20
- description="Quickly generate a caption from your uploaded image."
 
 
 
21
  )
22
 
23
  iface.launch()
 
 
1
  import gradio as gr
2
  from transformers import pipeline
3
+ from datasets import load_dataset
4
+ import torch
5
+ import numpy as np
6
 
7
  # Load BLIP model for image captioning
8
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
9
 
10
+ # Load SpeechT5 model for text-to-speech
11
+ synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
12
+
13
+ # Load speaker embedding once
14
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
15
+ speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
16
+
17
+
18
+ def process_image(image):
19
  try:
20
+ # Generate caption from the image
21
  caption = caption_model(image)[0]['generated_text']
22
+
23
+ # Convert caption to speech
24
+ speech = synthesiser(
25
+ caption,
26
+ forward_params={"speaker_embeddings": speaker_embedding}
27
+ )
28
+
29
+ # Prepare audio data
30
+ audio = np.array(speech["audio"])
31
+ rate = speech["sampling_rate"]
32
+
33
+ # Return both audio and caption
34
+ return (rate, audio), caption
35
+
36
  except Exception as e:
37
+ return None, f"Error: {str(e)}"
38
 
39
+
40
+ # Gradio Interface
41
  iface = gr.Interface(
42
+ fn=process_image,
43
  inputs=gr.Image(type='pil', label="Upload an Image"),
44
+ outputs=[
45
+ gr.Audio(label="Generated Audio"),
46
+ gr.Textbox(label="Generated Caption")
47
+ ],
48
+ title="SeeSay",
49
+ description="Upload an image to generate a caption and hear it described with SpeechT5's speech synthesis."
50
  )
51
 
52
  iface.launch()
53
+