Taizun commited on
Commit
aa5fa68
·
verified ·
1 Parent(s): acd681c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -0
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from PIL import Image
4
+ import scipy.io.wavfile as wavfile
5
+ from transformers import pipeline
6
+
7
+ # Set device for processing
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+
10
+ # Load models
11
+ caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
12
+ narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
13
+
14
+ def generate_audio(text):
15
+ """Generate audio narration from text."""
16
+ narrated_text = narrator(text)
17
+ wavfile.write("output.wav", rate=narrated_text["sampling_rate"], data=narrated_text["audio"][0])
18
+ return "output.wav"
19
+
20
+ def caption_my_image(pil_image):
21
+ """Generate caption for the image and convert it to audio."""
22
+ semantics = caption_image(images=pil_image)[0]['generated_text']
23
+ return generate_audio(semantics)
24
+
25
+ # Define the Gradio interface
26
+ demo = gr.Interface(
27
+ fn=caption_my_image,
28
+ inputs=[gr.Image(label="Upload Your Image", type="pil")],
29
+ outputs=[gr.Audio(label="Generated Audio Caption")],
30
+ title="Image Captioning and Narration",
31
+ description=(
32
+ "Upload an image to generate a descriptive caption and listen to its narration. "
33
+ "This application is powered by AI tools and brought to you by **Taizun**."
34
+ ),
35
+ theme="compact" # Use a minimalistic theme
36
+ )
37
+
38
+ # Launch the application
39
+ demo.launch()