preston-cell commited on
Commit
520c499
·
verified ·
1 Parent(s): eade8cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -16
app.py CHANGED
@@ -6,26 +6,26 @@ pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
6
 
7
  narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
8
 
9
- def launch(input):
10
- # Step 1: Extract text from image
11
- caption = pipe(input_image)[0]['generated_text']
12
 
13
- # Step 2: Generate speech from the caption
14
  audio_output = narrator(caption)
 
 
15
 
16
- # Step 3: Save the audio to a temporary file
17
- audio_data = audio_output["audio"]
18
- sampling_rate = audio_output["sampling_rate"]
19
 
20
- # Gradio expects a tuple: (numpy_array, sampling_rate)
21
- return (np.array(audio_data), sampling_rate)
22
-
23
- iface = gr.Interface(launch,
24
  fn=launch,
25
- inputs=gr.Image(type='pil'),
26
- outputs=gr.Audio(type="numpy", label="Narrated Output"),
 
 
 
27
  title="SeeSay",
28
  description="Upload an image to hear its context narrated aloud."
29
- )
30
-
31
- iface.launch()
 
6
 
7
  narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
8
 
9
+ def launch(input_image):
10
+ # Step 1: Extract caption
11
+ caption = pipe(input_image)[0]["generated_text"]
12
 
13
+ # Step 2: Convert caption to audio
14
  audio_output = narrator(caption)
15
+ audio_array = np.array(audio_output["audio"])
16
+ sample_rate = audio_output["sampling_rate"]
17
 
18
+ # Step 3: Return audio + caption
19
+ return (audio_array, sample_rate), caption
 
20
 
21
+ # Use dictionary to avoid conflicting argument ordering
22
+ iface = gr.Interface(
 
 
23
  fn=launch,
24
+ inputs=gr.Image(type='pil', label="Upload Image"),
25
+ outputs=[
26
+ gr.Audio(type="numpy", label="Narrated Audio"),
27
+ gr.Textbox(label="Extracted Caption")
28
+ ],
29
  title="SeeSay",
30
  description="Upload an image to hear its context narrated aloud."
31
+ )