preston-cell commited on
Commit
3ce024b
·
verified ·
1 Parent(s): 4c5db6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -36
app.py CHANGED
@@ -1,50 +1,35 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
- from transformers import AutoConfig
4
- import numpy as np
5
- from generator import load_csm_1b
6
- import torchaudio
7
-
8
- # Load the configuration manually
9
- config = AutoConfig.from_pretrained("sesame/csm-1b")
10
-
11
- # Load the model with config
12
- generator = load_csm_1b(device="cpu", config=config)
13
 
14
  # Load image-to-text model
15
  captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
16
 
17
  def process_image(input_image):
18
  try:
19
- # Generate caption
20
  caption = captioner(input_image)[0]['generated_text']
 
 
 
21
 
22
- # Generate speech using CSM
23
- audio = generator.generate(
24
- text=caption,
25
- speaker=0,
26
- context=[],
27
- max_audio_length_ms=10_000,
28
- )
29
-
30
- # Convert the audio tensor to NumPy for Gradio
31
- audio_np = audio.unsqueeze(0).cpu().numpy()
32
 
33
- return (audio_np, generator.sample_rate), caption
 
 
34
 
35
- except Exception as e:
36
- return str(e), "Error generating caption or speech."
 
 
37
 
38
- # Set up Gradio UI
39
- iface = gr.Interface(
40
- fn=process_image,
41
- inputs=gr.Image(type='pil', label="Upload Image"),
42
- outputs=[
43
- gr.Audio(type="numpy", label="Generated Speech"),
44
- gr.Textbox(label="Generated Caption")
45
- ],
46
- title="🎙️ SeeSay with CSM",
47
- description="Upload an image to generate a caption and hear it narrated using CSM."
48
- )
49
 
50
- iface.launch(share=True)
 
1
  import gradio as gr
2
  from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
3
 
4
  # Load image-to-text model
5
  captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
6
 
7
  def process_image(input_image):
8
  try:
9
+ # Step 1: Generate caption
10
  caption = captioner(input_image)[0]['generated_text']
11
+ return caption
12
+ except Exception as e:
13
+ return str(e)
14
 
15
+ # Set up Gradio app
16
+ with gr.Blocks(fill_height=True) as demo:
17
+ with gr.Sidebar():
18
+ gr.Markdown("# SeeSay - Powered by Sesame CSM")
19
+ gr.Markdown("This Space extracts captions from images and generates expressive speech using CSM.")
20
+ gr.Markdown("Sign in with your Hugging Face account to access the model.")
21
+ button = gr.LoginButton("Sign in")
 
 
 
22
 
23
+ # Image Upload and Caption Generation
24
+ image_input = gr.Image(type="pil", label="Upload Image")
25
+ caption_output = gr.Textbox(label="Generated Caption")
26
 
27
+ # Speech Generation using CSM
28
+ with gr.Row():
29
+ gr.Markdown("### Speech Generation")
30
+ gr.load("models/sesame/csm-1b", accept_token=button, provider="hf-inference")
31
 
32
+ # Link input and output
33
+ image_input.change(fn=process_image, inputs=image_input, outputs=caption_output)
 
 
 
 
 
 
 
 
 
34
 
35
+ demo.launch()