preston-cell commited on
Commit
005e1c9
·
verified ·
1 Parent(s): 5379d69

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -27
app.py CHANGED
@@ -1,35 +1,43 @@
1
  import gradio as gr
2
- from transformers import pipeline
 
 
3
 
4
- # Load image-to-text model
5
- captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
6
 
7
- def process_image(input_image):
8
- try:
9
- # Step 1: Generate caption
10
- caption = captioner(input_image)[0]['generated_text']
11
- return caption
12
- except Exception as e:
13
- return str(e)
14
 
15
- # Set up Gradio app
16
- with gr.Blocks(fill_height=True) as demo:
17
- with gr.Sidebar():
18
- gr.Markdown("# SeeSay - Powered by Sesame CSM")
19
- gr.Markdown("This Space extracts captions from images and generates expressive speech using CSM.")
20
- gr.Markdown("Sign in with your Hugging Face account to access the model.")
21
- button = gr.LoginButton("Sign in")
22
 
23
- # Image Upload and Caption Generation
24
- image_input = gr.Image(type="pil", label="Upload Image")
25
- caption_output = gr.Textbox(label="Generated Caption")
 
 
 
 
 
26
 
27
- # Speech Generation using CSM
28
- with gr.Row():
29
- gr.Markdown("### Speech Generation")
30
- gr.load("models/sesame/csm-1b", accept_token=button, provider="hf-inference")
 
 
 
31
 
32
- # Link input and output
33
- image_input.change(fn=process_image, inputs=image_input, outputs=caption_output)
 
 
 
 
 
34
 
35
- demo.launch()
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
+ from datasets import load_dataset
5
 
6
+ device = "cpu"
7
+ torch_dtype = torch.float32
8
 
9
+ # Load the Whisper model
10
+ model_id = "openai/whisper-large-v3-turbo"
11
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
12
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
13
+ )
14
+ model.to(device)
 
15
 
16
+ processor = AutoProcessor.from_pretrained(model_id)
 
 
 
 
 
 
17
 
18
+ pipe = pipeline(
19
+ "automatic-speech-recognition",
20
+ model=model,
21
+ tokenizer=processor.tokenizer,
22
+ feature_extractor=processor.feature_extractor,
23
+ torch_dtype=torch_dtype,
24
+ device=device,
25
+ )
26
 
27
+ def transcribe(audio_path):
28
+ try:
29
+ # Transcribe the audio using Whisper
30
+ result = pipe(audio_path)
31
+ return result["text"]
32
+ except Exception as e:
33
+ return str(e)
34
 
35
+ demo = gr.Interface(
36
+ fn=transcribe,
37
+ inputs=gr.Audio(type="filepath"),
38
+ outputs=gr.Textbox(),
39
+ title="Whisper Speech Recognition",
40
+ description="Upload an audio file to transcribe using Whisper large-v3-turbo."
41
+ )
42
 
43
+ demo.launch(share=True)