Spaces:
Sleeping
Sleeping
File size: 2,945 Bytes
f30b843 a0edfdb f984625 a0edfdb f984625 e6ea13d f984625 a483c36 f984625 83cd235 f984625 d0aa231 f984625 e9b130c f984625 66d96fc f984625 a0edfdb f984625 29f7833 602e80d 629e04f f984625 68bf04e 602e80d f984625 29f7833 f984625 d0aa231 f984625 ffbd81c f984625 29f7833 f984625 e6ea13d 602e80d e6ea13d 602e80d 629e04f e9b130c 26dbd13 a0edfdb 26dbd13 602e80d 68bf04e 602e80d efa273d e9b130c 68bf04e 602e80d f984625 629e04f 5c86456 f984625 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import gradio as gr
from transformers import (
pipeline,
AutoProcessor,
AutoModelForCausalLM,
AutoTokenizer,
set_seed
)
from datasets import load_dataset
import torch
import numpy as np
# Set seed
set_seed(42)
# Captioning model
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
# GPT-2 model for context generation
gpt2_generator = pipeline("text-generation", model="gpt2")
# SpeechT5 for text-to-speech
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
# Load Florence-2-base for OCR
ocr_device = "cuda" if torch.cuda.is_available() else "cpu"
ocr_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=ocr_dtype, trust_remote_code=True).to(ocr_device)
ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
# Load speaker embedding
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
def process_image(image):
try:
# Generate caption
caption = caption_model(image)[0]['generated_text']
# Extract OCR text
inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(ocr_device, ocr_dtype)
generated_ids = ocr_model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
num_beams=3,
do_sample=False
)
extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Generate context with GPT-2
prompt = f"Determine the context of this image based on the caption and extracted text. Caption: {caption}. Extracted text: {extracted_text}. Context:"
context_output = gpt2_generator(prompt, max_length=100, num_return_sequences=1)
context = context_output[0]['generated_text']
# Text-to-speech
speech = synthesiser(context, forward_params={"speaker_embeddings": speaker_embedding})
audio = np.array(speech["audio"])
rate = speech["sampling_rate"]
return (rate, audio), caption, extracted_text, context
except Exception as e:
return None, f"Error: {str(e)}", "", ""
# Gradio UI
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type='pil', label="Upload an Image"),
outputs=[
gr.Audio(label="Generated Audio"),
gr.Textbox(label="Generated Caption"),
gr.Textbox(label="Extracted Text (OCR)"),
gr.Textbox(label="Generated Context")
],
title="SeeSay Contextualizer",
description="Upload an image to generate a caption, extract text, create audio from context, and determine the context using GPT-2 and Florence-2-base."
)
iface.launch() |