Spaces:
Running
Running
File size: 4,170 Bytes
f30b843 8c3caa4 e6ea13d 602e80d 608498c e6ea13d ed4af8f 83cd235 e6ea13d 8c3caa4 efa273d 8c3caa4 efa273d 8c3caa4 e9b130c 5a2766f 602e80d 5a2766f 602e80d 5a2766f e6ea13d 602e80d 629e04f 8c3caa4 ed4af8f 602e80d 8c3caa4 efa273d 8c3caa4 f7203e8 8c3caa4 e6ea13d f7203e8 e9b130c e6ea13d 602e80d e6ea13d 602e80d 629e04f e9b130c 26dbd13 e6ea13d 602e80d 26dbd13 602e80d ed4af8f 602e80d efa273d e9b130c e6ea13d 602e80d 5a2766f 8c3caa4 629e04f 5c86456 84402c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import gradio as gr
from transformers import (
pipeline,
AutoProcessor,
AutoModelForCausalLM,
AutoTokenizer,
GenerationConfig,
set_seed
)
from datasets import load_dataset
import torch
import numpy as np
# Set seed for reproducibility
set_seed(42)
# Load BLIP model for image captioning
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
# Load SpeechT5 model for text-to-speech
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
# Load Florence-2 model for OCR
ocr_device = "cuda" if torch.cuda.is_available() else "cpu"
ocr_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
ocr_model = AutoModelForCausalLM.from_pretrained(
"microsoft/Florence-2-large",
torch_dtype=ocr_dtype,
trust_remote_code=True
).to(ocr_device)
ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
# Load Doge-320M-Instruct model for context generation
doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
doge_model = AutoModelForCausalLM.from_pretrained(
"SmallDoge/Doge-320M-Instruct",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
trust_remote_code=True
).to("cuda" if torch.cuda.is_available() else "cpu")
doge_generation_config = GenerationConfig(
max_new_tokens=100,
use_cache=True,
do_sample=True,
temperature=0.8,
top_p=0.9,
repetition_penalty=1.0
)
# Load and normalize speaker embedding to 600 dimensions
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = None
for i in range(len(embeddings_dataset)):
vec = embeddings_dataset[i]["xvector"]
if len(vec) >= 600:
speaker_embedding = torch.tensor(vec[:600]).unsqueeze(0)
break
if speaker_embedding is None:
raise ValueError("No suitable speaker embedding of at least 600 dimensions found.")
def process_image(image):
try:
# Step 1: Generate caption
caption = caption_model(image)[0]['generated_text']
# Step 2: OCR to extract text
inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(ocr_device, ocr_dtype)
generated_ids = ocr_model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=4096,
num_beams=3,
do_sample=False
)
extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Step 3: Generate context using Doge model
prompt = f"Determine the context of this image based on the caption and extracted text.\nCaption: {caption}\nExtracted text: {extracted_text}\nContext:"
conversation = [{"role": "user", "content": prompt}]
doge_inputs = doge_tokenizer.apply_chat_template(
conversation=conversation,
tokenize=True,
return_tensors="pt"
).to(doge_model.device)
doge_outputs = doge_model.generate(
doge_inputs,
generation_config=doge_generation_config
)
context = doge_tokenizer.decode(doge_outputs[0], skip_special_tokens=True).strip()
# Step 4: Convert context to speech
speech = synthesiser(
context,
forward_params={"speaker_embeddings": speaker_embedding}
)
audio = np.array(speech["audio"])
rate = speech["sampling_rate"]
return (rate, audio), caption, extracted_text, context
except Exception as e:
return None, f"Error: {str(e)}", "", ""
# Gradio Interface
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type='pil', label="Upload an Image"),
outputs=[
gr.Audio(label="Generated Audio"),
gr.Textbox(label="Generated Caption"),
gr.Textbox(label="Extracted Text (OCR)"),
gr.Textbox(label="Generated Context")
],
title="SeeSay",
description="Upload an image to generate a caption, extract text (OCR), generate context using Doge, and turn it into speech using SpeechT5."
)
iface.launch() |