import gradio as gr from transformers import ( pipeline, AutoProcessor, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, set_seed ) from datasets import load_dataset import torch import numpy as np # Set seed for reproducibility set_seed(42) # Load BLIP model for image captioning caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") # Load SpeechT5 model for text-to-speech synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts") # Load Florence-2 model for OCR ocr_device = "cuda" if torch.cuda.is_available() else "cpu" ocr_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 ocr_model = AutoModelForCausalLM.from_pretrained( "microsoft/Florence-2-large", torch_dtype=ocr_dtype, trust_remote_code=True ).to(ocr_device) ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True) # Load Doge-320M-Instruct model for context generation doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct") doge_model = AutoModelForCausalLM.from_pretrained( "SmallDoge/Doge-320M-Instruct", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, trust_remote_code=True ).to("cuda" if torch.cuda.is_available() else "cpu") doge_generation_config = GenerationConfig( max_new_tokens=100, use_cache=True, do_sample=True, temperature=0.8, top_p=0.9, repetition_penalty=1.0 ) # Load and normalize speaker embedding to 600 dimensions embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embedding = None for i in range(len(embeddings_dataset)): vec = embeddings_dataset[i]["xvector"] if len(vec) >= 600: speaker_embedding = torch.tensor(vec[:600]).unsqueeze(0) break if speaker_embedding is None: raise ValueError("No suitable speaker embedding of at least 600 dimensions found.") def process_image(image): try: # Step 1: Generate caption caption = caption_model(image)[0]['generated_text'] # Step 2: OCR to extract text inputs = ocr_processor(text="", images=image, return_tensors="pt").to(ocr_device, ocr_dtype) generated_ids = ocr_model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=4096, num_beams=3, do_sample=False ) extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # Step 3: Generate context using Doge model prompt = f"Determine the context of this image based on the caption and extracted text.\nCaption: {caption}\nExtracted text: {extracted_text}\nContext:" conversation = [{"role": "user", "content": prompt}] doge_inputs = doge_tokenizer.apply_chat_template( conversation=conversation, tokenize=True, return_tensors="pt" ).to(doge_model.device) doge_outputs = doge_model.generate( doge_inputs, generation_config=doge_generation_config ) context = doge_tokenizer.decode(doge_outputs[0], skip_special_tokens=True).strip() # Step 4: Convert context to speech speech = synthesiser( context, forward_params={"speaker_embeddings": speaker_embedding} ) audio = np.array(speech["audio"]) rate = speech["sampling_rate"] return (rate, audio), caption, extracted_text, context except Exception as e: return None, f"Error: {str(e)}", "", "" # Gradio Interface iface = gr.Interface( fn=process_image, inputs=gr.Image(type='pil', label="Upload an Image"), outputs=[ gr.Audio(label="Generated Audio"), gr.Textbox(label="Generated Caption"), gr.Textbox(label="Extracted Text (OCR)"), gr.Textbox(label="Generated Context") ], title="SeeSay", description="Upload an image to generate a caption, extract text (OCR), generate context using Doge, and turn it into speech using SpeechT5." ) iface.launch()