Spaces:

preston-cell
/

image-text-to-text

Sleeping

App Files Files Community

preston-cell commited on Apr 11

Commit

a483c36

verified ·

1 Parent(s): dbabbd4

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -39

app.py CHANGED Viewed

@@ -5,39 +5,36 @@ from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
     GenerationConfig,
     set_seed
 )
 from datasets import load_dataset
 import torch
 import numpy as np
-# Set seed for reproducibility
 set_seed(42)
-# Load BLIP model for image captioning
 caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-# Load SpeechT5 model for text-to-speech
 synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
-# Load Florence-2 model for OCR
-ocr_device = "cuda" if torch.cuda.is_available() else "cpu"
-ocr_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 ocr_model = AutoModelForCausalLM.from_pretrained(
-    "microsoft/Florence-2-large",
-    torch_dtype=ocr_dtype,
-    trust_remote_code=True
-).to(ocr_device)
 ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
-# Load Doge-320M-Instruct model for context generation
 doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
 doge_model = AutoModelForCausalLM.from_pretrained(
-    "SmallDoge/Doge-320M-Instruct",
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    trust_remote_code=True
-).to("cuda" if torch.cuda.is_available() else "cpu")
 doge_generation_config = GenerationConfig(
     max_new_tokens=100,
     use_cache=True,
@@ -47,26 +44,26 @@ doge_generation_config = GenerationConfig(
     repetition_penalty=1.0
 )
-# Load and pad/truncate speaker embedding to exactly 600 dimensions
-embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-raw_vec = embeddings_dataset[0]["xvector"]
-# Force embedding to 600 dimensions
-if len(raw_vec) > 600:
-    raw_vec = raw_vec[:600]
-elif len(raw_vec) < 600:
-    raw_vec = raw_vec + [0.0] * (600 - len(raw_vec))
-speaker_embedding = torch.tensor(raw_vec, dtype=torch.float32).unsqueeze(0)  # shape [1, 600]
-assert speaker_embedding.shape == (1, 600), f"Speaker embedding shape is {speaker_embedding.shape}, expected (1, 600)"
 def process_image(image):
     try:
-        # Step 1: Generate caption
         caption = caption_model(image)[0]['generated_text']
-        # Step 2: OCR to extract text
-        inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(ocr_device, ocr_dtype)
         generated_ids = ocr_model.generate(
             input_ids=inputs["input_ids"],
             pixel_values=inputs["pixel_values"],
@@ -76,23 +73,23 @@ def process_image(image):
         )
         extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # Step 3: Generate context using Doge model
         prompt = f"Determine the context of this image based on the caption and extracted text.\nCaption: {caption}\nExtracted text: {extracted_text}\nContext:"
         conversation = [{"role": "user", "content": prompt}]
         doge_inputs = doge_tokenizer.apply_chat_template(
             conversation=conversation,
             tokenize=True,
             return_tensors="pt"
-        ).to(doge_model.device)
-        doge_outputs = doge_model.generate(
-            doge_inputs,
             generation_config=doge_generation_config
         )
-        context = doge_tokenizer.decode(doge_outputs[0], skip_special_tokens=True).strip()
-        # Step 4: Convert context to speech
         speech = synthesiser(
             context,
             forward_params={"speaker_embeddings": speaker_embedding}
@@ -118,7 +115,7 @@ iface = gr.Interface(
         gr.Textbox(label="Generated Context")
     ],
     title="SeeSay Contextualizer with Doge-320M",
-    description="Upload an image to generate a caption, extract text (OCR), generate context using Doge, and turn it into speech using SpeechT5."
 )
-iface.launch()

     AutoModelForCausalLM,
     AutoTokenizer,
     GenerationConfig,
+    TextStreamer,
     set_seed
 )
 from datasets import load_dataset
 import torch
 import numpy as np
 set_seed(42)
+# Device and dtype
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# Load image captioning model
 caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+# Load SpeechT5 text-to-speech model
 synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
+# Load OCR model (Florence-2)
 ocr_model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/Florence-2-large", torch_dtype=dtype, trust_remote_code=True
+).to(device)
 ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
+# Load Doge-320M-Instruct for context generation
 doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
 doge_model = AutoModelForCausalLM.from_pretrained(
+    "SmallDoge/Doge-320M-Instruct", trust_remote_code=True
+).to(device)
 doge_generation_config = GenerationConfig(
     max_new_tokens=100,
     use_cache=True,
     repetition_penalty=1.0
 )
+# Load speaker embedding with exactly 600 values
+speaker_embedding = None
+embedding_data = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+for entry in embedding_data:
+    vec = entry["xvector"]
+    if len(vec) >= 600:
+        speaker_embedding = torch.tensor(vec[:600], dtype=torch.float32).unsqueeze(0)  # Shape: [1, 600]
+        break
+if speaker_embedding is None:
+    raise ValueError("No suitable speaker embedding of at least 600 dimensions found.")
+assert speaker_embedding.shape == (1, 600), f"Expected shape (1, 600), got {speaker_embedding.shape}"
 def process_image(image):
     try:
+        # 1. Caption the image
         caption = caption_model(image)[0]['generated_text']
+        # 2. OCR with Florence-2
+        inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, dtype)
         generated_ids = ocr_model.generate(
             input_ids=inputs["input_ids"],
             pixel_values=inputs["pixel_values"],
         )
         extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # 3. Prompt Doge model for context generation
         prompt = f"Determine the context of this image based on the caption and extracted text.\nCaption: {caption}\nExtracted text: {extracted_text}\nContext:"
+        prompt = prompt[:600]  # Ensure prompt isn't too long
         conversation = [{"role": "user", "content": prompt}]
         doge_inputs = doge_tokenizer.apply_chat_template(
             conversation=conversation,
             tokenize=True,
             return_tensors="pt"
+        ).to(device)
+        doge_output = doge_model.generate(
+            input_ids=doge_inputs,
             generation_config=doge_generation_config
         )
+        context = doge_tokenizer.decode(doge_output[0], skip_special_tokens=True).strip()
+        # 4. Convert context to speech
         speech = synthesiser(
             context,
             forward_params={"speaker_embeddings": speaker_embedding}
         gr.Textbox(label="Generated Context")
     ],
     title="SeeSay Contextualizer with Doge-320M",
+    description="Upload an image to caption it, extract text, generate context, and hear the result as speech."
 )
+iface.launch(share=True)