vidore
/

colqwen2-v1.0

Visual Document Retrieval

vidore-experimental

Model card Files Files and versions Community

manu commited on Mar 22

Commit

f0a3223

·

verified ·

1 Parent(s): 4988933

Update handler.py

Files changed (1) hide show

handler.py +35 -12

handler.py CHANGED Viewed

@@ -1,3 +1,6 @@
 from typing import Dict, List, Any
 from transformers.utils.import_utils import is_flash_attn_2_available
 from colpali_engine.models import ColQwen2, ColQwen2Processor
@@ -9,23 +12,43 @@ class EndpointHandler():
             path,
             torch_dtype=torch.bfloat16,
             device_map="cuda:0",  # or "mps" if on Apple Silicon
-            attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None, # should work on A100
-            ).eval()
         self.processor = ColQwen2Processor.from_pretrained(path)
-    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
-       data args:
-            inputs (:obj: `str`)
-      Return:
-            A :obj:`list` | `dict`: will be serialized and returned
         """
-        # process input
-        images = data.pop("inputs", data)
-        batch_images = self.processor.process_images([images]).to(self.model.device)
-        # Forward pass
         with torch.no_grad():
             image_embeddings = self.model(**batch_images).tolist()
         return {"embeddings": image_embeddings}

+import base64
+import io
+from PIL import Image
 from typing import Dict, List, Any
 from transformers.utils.import_utils import is_flash_attn_2_available
 from colpali_engine.models import ColQwen2, ColQwen2Processor
             path,
             torch_dtype=torch.bfloat16,
             device_map="cuda:0",  # or "mps" if on Apple Silicon
+            attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
+        ).eval()
         self.processor = ColQwen2Processor.from_pretrained(path)
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
+        Expects data in the following format:
+            {
+                "images": [
+                    "base64_encoded_image1",
+                    "base64_encoded_image2",
+                    ...
+                ]
+            }
+        Decodes each Base64 image into a PIL Image, processes them, and returns the embeddings.
         """
+        # Retrieve the list of base64 encoded images
+        base64_images = data.get("images", [])
+        if not isinstance(base64_images, list):
+            base64_images = [base64_images]
+        # Decode each image from base64 and convert to a PIL Image
+        decoded_images = []
+        for img_str in base64_images:
+            try:
+                img_data = base64.b64decode(img_str)
+                image = Image.open(io.BytesIO(img_data)).convert("RGB")
+                decoded_images.append(image)
+            except Exception as e:
+                print(f"Error decoding an image: {e}")
+        # Process the images using the processor
+        batch_images = self.processor.process_images(decoded_images).to(self.model.device)
+        # Forward pass through the model
         with torch.no_grad():
             image_embeddings = self.model(**batch_images).tolist()
         return {"embeddings": image_embeddings}