vidore
/

colqwen2-v1.0

Visual Document Retrieval

ColPali

Safetensors

English

vidore-experimental

vidore

Model card Files Files and versions Community

manuel commited on Mar 25

Commit

c458479

1 Parent(s): f1b2913

multiroute

Browse files

Files changed (1) hide show

handler.py +61 -21

handler.py CHANGED Viewed

@@ -20,40 +20,80 @@ class EndpointHandler():
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
-        Expects data in the following format:
             {
-                "inputs": [
                     "base64_encoded_image1",
                     "base64_encoded_image2",
                     ...
                 ]
             }
-        Decodes each Base64 image into a PIL Image, processes them, and returns the embeddings.
         """
-        # Retrieve the list of base64 encoded images
-        base64_images = data.get("inputs", [])
-        if not isinstance(base64_images, list):
-            base64_images = [base64_images]
-        else:
-            if len(base64_images) > 8:
                 return {"message": "Send a maximum of 8 images at once. We recommend sending one by one to improve load balancing."}
-        # Decode each image from base64 and convert to a PIL Image
-        decoded_images = []
-        for img_str in base64_images:
             try:
-                img_data = base64.b64decode(img_str)
-                image = Image.open(io.BytesIO(img_data)).convert("RGB")
-                decoded_images.append(image)
             except Exception as e:
-                print(f"Error decoding an image: {e}")
-        # Process the images using the processor
-        batch_images = self.processor.process_images(decoded_images).to(self.model.device)
         # Forward pass through the model
-        with torch.no_grad():
-            image_embeddings = self.model(**batch_images).tolist()
-        return {"embeddings": image_embeddings}

     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
+        Expects data in one of the following formats:
             {
+                "images": [
                     "base64_encoded_image1",
                     "base64_encoded_image2",
                     ...
                 ]
             }
+            or
+            {
+                "processed_images": [
+                    [...], # preprocessed image tensors
+                    [...]
+                ]
+            }
+            or
+            {
+                "text": [
+                    "text1",
+                    "text2",
+                    ...
+                ]
+            }
+        Returns embeddings for the provided input type.
         """
+        # Input validation
+        data = data.get("inputs", [])
+        input_keys = [key for key in ["images", "processed_images", "text"] if key in data]
+        if len(input_keys) != 1:
+            return {"error": "Exactly one of 'images', 'processed_images', or 'text' must be provided"}
+        input_type = input_keys[0]
+        inputs = data[input_type]
+        if input_type == "images":
+            if not isinstance(inputs, list):
+                inputs = [inputs]
+            if len(inputs) > 8:
                 return {"message": "Send a maximum of 8 images at once. We recommend sending one by one to improve load balancing."}
+            # Decode each image from base64 and convert to a PIL Image
+            decoded_images = []
+            for img_str in inputs:
+                try:
+                    img_data = base64.b64decode(img_str)
+                    image = Image.open(io.BytesIO(img_data)).convert("RGB")
+                    decoded_images.append(image)
+                except Exception as e:
+                    return {"error": f"Error decoding image: {str(e)}"}
+            # Process the images using the processor
+            batch = self.processor.process_images(decoded_images).to(self.model.device)
+        elif input_type == "processed_images":
             try:
+                print(inputs)
+                batch = torch.load(io.BytesIO(inputs), map_location=self.model.device)
+                print(batch)
             except Exception as e:
+                return {"error": f"Error processing preprocessed images: {str(e)}"}
+        else:  # text
+            if not isinstance(inputs, list):
+                inputs = [inputs]
+            try:
+                batch = self.processor.process_text(inputs).to(self.model.device)
+            except Exception as e:
+                return {"error": f"Error processing text: {str(e)}"}
         # Forward pass through the model
+        with torch.inference_mode():
+            embeddings = self.model(**batch).tolist()
+        return {"embeddings": embeddings}