Spaces:

Uddipan107
/

ocr-reorder-space

Running

App Files Files Community

Uddipan Basu Bir commited on 10 days ago

Commit

5b9baff

1 Parent(s): 124a92f

Add custom OCR reorder pipeline + Gradio UI

Browse files

Files changed (3) hide show

app.py +41 -0
inference.py +56 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import json, base64
+from io import BytesIO
+from PIL import Image
+import gradio as gr
+from inference import OcrReorderPipeline
+from transformers import (
+    AutoProcessor,
+    LayoutLMv3Model,
+    T5ForConditionalGeneration,
+    AutoTokenizer
+)
+import torch
+# Load from your model repo
+repo = "Uddipan107/ocr-layoutlmv3-base-t5-small"
+model     = LayoutLMv3Model.from_pretrained(repo)
+tokenizer = AutoTokenizer.from_pretrained(repo)
+processor = AutoProcessor.from_pretrained(repo, apply_ocr=False)
+pipe      = OcrReorderPipeline(model, tokenizer, processor, device=0)
+def infer(image, words_json, boxes_json):
+    words = json.loads(words_json)
+    boxes = json.loads(boxes_json)
+    buf = BytesIO(); image.save(buf, "PNG")
+    b64 = base64.b64encode(buf.getvalue()).decode()
+    # returns a list of strings; take first
+    return pipe(b64, words, boxes)[0]
+demo = gr.Interface(
+    fn=infer,
+    inputs=[
+      gr.Image(type="pil", label="Image"),
+      gr.Textbox(label="Words (JSON list)"),
+      gr.Textbox(label="Boxes (JSON list)")
+    ],
+    outputs="text",
+    title="OCR Reorder Pipeline"
+)
+if __name__ == "__main__":
+    demo.launch()

inference.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+from transformers import Pipeline
+from PIL import Image
+import base64
+from io import BytesIO
+class OcrReorderPipeline(Pipeline):
+    def __init__(self, model, tokenizer, processor, device=0):
+        super().__init__(model=model, tokenizer=tokenizer,
+                         feature_extractor=processor, device=device)
+        proj_state = torch.load("pytorch_model.bin", map_location="cpu")["projection"]
+        self.projection = torch.nn.Sequential(
+            torch.nn.Linear(768, model.config.d_model),
+            torch.nn.LayerNorm(model.config.d_model),
+            torch.nn.GELU()
+        )
+        self.projection.load_state_dict(proj_state)
+        self.projection.to(self.device)
+    def _sanitize_parameters(self, **kwargs):
+        return {}, {}, {}
+    def preprocess(self, image, words, boxes):
+        data = base64.b64decode(image)
+        img  = Image.open(BytesIO(data)).convert("RGB")
+        return self.feature_extractor(
+            [img], [words], boxes=[boxes],
+            return_tensors="pt", padding=True, truncation=True
+        )
+    def _forward(self, model_inputs):
+        pv, ids, mask, bbox = (
+            model_inputs[k].to(self.device)
+            for k in ("pixel_values","input_ids","attention_mask","bbox")
+        )
+        vision_out = self.model.vision_model(
+            pixel_values=pv,
+            input_ids=ids,
+            attention_mask=mask,
+            bbox=bbox
+        )
+        seq_len    = ids.size(1)
+        text_feats = vision_out.last_hidden_state[:, :seq_len, :]
+        proj_feats = self.projection(text_feats)
+        gen_ids = self.model.text_model.generate(
+            inputs_embeds=proj_feats,
+            attention_mask=mask,
+            max_length=512
+        )
+        return {"generated_ids": gen_ids}
+    def postprocess(self, model_outputs):
+        return self.tokenizer.batch_decode(
+            model_outputs["generated_ids"],
+            skip_special_tokens=True
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch
+transformers
+Pillow
+gradio