Spaces:

Uddipan107
/

ocr-reorder-space

Running

App Files Files Community

Uddipan Basu Bir commited on 10 days ago

Commit

a01cae7

1 Parent(s): 0d4b0fc

Download checkpoint from HF hub in OcrReorderPipeline

Browse files

Files changed (1) hide show

app.py +31 -77

app.py CHANGED Viewed

@@ -16,89 +16,43 @@ from transformers import (
 # ── 1) MODEL SETUP ─────────────────────────────────────────────────────
 repo = "Uddipan107/ocr-layoutlmv3-base-t5-small"
-# Processor for LayoutLMv3
 processor = AutoProcessor.from_pretrained(
     repo,
     subfolder="preprocessor",
     apply_ocr=False
 )
-# LayoutLMv3 encoder
-layout_model = LayoutLMv3Model.from_pretrained(repo)
-layout_model.eval()
-# T5 decoder & tokenizer
-t5_model = T5ForConditionalGeneration.from_pretrained(repo)
-t5_model.eval()
-tokenizer = AutoTokenizer.from_pretrained(
     repo, subfolder="preprocessor"
 )
-# Ensure decoder_start_token_id is set
 if t5_model.config.decoder_start_token_id is None:
-    # Fallback to bos_token_id if present
-    t5_model.config.decoder_start_token_id = tokenizer.bos_token_id
-# Projection head: load from checkpoint
-ckpt_file = hf_hub_download(repo_id=repo, filename="pytorch_model.bin")
-ckpt      = torch.load(ckpt_file, map_location="cpu")
-proj_state= ckpt["projection"]
-projection = torch.nn.Sequential(
-    torch.nn.Linear(768, t5_model.config.d_model),
-    torch.nn.LayerNorm(t5_model.config.d_model),
-    torch.nn.GELU()
-)
-projection.load_state_dict(proj_state)
-projection.eval()
-# Move models to CPU (Spaces are CPU-only)
-device = torch.device("cpu")
-layout_model.to(device)
-t5_model.to(device)
-projection.to(device)
-repo = "Uddipan107/ocr-layoutlmv3-base-t5-small"
-# Processor for LayoutLMv3
-processor = AutoProcessor.from_pretrained(
-    repo,
-    subfolder="preprocessor",
-    apply_ocr=False
-)
-# LayoutLMv3 encoder
-layout_model = LayoutLMv3Model.from_pretrained(repo)
-layout_model.eval()
-# T5 decoder & tokenizer
-t5_model = T5ForConditionalGeneration.from_pretrained(repo)
-t5_model.eval()
-tokenizer = AutoTokenizer.from_pretrained(
-    repo, subfolder="preprocessor"
-)
-# Projection head: load from checkpoint
-ckpt_file = hf_hub_download(repo_id=repo, filename="pytorch_model.bin")
-ckpt      = torch.load(ckpt_file, map_location="cpu")
-proj_state= ckpt["projection"]
-projection = torch.nn.Sequential(
     torch.nn.Linear(768, t5_model.config.d_model),
     torch.nn.LayerNorm(t5_model.config.d_model),
     torch.nn.GELU()
-)
 projection.load_state_dict(proj_state)
-projection.eval()
-# Move models to CPU (Spaces are CPU-only)
-device = torch.device("cpu")
-layout_model.to(device)
-t5_model.to(device)
-projection.to(device)
 # ── 2) INFERENCE FUNCTION ─────────────────────────────────────────────
 def infer(image_path, json_file):
     img_name = os.path.basename(image_path)
-    # 2.a) Load NDJSON file (one JSON object per line)
     data = []
     with open(json_file.name, "r", encoding="utf-8") as f:
         for line in f:
@@ -106,7 +60,6 @@ def infer(image_path, json_file):
                 continue
             data.append(json.loads(line))
-    # 2.b) Find entry matching uploaded image
     entry = next((e for e in data if e.get("img_name") == img_name), None)
     if entry is None:
         return f"❌ No JSON entry found for image '{img_name}'"
@@ -114,21 +67,21 @@ def infer(image_path, json_file):
     words = entry.get("src_word_list", [])
     boxes = entry.get("src_wordbox_list", [])
-    # 2.c) Open and preprocess the image + tokens + boxes
     img = Image.open(image_path).convert("RGB")
     encoding = processor(
         [img], [words], boxes=[boxes],
         return_tensors="pt", padding=True, truncation=True
     )
-    pixel_values   = encoding.pixel_values.to(device)
-    input_ids      = encoding.input_ids.to(device)
-    attention_mask = encoding.attention_mask.to(device)
-    bbox           = encoding.bbox.to(device)
-    # 2.d) Forward pass
     with torch.no_grad():
         # LayoutLMv3 encoding
-        lm_out = layout_model(
             pixel_values=pixel_values,
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -137,22 +90,23 @@ def infer(image_path, json_file):
         seq_len    = input_ids.size(1)
         text_feats = lm_out.last_hidden_state[:, :seq_len, :]
-        # Projection → T5 decoding
         proj_feats = projection(text_feats)
-        gen_ids = t5_model.generate(
             inputs_embeds=proj_feats,
             attention_mask=attention_mask,
             max_length=512,
-            decoder_start_token_id=t5_model.config.decoder_start_token_id
         )
-    # Decode to text
     result = tokenizer.batch_decode(
         gen_ids, skip_special_tokens=True
     )[0]
     return result
-# ── 3) GRADIO UI ───────────────────────────────────────────────────────
 demo = gr.Interface(
     fn=infer,
     inputs=[

 # ── 1) MODEL SETUP ─────────────────────────────────────────────────────
 repo = "Uddipan107/ocr-layoutlmv3-base-t5-small"
+# Processor
 processor = AutoProcessor.from_pretrained(
     repo,
     subfolder="preprocessor",
     apply_ocr=False
 )
+# Encoder & Decoder
+layout_model = LayoutLMv3Model.from_pretrained(repo).to("cpu").eval()
+t5_model     = T5ForConditionalGeneration.from_pretrained(repo).to("cpu").eval()
+tokenizer    = AutoTokenizer.from_pretrained(
     repo, subfolder="preprocessor"
 )
+# Ensure decoder_start_token_id and bos_token_id are set
 if t5_model.config.decoder_start_token_id is None:
+    fallback = tokenizer.bos_token_id or tokenizer.eos_token_id
+    t5_model.config.decoder_start_token_id = fallback
+if t5_model.config.bos_token_id is None:
+    t5_model.config.bos_token_id = t5_model.config.decoder_start_token_id
+# Projection head
+ckpt_file   = hf_hub_download(repo_id=repo, filename="pytorch_model.bin")
+ckpt        = torch.load(ckpt_file, map_location="cpu")
+proj_state  = ckpt["projection"]
+projection  = torch.nn.Sequential(
     torch.nn.Linear(768, t5_model.config.d_model),
     torch.nn.LayerNorm(t5_model.config.d_model),
     torch.nn.GELU()
+).to("cpu")
 projection.load_state_dict(proj_state)
 # ── 2) INFERENCE FUNCTION ─────────────────────────────────────────────
 def infer(image_path, json_file):
     img_name = os.path.basename(image_path)
+    # Load NDJSON
     data = []
     with open(json_file.name, "r", encoding="utf-8") as f:
         for line in f:
                 continue
             data.append(json.loads(line))
     entry = next((e for e in data if e.get("img_name") == img_name), None)
     if entry is None:
         return f"❌ No JSON entry found for image '{img_name}'"
     words = entry.get("src_word_list", [])
     boxes = entry.get("src_wordbox_list", [])
+    # Preprocess image + tokens
     img = Image.open(image_path).convert("RGB")
     encoding = processor(
         [img], [words], boxes=[boxes],
         return_tensors="pt", padding=True, truncation=True
     )
+    pixel_values   = encoding.pixel_values.to("cpu")
+    input_ids      = encoding.input_ids.to("cpu")
+    attention_mask = encoding.attention_mask.to("cpu")
+    bbox           = encoding.bbox.to("cpu")
+    # Forward pass
     with torch.no_grad():
         # LayoutLMv3 encoding
+        lm_out     = layout_model(
             pixel_values=pixel_values,
             input_ids=input_ids,
             attention_mask=attention_mask,
         seq_len    = input_ids.size(1)
         text_feats = lm_out.last_hidden_state[:, :seq_len, :]
+        # Projection + T5 decoding
         proj_feats = projection(text_feats)
+        gen_ids    = t5_model.generate(
             inputs_embeds=proj_feats,
             attention_mask=attention_mask,
             max_length=512,
+            decoder_start_token_id=t5_model.config.decoder_start_token_id,
+            bos_token_id=t5_model.config.bos_token_id
         )
+    # Decode and return
     result = tokenizer.batch_decode(
         gen_ids, skip_special_tokens=True
     )[0]
     return result
+# ── 3) GRADIO INTERFACE ────────────────────────────────────────────────
 demo = gr.Interface(
     fn=infer,
     inputs=[