Spaces:

ibrahim313
/

olmOcR_grinda

Runtime error

File size: 5,458 Bytes

120db54

"""
OLM-CLLM OCR – Gradio Space
Upload any PDF ➜ get clean, linearised text.

🚀  Model: allenai/olmOCR-7B-0225-preview
🔧  Prompts / render helpers come from the `olmocr` toolkit
"""

import json, base64, tempfile, os, gc
from io import BytesIO

import gradio as gr
import torch
from PIL import Image
from pypdf import PdfReader

from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png          # page → base64 PNG
from olmocr.prompts.anchor import get_anchor_text                  # page → anchor text
from olmocr.prompts import build_finetuning_prompt                 # anchor → final prompt

# ---------- 1. Model & processor (load once, then stay in memory) ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "allenai/olmOCR-7B-0225-preview",
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
).to(device).eval()

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

# ---------- 2. Utility ------------------------------------------------------
def _decode_llm_json(raw_str: str) -> str:
    """
    olmOCR returns a JSON string like:
    {
        "primary_language": "...",
        ...
        "natural_text": "THE ACTUAL PAGE TEXT"
    }
    Pull out the `natural_text` field; fall back to raw string if parsing fails.
    """
    try:
        page_json = json.loads(raw_str.strip())
        return page_json.get("natural_text") or ""
    except Exception:
        return raw_str.strip()

# ---------- 3. Core pipeline ------------------------------------------------
def pdf_to_text(pdf_file):
    """
    • Save uploaded file to a temp path (toolkit needs a real path)
    • Iterate over pages
    • For each page:
        – render page image → base64
        – generate anchor text in-page
        – build prompt (+ image) and run the model
        – collect `natural_text`
    • Return merged text
    """

    if pdf_file is None:
        return "⬆️ Please upload a PDF first."

    with tempfile.TemporaryDirectory() as tmpdir:
        local_pdf_path = os.path.join(tmpdir, "input.pdf")
        with open(local_pdf_path, "wb") as f:
            f.write(pdf_file.read())

        reader = PdfReader(local_pdf_path)
        n_pages = len(reader.pages)

        extracted_pages = []

        for page_idx in range(1, n_pages + 1):          # 1-indexed
            # a. Image
            img_b64 = render_pdf_to_base64png(
                local_pdf_path, page_idx, target_longest_image_dim=1024
            )
            page_image = Image.open(BytesIO(base64.b64decode(img_b64)))

            # b. Anchor text & prompt
            anchor = get_anchor_text(
                local_pdf_path,
                page_idx,
                pdf_engine="pdfreport",                 # uses pypdf / pdfium, no Poppler dependency
                target_length=4000,
            )
            prompt = build_finetuning_prompt(anchor)

            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text",  "text": prompt},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
                    ],
                }
            ]

            # c. Tokenise + generate
            text_in = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            inputs = processor(text=[text_in], images=[page_image], return_tensors="pt", padding=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                gen = model.generate(
                    **inputs,
                    temperature=0.2,
                    max_new_tokens=512,
                    do_sample=False,
                )

            prompt_len = inputs["input_ids"].shape[1]
            new_tokens = gen[:, prompt_len:]
            raw_out   = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]

            extracted_pages.append(_decode_llm_json(raw_out))

            # optional memory clean-up per page
            del inputs, gen
            gc.collect()
            torch.cuda.empty_cache() if torch.cuda.is_available() else None

    return "\n\n".join(extracted_pages) or "🤔 Nothing returned."

# ---------- 4. Gradio UI ----------------------------------------------------
with gr.Blocks(title="olmOCR 7B PDF Extractor") as demo:
    gr.Markdown(
        """
        # 🧠 **OLM-CLLM OCR**  
        Upload a PDF &rarr; get high-quality, linearised text (tables → Markdown, equations → LaTeX).  
        Fine-tuned Vision-LLM: **allenai/olmOCR-7B-0225-preview**.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            up = gr.File(label="📄 Upload PDF", file_types=[".pdf"])
            go = gr.Button("Extract Text", variant="primary", size="lg")
        with gr.Column(scale=2):
            out = gr.Textbox(
                label="📜 Extracted text",
                lines=25,
                interactive=False,
                show_copy_button=True,
            )

    go.click(pdf_to_text, inputs=up, outputs=out)

# ---------- 5. Launch locally (Space will ignore this) ----------------------
if __name__ == "__main__":
    demo.launch()