Spaces:
Runtime error
Runtime error
""" | |
OLM-CLLM OCR β Gradio Space | |
Upload any PDF β get clean, linearised text. | |
π Model: allenai/olmOCR-7B-0225-preview | |
π§ Prompts / render helpers come from the `olmocr` toolkit | |
""" | |
import json, base64, tempfile, os, gc | |
from io import BytesIO | |
import gradio as gr | |
import torch | |
from PIL import Image | |
from pypdf import PdfReader | |
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration | |
from olmocr.data.renderpdf import render_pdf_to_base64png # page β base64 PNG | |
from olmocr.prompts.anchor import get_anchor_text # page β anchor text | |
from olmocr.prompts import build_finetuning_prompt # anchor β final prompt | |
# ---------- 1. Model & processor (load once, then stay in memory) ---------- | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model = Qwen2VLForConditionalGeneration.from_pretrained( | |
"allenai/olmOCR-7B-0225-preview", | |
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
).to(device).eval() | |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") | |
# ---------- 2. Utility ------------------------------------------------------ | |
def _decode_llm_json(raw_str: str) -> str: | |
""" | |
olmOCR returns a JSON string like: | |
{ | |
"primary_language": "...", | |
... | |
"natural_text": "THE ACTUAL PAGE TEXT" | |
} | |
Pull out the `natural_text` field; fall back to raw string if parsing fails. | |
""" | |
try: | |
page_json = json.loads(raw_str.strip()) | |
return page_json.get("natural_text") or "" | |
except Exception: | |
return raw_str.strip() | |
# ---------- 3. Core pipeline ------------------------------------------------ | |
def pdf_to_text(pdf_file): | |
""" | |
β’ Save uploaded file to a temp path (toolkit needs a real path) | |
β’ Iterate over pages | |
β’ For each page: | |
β render page image β base64 | |
β generate anchor text in-page | |
β build prompt (+ image) and run the model | |
β collect `natural_text` | |
β’ Return merged text | |
""" | |
if pdf_file is None: | |
return "β¬οΈ Please upload a PDF first." | |
with tempfile.TemporaryDirectory() as tmpdir: | |
local_pdf_path = os.path.join(tmpdir, "input.pdf") | |
with open(local_pdf_path, "wb") as f: | |
f.write(pdf_file.read()) | |
reader = PdfReader(local_pdf_path) | |
n_pages = len(reader.pages) | |
extracted_pages = [] | |
for page_idx in range(1, n_pages + 1): # 1-indexed | |
# a. Image | |
img_b64 = render_pdf_to_base64png( | |
local_pdf_path, page_idx, target_longest_image_dim=1024 | |
) | |
page_image = Image.open(BytesIO(base64.b64decode(img_b64))) | |
# b. Anchor text & prompt | |
anchor = get_anchor_text( | |
local_pdf_path, | |
page_idx, | |
pdf_engine="pdfreport", # uses pypdf / pdfium, no Poppler dependency | |
target_length=4000, | |
) | |
prompt = build_finetuning_prompt(anchor) | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": prompt}, | |
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}}, | |
], | |
} | |
] | |
# c. Tokenise + generate | |
text_in = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
inputs = processor(text=[text_in], images=[page_image], return_tensors="pt", padding=True) | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
with torch.no_grad(): | |
gen = model.generate( | |
**inputs, | |
temperature=0.2, | |
max_new_tokens=512, | |
do_sample=False, | |
) | |
prompt_len = inputs["input_ids"].shape[1] | |
new_tokens = gen[:, prompt_len:] | |
raw_out = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0] | |
extracted_pages.append(_decode_llm_json(raw_out)) | |
# optional memory clean-up per page | |
del inputs, gen | |
gc.collect() | |
torch.cuda.empty_cache() if torch.cuda.is_available() else None | |
return "\n\n".join(extracted_pages) or "π€ Nothing returned." | |
# ---------- 4. Gradio UI ---------------------------------------------------- | |
with gr.Blocks(title="olmOCR 7B PDF Extractor") as demo: | |
gr.Markdown( | |
""" | |
# π§ **OLM-CLLM OCR** | |
Upload a PDF → get high-quality, linearised text (tables β Markdown, equations β LaTeX). | |
Fine-tuned Vision-LLM: **allenai/olmOCR-7B-0225-preview**. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
up = gr.File(label="π Upload PDF", file_types=[".pdf"]) | |
go = gr.Button("Extract Text", variant="primary", size="lg") | |
with gr.Column(scale=2): | |
out = gr.Textbox( | |
label="π Extracted text", | |
lines=25, | |
interactive=False, | |
show_copy_button=True, | |
) | |
go.click(pdf_to_text, inputs=up, outputs=out) | |
# ---------- 5. Launch locally (Space will ignore this) ---------------------- | |
if __name__ == "__main__": | |
demo.launch() | |