Spaces:
Runtime error
Runtime error
File size: 4,328 Bytes
d4ff93a bcb09ab d4ff93a b6034fe d4ff93a 4def4a0 d4ff93a 5bcf31a d4ff93a 53baa70 d4ff93a ddabc30 73efe67 ddabc30 5bcf31a 73efe67 5bcf31a ddabc30 73efe67 ddabc30 73efe67 ddabc30 73efe67 ddabc30 73efe67 ddabc30 73efe67 ddabc30 73efe67 ddabc30 73efe67 ddabc30 73efe67 ddabc30 73efe67 d4ff93a ddabc30 73efe67 ddabc30 d4ff93a ad1014c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq, AutoModelForImageTextToText
from pdf2image import convert_from_path
import base64
import io
import spaces
from PIL import Image
# Load the OCR model and processor from Hugging Face
try:
processor = AutoProcessor.from_pretrained("allenai/olmOCR-7B-0225-preview")
model = AutoModelForVision2Seq.from_pretrained("allenai/olmOCR-7B-0225-preview")
except ImportError as e:
processor = None
model = None
print(f"Error loading model: {str(e)}. Please ensure PyTorch is installed.")
except ValueError as e:
processor = None
model = None
print(f"Error with model configuration: {str(e)}")
@spaces.GPU(duration=120)
def process_pdf(pdf_file):
"""
Process the uploaded PDF file one page at a time, yielding HTML for each page
with its image and extracted text.
"""
if processor is None or model is None:
yield "<p>Error: Model could not be loaded. Check environment setup (PyTorch may be missing) or model compatibility.</p>"
return
# Check if a PDF file was uploaded
if pdf_file is None:
yield "<p>Please upload a PDF file.</p>"
return
# Convert PDF to images
try:
pages = convert_from_path(pdf_file.name)
except Exception as e:
yield f"<p>Error converting PDF to images: {str(e)}</p>"
return
# Initial HTML with "Copy All" button and container for pages
html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
yield html # Start with the header
# Process each page incrementally
for i, page in enumerate(pages):
# Convert the page image to base64 for embedding in HTML
buffered = io.BytesIO()
page.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode()
img_data = f"data:image/png;base64,{img_str}"
# Extract text from the page using the OCR model
try:
inputs = processor(text="Extract the text from this image.", images=page, return_tensors="pt")
outputs = model.generate(**inputs)
text = processor.decode(outputs[0], skip_special_tokens=True)
except Exception as e:
text = f"Error extracting text: {str(e)}"
# Generate HTML for this page's section
textarea_id = f"text{i+1}"
page_html = f'''
<div class="page" style="margin-bottom: 20px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">
<h3>Page {i+1}</h3>
<div style="display: flex; align-items: flex-start;">
<img src="{img_data}" alt="Page {i+1}" style="max-width: 300px; margin-right: 20px;">
<div style="flex-grow: 1;">
<textarea id="{textarea_id}" rows="10" style="width: 100%;">{text}</textarea>
<button onclick="copyText('{textarea_id}')" style="margin-top: 5px;">Copy</button>
</div>
</div>
</div>
'''
# Append this page to the existing HTML and yield the updated content
html += page_html
yield html
# After all pages are processed, close the div and add JavaScript
html += '</div>'
html += '''
<script>
function copyText(id) {
var text = document.getElementById(id);
text.select();
document.execCommand("copy");
}
function copyAll() {
var texts = document.querySelectorAll("#pages textarea");
var allText = Array.from(texts).map(t => t.value).join("\\n\\n");
navigator.clipboard.writeText(allText);
}
</script>
'''
yield html # Final yield with complete content and scripts
# Define the Gradio interface
with gr.Blocks(title="PDF Text Extractor") as demo:
gr.Markdown("# PDF Text Extractor")
gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text incrementally.")
with gr.Row():
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
submit_btn = gr.Button("Extract Text")
output_html = gr.HTML()
submit_btn.click(fn=process_pdf, inputs=pdf_input, outputs=output_html)
# Launch the interface
demo.launch() |