Spaces:

jkorstad
/

PDF-Parser

Running on Zero

File size: 3,982 Bytes

d4ff93a
5bcf31a
d4ff93a
 
 
b6034fe
d4ff93a
4def4a0
d4ff93a
5bcf31a
 
 
 
 
 
 
 
 
 
 
d4ff93a
b6034fe
d4ff93a
ddabc30
 
 
 
5bcf31a
 
 
ddabc30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4ff93a
 
 
ddabc30
 
 
 
 
 
 
d4ff93a
 
ad1014c

import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq
from pdf2image import convert_from_path
import base64
import io
import spaces
from PIL import Image

# Load the OCR model and processor from Hugging Face
try:
    processor = AutoProcessor.from_pretrained("allenai/olmOCR-7B-0225-preview")
    model = AutoModelForVision2Seq.from_pretrained("allenai/olmOCR-7B-0225-preview")
except ImportError as e:
    processor = None
    model = None
    print(f"Error loading model: {str(e)}. Please ensure PyTorch is installed.")
except ValueError as e:
    processor = None
    model = None
    print(f"Error with model configuration: {str(e)}")

@spaces.GPU
def process_pdf(pdf_file):
    """
    Process the uploaded PDF file, extract text from each page, and generate HTML
    to display each page's image and text with copy buttons.
    """
    if processor is None or model is None:
        return "<p>Error: Model could not be loaded. Check environment setup (PyTorch may be missing) or model compatibility.</p>"
    
    # Check if a PDF file was uploaded
    if pdf_file is None:
        return "<p>Please upload a PDF file.</p>"
    
    # Convert PDF to images
    try:
        pages = convert_from_path(pdf_file.name)
    except Exception as e:
        return f"<p>Error converting PDF to images: {str(e)}</p>"
    
    # Start building the HTML output
    html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
    
    # Process each page
    for i, page in enumerate(pages):
        # Convert the page image to base64 for embedding in HTML
        buffered = io.BytesIO()
        page.save(buffered, format="PNG")
        img_str = base64.b64encode(buffered.getvalue()).decode()
        img_data = f"data:image/png;base64,{img_str}"
        
        # Extract text from the page using the OCR model
        try:
            inputs = processor(text="Extract the text from this image.", images=page, return_tensors="pt")
            outputs = model.generate(**inputs)
            text = processor.decode(outputs[0], skip_special_tokens=True)
        except Exception as e:
            text = f"Error extracting text: {str(e)}"
        
        # Generate HTML for this page's section
        textarea_id = f"text{i+1}"
        html += f'''
        <div class="page" style="margin-bottom: 20px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">
            <h3>Page {i+1}</h3>
            <div style="display: flex; align-items: flex-start;">
                <img src="{img_data}" alt="Page {i+1}" style="max-width: 300px; margin-right: 20px;">
                <div style="flex-grow: 1;">
                    <textarea id="{textarea_id}" rows="10" style="width: 100%;">{text}</textarea>
                    <button onclick="copyText('{textarea_id}')" style="margin-top: 5px;">Copy</button>
                </div>
            </div>
        </div>
        '''
    
    # Close the pages div and add JavaScript for copy functionality
    html += '</div>'
    html += '''
    <script>
    function copyText(id) {
        var text = document.getElementById(id);
        text.select();
        document.execCommand("copy");
    }
    function copyAll() {
        var texts = document.querySelectorAll("#pages textarea");
        var allText = Array.from(texts).map(t => t.value).join("\\n\\n");
        navigator.clipboard.writeText(allText);
    }
    </script>
    '''
    return html

# Define the Gradio interface
with gr.Blocks(title="PDF Text Extractor") as demo:
    gr.Markdown("# PDF Text Extractor")
    gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text.")
    with gr.Row():
        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        submit_btn = gr.Button("Extract Text")
    output_html = gr.HTML()
    submit_btn.click(fn=process_pdf, inputs=pdf_input, outputs=output_html)

# Launch the interface
demo.launch()