File size: 4,328 Bytes
d4ff93a
bcb09ab
d4ff93a
 
 
b6034fe
d4ff93a
4def4a0
d4ff93a
5bcf31a
 
 
 
 
 
 
 
 
 
 
d4ff93a
53baa70
d4ff93a
ddabc30
73efe67
 
ddabc30
5bcf31a
73efe67
 
5bcf31a
ddabc30
 
73efe67
 
ddabc30
 
 
 
 
73efe67
 
ddabc30
73efe67
ddabc30
73efe67
ddabc30
73efe67
ddabc30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73efe67
ddabc30
 
 
 
 
 
 
 
 
 
 
73efe67
 
 
 
ddabc30
73efe67
ddabc30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73efe67
d4ff93a
 
 
ddabc30
73efe67
ddabc30
 
 
 
 
d4ff93a
 
ad1014c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq, AutoModelForImageTextToText
from pdf2image import convert_from_path
import base64
import io
import spaces
from PIL import Image

# Load the OCR model and processor from Hugging Face
try:
    processor = AutoProcessor.from_pretrained("allenai/olmOCR-7B-0225-preview")
    model = AutoModelForVision2Seq.from_pretrained("allenai/olmOCR-7B-0225-preview")
except ImportError as e:
    processor = None
    model = None
    print(f"Error loading model: {str(e)}. Please ensure PyTorch is installed.")
except ValueError as e:
    processor = None
    model = None
    print(f"Error with model configuration: {str(e)}")

@spaces.GPU(duration=120)
def process_pdf(pdf_file):
    """
    Process the uploaded PDF file one page at a time, yielding HTML for each page
    with its image and extracted text.
    """
    if processor is None or model is None:
        yield "<p>Error: Model could not be loaded. Check environment setup (PyTorch may be missing) or model compatibility.</p>"
        return
    
    # Check if a PDF file was uploaded
    if pdf_file is None:
        yield "<p>Please upload a PDF file.</p>"
        return
    
    # Convert PDF to images
    try:
        pages = convert_from_path(pdf_file.name)
    except Exception as e:
        yield f"<p>Error converting PDF to images: {str(e)}</p>"
        return
    
    # Initial HTML with "Copy All" button and container for pages
    html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
    yield html  # Start with the header
    
    # Process each page incrementally
    for i, page in enumerate(pages):
        # Convert the page image to base64 for embedding in HTML
        buffered = io.BytesIO()
        page.save(buffered, format="PNG")
        img_str = base64.b64encode(buffered.getvalue()).decode()
        img_data = f"data:image/png;base64,{img_str}"
        
        # Extract text from the page using the OCR model
        try:
            inputs = processor(text="Extract the text from this image.", images=page, return_tensors="pt")
            outputs = model.generate(**inputs)
            text = processor.decode(outputs[0], skip_special_tokens=True)
        except Exception as e:
            text = f"Error extracting text: {str(e)}"
        
        # Generate HTML for this page's section
        textarea_id = f"text{i+1}"
        page_html = f'''
        <div class="page" style="margin-bottom: 20px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">
            <h3>Page {i+1}</h3>
            <div style="display: flex; align-items: flex-start;">
                <img src="{img_data}" alt="Page {i+1}" style="max-width: 300px; margin-right: 20px;">
                <div style="flex-grow: 1;">
                    <textarea id="{textarea_id}" rows="10" style="width: 100%;">{text}</textarea>
                    <button onclick="copyText('{textarea_id}')" style="margin-top: 5px;">Copy</button>
                </div>
            </div>
        </div>
        '''
        
        # Append this page to the existing HTML and yield the updated content
        html += page_html
        yield html
    
    # After all pages are processed, close the div and add JavaScript
    html += '</div>'
    html += '''
    <script>
    function copyText(id) {
        var text = document.getElementById(id);
        text.select();
        document.execCommand("copy");
    }
    function copyAll() {
        var texts = document.querySelectorAll("#pages textarea");
        var allText = Array.from(texts).map(t => t.value).join("\\n\\n");
        navigator.clipboard.writeText(allText);
    }
    </script>
    '''
    yield html  # Final yield with complete content and scripts

# Define the Gradio interface
with gr.Blocks(title="PDF Text Extractor") as demo:
    gr.Markdown("# PDF Text Extractor")
    gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text incrementally.")
    with gr.Row():
        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        submit_btn = gr.Button("Extract Text")
    output_html = gr.HTML()
    submit_btn.click(fn=process_pdf, inputs=pdf_input, outputs=output_html)

# Launch the interface
demo.launch()