jkorstad commited on
Commit
ddabc30
·
verified ·
1 Parent(s): ad1014c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -72
app.py CHANGED
@@ -10,81 +10,81 @@ processor = AutoProcessor.from_pretrained("allenai/olmOCR-7B-0225-preview")
10
  model = AutoModelForCausalLM.from_pretrained("allenai/olmOCR-7B-0225-preview")
11
 
12
  def process_pdf(pdf_file):
13
- """
14
- Process the uploaded PDF file, extract text from each page, and generate HTML
15
- to display each page's image and text with copy buttons.
16
- """
17
- # Check if a PDF file was uploaded
18
- if pdf_file is None:
19
- return "<p>Please upload a PDF file.</p>"
20
-
21
- # Convert PDF to images
22
- try:
23
- pages = convert_from_path(pdf_file.name)
24
- except Exception as e:
25
- return f"<p>Error converting PDF to images: {str(e)}</p>"
26
-
27
- # Start building the HTML output
28
- html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
29
-
30
- # Process each page
31
- for i, page in enumerate(pages):
32
- # Convert the page image to base64 for embedding in HTML
33
- buffered = io.BytesIO()
34
- page.save(buffered, format="PNG")
35
- img_str = base64.b64encode(buffered.getvalue()).decode()
36
- img_data = f"data:image/png;base64,{img_str}"
37
-
38
- # Extract text from the page using the OCR model
39
- try:
40
- inputs = processor(text="Extract the text from this image.", images=page, return_tensors="pt")
41
- outputs = model.generate(**inputs)
42
- text = processor.decode(outputs[0], skip_special_tokens=True)
43
- except Exception as e:
44
- text = f"Error extracting text: {str(e)}"
45
-
46
- # Generate HTML for this page's section
47
- textarea_id = f"text{i+1}"
48
- html += f'''
49
- <div class="page" style="margin-bottom: 20px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">
50
- <h3>Page {i+1}</h3>
51
- <div style="display: flex; align-items: flex-start;">
52
- <img src="{img_data}" alt="Page {i+1}" style="max-width: 300px; margin-right: 20px;">
53
- <div style="flex-grow: 1;">
54
- <textarea id="{textarea_id}" rows="10" style="width: 100%;">{text}</textarea>
55
- <button onclick="copyText('{textarea_id}')" style="margin-top: 5px;">Copy</button>
56
- </div>
57
- </div>
58
- </div>
59
- '''
60
-
61
- # Close the pages div and add JavaScript for copy functionality
62
- html += '</div>'
63
- html += '''
64
- <script>
65
- function copyText(id) {
66
- var text = document.getElementById(id);
67
- text.select();
68
- document.execCommand("copy");
69
- }
70
- function copyAll() {
71
- var texts = document.querySelectorAll("#pages textarea");
72
- var allText = Array.from(texts).map(t => t.value).join("\\n\\n");
73
- navigator.clipboard.writeText(allText);
74
- }
75
- </script>
76
- '''
77
- return html
78
 
79
  # Define the Gradio interface
80
  with gr.Blocks(title="PDF Text Extractor") as demo:
81
- gr.Markdown("# PDF Text Extractor")
82
- gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text.")
83
- with gr.Row():
84
- pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
85
- submit_btn = gr.Button("Extract Text")
86
- output_html = gr.HTML()
87
- submit_btn.click(fn=process_pdf, inputs=pdf_input, outputs=output_html)
88
 
89
  # Launch the interface
90
  demo.launch()
 
10
  model = AutoModelForCausalLM.from_pretrained("allenai/olmOCR-7B-0225-preview")
11
 
12
  def process_pdf(pdf_file):
13
+ """
14
+ Process the uploaded PDF file, extract text from each page, and generate HTML
15
+ to display each page's image and text with copy buttons.
16
+ """
17
+ # Check if a PDF file was uploaded
18
+ if pdf_file is None:
19
+ return "<p>Please upload a PDF file.</p>"
20
+
21
+ # Convert PDF to images
22
+ try:
23
+ pages = convert_from_path(pdf_file.name)
24
+ except Exception as e:
25
+ return f"<p>Error converting PDF to images: {str(e)}</p>"
26
+
27
+ # Start building the HTML output
28
+ html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
29
+
30
+ # Process each page
31
+ for i, page in enumerate(pages):
32
+ # Convert the page image to base64 for embedding in HTML
33
+ buffered = io.BytesIO()
34
+ page.save(buffered, format="PNG")
35
+ img_str = base64.b64encode(buffered.getvalue()).decode()
36
+ img_data = f"data:image/png;base64,{img_str}"
37
+
38
+ # Extract text from the page using the OCR model
39
+ try:
40
+ inputs = processor(text="Extract the text from this image.", images=page, return_tensors="pt")
41
+ outputs = model.generate(**inputs)
42
+ text = processor.decode(outputs[0], skip_special_tokens=True)
43
+ except Exception as e:
44
+ text = f"Error extracting text: {str(e)}"
45
+
46
+ # Generate HTML for this page's section
47
+ textarea_id = f"text{i+1}"
48
+ html += f'''
49
+ <div class="page" style="margin-bottom: 20px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">
50
+ <h3>Page {i+1}</h3>
51
+ <div style="display: flex; align-items: flex-start;">
52
+ <img src="{img_data}" alt="Page {i+1}" style="max-width: 300px; margin-right: 20px;">
53
+ <div style="flex-grow: 1;">
54
+ <textarea id="{textarea_id}" rows="10" style="width: 100%;">{text}</textarea>
55
+ <button onclick="copyText('{textarea_id}')" style="margin-top: 5px;">Copy</button>
56
+ </div>
57
+ </div>
58
+ </div>
59
+ '''
60
+
61
+ # Close the pages div and add JavaScript for copy functionality
62
+ html += '</div>'
63
+ html += '''
64
+ <script>
65
+ function copyText(id) {
66
+ var text = document.getElementById(id);
67
+ text.select();
68
+ document.execCommand("copy");
69
+ }
70
+ function copyAll() {
71
+ var texts = document.querySelectorAll("#pages textarea");
72
+ var allText = Array.from(texts).map(t => t.value).join("\\n\\n");
73
+ navigator.clipboard.writeText(allText);
74
+ }
75
+ </script>
76
+ '''
77
+ return html
78
 
79
  # Define the Gradio interface
80
  with gr.Blocks(title="PDF Text Extractor") as demo:
81
+ gr.Markdown("# PDF Text Extractor")
82
+ gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text.")
83
+ with gr.Row():
84
+ pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
85
+ submit_btn = gr.Button("Extract Text")
86
+ output_html = gr.HTML()
87
+ submit_btn.click(fn=process_pdf, inputs=pdf_input, outputs=output_html)
88
 
89
  # Launch the interface
90
  demo.launch()