Deadmon commited on
Commit
36ada58
·
verified ·
1 Parent(s): d1e4811

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -74
app.py CHANGED
@@ -2,9 +2,7 @@ import os
2
  from pathlib import Path
3
  import fitz # PyMuPDF for PDF handling
4
  from PIL import Image
5
- import pytesseract # For OCR
6
  from transformers import BlipProcessor, BlipForConditionalGeneration # For image captioning
7
- import io
8
  import torch
9
  import gradio as gr
10
 
@@ -12,56 +10,59 @@ import gradio as gr
12
  OUTPUT_DIR = Path("outputs")
13
  OUTPUT_DIR.mkdir(exist_ok=True)
14
 
15
- def pdf_to_images(pdf_path):
16
  """
17
- Convert PDF pages to appropriately sized images
18
  """
19
  try:
20
  # Open the PDF
21
  pdf_document = fitz.open(pdf_path)
22
- images = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- for page_num in range(len(pdf_document)):
25
- page = pdf_document[page_num]
26
-
27
- # Get the page dimensions to determine appropriate resolution
28
- rect = page.rect
29
- width = rect.width
30
- height = rect.height
31
-
32
- # Calculate appropriate zoom factor to get good quality images
33
- # Aim for approximately 2000 pixels on the longest side
34
- zoom = 2000 / max(width, height)
35
-
36
- # Create a transformation matrix
37
- mat = fitz.Matrix(zoom, zoom)
38
-
39
- # Render page to an image
40
- pix = page.get_pixmap(matrix=mat)
41
-
42
- # Convert to PIL Image
43
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
44
-
45
- # Save image
46
- image_path = OUTPUT_DIR / f"page_{page_num + 1}.png"
47
- img.save(image_path, "PNG")
48
- images.append((image_path, img))
49
-
50
  pdf_document.close()
51
- return images
52
  except Exception as e:
53
- print(f"Error converting PDF to images: {str(e)}")
54
- return []
55
 
56
- def extract_text_from_image(image):
57
  """
58
- Extract text from an image using OCR
59
  """
60
  try:
61
- text = pytesseract.image_to_string(image)
 
 
 
 
 
 
 
62
  return text.strip()
63
  except Exception as e:
64
- print(f"Error during OCR: {str(e)}")
65
  return ""
66
 
67
  def analyze_image(image_path):
@@ -91,43 +92,53 @@ def process_pdf(pdf_path, output_txt_path):
91
  """
92
  Main function to process the PDF and generate output
93
  """
94
- # Convert PDF to images
95
- print("Converting PDF to images...")
96
- images = pdf_to_images(pdf_path)
97
-
98
- if not images:
99
- print("No images were generated from the PDF.")
100
- return
101
-
102
- # Prepare output file
103
- with open(output_txt_path, 'w', encoding='utf-8') as f:
104
- f.write(f"Analysis of {os.path.basename(pdf_path)}\n")
105
- f.write("=" * 50 + "\n\n")
106
 
107
- # Process each page
108
- for page_num, (image_path, image) in enumerate(images, 1):
109
- print(f"Processing page {page_num}...")
110
-
111
- # Write page header
112
- f.write(f"Page {page_num}\n")
113
- f.write("-" * 30 + "\n\n")
114
-
115
- # Extract and write text
116
- text = extract_text_from_image(image)
117
- if text:
118
- f.write("Extracted Text:\n")
119
- f.write(text)
120
- f.write("\n\n")
121
- else:
122
- f.write("No text could be extracted from this page.\n\n")
123
 
124
- # Analyze image and write description
125
- description = analyze_image(image_path)
126
- f.write("Image Description:\n")
127
- f.write(f"{description}\n")
128
- f.write("\n" + "=" * 50 + "\n\n")
129
-
130
- print(f"Processing complete. Results saved to {output_txt_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  def process_uploaded_pdf(pdf_file):
133
  if pdf_file is None:
@@ -148,7 +159,7 @@ interface = gr.Interface(
148
  inputs=gr.File(label="Upload PDF"),
149
  outputs=gr.Textbox(label="Analysis Results"),
150
  title="PDF Analyzer",
151
- description="Upload a PDF file to extract text and analyze images."
152
  )
153
 
154
  interface.launch()
 
2
  from pathlib import Path
3
  import fitz # PyMuPDF for PDF handling
4
  from PIL import Image
 
5
  from transformers import BlipProcessor, BlipForConditionalGeneration # For image captioning
 
6
  import torch
7
  import gradio as gr
8
 
 
10
  OUTPUT_DIR = Path("outputs")
11
  OUTPUT_DIR.mkdir(exist_ok=True)
12
 
13
+ def generate_page_image(pdf_path, page_num):
14
  """
15
+ Generate an image from a specific PDF page for analysis
16
  """
17
  try:
18
  # Open the PDF
19
  pdf_document = fitz.open(pdf_path)
20
+ page = pdf_document[page_num]
21
+
22
+ # Get the page dimensions to determine appropriate resolution
23
+ rect = page.rect
24
+ width = rect.width
25
+ height = rect.height
26
+
27
+ # Calculate appropriate zoom factor to get good quality images
28
+ # Aim for approximately 2000 pixels on the longest side
29
+ zoom = 2000 / max(width, height)
30
+
31
+ # Create a transformation matrix
32
+ mat = fitz.Matrix(zoom, zoom)
33
+
34
+ # Render page to an image
35
+ pix = page.get_pixmap(matrix=mat)
36
+
37
+ # Convert to PIL Image
38
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
39
+
40
+ # Save image
41
+ image_path = OUTPUT_DIR / f"page_{page_num + 1}.png"
42
+ img.save(image_path, "PNG")
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  pdf_document.close()
45
+ return image_path
46
  except Exception as e:
47
+ print(f"Error generating image for page {page_num + 1}: {str(e)}")
48
+ return None
49
 
50
+ def extract_text_from_pdf(pdf_path, page_num):
51
  """
52
+ Extract text directly from a specific PDF page
53
  """
54
  try:
55
+ # Open the PDF
56
+ pdf_document = fitz.open(pdf_path)
57
+ page = pdf_document[page_num]
58
+
59
+ # Extract text
60
+ text = page.get_text("text")
61
+
62
+ pdf_document.close()
63
  return text.strip()
64
  except Exception as e:
65
+ print(f"Error extracting text from page {page_num + 1}: {str(e)}")
66
  return ""
67
 
68
  def analyze_image(image_path):
 
92
  """
93
  Main function to process the PDF and generate output
94
  """
95
+ try:
96
+ # Open the PDF to get page count
97
+ pdf_document = fitz.open(pdf_path)
98
+ num_pages = len(pdf_document)
99
+ pdf_document.close()
 
 
 
 
 
 
 
100
 
101
+ if num_pages == 0:
102
+ print("The PDF is empty.")
103
+ return
104
+
105
+ # Prepare output file
106
+ with open(output_txt_path, 'w', encoding='utf-8') as f:
107
+ f.write(f"Analysis of {os.path.basename(pdf_path)}\n")
108
+ f.write("=" * 50 + "\n\n")
 
 
 
 
 
 
 
 
109
 
110
+ # Process each page
111
+ for page_num in range(num_pages):
112
+ print(f"Processing page {page_num + 1}...")
113
+
114
+ # Write page header
115
+ f.write(f"Page {page_num + 1}\n")
116
+ f.write("-" * 30 + "\n\n")
117
+
118
+ # Extract and write text
119
+ text = extract_text_from_pdf(pdf_path, page_num)
120
+ if text:
121
+ f.write("Extracted Text:\n")
122
+ f.write(text)
123
+ f.write("\n\n")
124
+ else:
125
+ f.write("No text could be extracted from this page.\n\n")
126
+
127
+ # Generate image for analysis and write description
128
+ image_path = generate_page_image(pdf_path, page_num)
129
+ if image_path:
130
+ description = analyze_image(image_path)
131
+ f.write("Image Description:\n")
132
+ f.write(f"{description}\n")
133
+ f.write("\n" + "=" * 50 + "\n\n")
134
+ else:
135
+ f.write("Image Description:\n")
136
+ f.write("Could not generate image for analysis.\n")
137
+ f.write("\n" + "=" * 50 + "\n\n")
138
+
139
+ print(f"Processing complete. Results saved to {output_txt_path}")
140
+ except Exception as e:
141
+ print(f"Error processing PDF: {str(e)}")
142
 
143
  def process_uploaded_pdf(pdf_file):
144
  if pdf_file is None:
 
159
  inputs=gr.File(label="Upload PDF"),
160
  outputs=gr.Textbox(label="Analysis Results"),
161
  title="PDF Analyzer",
162
+ description="Upload a PDF file to extract text directly and analyze images."
163
  )
164
 
165
  interface.launch()