Prashasst commited on
Commit
42ba117
·
verified ·
1 Parent(s): 0eb6eb5

Update app.py

Browse files

fixed minor bugs

Files changed (1) hide show
  1. app.py +27 -12
app.py CHANGED
@@ -1,35 +1,50 @@
1
  import gradio as gr
2
  import pandas as pd
3
- import os
4
- import fitz
5
  import pytesseract
 
 
6
  import base64
7
  from google import genai
8
  from google.genai import types
9
 
10
- googel_api=os.getenv("google_api")
 
11
 
12
 
13
 
14
  def read_pdf(pdf_path):
15
  text = ""
16
  doc = fitz.open(pdf_path)
 
17
  for page_num in range(len(doc)):
18
  page = doc.load_page(page_num)
19
- page_text = page.get_text()
20
- if page_text.strip():
21
- text += page_text + "\n"
22
- else:
23
- # print(f"Image found in Page {page_num + 1} Performing OCR...")
24
- images = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
25
- for img in images:
26
- text += pytesseract.image_to_string(img) + "\n"
27
- # print(f"Extracted text preview:\n{text[:600]}...")
 
 
 
 
 
 
 
 
 
 
28
  return text.strip()
29
 
30
 
31
 
32
 
 
 
33
  def generate(extracted_text):
34
  client = genai.Client(
35
  api_key=google_api,
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import fitz # PyMuPDF
 
4
  import pytesseract
5
+ from pdf2image import convert_from_path
6
+ import os
7
  import base64
8
  from google import genai
9
  from google.genai import types
10
 
11
+ google_api=os.getenv("google_api")
12
+
13
 
14
 
15
 
16
  def read_pdf(pdf_path):
17
  text = ""
18
  doc = fitz.open(pdf_path)
19
+
20
  for page_num in range(len(doc)):
21
  page = doc.load_page(page_num)
22
+ page_text = page.get_text("text").strip() # Extract text from page
23
+
24
+ # Extract Images for OCR
25
+ images = page.get_images(full=True) # Check if the page has images
26
+
27
+ ocr_text = ""
28
+ if images: # If images exist, process them
29
+ print(f"Page {page_num + 1} contains images, performing OCR...")
30
+ img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
31
+
32
+ for img in img_pages:
33
+ ocr_text += pytesseract.image_to_string(img).strip() + "\n"
34
+
35
+ # Combine both text extraction methods
36
+ combined_text = f"{page_text}\n{ocr_text}".strip()
37
+
38
+ if combined_text:
39
+ text += combined_text + "\n\n"
40
+
41
  return text.strip()
42
 
43
 
44
 
45
 
46
+
47
+
48
  def generate(extracted_text):
49
  client = genai.Client(
50
  api_key=google_api,