Update app.py
Browse filesfixed minor bugs
app.py
CHANGED
@@ -1,35 +1,50 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
-
import
|
4 |
-
import fitz
|
5 |
import pytesseract
|
|
|
|
|
6 |
import base64
|
7 |
from google import genai
|
8 |
from google.genai import types
|
9 |
|
10 |
-
|
|
|
11 |
|
12 |
|
13 |
|
14 |
def read_pdf(pdf_path):
|
15 |
text = ""
|
16 |
doc = fitz.open(pdf_path)
|
|
|
17 |
for page_num in range(len(doc)):
|
18 |
page = doc.load_page(page_num)
|
19 |
-
page_text = page.get_text()
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
return text.strip()
|
29 |
|
30 |
|
31 |
|
32 |
|
|
|
|
|
33 |
def generate(extracted_text):
|
34 |
client = genai.Client(
|
35 |
api_key=google_api,
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
+
import fitz # PyMuPDF
|
|
|
4 |
import pytesseract
|
5 |
+
from pdf2image import convert_from_path
|
6 |
+
import os
|
7 |
import base64
|
8 |
from google import genai
|
9 |
from google.genai import types
|
10 |
|
11 |
+
google_api=os.getenv("google_api")
|
12 |
+
|
13 |
|
14 |
|
15 |
|
16 |
def read_pdf(pdf_path):
|
17 |
text = ""
|
18 |
doc = fitz.open(pdf_path)
|
19 |
+
|
20 |
for page_num in range(len(doc)):
|
21 |
page = doc.load_page(page_num)
|
22 |
+
page_text = page.get_text("text").strip() # Extract text from page
|
23 |
+
|
24 |
+
# Extract Images for OCR
|
25 |
+
images = page.get_images(full=True) # Check if the page has images
|
26 |
+
|
27 |
+
ocr_text = ""
|
28 |
+
if images: # If images exist, process them
|
29 |
+
print(f"Page {page_num + 1} contains images, performing OCR...")
|
30 |
+
img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
|
31 |
+
|
32 |
+
for img in img_pages:
|
33 |
+
ocr_text += pytesseract.image_to_string(img).strip() + "\n"
|
34 |
+
|
35 |
+
# Combine both text extraction methods
|
36 |
+
combined_text = f"{page_text}\n{ocr_text}".strip()
|
37 |
+
|
38 |
+
if combined_text:
|
39 |
+
text += combined_text + "\n\n"
|
40 |
+
|
41 |
return text.strip()
|
42 |
|
43 |
|
44 |
|
45 |
|
46 |
+
|
47 |
+
|
48 |
def generate(extracted_text):
|
49 |
client = genai.Client(
|
50 |
api_key=google_api,
|