Spaces:

Ralqasimi
/

Chatbot

Sleeping

Ralqasimi commited on Feb 7

Commit

a377365

verified ·

1 Parent(s): 2e215e3

Update pdf_extractor.py

Files changed (1) hide show

pdf_extractor.py CHANGED Viewed

@@ -1,26 +1,13 @@
-import fitz  # PyMuPDF for normal PDFs
-import pytesseract
-from pdf2image import convert_from_path
-# Extract text from normal PDFs
 def extract_text_from_pdf(pdf_path):
     text = ""
-    doc = fitz.open(pdf_path)
-    for page in doc:
-        text += page.get_text("text") + "\n"
     return text.strip()
-# Extract text from scanned PDFs using OCR
-def extract_text_from_scanned_pdf(pdf_path):
-    images = convert_from_path(pdf_path)
-    text = ""
-    for img in images:
-        text += pytesseract.image_to_string(img, lang="ara") + "\n"
-    return text.strip()
-# Main function to extract text from both normal and scanned PDFs
 def get_pdf_text(pdf_path):
-    text = extract_text_from_pdf(pdf_path)
-    if not text.strip():  # If it's empty, use OCR
-        text = extract_text_from_scanned_pdf(pdf_path)
-    return text

+from PyPDF2 import PdfReader
+# Function to extract text from PDFs (normal PDFs only)
 def extract_text_from_pdf(pdf_path):
+    reader = PdfReader(pdf_path)
     text = ""
+    for page in reader.pages:
+        text += page.extract_text()
     return text.strip()
+# Main function to handle PDF text extraction
 def get_pdf_text(pdf_path):
+    return extract_text_from_pdf(pdf_path)