Chatbot / pdf_extractor.py
Ralqasimi's picture
Update pdf_extractor.py
4491ff6 verified
raw
history blame
820 Bytes
import fitz # PyMuPDF for normal PDFs
import pytesseract
from pdf2image import convert_from_path
# Extract text from normal PDFs
def extract_text_from_pdf(pdf_path):
text = ""
doc = fitz.open(pdf_path)
for page in doc:
text += page.get_text("text") + "\n"
return text.strip()
# Extract text from scanned PDFs using OCR
def extract_text_from_scanned_pdf(pdf_path):
images = convert_from_path(pdf_path)
text = ""
for img in images:
text += pytesseract.image_to_string(img, lang="ara") + "\n"
return text.strip()
# Main function to extract text from both normal and scanned PDFs
def get_pdf_text(pdf_path):
text = extract_text_from_pdf(pdf_path)
if not text.strip(): # If it's empty, use OCR
text = extract_text_from_scanned_pdf(pdf_path)
return text