Spaces:

Ralqasimi
/

Chatbot

Sleeping

File size: 820 Bytes

4491ff6

import fitz  # PyMuPDF for normal PDFs
import pytesseract
from pdf2image import convert_from_path

# Extract text from normal PDFs
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()

# Extract text from scanned PDFs using OCR
def extract_text_from_scanned_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img, lang="ara") + "\n"
    return text.strip()

# Main function to extract text from both normal and scanned PDFs
def get_pdf_text(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    if not text.strip():  # If it's empty, use OCR
        text = extract_text_from_scanned_pdf(pdf_path)
    return text