File size: 820 Bytes
4491ff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import fitz  # PyMuPDF for normal PDFs
import pytesseract
from pdf2image import convert_from_path

# Extract text from normal PDFs
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()

# Extract text from scanned PDFs using OCR
def extract_text_from_scanned_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img, lang="ara") + "\n"
    return text.strip()

# Main function to extract text from both normal and scanned PDFs
def get_pdf_text(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    if not text.strip():  # If it's empty, use OCR
        text = extract_text_from_scanned_pdf(pdf_path)
    return text