|
import fitz |
|
import pytesseract |
|
from pdf2image import convert_from_path |
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
text = "" |
|
doc = fitz.open(pdf_path) |
|
for page in doc: |
|
text += page.get_text("text") + "\n" |
|
return text.strip() |
|
|
|
|
|
def extract_text_from_scanned_pdf(pdf_path): |
|
images = convert_from_path(pdf_path) |
|
text = "" |
|
for img in images: |
|
text += pytesseract.image_to_string(img, lang="ara") + "\n" |
|
return text.strip() |
|
|
|
|
|
def get_pdf_text(pdf_path): |
|
text = extract_text_from_pdf(pdf_path) |
|
if not text.strip(): |
|
text = extract_text_from_scanned_pdf(pdf_path) |
|
return text |