Spaces:

Ralqasimi
/

Chatbot

Sleeping

Chatbot / pdf_extractor.py

Update pdf_extractor.py

4491ff6 verified 3 months ago

820 Bytes

	import fitz # PyMuPDF for normal PDFs
	import pytesseract
	from pdf2image import convert_from_path

	# Extract text from normal PDFs
	def extract_text_from_pdf(pdf_path):
	text = ""
	doc = fitz.open(pdf_path)
	for page in doc:
	text += page.get_text("text") + "\n"
	return text.strip()

	# Extract text from scanned PDFs using OCR
	def extract_text_from_scanned_pdf(pdf_path):
	images = convert_from_path(pdf_path)
	text = ""
	for img in images:
	text += pytesseract.image_to_string(img, lang="ara") + "\n"
	return text.strip()

	# Main function to extract text from both normal and scanned PDFs
	def get_pdf_text(pdf_path):
	text = extract_text_from_pdf(pdf_path)
	if not text.strip(): # If it's empty, use OCR
	text = extract_text_from_scanned_pdf(pdf_path)
	return text