Ralqasimi commited on
Commit
a377365
·
verified ·
1 Parent(s): 2e215e3

Update pdf_extractor.py

Browse files
Files changed (1) hide show
  1. pdf_extractor.py +7 -20
pdf_extractor.py CHANGED
@@ -1,26 +1,13 @@
1
- import fitz # PyMuPDF for normal PDFs
2
- import pytesseract
3
- from pdf2image import convert_from_path
4
 
5
- # Extract text from normal PDFs
6
  def extract_text_from_pdf(pdf_path):
 
7
  text = ""
8
- doc = fitz.open(pdf_path)
9
- for page in doc:
10
- text += page.get_text("text") + "\n"
11
  return text.strip()
12
 
13
- # Extract text from scanned PDFs using OCR
14
- def extract_text_from_scanned_pdf(pdf_path):
15
- images = convert_from_path(pdf_path)
16
- text = ""
17
- for img in images:
18
- text += pytesseract.image_to_string(img, lang="ara") + "\n"
19
- return text.strip()
20
-
21
- # Main function to extract text from both normal and scanned PDFs
22
  def get_pdf_text(pdf_path):
23
- text = extract_text_from_pdf(pdf_path)
24
- if not text.strip(): # If it's empty, use OCR
25
- text = extract_text_from_scanned_pdf(pdf_path)
26
- return text
 
1
+ from PyPDF2 import PdfReader
 
 
2
 
3
+ # Function to extract text from PDFs (normal PDFs only)
4
  def extract_text_from_pdf(pdf_path):
5
+ reader = PdfReader(pdf_path)
6
  text = ""
7
+ for page in reader.pages:
8
+ text += page.extract_text()
 
9
  return text.strip()
10
 
11
+ # Main function to handle PDF text extraction
 
 
 
 
 
 
 
 
12
  def get_pdf_text(pdf_path):
13
+ return extract_text_from_pdf(pdf_path)