File size: 3,556 Bytes
8a8a96a 6977359 6cd7c3b 8a8a96a 6cd7c3b 8a8a96a fb39985 6cd7c3b 8a8a96a 6977359 8a8a96a 6cd7c3b 8a8a96a 6977359 6cd7c3b 6977359 6cd7c3b 6977359 6cd7c3b 8a8a96a 6977359 6cd7c3b 6977359 6cd7c3b 6977359 6cd7c3b 8a8a96a 6cd7c3b 6977359 8a8a96a 6977359 8a8a96a 6977359 8a8a96a 6cd7c3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import os
import fitz # PyMuPDF
import pytesseract
import easyocr
import numpy as np
from pdf2image import convert_from_path
from PIL import Image
from abc import ABC, abstractmethod
from paddleocr import PaddleOCR
from utils import measure_time
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
#Abstract Base Class (Interface)
class FileProcessor(ABC):
"""Abstract class for file processing."""
@abstractmethod
def extract_text(self, file_path):
"""Method to extract text from a file."""
pass
#PDF Processor (Handles Text + OCR for Scanned PDFs)
class PDFProcessor(FileProcessor):
def extract_text(self, pdf_path,reader):
text = ""
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
page_text = page.get_text("text").strip() # Extract text from page
# print(f"page- {page_num} text : {page_text}") #DEBUG
# Extract Images for OCR
images = page.get_images(full=True)
ocr_text = ""
if images:
img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
for img in img_pages:
img = np.array(img) #easy ocr expects np image
# text_ocr = reader.readtext(img, detail=0) # EasyOCR
text_ocr = reader.ocr(img, cls=True) # PaddleOCR
if text_ocr: # Ensure text_ocr is not empty
ocr_text += " ".join([line[1][0] for res in text_ocr for line in res]) + "\n" # PaddleOCR
# ocr_text += " ".join(text_ocr).strip() + "\n" # EasyOCR
# ocr_text += pytesseract.image_to_string(img).strip() + "\n"
# print(f"page- {page_num} orc : {ocr_text}") #DEBUG
# Combine both text extraction methods
combined_text = f"{page_text}\n{ocr_text}".strip()
text += combined_text + "\n\n"
return text.strip() if text else "No text found in PDF."
#Image Processor (OCR)
class ImageProcessor(FileProcessor):
def extract_text(self, image_path,reader):
print("Single Image")
# img = Image.open(image_path)
# text = pytesseract.image_to_string(img).strip() #Tesseract
# text = reader.readtext(image_path, detail=0) #EasyOCR
text_ocr = reader.ocr(image_path, cls=True) #PaddleOCR
# print(text)
return " ".join([line[1][0] for res in text_ocr for line in res]) #PaddelOCR
# return " ".join(text) if text else "No text found in Image." #EasyOCR
# return text if text else "No text found in Image." #Tesseract
#Factory to Select the Right Processor
class FileProcessorFactory:
"""Factory class to get the correct file processor based on file extension."""
_processors = {
".pdf": PDFProcessor(),
".png": ImageProcessor(),
".jpg": ImageProcessor(),
".jpeg": ImageProcessor(),
}
@classmethod
def get_processor(cls, file_path):
ext = os.path.splitext(file_path)[-1].lower()
return cls._processors.get(ext, None)
#Unified File Reading Function
@measure_time
def read_file(file_path,reader):
processor = FileProcessorFactory.get_processor(file_path)
if processor:
return processor.extract_text(file_path,reader)
else:
return f"Unsupported file format: {file_path}"
|