File size: 3,556 Bytes
8a8a96a
 
 
6977359
6cd7c3b
8a8a96a
 
 
 
6cd7c3b
 
8a8a96a
fb39985
 
 
 
6cd7c3b
 
 
 
8a8a96a
 
 
 
 
 
 
 
 
 
 
 
6977359
8a8a96a
 
 
 
 
 
6cd7c3b
8a8a96a
 
 
 
 
 
6977359
6cd7c3b
 
 
6977359
6cd7c3b
 
 
6977359
6cd7c3b
8a8a96a
 
 
 
 
 
 
 
 
 
6977359
 
6cd7c3b
6977359
6cd7c3b
 
 
6977359
6cd7c3b
 
 
8a8a96a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cd7c3b
6977359
8a8a96a
6977359
8a8a96a
6977359
8a8a96a
 
6cd7c3b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import fitz  # PyMuPDF
import pytesseract
import easyocr
import numpy as np
from pdf2image import convert_from_path
from PIL import Image
from abc import ABC, abstractmethod

from paddleocr import PaddleOCR
from utils import measure_time


pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"






#Abstract Base Class (Interface)
class FileProcessor(ABC):
    """Abstract class for file processing."""

    @abstractmethod
    def extract_text(self, file_path):
        """Method to extract text from a file."""
        pass


#PDF Processor (Handles Text + OCR for Scanned PDFs)
class PDFProcessor(FileProcessor):
    def extract_text(self, pdf_path,reader):
        text = ""
        doc = fitz.open(pdf_path)

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            page_text = page.get_text("text").strip()  # Extract text from page
            # print(f"page- {page_num} text : {page_text}") #DEBUG
            # Extract Images for OCR
            images = page.get_images(full=True)
            ocr_text = ""
            if images:
                img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
                for img in img_pages:
                    img = np.array(img)  #easy ocr expects np image
                    # text_ocr = reader.readtext(img, detail=0)  # EasyOCR
                    text_ocr = reader.ocr(img, cls=True)         # PaddleOCR
 
                    if text_ocr:  # Ensure text_ocr is not empty
                        ocr_text +=  " ".join([line[1][0] for res in text_ocr for line in res]) + "\n"    # PaddleOCR
                        # ocr_text += " ".join(text_ocr).strip() + "\n"                                   # EasyOCR

                    # ocr_text += pytesseract.image_to_string(img).strip() + "\n"
            # print(f"page- {page_num} orc : {ocr_text}") #DEBUG

            # Combine both text extraction methods
            combined_text = f"{page_text}\n{ocr_text}".strip()
            text += combined_text + "\n\n"

        return text.strip() if text else "No text found in PDF."


#Image Processor (OCR)
class ImageProcessor(FileProcessor):
    def extract_text(self, image_path,reader):
        print("Single Image")
        # img = Image.open(image_path)

        # text = pytesseract.image_to_string(img).strip()    #Tesseract
        # text = reader.readtext(image_path, detail=0)       #EasyOCR
        text_ocr = reader.ocr(image_path, cls=True)          #PaddleOCR
        # print(text)
        return " ".join([line[1][0] for res in text_ocr for line in res]) #PaddelOCR
        # return " ".join(text) if text else "No text found in Image."    #EasyOCR
        # return text if text else "No text found in Image."              #Tesseract


#Factory to Select the Right Processor
class FileProcessorFactory:
    """Factory class to get the correct file processor based on file extension."""

    _processors = {
        ".pdf": PDFProcessor(),
        ".png": ImageProcessor(),
        ".jpg": ImageProcessor(),
        ".jpeg": ImageProcessor(),
    }

    @classmethod
    def get_processor(cls, file_path):
        ext = os.path.splitext(file_path)[-1].lower()
        return cls._processors.get(ext, None)


#Unified File Reading Function
@measure_time
def read_file(file_path,reader):
    processor = FileProcessorFactory.get_processor(file_path)

    if processor:
        return processor.extract_text(file_path,reader)
    else:
        return f"Unsupported file format: {file_path}"