File size: 2,356 Bytes
8a8a96a
 
 
 
 
 
 
 
fb39985
 
 
 
 
8a8a96a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from abc import ABC, abstractmethod



pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"



#Abstract Base Class (Interface)
class FileProcessor(ABC):
    """Abstract class for file processing."""

    @abstractmethod
    def extract_text(self, file_path):
        """Method to extract text from a file."""
        pass


#PDF Processor (Handles Text + OCR for Scanned PDFs)
class PDFProcessor(FileProcessor):
    def extract_text(self, pdf_path):
        text = ""
        doc = fitz.open(pdf_path)

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            page_text = page.get_text("text").strip()  # Extract text from page

            # Extract Images for OCR
            images = page.get_images(full=True)
            ocr_text = ""
            if images:
                img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
                for img in img_pages:
                    ocr_text += pytesseract.image_to_string(img).strip() + "\n"

            # Combine both text extraction methods
            combined_text = f"{page_text}\n{ocr_text}".strip()
            text += combined_text + "\n\n"

        return text.strip() if text else "No text found in PDF."


#Image Processor (OCR)
class ImageProcessor(FileProcessor):
    def extract_text(self, image_path):
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img).strip()
        return text if text else "No text found in Image."


#Factory to Select the Right Processor
class FileProcessorFactory:
    """Factory class to get the correct file processor based on file extension."""

    _processors = {
        ".pdf": PDFProcessor(),
        ".png": ImageProcessor(),
        ".jpg": ImageProcessor(),
        ".jpeg": ImageProcessor(),
    }

    @classmethod
    def get_processor(cls, file_path):
        ext = os.path.splitext(file_path)[-1].lower()
        return cls._processors.get(ext, None)


#Unified File Reading Function
def read_file(file_path):
    processor = FileProcessorFactory.get_processor(file_path)
    
    if processor:
        return processor.extract_text(file_path)
    else:
        return f"Unsupported file format: {file_path}"