Prashasst commited on
Commit
8a8a96a
·
verified ·
1 Parent(s): a6e539a

Create file_processor.py

Browse files

refactored the code to make it scalable for more file types

Files changed (1) hide show
  1. file_processor.py +77 -0
file_processor.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz # PyMuPDF
3
+ import pytesseract
4
+ from pdf2image import convert_from_path
5
+ from PIL import Image
6
+ from abc import ABC, abstractmethod
7
+
8
+
9
+ #Abstract Base Class (Interface)
10
+ class FileProcessor(ABC):
11
+ """Abstract class for file processing."""
12
+
13
+ @abstractmethod
14
+ def extract_text(self, file_path):
15
+ """Method to extract text from a file."""
16
+ pass
17
+
18
+
19
+ #PDF Processor (Handles Text + OCR for Scanned PDFs)
20
+ class PDFProcessor(FileProcessor):
21
+ def extract_text(self, pdf_path):
22
+ text = ""
23
+ doc = fitz.open(pdf_path)
24
+
25
+ for page_num in range(len(doc)):
26
+ page = doc.load_page(page_num)
27
+ page_text = page.get_text("text").strip() # Extract text from page
28
+
29
+ # Extract Images for OCR
30
+ images = page.get_images(full=True)
31
+ ocr_text = ""
32
+ if images:
33
+ img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
34
+ for img in img_pages:
35
+ ocr_text += pytesseract.image_to_string(img).strip() + "\n"
36
+
37
+ # Combine both text extraction methods
38
+ combined_text = f"{page_text}\n{ocr_text}".strip()
39
+ text += combined_text + "\n\n"
40
+
41
+ return text.strip() if text else "No text found in PDF."
42
+
43
+
44
+ #Image Processor (OCR)
45
+ class ImageProcessor(FileProcessor):
46
+ def extract_text(self, image_path):
47
+ img = Image.open(image_path)
48
+ text = pytesseract.image_to_string(img).strip()
49
+ return text if text else "No text found in Image."
50
+
51
+
52
+ #Factory to Select the Right Processor
53
+ class FileProcessorFactory:
54
+ """Factory class to get the correct file processor based on file extension."""
55
+
56
+ _processors = {
57
+ ".pdf": PDFProcessor(),
58
+ ".png": ImageProcessor(),
59
+ ".jpg": ImageProcessor(),
60
+ ".jpeg": ImageProcessor(),
61
+ }
62
+
63
+ @classmethod
64
+ def get_processor(cls, file_path):
65
+ ext = os.path.splitext(file_path)[-1].lower()
66
+ return cls._processors.get(ext, None)
67
+
68
+
69
+ #Unified File Reading Function
70
+ def read_file(file_path):
71
+ processor = FileProcessorFactory.get_processor(file_path)
72
+
73
+ if processor:
74
+ return processor.extract_text(file_path)
75
+ else:
76
+ return f"Unsupported file format: {file_path}"
77
+