Prashasst commited on
Commit
6977359
·
verified ·
1 Parent(s): ffd74e7

Update file_processing.py

Browse files
Files changed (1) hide show
  1. file_processing.py +20 -12
file_processing.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import fitz # PyMuPDF
3
  import pytesseract
 
4
  from pdf2image import convert_from_path
5
  from PIL import Image
6
  from abc import ABC, abstractmethod
@@ -10,7 +11,6 @@ from abc import ABC, abstractmethod
10
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
11
 
12
 
13
-
14
  #Abstract Base Class (Interface)
15
  class FileProcessor(ABC):
16
  """Abstract class for file processing."""
@@ -23,21 +23,26 @@ class FileProcessor(ABC):
23
 
24
  #PDF Processor (Handles Text + OCR for Scanned PDFs)
25
  class PDFProcessor(FileProcessor):
26
- def extract_text(self, pdf_path):
27
  text = ""
28
  doc = fitz.open(pdf_path)
29
 
30
  for page_num in range(len(doc)):
31
  page = doc.load_page(page_num)
32
  page_text = page.get_text("text").strip() # Extract text from page
33
-
34
  # Extract Images for OCR
35
  images = page.get_images(full=True)
36
  ocr_text = ""
37
  if images:
38
  img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
39
  for img in img_pages:
40
- ocr_text += pytesseract.image_to_string(img).strip() + "\n"
 
 
 
 
 
41
 
42
  # Combine both text extraction methods
43
  combined_text = f"{page_text}\n{ocr_text}".strip()
@@ -48,10 +53,14 @@ class PDFProcessor(FileProcessor):
48
 
49
  #Image Processor (OCR)
50
  class ImageProcessor(FileProcessor):
51
- def extract_text(self, image_path):
52
- img = Image.open(image_path)
53
- text = pytesseract.image_to_string(img).strip()
54
- return text if text else "No text found in Image."
 
 
 
 
55
 
56
 
57
  #Factory to Select the Right Processor
@@ -72,11 +81,10 @@ class FileProcessorFactory:
72
 
73
 
74
  #Unified File Reading Function
75
- def read_file(file_path):
76
  processor = FileProcessorFactory.get_processor(file_path)
77
-
78
  if processor:
79
- return processor.extract_text(file_path)
80
  else:
81
  return f"Unsupported file format: {file_path}"
82
-
 
1
  import os
2
  import fitz # PyMuPDF
3
  import pytesseract
4
+ import easyocr
5
  from pdf2image import convert_from_path
6
  from PIL import Image
7
  from abc import ABC, abstractmethod
 
11
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
12
 
13
 
 
14
  #Abstract Base Class (Interface)
15
  class FileProcessor(ABC):
16
  """Abstract class for file processing."""
 
23
 
24
  #PDF Processor (Handles Text + OCR for Scanned PDFs)
25
  class PDFProcessor(FileProcessor):
26
+ def extract_text(self, pdf_path,reader):
27
  text = ""
28
  doc = fitz.open(pdf_path)
29
 
30
  for page_num in range(len(doc)):
31
  page = doc.load_page(page_num)
32
  page_text = page.get_text("text").strip() # Extract text from page
33
+ print(f"page- {page_num} text : {page_text}") #DEBUG
34
  # Extract Images for OCR
35
  images = page.get_images(full=True)
36
  ocr_text = ""
37
  if images:
38
  img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
39
  for img in img_pages:
40
+ img = np.array(img) #easy ocr expects np image
41
+ text_ocr = reader.readtext(img, detail=0) # I have initialized reader globally
42
+ if text_ocr: # Ensure text_ocr is not empty
43
+ ocr_text += " ".join(text_ocr).strip() + "\n"
44
+ # ocr_text += pytesseract.image_to_string(img).strip() + "\n"
45
+ print(f"page- {page_num} orc : {ocr_text}") #DEBUG
46
 
47
  # Combine both text extraction methods
48
  combined_text = f"{page_text}\n{ocr_text}".strip()
 
53
 
54
  #Image Processor (OCR)
55
  class ImageProcessor(FileProcessor):
56
+ def extract_text(self, image_path,reader):
57
+ print("Single Image")
58
+
59
+ # text = pytesseract.image_to_string(img).strip()
60
+ text = reader.readtext(image_path, detail=0) #I have initilized reader globally already
61
+ # print(text)
62
+ return " ".join(text) if text else "No text found in Image."
63
+ # return text if text else "No text found in Image."
64
 
65
 
66
  #Factory to Select the Right Processor
 
81
 
82
 
83
  #Unified File Reading Function
84
+ def read_file(file_path,reader):
85
  processor = FileProcessorFactory.get_processor(file_path)
86
+
87
  if processor:
88
+ return processor.extract_text(file_path,reader)
89
  else:
90
  return f"Unsupported file format: {file_path}"