Prashasst commited on
Commit
6cd7c3b
·
verified ·
1 Parent(s): d258e41

PaddleOCR added

Browse files
Files changed (1) hide show
  1. file_processing.py +24 -8
file_processing.py CHANGED
@@ -2,15 +2,22 @@ import os
2
  import fitz # PyMuPDF
3
  import pytesseract
4
  import easyocr
 
5
  from pdf2image import convert_from_path
6
  from PIL import Image
7
  from abc import ABC, abstractmethod
8
 
 
 
9
 
10
 
11
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
12
 
13
 
 
 
 
 
14
  #Abstract Base Class (Interface)
15
  class FileProcessor(ABC):
16
  """Abstract class for file processing."""
@@ -30,7 +37,7 @@ class PDFProcessor(FileProcessor):
30
  for page_num in range(len(doc)):
31
  page = doc.load_page(page_num)
32
  page_text = page.get_text("text").strip() # Extract text from page
33
- print(f"page- {page_num} text : {page_text}") #DEBUG
34
  # Extract Images for OCR
35
  images = page.get_images(full=True)
36
  ocr_text = ""
@@ -38,11 +45,15 @@ class PDFProcessor(FileProcessor):
38
  img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
39
  for img in img_pages:
40
  img = np.array(img) #easy ocr expects np image
41
- text_ocr = reader.readtext(img, detail=0) # I have initialized reader globally
 
 
42
  if text_ocr: # Ensure text_ocr is not empty
43
- ocr_text += " ".join(text_ocr).strip() + "\n"
 
 
44
  # ocr_text += pytesseract.image_to_string(img).strip() + "\n"
45
- print(f"page- {page_num} orc : {ocr_text}") #DEBUG
46
 
47
  # Combine both text extraction methods
48
  combined_text = f"{page_text}\n{ocr_text}".strip()
@@ -55,12 +66,15 @@ class PDFProcessor(FileProcessor):
55
  class ImageProcessor(FileProcessor):
56
  def extract_text(self, image_path,reader):
57
  print("Single Image")
 
58
 
59
- # text = pytesseract.image_to_string(img).strip()
60
- text = reader.readtext(image_path, detail=0) #I have initilized reader globally already
 
61
  # print(text)
62
- return " ".join(text) if text else "No text found in Image."
63
- # return text if text else "No text found in Image."
 
64
 
65
 
66
  #Factory to Select the Right Processor
@@ -81,6 +95,7 @@ class FileProcessorFactory:
81
 
82
 
83
  #Unified File Reading Function
 
84
  def read_file(file_path,reader):
85
  processor = FileProcessorFactory.get_processor(file_path)
86
 
@@ -88,3 +103,4 @@ def read_file(file_path,reader):
88
  return processor.extract_text(file_path,reader)
89
  else:
90
  return f"Unsupported file format: {file_path}"
 
 
2
  import fitz # PyMuPDF
3
  import pytesseract
4
  import easyocr
5
+ import numpy as np
6
  from pdf2image import convert_from_path
7
  from PIL import Image
8
  from abc import ABC, abstractmethod
9
 
10
+ from paddleocr import PaddleOCR
11
+ from utils import measure_time
12
 
13
 
14
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
15
 
16
 
17
+
18
+
19
+
20
+
21
  #Abstract Base Class (Interface)
22
  class FileProcessor(ABC):
23
  """Abstract class for file processing."""
 
37
  for page_num in range(len(doc)):
38
  page = doc.load_page(page_num)
39
  page_text = page.get_text("text").strip() # Extract text from page
40
+ # print(f"page- {page_num} text : {page_text}") #DEBUG
41
  # Extract Images for OCR
42
  images = page.get_images(full=True)
43
  ocr_text = ""
 
45
  img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
46
  for img in img_pages:
47
  img = np.array(img) #easy ocr expects np image
48
+ # text_ocr = reader.readtext(img, detail=0) # EasyOCR
49
+ text_ocr = reader.ocr(img, cls=True) # PaddleOCR
50
+
51
  if text_ocr: # Ensure text_ocr is not empty
52
+ ocr_text += " ".join([line[1][0] for res in text_ocr for line in res]) + "\n" # PaddleOCR
53
+ # ocr_text += " ".join(text_ocr).strip() + "\n" # EasyOCR
54
+
55
  # ocr_text += pytesseract.image_to_string(img).strip() + "\n"
56
+ # print(f"page- {page_num} orc : {ocr_text}") #DEBUG
57
 
58
  # Combine both text extraction methods
59
  combined_text = f"{page_text}\n{ocr_text}".strip()
 
66
  class ImageProcessor(FileProcessor):
67
  def extract_text(self, image_path,reader):
68
  print("Single Image")
69
+ # img = Image.open(image_path)
70
 
71
+ # text = pytesseract.image_to_string(img).strip() #Tesseract
72
+ # text = reader.readtext(image_path, detail=0) #EasyOCR
73
+ text_ocr = reader.ocr(image_path, cls=True) #PaddleOCR
74
  # print(text)
75
+ return " ".join([line[1][0] for res in text_ocr for line in res]) #PaddelOCR
76
+ # return " ".join(text) if text else "No text found in Image." #EasyOCR
77
+ # return text if text else "No text found in Image." #Tesseract
78
 
79
 
80
  #Factory to Select the Right Processor
 
95
 
96
 
97
  #Unified File Reading Function
98
+ @measure_time
99
  def read_file(file_path,reader):
100
  processor = FileProcessorFactory.get_processor(file_path)
101
 
 
103
  return processor.extract_text(file_path,reader)
104
  else:
105
  return f"Unsupported file format: {file_path}"
106
+