Spaces:

Prashasst
/

Medical_Lab_Test_Extraction_Pipeline

Sleeping

App Files Files Community

Prashasst commited on Mar 18

Commit

6cd7c3b

verified ·

1 Parent(s): d258e41

PaddleOCR added

Browse files

Files changed (1) hide show

file_processing.py +24 -8

file_processing.py CHANGED Viewed

@@ -2,15 +2,22 @@ import os
 import fitz  # PyMuPDF
 import pytesseract
 import easyocr
 from pdf2image import convert_from_path
 from PIL import Image
 from abc import ABC, abstractmethod
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 #Abstract Base Class (Interface)
 class FileProcessor(ABC):
     """Abstract class for file processing."""
@@ -30,7 +37,7 @@ class PDFProcessor(FileProcessor):
         for page_num in range(len(doc)):
             page = doc.load_page(page_num)
             page_text = page.get_text("text").strip()  # Extract text from page
-            print(f"page- {page_num} text : {page_text}") #DEBUG
             # Extract Images for OCR
             images = page.get_images(full=True)
             ocr_text = ""
@@ -38,11 +45,15 @@ class PDFProcessor(FileProcessor):
                 img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
                 for img in img_pages:
                     img = np.array(img)  #easy ocr expects np image
-                    text_ocr = reader.readtext(img, detail=0)  # I have initialized reader globally
                     if text_ocr:  # Ensure text_ocr is not empty
-                        ocr_text += " ".join(text_ocr).strip() + "\n"
                     # ocr_text += pytesseract.image_to_string(img).strip() + "\n"
-            print(f"page- {page_num} orc : {ocr_text}") #DEBUG
             # Combine both text extraction methods
             combined_text = f"{page_text}\n{ocr_text}".strip()
@@ -55,12 +66,15 @@ class PDFProcessor(FileProcessor):
 class ImageProcessor(FileProcessor):
     def extract_text(self, image_path,reader):
         print("Single Image")
-        # text = pytesseract.image_to_string(img).strip()
-        text = reader.readtext(image_path, detail=0)   #I have initilized reader globally already
         # print(text)
-        return " ".join(text) if text else "No text found in Image."
-        # return text if text else "No text found in Image."
 #Factory to Select the Right Processor
@@ -81,6 +95,7 @@ class FileProcessorFactory:
 #Unified File Reading Function
 def read_file(file_path,reader):
     processor = FileProcessorFactory.get_processor(file_path)
@@ -88,3 +103,4 @@ def read_file(file_path,reader):
         return processor.extract_text(file_path,reader)
     else:
         return f"Unsupported file format: {file_path}"

 import fitz  # PyMuPDF
 import pytesseract
 import easyocr
+import numpy as np
 from pdf2image import convert_from_path
 from PIL import Image
 from abc import ABC, abstractmethod
+from paddleocr import PaddleOCR
+from utils import measure_time
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 #Abstract Base Class (Interface)
 class FileProcessor(ABC):
     """Abstract class for file processing."""
         for page_num in range(len(doc)):
             page = doc.load_page(page_num)
             page_text = page.get_text("text").strip()  # Extract text from page
+            # print(f"page- {page_num} text : {page_text}") #DEBUG
             # Extract Images for OCR
             images = page.get_images(full=True)
             ocr_text = ""
                 img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
                 for img in img_pages:
                     img = np.array(img)  #easy ocr expects np image
+                    # text_ocr = reader.readtext(img, detail=0)  # EasyOCR
+                    text_ocr = reader.ocr(img, cls=True)         # PaddleOCR
                     if text_ocr:  # Ensure text_ocr is not empty
+                        ocr_text +=  " ".join([line[1][0] for res in text_ocr for line in res]) + "\n"    # PaddleOCR
+                        # ocr_text += " ".join(text_ocr).strip() + "\n"                                   # EasyOCR
                     # ocr_text += pytesseract.image_to_string(img).strip() + "\n"
+            # print(f"page- {page_num} orc : {ocr_text}") #DEBUG
             # Combine both text extraction methods
             combined_text = f"{page_text}\n{ocr_text}".strip()
 class ImageProcessor(FileProcessor):
     def extract_text(self, image_path,reader):
         print("Single Image")
+        # img = Image.open(image_path)
+        # text = pytesseract.image_to_string(img).strip()    #Tesseract
+        # text = reader.readtext(image_path, detail=0)       #EasyOCR
+        text_ocr = reader.ocr(image_path, cls=True)          #PaddleOCR
         # print(text)
+        return " ".join([line[1][0] for res in text_ocr for line in res]) #PaddelOCR
+        # return " ".join(text) if text else "No text found in Image."    #EasyOCR
+        # return text if text else "No text found in Image."              #Tesseract
 #Factory to Select the Right Processor
 #Unified File Reading Function
+@measure_time
 def read_file(file_path,reader):
     processor = FileProcessorFactory.get_processor(file_path)
         return processor.extract_text(file_path,reader)
     else:
         return f"Unsupported file format: {file_path}"