Spaces:

Prashasst
/

Medical_Lab_Test_Extraction_Pipeline

Sleeping

File size: 3,556 Bytes

import os
import fitz  # PyMuPDF
import pytesseract
import easyocr
import numpy as np
from pdf2image import convert_from_path
from PIL import Image
from abc import ABC, abstractmethod

from paddleocr import PaddleOCR
from utils import measure_time


pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"






#Abstract Base Class (Interface)
class FileProcessor(ABC):
    """Abstract class for file processing."""

    @abstractmethod
    def extract_text(self, file_path):
        """Method to extract text from a file."""
        pass


#PDF Processor (Handles Text + OCR for Scanned PDFs)
class PDFProcessor(FileProcessor):
    def extract_text(self, pdf_path,reader):
        text = ""
        doc = fitz.open(pdf_path)

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            page_text = page.get_text("text").strip()  # Extract text from page
            # print(f"page- {page_num} text : {page_text}") #DEBUG
            # Extract Images for OCR
            images = page.get_images(full=True)
            ocr_text = ""
            if images:
                img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
                for img in img_pages:
                    img = np.array(img)  #easy ocr expects np image
                    # text_ocr = reader.readtext(img, detail=0)  # EasyOCR
                    text_ocr = reader.ocr(img, cls=True)         # PaddleOCR
 
                    if text_ocr:  # Ensure text_ocr is not empty
                        ocr_text +=  " ".join([line[1][0] for res in text_ocr for line in res]) + "\n"    # PaddleOCR
                        # ocr_text += " ".join(text_ocr).strip() + "\n"                                   # EasyOCR

                    # ocr_text += pytesseract.image_to_string(img).strip() + "\n"
            # print(f"page- {page_num} orc : {ocr_text}") #DEBUG

            # Combine both text extraction methods
            combined_text = f"{page_text}\n{ocr_text}".strip()
            text += combined_text + "\n\n"

        return text.strip() if text else "No text found in PDF."


#Image Processor (OCR)
class ImageProcessor(FileProcessor):
    def extract_text(self, image_path,reader):
        print("Single Image")
        # img = Image.open(image_path)

        # text = pytesseract.image_to_string(img).strip()    #Tesseract
        # text = reader.readtext(image_path, detail=0)       #EasyOCR
        text_ocr = reader.ocr(image_path, cls=True)          #PaddleOCR
        # print(text)
        return " ".join([line[1][0] for res in text_ocr for line in res]) #PaddelOCR
        # return " ".join(text) if text else "No text found in Image."    #EasyOCR
        # return text if text else "No text found in Image."              #Tesseract


#Factory to Select the Right Processor
class FileProcessorFactory:
    """Factory class to get the correct file processor based on file extension."""

    _processors = {
        ".pdf": PDFProcessor(),
        ".png": ImageProcessor(),
        ".jpg": ImageProcessor(),
        ".jpeg": ImageProcessor(),
    }

    @classmethod
    def get_processor(cls, file_path):
        ext = os.path.splitext(file_path)[-1].lower()
        return cls._processors.get(ext, None)


#Unified File Reading Function
@measure_time
def read_file(file_path,reader):
    processor = FileProcessorFactory.get_processor(file_path)

    if processor:
        return processor.extract_text(file_path,reader)
    else:
        return f"Unsupported file format: {file_path}"