Spaces:

ignaciaginting
/

extract_from_doc

Build error

File size: 3,598 Bytes

230c9a6

import os
from pdf_extract_kit.utils.data_preprocess import load_pdf


class BaseTask:
    def __init__(self, model):
        self.model = model

    def load_images(self, input_data):
        """

        Loads images from a single image path or a directory containing multiple images.



        Args:

            input_data (str): Path to a single image file or a directory containing image files.



        Returns:

            list: List of paths to all images to be predicted.

        """
        images = []

        if os.path.isdir(input_data):
            # If input_data is a directory, check for nested directories
            for root, dirs, files in os.walk(input_data):
                if dirs:
                    raise ValueError("Input directory should not contain nested directories: {}".format(input_data))
                for file in files:
                    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                        image_path = os.path.join(root, file)
                        images.append(image_path)
                images = sorted(images)
                break  # Only process the top-level directory
        else:
            # Determine the type of input data and process accordingly
            if input_data.lower().endswith(('.png', '.jpg', '.jpeg')):
                # If input is a single image file
                images = [input_data]
            else:
                raise ValueError("Unsupported input data format: {}".format(input_data))

        return images

    def load_pdf_images(self, input_data):
        """

        Loads images from a single PDF file or directory containing multiple PDF files.



        Args:

            input_data (str): Path to a single PDF file or a directory containing PDF files.



        Returns:

            dict: Dictionary with image IDs (formed by PDF path and page number) as keys and corresponding PIL.Image objects as values.

                  Note: Loading multiple PDFs at once is not recommended due to high memory consumption. Consider processing one PDF at a time externally using loops or multithreading.

        """
        pdf_images = {}

        if os.path.isdir(input_data):
            # If input_data is a directory, check for nested directories
            for root, dirs, files in os.walk(input_data):
                if dirs:
                    raise ValueError("Input directory should not contain nested directories: {}".format(input_data))
                for file in files:
                    if file.lower().endswith(('.pdf')):
                        pdf_path = os.path.join(root, file)
                        images = load_pdf(pdf_path)
                        for i, img in enumerate(images):
                            img_id = f"{os.path.splitext(file)[0]}_page_{i+1:04d}"
                            pdf_images[img_id] = img
                # images = sorted(images)
                break  # Only process the top-level directory
        else:
            # Determine the type of input data and process accordingly
            if input_data.lower().endswith(('.pdf')):
                # If input is a single image file
                images = load_pdf(input_data)
                for i, img in enumerate(images):
                    img_id = f"{os.path.splitext(os.path.basename(input_data))[0]}_page_{i+1:04d}"
                    pdf_images[img_id] = img
            else:
                raise ValueError("Unsupported input data format: {}".format(input_data))

        return pdf_images