File size: 3,433 Bytes
48e7216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97


import os
import json

from pydantic import BaseModel, Field, model_validator
from typing import List
import pandas as pd
from utils.logger import setup_logger

logger  = setup_logger(__name__)


def restructure_documents(original_data:dict):
    result = {}

    for pdf_path, pages in original_data.items():
        file_name = os.path.basename(pdf_path)
        for image_path, data in pages.items():
            image_name = os.path.basename(image_path)

            document_category = data.get("document_category")
            document_type = data.get("document_type")

            # Prepare the inner dict content
            entry = {
                "uploaded_file_path": file_name,
                "uploaded_file_extracted_images": [image_name],
                **data  # include all original fields (including document_type and document_category)
            }

            # Wrap it under document_type
            wrapped_entry = {document_type: entry}

            # Append to appropriate document_category list
            result.setdefault(document_category, []).append(wrapped_entry)

    return result

def extract_document_types_from_transformed(transformed_data):
    category_map = {}

    for category, docs in transformed_data.items():
        doc_types = set()
        for item in docs:
            for doc_type in item.keys():  # because each item is like {'payslip': {...}}
                doc_types.add(doc_type)
        category_map[category] = sorted(list(doc_types))

    return category_map



class DocumentTypeByCategory(BaseModel):
    bank_statement: List[str] = Field(default_factory=list)
    income_document: List[str] = Field(default_factory=list)
    identity_verification_document: List[str] = Field(default_factory=list)

    # Computed flags
    is_bank_statement_valid: bool = Field(default=False, exclude=True)
    is_income_document_valid: bool = Field(default=False, exclude=True)
    is_identity_verification_document_valid: bool = Field(default=False, exclude=True)

    @model_validator(mode="after")
    def compute_valid_flags(self):
        self.is_bank_statement_valid = bool(self.bank_statement)
        self.is_income_document_valid = bool(self.income_document)
        self.is_identity_verification_document_valid = bool(self.identity_verification_document)
        return self

    def to_dataframe(self) -> pd.DataFrame:
        data = [
            {
                "document_category": "bank_statement",
                "Uploaded": self.is_bank_statement_valid,
                "document_types": ", ".join(self.bank_statement) if self.bank_statement else "Missing"
            },
            {
                "document_category": "income_document",
                "Uploaded": self.is_income_document_valid,
                "document_types": ", ".join(self.income_document) if self.income_document else "Missing"
            },
            {
                "document_category": "identity_verification_document",
                "Uploaded": self.is_identity_verification_document_valid,
                "document_types": ", ".join(self.identity_verification_document) if self.identity_verification_document else "Missing"
            }
        ]

        data_df = pd.DataFrame(data)
        data_df.index += 1
        logger.info(f"df: {data_df}")
        data_df['Uploaded'] = data_df['Uploaded'].apply(lambda x: 'βœ…' if x  else '❌')

        return data_df