import os import json from pydantic import BaseModel, Field, model_validator from typing import List import pandas as pd from utils.logger import setup_logger logger = setup_logger(__name__) def restructure_documents(original_data:dict): result = {} for pdf_path, pages in original_data.items(): file_name = os.path.basename(pdf_path) for image_path, data in pages.items(): image_name = os.path.basename(image_path) document_category = data.get("document_category") document_type = data.get("document_type") # Prepare the inner dict content entry = { "uploaded_file_path": file_name, "uploaded_file_extracted_images": [image_name], **data # include all original fields (including document_type and document_category) } # Wrap it under document_type wrapped_entry = {document_type: entry} # Append to appropriate document_category list result.setdefault(document_category, []).append(wrapped_entry) return result def extract_document_types_from_transformed(transformed_data): category_map = {} for category, docs in transformed_data.items(): doc_types = set() for item in docs: for doc_type in item.keys(): # because each item is like {'payslip': {...}} doc_types.add(doc_type) category_map[category] = sorted(list(doc_types)) return category_map class DocumentTypeByCategory(BaseModel): bank_statement: List[str] = Field(default_factory=list) income_document: List[str] = Field(default_factory=list) identity_verification_document: List[str] = Field(default_factory=list) # Computed flags is_bank_statement_valid: bool = Field(default=False, exclude=True) is_income_document_valid: bool = Field(default=False, exclude=True) is_identity_verification_document_valid: bool = Field(default=False, exclude=True) @model_validator(mode="after") def compute_valid_flags(self): self.is_bank_statement_valid = bool(self.bank_statement) self.is_income_document_valid = bool(self.income_document) self.is_identity_verification_document_valid = bool(self.identity_verification_document) return self def to_dataframe(self) -> pd.DataFrame: data = [ { "document_category": "bank_statement", "Uploaded": self.is_bank_statement_valid, "document_types": ", ".join(self.bank_statement) if self.bank_statement else "Missing" }, { "document_category": "income_document", "Uploaded": self.is_income_document_valid, "document_types": ", ".join(self.income_document) if self.income_document else "Missing" }, { "document_category": "identity_verification_document", "Uploaded": self.is_identity_verification_document_valid, "document_types": ", ".join(self.identity_verification_document) if self.identity_verification_document else "Missing" } ] data_df = pd.DataFrame(data) data_df.index += 1 logger.info(f"df: {data_df}") data_df['Uploaded'] = data_df['Uploaded'].apply(lambda x: '✅' if x else '❌') return data_df