Spaces:
Build error
Build error
import os | |
import json | |
from pydantic import BaseModel, Field, model_validator | |
from typing import List | |
import pandas as pd | |
from utils.logger import setup_logger | |
logger = setup_logger(__name__) | |
def restructure_documents(original_data:dict): | |
result = {} | |
for pdf_path, pages in original_data.items(): | |
file_name = os.path.basename(pdf_path) | |
for image_path, data in pages.items(): | |
image_name = os.path.basename(image_path) | |
document_category = data.get("document_category") | |
document_type = data.get("document_type") | |
# Prepare the inner dict content | |
entry = { | |
"uploaded_file_path": file_name, | |
"uploaded_file_extracted_images": [image_name], | |
**data # include all original fields (including document_type and document_category) | |
} | |
# Wrap it under document_type | |
wrapped_entry = {document_type: entry} | |
# Append to appropriate document_category list | |
result.setdefault(document_category, []).append(wrapped_entry) | |
return result | |
def extract_document_types_from_transformed(transformed_data): | |
category_map = {} | |
for category, docs in transformed_data.items(): | |
doc_types = set() | |
for item in docs: | |
for doc_type in item.keys(): # because each item is like {'payslip': {...}} | |
doc_types.add(doc_type) | |
category_map[category] = sorted(list(doc_types)) | |
return category_map | |
class DocumentTypeByCategory(BaseModel): | |
bank_statement: List[str] = Field(default_factory=list) | |
income_document: List[str] = Field(default_factory=list) | |
identity_verification_document: List[str] = Field(default_factory=list) | |
# Computed flags | |
is_bank_statement_valid: bool = Field(default=False, exclude=True) | |
is_income_document_valid: bool = Field(default=False, exclude=True) | |
is_identity_verification_document_valid: bool = Field(default=False, exclude=True) | |
def compute_valid_flags(self): | |
self.is_bank_statement_valid = bool(self.bank_statement) | |
self.is_income_document_valid = bool(self.income_document) | |
self.is_identity_verification_document_valid = bool(self.identity_verification_document) | |
return self | |
def to_dataframe(self) -> pd.DataFrame: | |
data = [ | |
{ | |
"document_category": "bank_statement", | |
"Uploaded": self.is_bank_statement_valid, | |
"document_types": ", ".join(self.bank_statement) if self.bank_statement else "Missing" | |
}, | |
{ | |
"document_category": "income_document", | |
"Uploaded": self.is_income_document_valid, | |
"document_types": ", ".join(self.income_document) if self.income_document else "Missing" | |
}, | |
{ | |
"document_category": "identity_verification_document", | |
"Uploaded": self.is_identity_verification_document_valid, | |
"document_types": ", ".join(self.identity_verification_document) if self.identity_verification_document else "Missing" | |
} | |
] | |
data_df = pd.DataFrame(data) | |
data_df.index += 1 | |
logger.info(f"df: {data_df}") | |
data_df['Uploaded'] = data_df['Uploaded'].apply(lambda x: 'β ' if x else 'β') | |
return data_df | |