Spaces:
Build error
Build error
File size: 3,433 Bytes
48e7216 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import os
import json
from pydantic import BaseModel, Field, model_validator
from typing import List
import pandas as pd
from utils.logger import setup_logger
logger = setup_logger(__name__)
def restructure_documents(original_data:dict):
result = {}
for pdf_path, pages in original_data.items():
file_name = os.path.basename(pdf_path)
for image_path, data in pages.items():
image_name = os.path.basename(image_path)
document_category = data.get("document_category")
document_type = data.get("document_type")
# Prepare the inner dict content
entry = {
"uploaded_file_path": file_name,
"uploaded_file_extracted_images": [image_name],
**data # include all original fields (including document_type and document_category)
}
# Wrap it under document_type
wrapped_entry = {document_type: entry}
# Append to appropriate document_category list
result.setdefault(document_category, []).append(wrapped_entry)
return result
def extract_document_types_from_transformed(transformed_data):
category_map = {}
for category, docs in transformed_data.items():
doc_types = set()
for item in docs:
for doc_type in item.keys(): # because each item is like {'payslip': {...}}
doc_types.add(doc_type)
category_map[category] = sorted(list(doc_types))
return category_map
class DocumentTypeByCategory(BaseModel):
bank_statement: List[str] = Field(default_factory=list)
income_document: List[str] = Field(default_factory=list)
identity_verification_document: List[str] = Field(default_factory=list)
# Computed flags
is_bank_statement_valid: bool = Field(default=False, exclude=True)
is_income_document_valid: bool = Field(default=False, exclude=True)
is_identity_verification_document_valid: bool = Field(default=False, exclude=True)
@model_validator(mode="after")
def compute_valid_flags(self):
self.is_bank_statement_valid = bool(self.bank_statement)
self.is_income_document_valid = bool(self.income_document)
self.is_identity_verification_document_valid = bool(self.identity_verification_document)
return self
def to_dataframe(self) -> pd.DataFrame:
data = [
{
"document_category": "bank_statement",
"Uploaded": self.is_bank_statement_valid,
"document_types": ", ".join(self.bank_statement) if self.bank_statement else "Missing"
},
{
"document_category": "income_document",
"Uploaded": self.is_income_document_valid,
"document_types": ", ".join(self.income_document) if self.income_document else "Missing"
},
{
"document_category": "identity_verification_document",
"Uploaded": self.is_identity_verification_document_valid,
"document_types": ", ".join(self.identity_verification_document) if self.identity_verification_document else "Missing"
}
]
data_df = pd.DataFrame(data)
data_df.index += 1
logger.info(f"df: {data_df}")
data_df['Uploaded'] = data_df['Uploaded'].apply(lambda x: 'β
' if x else 'β')
return data_df
|