vamsidharmuthireddy's picture
Upload 90 files
48e7216 verified
import os
import json
from pydantic import BaseModel, Field, model_validator
from typing import List
import pandas as pd
from utils.logger import setup_logger
logger = setup_logger(__name__)
def restructure_documents(original_data:dict):
result = {}
for pdf_path, pages in original_data.items():
file_name = os.path.basename(pdf_path)
for image_path, data in pages.items():
image_name = os.path.basename(image_path)
document_category = data.get("document_category")
document_type = data.get("document_type")
# Prepare the inner dict content
entry = {
"uploaded_file_path": file_name,
"uploaded_file_extracted_images": [image_name],
**data # include all original fields (including document_type and document_category)
}
# Wrap it under document_type
wrapped_entry = {document_type: entry}
# Append to appropriate document_category list
result.setdefault(document_category, []).append(wrapped_entry)
return result
def extract_document_types_from_transformed(transformed_data):
category_map = {}
for category, docs in transformed_data.items():
doc_types = set()
for item in docs:
for doc_type in item.keys(): # because each item is like {'payslip': {...}}
doc_types.add(doc_type)
category_map[category] = sorted(list(doc_types))
return category_map
class DocumentTypeByCategory(BaseModel):
bank_statement: List[str] = Field(default_factory=list)
income_document: List[str] = Field(default_factory=list)
identity_verification_document: List[str] = Field(default_factory=list)
# Computed flags
is_bank_statement_valid: bool = Field(default=False, exclude=True)
is_income_document_valid: bool = Field(default=False, exclude=True)
is_identity_verification_document_valid: bool = Field(default=False, exclude=True)
@model_validator(mode="after")
def compute_valid_flags(self):
self.is_bank_statement_valid = bool(self.bank_statement)
self.is_income_document_valid = bool(self.income_document)
self.is_identity_verification_document_valid = bool(self.identity_verification_document)
return self
def to_dataframe(self) -> pd.DataFrame:
data = [
{
"document_category": "bank_statement",
"Uploaded": self.is_bank_statement_valid,
"document_types": ", ".join(self.bank_statement) if self.bank_statement else "Missing"
},
{
"document_category": "income_document",
"Uploaded": self.is_income_document_valid,
"document_types": ", ".join(self.income_document) if self.income_document else "Missing"
},
{
"document_category": "identity_verification_document",
"Uploaded": self.is_identity_verification_document_valid,
"document_types": ", ".join(self.identity_verification_document) if self.identity_verification_document else "Missing"
}
]
data_df = pd.DataFrame(data)
data_df.index += 1
logger.info(f"df: {data_df}")
data_df['Uploaded'] = data_df['Uploaded'].apply(lambda x: 'βœ…' if x else '❌')
return data_df