Spaces:

vamsidharmuthireddy
/

underwriting-workflow

Build error

App Files Files Community

underwriting-workflow / document_reader /utils /json_utils.py

vamsidharmuthireddy

Upload 90 files

48e7216 verified 18 days ago

raw

history blame contribute delete

3.43 kB



	import os
	import json

	from pydantic import BaseModel, Field, model_validator
	from typing import List
	import pandas as pd
	from utils.logger import setup_logger

	logger = setup_logger(__name__)


	def restructure_documents(original_data:dict):
	result = {}

	for pdf_path, pages in original_data.items():
	file_name = os.path.basename(pdf_path)
	for image_path, data in pages.items():
	image_name = os.path.basename(image_path)

	document_category = data.get("document_category")
	document_type = data.get("document_type")

	# Prepare the inner dict content
	entry = {
	"uploaded_file_path": file_name,
	"uploaded_file_extracted_images": [image_name],
	**data # include all original fields (including document_type and document_category)
	}

	# Wrap it under document_type
	wrapped_entry = {document_type: entry}

	# Append to appropriate document_category list
	result.setdefault(document_category, []).append(wrapped_entry)

	return result

	def extract_document_types_from_transformed(transformed_data):
	category_map = {}

	for category, docs in transformed_data.items():
	doc_types = set()
	for item in docs:
	for doc_type in item.keys(): # because each item is like {'payslip': {...}}
	doc_types.add(doc_type)
	category_map[category] = sorted(list(doc_types))

	return category_map



	class DocumentTypeByCategory(BaseModel):
	bank_statement: List[str] = Field(default_factory=list)
	income_document: List[str] = Field(default_factory=list)
	identity_verification_document: List[str] = Field(default_factory=list)

	# Computed flags
	is_bank_statement_valid: bool = Field(default=False, exclude=True)
	is_income_document_valid: bool = Field(default=False, exclude=True)
	is_identity_verification_document_valid: bool = Field(default=False, exclude=True)

	@model_validator(mode="after")
	def compute_valid_flags(self):
	self.is_bank_statement_valid = bool(self.bank_statement)
	self.is_income_document_valid = bool(self.income_document)
	self.is_identity_verification_document_valid = bool(self.identity_verification_document)
	return self

	def to_dataframe(self) -> pd.DataFrame:
	data = [
	{
	"document_category": "bank_statement",
	"Uploaded": self.is_bank_statement_valid,
	"document_types": ", ".join(self.bank_statement) if self.bank_statement else "Missing"
	},
	{
	"document_category": "income_document",
	"Uploaded": self.is_income_document_valid,
	"document_types": ", ".join(self.income_document) if self.income_document else "Missing"
	},
	{
	"document_category": "identity_verification_document",
	"Uploaded": self.is_identity_verification_document_valid,
	"document_types": ", ".join(self.identity_verification_document) if self.identity_verification_document else "Missing"
	}
	]

	data_df = pd.DataFrame(data)
	data_df.index += 1
	logger.info(f"df: {data_df}")
	data_df['Uploaded'] = data_df['Uploaded'].apply(lambda x: '✅' if x else '❌')

	return data_df