from typing import Any, Dict import schemas from utils.logger import setup_logger logger = setup_logger(__name__) def group_documents_by_type(obj, result=None): if result is None: result = { "payslip": [], "bank_statement": [], "passport": [], "driving_license": [], } if isinstance(obj, dict): doc_type = obj.get("document_type") if doc_type in result: result[doc_type].append(obj) for value in obj.values(): group_documents_by_type(value, result) elif isinstance(obj, list): for item in obj: group_documents_by_type(item, result) return result # Transformation Functions def transform_validate_payslip( data: Dict[str, Any], application_form_dict: Dict[str, str] ) -> schemas.UKPayslipSchema: # return schemas.UKPayslipSchema( # pay_period_start_date=data.get("pay_period_start"), # pay_period_end_date=data.get("pay_period_end"), # pay_date=data.get("payslip_date"), # full_name=data.get("employee_name"), # employer_name=data.get("employer_name"), # is_basic_pay_net_pay_other_salary_components_present=bool( # data.get("basic_pay") and data.get("net_pay") # ), # is_tax_deducation_present=bool(data.get("tax_deduction")), # is_ni_deduction_present=bool(data.get("ni_contribution")), # complete_employee_address=None, # employee_number=None, # ) payslip_payload = { "pay_period_start_date": data.get("pay_period_start"), "pay_period_end_date": data.get("pay_period_end"), "pay_date": data.get("payslip_date"), "full_name": data.get("employee_name"), "employer_name": data.get("employer_name"), "is_basic_pay_net_pay_other_salary_components_present": bool( data.get("basic_pay") and data.get("net_pay") ), "is_tax_deducation_present": bool(data.get("tax_deduction")), "is_ni_deduction_present": bool(data.get("ni_contribution")), "complete_employee_address": data.get("employee_address"), # "employee_number": data.get("employee_id"), } # return payslip_payload return schemas.UKPayslipSchema.model_validate( payslip_payload, context=application_form_dict, ).model_dump() def transform_validate_passport( data: Dict[str, Any], application_form_dict: Dict[str, str] ) -> schemas.UKPassportSchema: # name = data.get("full_name") or f"{data.get('given_names', '')} {data.get('surname', '')}".strip() passport_payload = { "full_name": data.get("given_names"), "expiry_date": data.get("date_of_expiry"), } # return schemas.UKPassportSchema( # full_name=name, # expiry_date=data.get("date_of_expiry"), # ) # return passport_payload return schemas.UKPassportSchema.model_validate( passport_payload, context=application_form_dict, ).model_dump() def transform_validate_driving_license( data: Dict[str, Any], application_form_dict: Dict[str, str] ) -> schemas.UKDrivingLicense: name = data.get("full_name") or f"{data.get('first_name', '')} {data.get('surname', '')}".strip() driving_license_payload = {"full_name": name,} # return schemas.UKPassportSchema( # full_name=name, # expiry_date=data.get("date_of_expiry"), # ) # return passport_payload return schemas.UKDrivingLicense.model_validate( driving_license_payload, context=application_form_dict, ).model_dump() def transform_validate_bank_statement( data: Dict[str, Any], application_form_dict: Dict[str, str] ) -> schemas.UKBankAccountStatement: # First salary deposit date from 'salary_credits' if available salary_credits = data.get("salary_credits", []) first_salary_date = None if salary_credits: try: # first_salary_date = int(salary_credits[0]["date"].split("-")[2]) first_salary_date = salary_credits[0]["date"] except (IndexError, ValueError, KeyError): pass # return schemas.UKBankAccountStatement( # statement_start_date=data.get("statement_start_date"), # statement_end_date=data.get("statement_end_date"), # first_salary_deposit_date_present=first_salary_date, # bank_name=None, # Not present in this JSON sample # full_name=data.get("account_holder_name"), # account_number=None, # sort_code=None, # ) bank_statement_payload = { "statement_start_date": data.get("statement_start_date"), "statement_end_date": data.get("statement_end_date"), "first_salary_deposit_date_present": first_salary_date, "bank_name": data.get("bank_name"), # Not present in this JSON sample "full_name": data.get("account_holder_name"), "account_number": data.get("account_number"), "sort_code": data.get("sort_code"), } # return bank_statement_payload return schemas.UKBankAccountStatement.model_validate( bank_statement_payload, context=application_form_dict, ).model_dump() def process_extracted_data( extracted_data: Dict[str, Any], application_form: Dict[str, Any], full_data_transformed# schemas.CustomAppFormUpload ): # full_data = json.loads(extracted_json_data) # application_form_dict = application_form.model_dump() grouped_docs = group_documents_by_type(extracted_data) # for key in grouped_docs: # if not grouped_docs[key]: # return f"{key} document type file not uploaded" transformed_validated_data = { # "payslips": [transform_payslip(doc) for doc in grouped_docs["payslip"]], # "bank_statements": [transform_bank_statement(doc) for doc in grouped_docs["bank_statement"]], # "passports": [transform_passport(doc) for doc in grouped_docs["passport"]], "payslips": [ transform_validate_payslip(doc, application_form) for doc in grouped_docs["payslip"] ], "bank_statements": [ transform_validate_bank_statement(doc, application_form) for doc in grouped_docs["bank_statement"] ], "passports": [ transform_validate_passport(doc, application_form) for doc in grouped_docs["passport"] ], "driving_licenses": [ transform_validate_driving_license(doc, application_form) for doc in grouped_docs["driving_license"] ], } logger.info(f"transformed_validated_data: {transformed_validated_data}") # `names_across_docs` is a set that stores unique lowercase versions of full names extracted from # the transformed and validated data. It is used to check if the names across the uploaded # documents match. The set ensures that only unique names are stored, and it is used to determine # if there is consistency in the names provided across the different types of documents. names_across_docs = set() names_all = [] for docs in transformed_validated_data.values(): for doc in docs: if "full_name" in doc and doc['full_name'] is not None: names_across_docs.add(doc["full_name"].lower().replace(" ", "")) names_all.append(doc["full_name"]) names_across_docs_match = len(names_across_docs) <= 1 if names_across_docs_match: cross_docs_name_eq_check = { # "Policy": "The applicant's name must match across the uploaded documents", "Policy": "Document Consistency", "Value": names_all[-1], "Status": names_across_docs_match, "Message": "Applicant's name matches across the uploaded documents", } else: cross_docs_name_eq_check = { # "Policy": "The applicant's name must match across the uploaded documents", "Policy": "Document Consistency", "Value": names_all, "Status": names_across_docs_match, "Message": "Applicant's name does not match across the uploaded documents" } return transformed_validated_data, cross_docs_name_eq_check