diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..6541efbcf70af43bc9c0614e320753737e3af8ac --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +**__pycache__** +logs_directory/ \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/__pycache__/__init__.cpython-313.pyc b/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae119b619128376a9ee192878adf4fc19d2c7b86 Binary files /dev/null and b/__pycache__/__init__.cpython-313.pyc differ diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..f2dba2c16f401563a1a20ee9e87b146a7f0e3446 --- /dev/null +++ b/app.py @@ -0,0 +1,440 @@ +import streamlit as st +import pandas as pd + +from schemas.custom_app_form import CustomAppFormUpload +from utils.prep_validators_payload import process_extracted_data + + +upload_docs_tab, demo_validations_considered_tab, upload_docs_validation_results_tab = st.tabs( + ["Upload Documents", "Demo Validations", "Validation Results"] +) + +with upload_docs_tab: + st.header("Upload Documents") + # st.markdown("## Upload Custom Application Form") + uploaded_custom_application_form_file = st.file_uploader( + label="Upload Custom Application Form", + accept_multiple_files=False, + type=["csv"], + ) + + uploaded_files = st.file_uploader( + label="Upload files to be validated", + accept_multiple_files=True, + type=["png", "jpg", "jpeg", "pdf", "zip"] + ) + + if uploaded_custom_application_form_file: + uploaded_custom_form_df = pd.read_csv( + uploaded_custom_application_form_file, header=None) + uploaded_custom_form_dict = dict( + zip(uploaded_custom_form_df[0], uploaded_custom_form_df[1])) + st.write("Raw Dictionary:") + st.json(uploaded_custom_form_dict) + custom_app_form = CustomAppFormUpload.model_validate( + uploaded_custom_form_dict).model_dump() + st.write("Parsed Dictionary:") + st.json(custom_app_form) + # print(custom_app_form) + if isinstance(custom_app_form, dict) and not custom_app_form.get("is_incomplete"): + st.session_state["custom_app_form"] = custom_app_form + st.write("Session State:") + st.write(st.session_state) + + +with demo_validations_considered_tab: + st.header("Demo Validations") + # demo_validations = [ + # # { + # # "Document Type": "Passport", + # # "Validation": "Full name must be present", + # # "Raises Red Flag": True, + # # # "Error Message": "Applicant's full name not present", + # # }, + # # { + # # "Document Type": "Passport", + # # "Validation": "Full name must have length between 2 & 61", + # # "Raises Red Flag": True, + # # # "Error Message": "Full name must have a length of at least 2 & at most 61", + # # }, + # # { + # # "Document Type": "Passport", + # # "Validation": "Full name must have at least two words", + # # "Raises Red Flag": True, + # # # "Error Message": "Full name must consist of at least 2 words (first name + last name)", + # # }, + # { + # "Document Type": "Passport", + # "Validation": ( + # "Full name must be present. " + # "Full name must have length between 2 & 61. " + # "Full name must have at least two words." + # ), + # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # { + # "Document Type": "Passport", + # "Validation": "Expiry date must be present & after a year from current date", + # "Raises Red Flag": True, + # # "Error Message": "Provided passport expires within 1 year", + # }, + # { + # "Document Type": "Payslip", + # "Validation": ( + # "Full name must be present. " + # "Full name must have length between 2 & 61. " + # "Full name must have at least two words." + # ), + # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # { + # "Document Type": "Payslip", + # "Validation": "Employer name must be present", + # "Raises Red Flag": True, + # # "Error Message": "Employer name not present", + # }, + # { + # "Document Type": "Payslip", + # "Validation": "Employer name must have at least alphabet", + # "Raises Red Flag": True, + # # "Error Message": "Employer name must contain at least one letter", + # }, + # { + # "Document Type": "Payslip", + # "Validation": "Employer name cannot be only whitespace", + # "Raises Red Flag": True, + # # "Error Message": "Employer name cannot be only whitespace", + # }, + # { + # "Document Type": "Payslip", + # "Validation": "Employer name must match the provided value", + # "Raises Red Flag": True, + # # "Error Message": "Employer name mismatch with provided value", + # }, + # { + # "Document Type": "Payslip", + # "Validation": ( + # "Pay period start & dates must be present.\n" + # "Pay period start date cannot be on or after the end date.\n" + # "Pay period's end date must be within the last 35 days & not in the future.\n" + # "Pay period's date(s) must not be older than those of the last calendar month.\n" + # "Pay period's start date & end date must have a gap of at least 28 days." + # ), + # "Raises Red Flag": True, + # # "Error Message": "Employer name mismatch with provided value", + # }, + # { + # "Document Type": "Payslip", + # "Validation": ( + # "Basic salary, Net Salary and/or other requisite salary components must be present. " + # "Tax Deduction line item must be present. " + # "NI/National Insurance line item must be present." + # ), + # "Raises Red Flag": True, + # }, + # { + # "Document Type": "Payslip", + # "Validation": ( + # "Applicant's address must be present. " + # "Applicant's complete address must have a length of at least 10 & at most 300. " + # "Complete address must match with provided value. " + # ), + # "Raises Red Flag": True, + # }, + # { + # "Document Type": "Payslip", + # "Validation": "Employee number must be greater than 25", + # "Raises Red Flag": True, + # }, + # { + # "Document Type": "Digital Bank Account Statement", + # "Validation": ( + # "Full name must be present. " + # "Full name must have length between 2 & 61. " + # "Full name must have at least two words." + # ), + # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # { + # "Document Type": "Digital Bank Account Statement", + # "Validation": ( + # "Bank name must be present. " + # "Bank name must have length between 4 & 50. " + # "Bank Name must match provided value." + # ), + # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # { + # "Document Type": "Digital Bank Account Statement", + # "Validation": ( + # "Bank account number must be present. " + # "Bank account number must be of 8 digits only. " + # ), + # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # { + # "Document Type": "Digital Bank Account Statement", + # "Validation": ( + # "Sort number must be present. " + # "It must be of the format xx-xx-xx wherein x are digits. " + # ), + # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # { + # "Document Type": "Digital Bank Account Statement", + # "Validation": ( + # "Both statement start date & statement end date must be present. " + # "Account statement period's start date & end date must have a gap of at least 28 days. " + # "At least one salary credit must be present. " + # "Statement period's end date must be after the start date. " + # ), + # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # ] + demo_validations = [ + # { + # "Topic / Document Type": "General Guidance", + # "Policy / Rule / Condition": "Income/Employment Docs Risk", + # "Action / Guidance / Requirement": "Be aware of higher risk of manipulation (Payslips, bank statements, Customer name).", + # "Red Flag / Caution": "Higher risk category.", + # "Notes / Details": "", + # }, + { + "Topic / Document Type": "General Guidance", + "Policy / Rule / Condition": "Document Consistency", + # "Action / Guidance / Requirement": "Compare information across all documents (e.g., payslips vs bank statements) to ensure consistency.", + "Action / Guidance / Requirement": "Compare applicant's full name across all documents to ensure consistency.", + # "Red Flag / Caution": "Inconsistencies require investigation.", + # "Notes / Details": "", + }, + # { + # "Topic / Document Type": "General Guidance", + # "Policy / Rule / Condition": "Payslip YTD Check", + # "Action / Guidance / Requirement": "Do Year-to-Date figures (gross income, tax) make sense?", + # "Red Flag / Caution": "If figures don’t make sense, investigate.", + # "Notes / Details": "", + # }, + # { + # "Topic / Document Type": "General Guidance", + # "Policy / Rule / Condition": "Payslip Details Check", + # "Action / Guidance / Requirement": "Check for low employee numbers, rounded figures, differences in payment methods (e.g., payslip says BACS, statement shows Faster Payment).", + # "Red Flag / Caution": "These can be red flags requiring investigation.", + # "Notes / Details": "", + # }, + { + "Topic / Document Type": "General Guidance", + "Policy / Rule / Condition": "Overall Validation", + "Action / Guidance / Requirement": "Ensure document is genuine, not fraudulent, belongs to the customer, and is from the expected source.", + # "Red Flag / Caution": "Any doubt may indicate fraud.", + # "Notes / Details": "Applies to all documents.", + }, + { + "Topic / Document Type": "Passport", + "Policy / Rule / Condition": "Full Name", + "Action / Guidance / Requirement": ( + "Full name must be present. " + "Full name must have length between 2 & 61. " + "Full name must have at least two words." + ), + # "Raises Red Flag": True, + # "Error Message": "Applicant's full name not present", + }, + { + "Topic / Document Type": "Passport", + "Policy / Rule / Condition": "Expiry Date", + "Action / Guidance / Requirement": "Expiry date must be present & after a year from current date", + # "Raises Red Flag": True, + # "Error Message": "Provided passport expires within 1 year", + }, + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Employer & Customer Names", + "Action / Guidance / Requirement": "Must include correct Employer’s and Customer’s names.", + # "Red Flag / Caution": "Missing or incorrect names.", + # "Notes / Details": "Cross-reference with BMM/HOME.", + }, + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Submission Requirement (Monthly Pay)", + "Action / Guidance / Requirement": "Minimum one month's most recent payslip required.", + # "Red Flag / Caution": "", + # "Notes / Details": "", + }, + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Pay Date Requirement", + # "Action / Guidance / Requirement": "Pay date must be within 35 days of FCD (Final Completion Date).", + "Action / Guidance / Requirement": "Pay date must be within 35 days of document upload date.", + # "Red Flag / Caution": "Pay date older than 35 days from FCD.", + # "Notes / Details": "", + }, + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Pay Period End Date (DD/MM/YYYY, if no pay date)", + "Action / Guidance / Requirement": "Period end date must be within 35 days of FCD.", + # "Red Flag / Caution": "Period end date older than 35 days from FCD.", + # "Notes / Details": "", + }, + { + "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "Pay Period Month (MM/YYYY, if no pay date)", + "Policy / Rule / Condition": "Pay Period Month (MM/YYYY, if no pay date) basis pay period duration", + "Action / Guidance / Requirement": "Payslips dated in the current or previous calendar month are acceptable (must be the most recent).", + # "Red Flag / Caution": "Older than previous calendar month.", + # "Notes / Details": "", + }, + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Undated Payslips", + "Action / Guidance / Requirement": "Unacceptable.", + # "Red Flag / Caution": "Undated payslip received.", + # "Notes / Details": "Request a dated version.", + }, + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Tax & NI Contributions", + "Action / Guidance / Requirement": "Must be visible. Perform a sense check.", + # "Red Flag / Caution": "Missing or nonsensical Tax/NI figures.", + # "Notes / Details": "", + }, + # custom + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Applicant Address", + "Action / Guidance / Requirement": ( + "Applicant's address must be present. " + "Applicant's complete address must have a length of at least 10 & at most 300. " + "Complete address must match with provided value. " + ), + # "Red Flag / Caution": "Missing or nonsensical Tax/NI figures.", + # "Notes / Details": "", + }, + # { + # "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "YTD Figures Match", + # "Action / Guidance / Requirement": "Verify YTD figures match declared income.", + # "Red Flag / Caution": "YTD figures do not match declared income.", + # "Notes / Details": "Add to YMI/FDM memo if they do not match.", + # }, + # { + # "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "Pension Income (on Payslip)", + # "Action / Guidance / Requirement": "Must show within the last 35 days / be the most recent.", + # "Red Flag / Caution": "Pension income shown is dated >35 days ago.", + # "Notes / Details": "Alternatively use pension annual statement/latest P60. Cross-reference with bank statement if possible.", + # }, + # { + # "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "Joint Applicants", + # "Action / Guidance / Requirement": "Required if applicable.", + # "Red Flag / Caution": "Missing payslip for a joint applicant.", + # "Notes / Details": "", + # }, + # { + # "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "Payslip Red Flags", + # "Action / Guidance / Requirement": "", + # "Red Flag / Caution": "Rounded figures. Low employee/payroll number. Presence of these flags.", + # "Notes / Details": "Investigate further.", + # }, + # { + # "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "Payslip Verification (HOME)", + # "Action / Guidance / Requirement": "Check information in HOME against payslip details (employer name, customer name, etc.).", + # "Red Flag / Caution": "Mismatches found (e.g., misspellings, missing words).", + # "Notes / Details": "Correct HOME after consulting customer. If correction not possible (e.g., space), add YMI/FDM memo explaining.", + # }, + # { + # "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "Payslip Near 35-Day Limit", + # "Action / Guidance / Requirement": "If payslip is close to the 35-day limit and no decision is obtained.", + # "Red Flag / Caution": "Decision pending, payslip nearing expiry.", + # "Notes / Details": "Another, more recent payslip may be required.", + # }, + # { + # "Topic / Document Type": "Digital Bank Stmts", + # "Policy / Rule / Condition": "Purpose", + # "Action / Guidance / Requirement": "Used to confirm income/expenditure.", + # "Red Flag / Caution": "", + # "Notes / Details": "Cannot be used for ID & VA confirmation.", + # }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Coverage", + # "Action / Guidance / Requirement": "Must cover a full calendar month (vs 28 days for original).", + "Action / Guidance / Requirement": "Account statement period's start date & end date must have a gap of at least 28 days.", + # "Red Flag / Caution": "", + # "Notes / Details": "", + }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Data Match", + "Action / Guidance / Requirement": "Customer data on statement must match profile.", + # "Red Flag / Caution": "Data mismatch.", + # "Notes / Details": "", + }, + # { + # "Topic / Document Type": "Digital Bank Stmts", + # "Policy / Rule / Condition": "Pay Info Match", + # "Action / Guidance / Requirement": "Verify pay information matches the payslip.", + # "Red Flag / Caution": "Pay info mismatch vs payslip.", + # "Notes / Details": "", + # }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Authenticity Doubt", + "Action / Guidance / Requirement": "If any doubt regarding authenticity.", + # "Red Flag / Caution": "Suspected non-genuine digital statement.", + # "Notes / Details": "Cases may be referred to Fraud.", + }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Bank name", + "Action / Guidance / Requirement": ( + "Bank name must be present. " + "Bank name must have length between 4 & 50. " + "Bank Name must match provided value." + ), + }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Bank account number", + "Action / Guidance / Requirement": ( + "Bank account number must be present. " + "Bank account number must be of 8 digits only. " + ), + }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Sort code", + "Action / Guidance / Requirement": ( + "Sort number must be present. " + "It must be of the format xx-xx-xx wherein x are digits. " + ), + }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Date checks", + "Action / Guidance / Requirement": ( + "Both statement start date & statement end date must be present. " + "At least one salary credit must be present. " + "Statement period's end date must be after the start date. " + ), + }, + ] + + demo_validations_df = pd.DataFrame(demo_validations) + st.table(demo_validations_df) + + +with upload_docs_validation_results_tab: + st.header("Validation Results") + if st.session_state: + st.session_state diff --git a/app_fastapi.py b/app_fastapi.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app_streamlit.py b/app_streamlit.py new file mode 100644 index 0000000000000000000000000000000000000000..ee3d215772bf70ad90bff6a7eccec9cd6cfb0ec0 --- /dev/null +++ b/app_streamlit.py @@ -0,0 +1,48 @@ +import streamlit as st +from utils.process_files import process_uploaded_files +from utils.document_display import display_based_on_card +import os +import pandas as pd +import json +from llm.document_analyzer import analyze_files + +from PIL import Image +from utils import setup_logger +from utils.session_state import reset_state +from datetime import datetime +import uuid +from utils.tabs.document_upload_tab import upload_documents +from utils.tabs.memo import display_memo +from utils.tabs.demo_validations import display_demo_validations +from utils.tabs.document_validation_tab import validate_documents + +logger = setup_logger(__name__) + +st.set_page_config(layout="wide") + + +# Initialize session state structures +if 'uploads' not in st.session_state: + st.session_state['uploads'] = {} +if 'current_upload' not in st.session_state: + st.session_state['current_upload'] = None + + +st.title("🪪 Underwriting Workflow") + + +upload_docs_tab, memo_tab, upload_docs_validation_results_tab, demo_validations_considered_tab = st.tabs( + ["Upload Documents", "Memo", "Validation Results", "Policies"] +) + +with upload_docs_tab: + upload_documents() + +with memo_tab: + display_memo() + +with demo_validations_considered_tab: + display_demo_validations() + +with upload_docs_validation_results_tab: + validate_documents(current=st.session_state['current_upload']) diff --git a/app_streamlit_bak.py b/app_streamlit_bak.py new file mode 100644 index 0000000000000000000000000000000000000000..6a2b67bf5d8a0337df3f9f1ca20589541d56a3dc --- /dev/null +++ b/app_streamlit_bak.py @@ -0,0 +1,79 @@ +import streamlit as st +from utils.process_files import process_uploaded_files +from utils.document_display import display_based_on_card +import os +import pandas as pd +import json +from llm.document_analyzer import analyze_files + +from PIL import Image +from utils import setup_logger + +logger = setup_logger(__name__) + +st.set_page_config(layout="wide") +if len(st.session_state) == 0: + if 'tab_ocr' not in st.session_state: + # if st.session_state['tab_ocr']['file_groups'] is None: + st.session_state = { + 'tab_ocr': { + 'file_groups': None, + 'values_raw': None, + 'values_display': None + + } + } + +logger.info(f"st.session_state: {st.session_state}") +st.title("ID Analyser") + +uploaded_files = st.file_uploader("Upload Images, PDFs", accept_multiple_files=True, type=[ + "png", "jpg", "jpeg", "pdf", "zip"]) + + +if uploaded_files: + st.session_state = { + 'tab_ocr': { + 'file_groups': None, + 'values_raw': None, + 'values_display': None + + } + } + file_paths, file_groups, temp_dir = process_uploaded_files( + uploaded_files) # Remove file paths later + if st.session_state['tab_ocr']['file_groups'] is None: + st.session_state['tab_ocr']['file_groups'] = file_groups + + analyze_clicked = st.button("Analyze") + + if analyze_clicked: + st.session_state['tab_ocr']['values_raw'] = None + st.session_state['tab_ocr']['values_display'] = None + + if analyze_clicked or st.session_state['tab_ocr']['values_display']: + # if st.button("Analyze") or st.session_state['tab_ocr']['values_display'] is not None: + if st.session_state['tab_ocr']['values_raw'] is None: + analysis_results_groups, json_output_path = analyze_files( + file_groups=st.session_state['tab_ocr']['file_groups'], + temp_dir=temp_dir) + + st.session_state['tab_ocr']['values_raw'] = analysis_results_groups + + if st.session_state['tab_ocr']['values_display'] is None: + st.session_state['tab_ocr']['values_display'] = {} + + for original_file, extracted_files in st.session_state['tab_ocr']['file_groups'].items(): + analysis_results_for_id = display_based_on_card( + original_file=original_file, + analysis_results_for_original_file=st.session_state[ + 'tab_ocr']['values_raw'][original_file], + extracted_files=extracted_files) + + st.download_button( + label="Download Analysis JSON", + data=json.dumps( + st.session_state['tab_ocr']['values_raw'], indent=4), + file_name="analysis_results.json", + mime="application/json" + ) diff --git a/checks copy.ipynb b/checks copy.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..97027fd14328d1c71be5a7178d2c7f4c286e27ee --- /dev/null +++ b/checks copy.ipynb @@ -0,0 +1,235 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ID: Passport" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check against sample extracted JSON" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "from utils.prep_validators_payload import process_extracted_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "custom_app_form = {\n", + " \"application_summary_full_name\": \"Jodie Pippa\",\n", + " \"application_summary_bank_name\": \"HSBC\",\n", + " \"application_summary_employer_name\": \"ABC Ltd\",\n", + " \"application_summary_complete_address\": \"123 Maple Street, London, UK, SW1A 1AA\",\n", + " \"full_name_err_msgs\": None,\n", + " \"bank_name_err_msgs\": None,\n", + " \"employer_name_err_msgs\": None,\n", + " \"complete_employee_address_err_msgs\": None,\n", + " \"is_incomplete\": False,\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'/tmp/tmp6w8qn6h6/sample_documents/sample_documents/3.pdf': {'/tmp/tmp6w8qn6h6/sample_documents/sample_documents/3.pdf_page_0.png': {'document_category': 'bank_statement', 'document_type': 'bank_statement', 'account_holder_name': 'Jodie Pippa', 'account_holder_address': '', 'bank_name': 'HSBC', 'account_number': '12345678', 'sort_code': '20-00-00', 'statement_start_date': '2025-01-01', 'statement_end_date': '2025-02-28', 'salary_credits': [{'date': '2025-01-06', 'amount': '2213.83', 'from': 'ABC Ltd', 'description': 'Salary - ABC Ltd (Pro-rated Jan)'}, {'date': '2025-02-06', 'amount': '2566.66', 'from': 'ABC Ltd', 'description': 'Salary - ABC Ltd (Full Feb Salary)'}]}}, '/tmp/tmp6w8qn6h6/sample_documents/sample_documents/5.pdf': {'/tmp/tmp6w8qn6h6/sample_documents/sample_documents/5.pdf_page_0.png': {'document_category': 'income_document', 'document_type': 'payslip', 'employee_name': 'Jodie Pippa', 'employer_name': 'ABC Ltd', 'employee_id': 'JP12345', 'employee_address': '123 Maple Street, London, UK, SW1A 1AA', 'employer_address': '456 Business Street, London, UK, SW1A 2BB', 'tax_code': '1257L', 'payslip_date': '2025-01-31', 'pay_period_start': '2025-01-06', 'pay_period_end': '2025-01-31', 'payment_frequency': 'monthly', 'basic_pay': '3333.33', 'net_pay': '2566.66', 'gross_pay': '3333.33', 'salary_components': [], 'ni_contribution': '266.67', 'tax_deduction': '333.33', 'other_deductions': [{'name': 'Pension Contribution', 'amount': '166.67'}]}}, '/tmp/tmp6w8qn6h6/sample_documents/sample_documents/2.pdf': {'/tmp/tmp6w8qn6h6/sample_documents/sample_documents/2.pdf_page_0.png': {'document_category': 'income_document', 'document_type': 'p60', 'employee_details': {'surname': 'Pippa', 'forenames_or_initials': 'Jodie', 'national_insurance_number': 'AB123456C', 'works_payroll_number': '5342'}, 'pay_and_income_tax_details': {'previous_employments': {'pay': 0.0, 'tax_deducted': 0.0}, 'current_employment': {'pay': 9545.45, 'tax_deducted': 0.0}, 'total_for_year': {'pay': 9545.45, 'tax_deducted': 0.0}, 'final_tax_code': '1257'}, 'national_insurance_contributions': [{'nic_letter': 'A', 'earnings': {'at_or_above_lel': 6396.0, 'above_lel_up_to_pt': 0.0, 'above_pt_up_to_uel': 3149.45}, 'employee_contributions_above_pt': 377.93}], 'statutory_payments': {'maternity_pay': 0.0, 'paternity_pay': 0.0, 'adoption_pay': 0.0, 'shared_parental_pay': 0.0}, 'other_details': {'student_loan_deductions': 0.0}, 'employer_details': {'employer_name_and_address': None, 'paye_reference': '123/AB456'}}}, '/tmp/tmp6w8qn6h6/sample_documents/sample_documents/1.pdf': {'/tmp/tmp6w8qn6h6/sample_documents/sample_documents/1.pdf_page_0.png': {'document_category': 'identity_verification_document', 'document_type': 'passport', 'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PolicyValueStatusMessage
0Applicant's full name should be presentJodie PippaTrueApplicant's full name is present
1Full name must have a length of at least 2 & a...11TrueFull name has a length of at least 2 & at most 61
2Full name must consist of at least 2 words (fi...2TrueFull name consists of at least 2 words (first ...
3Name should match with provided valueJodie PippaTrueName matches with provided value
4Employer name must be presentABC LtdTrueEmployer name is present
5Employer name must match with provided valueABC LtdTrueEmployer name matches with provided value
\n", + "" + ], + "text/plain": [ + " Policy Value Status \\\n", + "0 Applicant's full name should be present Jodie Pippa True \n", + "1 Full name must have a length of at least 2 & a... 11 True \n", + "2 Full name must consist of at least 2 words (fi... 2 True \n", + "3 Name should match with provided value Jodie Pippa True \n", + "4 Employer name must be present ABC Ltd True \n", + "5 Employer name must match with provided value ABC Ltd True \n", + "\n", + " Message \n", + "0 Applicant's full name is present \n", + "1 Full name has a length of at least 2 & at most 61 \n", + "2 Full name consists of at least 2 words (first ... \n", + "3 Name matches with provided value \n", + "4 Employer name is present \n", + "5 Employer name matches with provided value " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = process_extracted_data(full_data, custom_app_form)\n", + "a['payslips'][0]['validation_policy_status_df']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "hsbc_uk_demo_venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/config.toml b/config.toml new file mode 100644 index 0000000000000000000000000000000000000000..af6a35f5d39d7c870ed60300d7a906a6fb66001c --- /dev/null +++ b/config.toml @@ -0,0 +1,3 @@ +[server] + +maxUploadSize = 10 \ No newline at end of file diff --git a/llm/__init__.py b/llm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llm/__pycache__/__init__.cpython-313.pyc b/llm/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc42eb6c899bfad45484d45acbd68e61cef8fa71 Binary files /dev/null and b/llm/__pycache__/__init__.cpython-313.pyc differ diff --git a/llm/__pycache__/document_analyzer.cpython-313.pyc b/llm/__pycache__/document_analyzer.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2899e8467ba67dd67fdd350ad23e9d7af137abc6 Binary files /dev/null and b/llm/__pycache__/document_analyzer.cpython-313.pyc differ diff --git a/llm/__pycache__/llm.cpython-313.pyc b/llm/__pycache__/llm.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97807326920a5ac56662cedc6ec2cc8534cd8876 Binary files /dev/null and b/llm/__pycache__/llm.cpython-313.pyc differ diff --git a/llm/document_analyzer.py b/llm/document_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..3242ebc9a8bed94b3872635eed93a51a0fff4fd6 --- /dev/null +++ b/llm/document_analyzer.py @@ -0,0 +1,102 @@ + +import streamlit as st +from PIL import Image +import json +import os +from utils import im_2_b64, load_pdf_as_image, generate_metadata +from .llm import DocumentLLM +from prompts import (document_type_prompt, passport_prompt, + payslip_prompt, bank_statement_prompt, + p60_prompt, driving_license_prompt, + genric_ocr_prompt) +from utils.json_utils import restructure_documents + +from utils import setup_logger + +logger = setup_logger(__name__) + + +def analyze_files(file_groups: dict, temp_dir, current_upload): + document_llm = DocumentLLM() + results_group = {} + for original_file, extracted_files in file_groups.items(): + results = {} + for file_name in extracted_files: + results[file_name] = {"status": "processed", + "type": "image", "dummy_data": 12345} + + logger.info(f"file_name : {file_name}") + extension = file_name.lower().split('.')[-1] + + results[file_name] = generate_metadata(file_name) + + try: + logger.info(f"Starting analysis for {file_name}") + + if extension in ['jpg', 'jpeg', 'png', 'gif']: + image = Image.open(file_name) + image_buffer = im_2_b64(image) + elif extension == 'pdf': + img = load_pdf_as_image(file_name) + image_buffer = im_2_b64(img) + st.image(img, use_container_width=True) + + else: + st.write( + f"Unsupported file format: {extension}") + + if image_buffer is not None: + results[file_name] = document_llm.call_llm_api( + prompt=document_type_prompt, + image_path=file_name) + + logger.info( + f"File name: {file_name}, Results: {results[file_name]}") + document_type = results[file_name].get( + 'document_type', None) + + if document_type is not None: + + prompt = None + + if document_type == 'passport': + prompt = passport_prompt + elif document_type == 'driving_license': + prompt = driving_license_prompt + elif document_type == 'bank_statement': + prompt = bank_statement_prompt + elif document_type == 'payslip': + prompt = payslip_prompt + elif document_type == 'p60': + prompt = p60_prompt + else: + prompt = genric_ocr_prompt + + if prompt is not None: + data = document_llm.call_llm_api( + prompt=prompt, + image_path=file_name) + + results[file_name].update(data) + + logger.info(f"{file_name}: {data}") + + except Exception as e: + st.error(f"Error processing {file_name}: {str(e)}") + + image_buffer = None + + results_group[original_file] = results + + + results_transformed = restructure_documents(results_group) + st.session_state['uploads'][current_upload]['results_transformed'] = results_transformed + + + # Save analysis results to a JSON file + json_output_path = os.path.join( + temp_dir, "analysis_results.json") + with open(json_output_path, "w") as json_file: + json.dump(results_group, json_file, indent=4) + + return results_group, json_output_path diff --git a/llm/llm.py b/llm/llm.py new file mode 100644 index 0000000000000000000000000000000000000000..e387e629a32ad98a1ed9426ebb046003dd456042 --- /dev/null +++ b/llm/llm.py @@ -0,0 +1,47 @@ +from pydantic import BaseModel +from google import genai +from google.genai.types import HttpOptions +from dotenv import load_dotenv +import os +import json +import re +from vertexai.generative_models import Part, Image, GenerativeModel +from utils import setup_logger +import vertexai + +logger = setup_logger(__name__) + +load_dotenv() + +project = os.getenv("GOOGLE_CLOUD_PROJECT") +location = os.getenv("GOOGLE_CLOUD_LOCATION") +vertexai.init(project=project, location=location) + + +class DocumentLLM(BaseModel): + + def call_llm_api(self, prompt, image_path): + + model: GenerativeModel = GenerativeModel( + model_name="gemini-2.0-flash-001") + + text_part = Part.from_text(prompt) + image_part = Part.from_image(Image.load_from_file(image_path)) + + response = model.generate_content([ + image_part, + text_part]) + + content = response.text + + try: + content = json.loads(content) + except Exception as e: + logger.info(f"Json is being formatted") + content = re.sub(r"^```json\s*|\s*```$", "", + content, flags=re.MULTILINE) + + # Parse JSON + content = json.loads(content) + + return content diff --git a/logs_directory/app_20250414.log b/logs_directory/app_20250414.log new file mode 100644 index 0000000000000000000000000000000000000000..b7f7d0329605f1e477b4d33d2e7b805ae2cb6c00 --- /dev/null +++ b/logs_directory/app_20250414.log @@ -0,0 +1,78 @@ +2025-04-14 22:13:56 - llm.document_analyzer - INFO - [document_analyzer.py:26] - file_name : /tmp/tmpq7o86ysz/1.pdf_page_0.png +2025-04-14 22:14:01 - utils.document_display - INFO - [document_display.py:148] - analysis_results_for_id : {'File Name': '1.pdf_page_0.png', 'Directory': 'tmpq7o86ysz', 'File Size': '943.88 KB', 'Last Modified': '2025-04-14 22:13:56', 'Created': '2025-04-14 22:13:56', 'File Extension': '.png', 'Full Path': '/tmp/tmpq7o86ysz/1.pdf_page_0.png', 'Image Size': '1700x2200', 'Image Mode': 'RGB', 'Image Format': 'PNG'} +2025-04-14 22:14:01 - utils.document_display - INFO - [document_display.py:167] - document_type for /tmp/tmpq7o86ysz/1.pdf: None +2025-04-14 22:14:01 - utils.document_display - INFO - [document_display.py:245] - Exception for processing analysis results of {'File Name': '1.pdf_page_0.png', 'Directory': 'tmpq7o86ysz', 'File Size': '943.88 KB', 'Last Modified': '2025-04-14 22:13:56', 'Created': '2025-04-14 22:13:56', 'File Extension': '.png', 'Full Path': '/tmp/tmpq7o86ysz/1.pdf_page_0.png', 'Image Size': '1700x2200', 'Image Mode': 'RGB', 'Image Format': 'PNG'}: 'NoneType' object has no attribute 'lower' +2025-04-14 22:14:01 - utils.document_display - INFO - [document_display.py:249] - analysis_results_for_id_updated : {'File Name': '1.pdf_page_0.png', 'Directory': 'tmpq7o86ysz', 'File Size': '943.88 KB', 'Last Modified': '2025-04-14 22:13:56', 'Created': '2025-04-14 22:13:56', 'File Extension': '.png', 'Full Path': '/tmp/tmpq7o86ysz/1.pdf_page_0.png', 'Image Size': '1700x2200', 'Image Mode': 'RGB', 'Image Format': 'PNG'} +2025-04-14 22:14:01 - __main__ - INFO - [app_streamlit.py:40] - file_path while displaying: ['/tmp/tmpq7o86ysz/1.pdf_page_0.png'] +2025-04-14 22:18:51 - llm.document_analyzer - INFO - [document_analyzer.py:26] - file_name : /tmp/tmpqmaed05g/1.pdf_page_0.png +2025-04-14 22:18:51 - llm.document_analyzer - INFO - [document_analyzer.py:32] - Starting analysis for /tmp/tmpqmaed05g/1.pdf_page_0.png +2025-04-14 22:18:55 - utils.document_display - INFO - [document_display.py:148] - analysis_results_for_id : {'File Name': '1.pdf_page_0.png', 'Directory': 'tmpqmaed05g', 'File Size': '943.88 KB', 'Last Modified': '2025-04-14 22:18:51', 'Created': '2025-04-14 22:18:51', 'File Extension': '.png', 'Full Path': '/tmp/tmpqmaed05g/1.pdf_page_0.png', 'Image Size': '1700x2200', 'Image Mode': 'RGB', 'Image Format': 'PNG'} +2025-04-14 22:18:55 - utils.document_display - INFO - [document_display.py:167] - document_type for /tmp/tmpqmaed05g/1.pdf: None +2025-04-14 22:18:55 - utils.document_display - INFO - [document_display.py:249] - analysis_results_for_id_updated : {'document_type': None} +2025-04-14 22:18:55 - __main__ - INFO - [app_streamlit.py:40] - file_path while displaying: ['/tmp/tmpqmaed05g/1.pdf_page_0.png'] +2025-04-14 22:37:29 - llm.document_analyzer - INFO - [document_analyzer.py:26] - file_name : /tmp/tmp5dfq1etu/1.pdf_page_0.png +2025-04-14 22:37:29 - llm.document_analyzer - INFO - [document_analyzer.py:32] - Starting analysis for /tmp/tmp5dfq1etu/1.pdf_page_0.png +2025-04-14 22:37:35 - llm.document_analyzer - INFO - [document_analyzer.py:51] - File name: /tmp/tmp5dfq1etu/1.pdf_page_0.png, Results: {'document_category': 'identity_verification_document', 'document_type': 'passport'} +2025-04-14 22:37:40 - llm.llm - INFO - [llm.py:40] - Json is being formatted +2025-04-14 22:37:40 - llm.document_analyzer - INFO - [document_analyzer.py:76] - /tmp/tmp5dfq1etu/1.pdf_page_0.png: {'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P, identity_verification_document +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , passport +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , UNITED-KINGDOM-FIVE JODIE PIPPA +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , UNITED-KINGDOM-FIVE +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , JODIE PIPPA +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , 107185703 +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , BRITISH CITIZEN +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , 1985-01-17 +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , LONDON +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , F +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , 2006-01-31 +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , 2016-01-31 +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , UKPA +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , P +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , GBR +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:00 - utils.document_display.passport - INFO - [passport.py:56] - , P, 1071857032GBR8501178F1601312<<<<<<<<<<<<<<02 +2025-04-15 14:03:53 - __main__ - INFO - [app_streamlit.py:27] - st.session_state: {'tab_ocr': {'file_groups': {'/tmp/tmpnhehsfkg/1.pdf': ['/tmp/tmpnhehsfkg/1.pdf_page_0.png']}, 'values_raw': {'/tmp/tmpnhehsfkg/1.pdf': {'/tmp/tmpnhehsfkg/1.pdf_page_0.png': {'document_category': 'identity_verification_document', 'document_type': 'passport', 'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P, identity_verification_document +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , passport +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , UNITED-KINGDOM-FIVE JODIE PIPPA +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , UNITED-KINGDOM-FIVE +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , JODIE PIPPA +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , 107185703 +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , BRITISH CITIZEN +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , 1985-01-17 +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , LONDON +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , F +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , 2006-01-31 +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , 2016-01-31 +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , UKPA +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , P +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , GBR +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:03:54 - utils.document_display.passport - INFO - [passport.py:56] - , P, 1071857032GBR8501178F1601312<<<<<<<<<<<<<<02 +2025-04-15 14:03:56 - __main__ - INFO - [app_streamlit.py:27] - st.session_state: {'tab_ocr': {'file_groups': {'/tmp/tmpnhehsfkg/1.pdf': ['/tmp/tmpnhehsfkg/1.pdf_page_0.png']}, 'values_raw': {'/tmp/tmpnhehsfkg/1.pdf': {'/tmp/tmpnhehsfkg/1.pdf_page_0.png': {'document_category': 'identity_verification_document', 'document_type': 'passport', 'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P, identity_verification_document +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:56] - , passport +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:56] - , 107185703 +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:56] - , UNITED-KINGDOM-FIVE JODIE PIPPA +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:56] - , 1985-01-17 +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:56] - , BRITISH CITIZEN +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:56] - , 2006-01-31 +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:56] - , 2016-01-31 +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:56] - , F +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:04:08 - utils.document_display.passport - INFO - [passport.py:56] - , None +2025-04-15 14:05:21 - __main__ - INFO - [app_streamlit.py:27] - st.session_state: {'tab_ocr': {'file_groups': {'/tmp/tmpnhehsfkg/1.pdf': ['/tmp/tmpnhehsfkg/1.pdf_page_0.png']}, 'values_raw': {'/tmp/tmpnhehsfkg/1.pdf': {'/tmp/tmpnhehsfkg/1.pdf_page_0.png': {'document_category': 'identity_verification_document', 'document_type': 'passport', 'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P, identity_verification_document +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:56] - , passport +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:56] - , 107185703 +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:56] - , UNITED-KINGDOM-FIVE JODIE PIPPA +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:56] - , 1985-01-17 +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:56] - , BRITISH CITIZEN +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:56] - , 2006-01-31 +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:56] - , 2016-01-31 +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:56] - , F +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:55] - isinstance(value, dict) : False +2025-04-15 14:05:22 - utils.document_display.passport - INFO - [passport.py:56] - , None +2025-04-15 14:06:15 - __main__ - INFO - [app_streamlit.py:27] - st.session_state: {'tab_ocr': {'file_groups': {'/tmp/tmpnhehsfkg/1.pdf': ['/tmp/tmpnhehsfkg/1.pdf_page_0.png']}, 'values_raw': {'/tmp/tmpnhehsfkg/1.pdf': {'/tmp/tmpnhehsfkg/1.pdf_page_0.png': {'document_category': 'identity_verification_document', 'document_type': 'passport', 'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P, identity_verification_document +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:57] - , passport +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:57] - , 107185703 +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:57] - , UNITED-KINGDOM-FIVE JODIE PIPPA +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:57] - , 1985-01-17 +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:57] - , BRITISH CITIZEN +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:57] - , 2006-01-31 +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:57] - , 2016-01-31 +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:57] - , F +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:12 - utils.document_display.passport - INFO - [passport.py:57] - , None +2025-04-15 14:07:12 - __main__ - INFO - [app_streamlit.py:70] - file_path while displaying: ['/tmp/tmpnhehsfkg/1.pdf_page_0.png'] +2025-04-15 14:07:24 - __main__ - INFO - [app_streamlit.py:27] - st.session_state: {'tab_ocr': {'file_groups': {'/tmp/tmpnhehsfkg/1.pdf': ['/tmp/tmpnhehsfkg/1.pdf_page_0.png']}, 'values_raw': {'/tmp/tmpnhehsfkg/1.pdf': {'/tmp/tmpnhehsfkg/1.pdf_page_0.png': {'document_category': 'identity_verification_document', 'document_type': 'passport', 'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P, identity_verification_document +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:57] - , passport +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:57] - , 107185703 +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:57] - , UNITED-KINGDOM-FIVE JODIE PIPPA +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:57] - , 1985-01-17 +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:57] - , BRITISH CITIZEN +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:57] - , 2006-01-31 +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:57] - , 2016-01-31 +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:57] - , F +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:24 - utils.document_display.passport - INFO - [passport.py:57] - , None +2025-04-15 14:07:26 - __main__ - INFO - [app_streamlit.py:27] - st.session_state: {'tab_ocr': {'file_groups': {'/tmp/tmpnhehsfkg/1.pdf': ['/tmp/tmpnhehsfkg/1.pdf_page_0.png']}, 'values_raw': {'/tmp/tmpnhehsfkg/1.pdf': {'/tmp/tmpnhehsfkg/1.pdf_page_0.png': {'document_category': 'identity_verification_document', 'document_type': 'passport', 'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P, identity_verification_document +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:57] - , passport +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:57] - , 107185703 +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:57] - , UNITED-KINGDOM-FIVE JODIE PIPPA +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:57] - , 1985-01-17 +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:57] - , BRITISH CITIZEN +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:57] - , 2006-01-31 +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:57] - , 2016-01-31 +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:57] - , F +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:56] - isinstance(value, dict) : False +2025-04-15 14:07:38 - utils.document_display.passport - INFO - [passport.py:57] - , None +2025-04-15 14:10:03 - __main__ - INFO - [app_streamlit.py:27] - st.session_state: {'tab_ocr': {'file_groups': {'/tmp/tmpnhehsfkg/1.pdf': ['/tmp/tmpnhehsfkg/1.pdf_page_0.png']}, 'values_raw': {'/tmp/tmpnhehsfkg/1.pdf': {'/tmp/tmpnhehsfkg/1.pdf_page_0.png': {'document_category': 'identity_verification_document', 'document_type': 'passport', 'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P= 2 and full_name_val_len <= 61 + ): + err_msgs.append( + "Full name must have a length of at least 2 & at most 61" + ) + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Full Name", + full_name_val_len, + False, + "Full name does not have a length of at least 2 & at most 61", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Full Name", + full_name_val_len, + True, + "Full name has a length of at least 2 & at most 61", + ] + + + if ( + not expected + or not full_name_val + or full_name_val.lower() != expected.lower() + ): + err_msgs.append("Name mismatch with provided value") + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Data Match", + f"{full_name_val}, {expected}", + False, + "Name does not match with provided value", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Data Match", + f"{full_name_val}, {expected}", + True, + "Name matches with provided value", + ] + + + if not full_name_val or len(full_name_val.strip().split(" ")) < 2: + err_msgs.append( + "Full name must consist of at least 2 words (first name + last name)" + ) + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Full Name", + full_name_val, + False, + "Full name does not consist of at least 2 words (first name + last name)", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Full Name", + full_name_val, + True, + "Full name consists of at least 2 words (first name + last name)", + ] + + + if err_msgs: + values.full_name_err_msgs = ", ".join(err_msgs) + else: + values.full_name_err_msgs = None + + return values + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @model_validator(mode="after") + def validate_bank_name(cls, values, info: ValidationInfo): + """Match bank name against provided name (case-insensitive)""" + try: + err_msgs = [] + expected = ( + info.context.get("application_summary_bank_name") + if info.context + else None + ) + bank_name_val = values.bank_name + if not bank_name_val: + err_msgs.append("Bank name not present") + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Bank name", + bank_name_val, + False, + "Bank name is not present", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Bank name", + bank_name_val, + True, + "Bank name is present", + ] + + + bank_name_val_len = 0 + if bank_name_val: + bank_name_val_len = len(bank_name_val) + if not bank_name_val and not ( + bank_name_val_len >= 4 and bank_name_val_len <= 50 + ): + err_msgs.append( + "Bank name must have a length of at least 4 & at most 50" + ) + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Bank name", + bank_name_val_len, + False, + "Bank name does not have a length of at least 4 & at most 50", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Bank name", + bank_name_val_len, + True, + "Bank name has a length of at least 4 & at most 50", + ] + + + if ( + not expected + or not bank_name_val + or bank_name_val.lower() != expected.lower() + ): + err_msgs.append("Bank name mismatch with provided value") + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Data Match", + f"{bank_name_val}, {expected}", + False, + "Bank name does not match with provided value", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Data Match", + f"{bank_name_val}, {expected}", + True, + "Bank name matches with provided value", + ] + + + + if err_msgs: + values.bank_name_err_msgs = ", ".join(err_msgs) + else: + values.bank_name_err_msgs = None + + return values + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @model_validator(mode="after") + def validate_account_number(cls, values): + """Validate detected bank account number""" + try: + err_msgs = list() + + if not values.account_number: + err_msgs.append( + "Bank account number not present. Bank account number must be present." + ) + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Bank account number", + values.account_number, + False, + "Bank account number is not present.", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Bank account number", + values.account_number, + True, + "Bank name matches is present", + ] + + + + if not values.account_number or not re.fullmatch( + r"^\d{8}$", values.account_number + ): + err_msgs.append( + "Provided account number is invalid. It must be of 8 digits length only" + ) + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Bank account number", + values.account_number, + False, + "Provided account number is invalid", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Bank account number", + values.account_number, + True, + "Provided account number is valid", + ] + + + + if err_msgs: + values.account_number_err_msgs = ", ".join(err_msgs) + else: + values.account_number_err_msgs = None + + return values + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @model_validator(mode="after") + def validate_sort_code(cls, values): + """Validate extracted Bank Account Sort Code""" + try: + err_msgs = list() + + if not values.sort_code: + err_msgs.append( + "Sort code not present. Sort number must be present.") + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Sort code", + values.sort_code, + False, + "Sort code is not present.", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Sort code", + values.sort_code, + True, + "Sort code is present.", + ] + + + + # if not values.sort_code or not re.fullmatch(r"^\d{2}-?\d{2}-?\d{2}$", values.sort_code): + if not values.sort_code or not re.fullmatch( + r"^\d{2}-\d{2}-\d{2}$", values.sort_code + ): + err_msgs.append( + "Provided sort code's format is invalid. It must be of the format xx-xx-xx wherein x are digits." + ) + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Sort code", + values.sort_code, + False, + "Sort code's format is invalid.", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Sort code", + values.sort_code, + True, + "Sort code's format is valid.", + ] + + + + if err_msgs: + values.sort_code_err_msgs = ", ".join(err_msgs) + else: + values.sort_code_err_msgs = None + + return values + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @model_validator(mode="after") + def validate_bank_account_statement_dates(cls, values): + try: + err_msgs = list() + statement_start_date_val = values.statement_start_date + statement_end_date_val = values.statement_end_date + + if not statement_start_date_val or not statement_end_date_val: + err_msgs.append( + "Both statement start date & statement end date must be present" + ) + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Date checks", + f"{statement_start_date_val}, {statement_end_date_val}", + False, + "Both statement start date & statement end date are not present", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Date checks", + f"{statement_start_date_val}, {statement_end_date_val}", + True, + "Both statement start date & statement end date are present", + ] + + + + if statement_start_date_val and statement_end_date_val: + if (statement_end_date_val - statement_start_date_val).days < 28: + err_msgs.append( + "Account statement period's start date & end date must have a gap of at least 28 days" + ) + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Coverage", + f"{statement_start_date_val}, {statement_end_date_val}", + False, + "Account statement period's start date & end date donot have a gap of at least 28 days", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Coverage", + f"{statement_start_date_val}, {statement_end_date_val}", + True, + "Account statement period's start date & end date have a gap of at least 28 days", + ] + + + if err_msgs: + values.account_statement_date_err_msgs = ", ".join(err_msgs) + else: + values.account_statement_date_err_msgs = None + + return values + + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @model_validator(mode="after") + def validate_salary_credit_checks(cls, values): + try: + err_msgs = list() + + statement_start_date_val = values.statement_start_date + statement_end_date_val = values.statement_end_date + first_salary_deposit_date_present_val = ( + values.first_salary_deposit_date_present + ) + # # is_salary_credit_present_val = values.is_salary_credit_present + # is_salary_credit_consistent_across_months_val = ( + # values.is_salary_credit_consistent_across_months + # ) + + # if not statement_start_date_val or not statement_end_date_val: + # err_msgs.append( + # "Both statement start date & statement end date must be present" + # ) + # values.validation_policy_status_df.loc[len( + # values.validation_policy_status_df)] = ["Both statement start date & statement end date must be present", f"{statement_start_date_val}, {statement_end_date_val}", False, "Both statement start date & statement end date are not present"] + # else: + # values.validation_policy_status_df.loc[len( + # values.validation_policy_status_df)] = ["Both statement start date & statement end date must be present", f"{statement_start_date_val}, {statement_end_date_val}", True, "Both statement start date & statement end date are present"] + + + if not first_salary_deposit_date_present_val: + err_msgs.append("At least one salary credit must be present") + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Salary deposit", + first_salary_deposit_date_present_val, + False, + "At least one salary credit is not present", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Salary deposit", + first_salary_deposit_date_present_val, + True, + "At least one salary credit is present", + ] + + + if ( + not statement_start_date_val + or not statement_end_date_val + or (statement_end_date_val < statement_start_date_val) + ): + err_msgs.append( + "Statement period's end date must be after the start date" + ) + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Date checks", + f"{statement_start_date_val}, {statement_end_date_val}", + False, + "Statement period's end date is not after the start date", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Date checks", + f"{statement_start_date_val}, {statement_end_date_val}", + True, + "Statement period's end date is after the start date", + ] + + + # # if start and end and (start.month != end.month or start.year != end.year): + # if ( + # statement_start_date_val + # and statement_end_date_val + # and first_salary_deposit_date_present_val + # and ( + # statement_start_date_val.month < statement_end_date_val.month + # or statement_start_date_val.year < statement_end_date_val.year + # ) + # and ( + # statement_end_date_val.day >= first_salary_deposit_date_present_val + # ) + # ): + # if not is_salary_credit_consistent_across_months_val: + # err_msgs.append( + # "Salary credit amount across months must be consistent" + # ) + + if err_msgs: + values.salary_deposit_err_msgs = ", ".join(err_msgs) + else: + values.salary_deposit_err_msgs = None + + return values + + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @computed_field + @property + def is_red_flagged(self) -> bool: + if ( + self.account_statement_date_err_msgs + or self.full_name_err_msgs + or self.bank_name_err_msgs + or self.account_number_err_msgs + or self.sort_code_err_msgs + or self.salary_deposit_err_msgs + ): + return True + return False diff --git a/schemas/custom_app_form.py b/schemas/custom_app_form.py new file mode 100644 index 0000000000000000000000000000000000000000..ea8277eaa9a0764686ddccd15f0b81e83fabf9c5 --- /dev/null +++ b/schemas/custom_app_form.py @@ -0,0 +1,163 @@ +import datetime +import re + +from dateutil.relativedelta import relativedelta +from pydantic import ( + BaseModel, + computed_field, + Field, + ValidationInfo, + model_validator, + ConfigDict +) +import pandas as pd + + +class CustomAppFormUpload(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + # application_summary_full_name: str | None = Field(None, alias="full_name") + # application_summary_bank_name: str | None = Field(None, alias="bank_name") + # application_summary_employer_name: str | None = Field(None, alias="employer_name") + # application_summary_complete_address: str | None = Field(None, alias="complete_address") + application_summary_full_name: str = Field(alias="full_name") + application_summary_bank_name: str = Field(alias="bank_name") + application_summary_employer_name: str = Field(alias="employer_name") + application_summary_complete_address: str = Field(alias="complete_address") + + full_name_err_msgs: str | None = None + bank_name_err_msgs: str | None = None + employer_name_err_msgs: str | None = None + complete_employee_address_err_msgs: str | None = None + validation_policy_status_df: pd.DataFrame = pd.DataFrame( + columns=["Policy", "Value", "Status", "Message"]) + # is_incomplete: bool = False + + @model_validator(mode="after") + def validate_full_name(self, info: ValidationInfo): + """Validate provided applicant's full name""" + try: + err_msgs = [] + + full_name_val = self.application_summary_full_name + if not full_name_val: + err_msgs.append("Applicant's full name not present") + + full_name_val_len = 0 + if full_name_val: + full_name_val_len = len(full_name_val) + if not full_name_val and not ( + full_name_val_len >= 2 and full_name_val_len <= 61 + ): + err_msgs.append( + "Full name must have a length of at least 2 & at most 61" + ) + + if not full_name_val or len(full_name_val.strip().split(" ")) < 2: + err_msgs.append( + "Full name must consist of at least 2 words (first name + last name)" + ) + + if err_msgs: + self.full_name_err_msgs = ", ".join(err_msgs) + else: + self.full_name_err_msgs = None + + return self + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @model_validator(mode="after") + def validate_bank_name(self, info: ValidationInfo): + """Validate provided bank name""" + try: + err_msgs = [] + bank_name_val = self.application_summary_bank_name + if not bank_name_val: + err_msgs.append("Bank name not present") + + bank_name_val_len = 0 + if bank_name_val: + bank_name_val_len = len(bank_name_val) + if not bank_name_val and not ( + bank_name_val_len >= 4 and bank_name_val_len <= 50 + ): + err_msgs.append( + "Bank name must have a length of at least 4 & at most 50" + ) + + if err_msgs: + self.bank_name_err_msgs = ", ".join(err_msgs) + else: + self.bank_name_err_msgs = None + + return self + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @model_validator(mode="after") + def validate_employer_name(self, info: ValidationInfo): + """Validate provided employer name""" + try: + err_msgs = [] + employer_name_val = self.application_summary_employer_name + + if not employer_name_val: + err_msgs.append("Employer name not present") + + # # Allowed: letters, numbers, spaces, and common name punctuation + # pattern = r"^[A-Za-z0-9&\-,.()'/@ ]{2,100}$" + # if not re.match(pattern, employer_name_val): + # err_msgs.append("Employer name contains invalid characters") + if not re.search(r"[A-Za-z]", employer_name_val): + err_msgs.append( + "Employer name must contain at least one letter") + if employer_name_val.strip() == "": + err_msgs.append("Employer name cannot be only whitespace") + + self.employer_name_err_msgs = ", ".join( + err_msgs) if err_msgs else None + return self + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @model_validator(mode="after") + def validate_complete_address(self, info: ValidationInfo): + try: + err_msgs = [] + val = self.application_summary_complete_address + + if not val: + err_msgs.append("Applicant's address not present") + + length = len(val) if val else 0 + if not (10 <= length <= 300): + err_msgs.append( + "Applicant's complete address must have a length of at least 10 & at most 300" + ) + + self.complete_employee_address_err_msgs = ( + ", ".join(err_msgs) if err_msgs else None + ) + return self + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @computed_field + @property + def is_incomplete(self) -> bool: + if any([ + self.full_name_err_msgs, + self.bank_name_err_msgs, + self.employer_name_err_msgs, + self.complete_employee_address_err_msgs, + ]): + return True + return False diff --git a/schemas/id.py b/schemas/id.py new file mode 100644 index 0000000000000000000000000000000000000000..4e1a03e5ed4af4d9c338d61b129f77cc37efbec3 --- /dev/null +++ b/schemas/id.py @@ -0,0 +1,291 @@ +import datetime + +from dateutil.relativedelta import relativedelta +from pydantic import ( + BaseModel, + computed_field, + Field, + ValidationInfo, + model_validator, + ConfigDict +) +import pandas as pd + + +class UKPassportSchema(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + full_name: str | None = Field( + default=None, + description="Applicant's full name. Must consist of at least 2 words, have length gte 2 & lte 61", + examples=["Jodie Pippa"], + ) # , min_length=2, max_length=61) + expiry_date: datetime.date | None = Field( + default=None, + description="The passport's expiry date in YYYY-MM-DD format", + examples=["2028-06-01"], + ) + + full_name_err_msgs: str | None = None + expiry_date_err_msgs: str | None = None + validation_policy_status_df: pd.DataFrame = pd.DataFrame( + columns=["Policy", "Value", "Status", "Message"]) + + @model_validator(mode="after") + def validate_expiry_date(cls, values): + try: + err_msgs = list() + expiry_date_val = values.expiry_date + if not expiry_date_val: + err_msgs.append("Expiry date must be present") + values.validation_policy_status_df.loc[len( + values.validation_policy_status_df)] = ["Expiry date must be present", values.expiry_date, False, "Expiry date is not present"] + else: + values.validation_policy_status_df.loc[len( + values.validation_policy_status_df)] = ["Expiry date must be present", values.expiry_date, True, "Expiry date is present"] + if expiry_date_val < datetime.date.today() + relativedelta(years=1): + # raise ValueError("Provided passport expires within 1 year") + err_msgs.append("Provided passport expires within 1 year") + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Provided passport expiry should be more than 1 year", + values.expiry_date, + False, + "Provided passport expires within 1 year &/or is expired", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Provided passport expiry should be more than 1 year", + values.expiry_date, + True, + "Provided passport does not expire within 1 year", + ] + + values.expiry_date_err_msgs = ", ".join( + err_msgs) if err_msgs else None + return values + except Exception as e: + raise + # if not values.expiry_date_err_msgs: + # values.expiry_date_err_msgs = "Provided passport expires within 1 year" + # else: + # values.expiry_date_err_msgs = f"{values.expiry_date_err_msgs}, Provided passport expires within 1 year" + # if not values.expiry_date_err_msgs: + # values.expiry_date_err_msgs = None + # return values + + @model_validator(mode="after") + def validate_full_name(cls, values, info: ValidationInfo): + """Match applicant's full name against provided name (case-insensitive)""" + try: + err_msgs = [] + expected = ( + info.context.get("application_summary_full_name") + if info.context + else None + ) + full_name_val = values.full_name + if not full_name_val: + err_msgs.append("Applicant's full name not present") + values.validation_policy_status_df.loc[len( + values.validation_policy_status_df)] = ["Applicant's full name should be present", full_name_val, False, "Applicant's full name not present"] + else: + values.validation_policy_status_df.loc[len( + values.validation_policy_status_df)] = ["Applicant's full name should be present", full_name_val, True, "Applicant's full name is present"] + + full_name_val_len = 0 + if full_name_val: + full_name_val_len = len(full_name_val) + if not full_name_val and not ( + full_name_val_len >= 2 and full_name_val_len <= 61 + ): + err_msgs.append( + "Full name must have a length of at least 2 & at most 61" + ) + values.validation_policy_status_df.loc[len( + values.validation_policy_status_df)] = [ "Full name must have a length of at least 2 & at most 61", full_name_val_len, False, "Full name does not have a length of at least 2 & at most 61"] + else: + values.validation_policy_status_df.loc[len( + values.validation_policy_status_df)] = [ "Full name must have a length of at least 2 & at most 61", full_name_val_len, True, "Full name has a length of at least 2 & at most 61"] + + + if ( + not expected + or not full_name_val + or full_name_val.lower() != expected.lower() + ): + err_msgs.append("Name mismatch with provided value") + values.validation_policy_status_df.loc[len( + values.validation_policy_status_df)] = ["Name should match with provided value", full_name_val, False, "Name does not match with provided value"] + else: + values.validation_policy_status_df.loc[len( + values.validation_policy_status_df)] = ["Name should match with provided value", full_name_val, True, "Name matches with provided value"] + + + if not full_name_val or len(full_name_val.strip().split(" ")) < 2: + err_msgs.append( + "Full name must consist of at least 2 words (first name + last name)" + ) + values.validation_policy_status_df.loc[len( + values.validation_policy_status_df)] = ["Full name must consist of at least 2 words (first name + last name)", len(full_name_val.strip().split(" ")), False, "Full name does not consist of at least 2 words (first name + last name)"] + else: + values.validation_policy_status_df.loc[len( + values.validation_policy_status_df)] = ["Full name must consist of at least 2 words (first name + last name)", len(full_name_val.strip().split(" ")), True, "Full name does consist of at least 2 words (first name + last name)"] + + + if err_msgs: + values.full_name_err_msgs = ", ".join(err_msgs) + else: + values.full_name_err_msgs = None + + return values + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @computed_field + @property + def is_red_flagged(self) -> bool: + if self.full_name_err_msgs or self.expiry_date_err_msgs: + return True + return False + + +class UKDrivingLicense(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + full_name: str | None = Field( + default=None, + description="Applicant's full name. Must consist of at least 2 words, have length gte 2 & lte 61", + examples=["Jodie Pippa"], + ) # , min_length=2, max_length=61) + + full_name_err_msgs: str | None = None + expiry_date_err_msgs: str | None = None + validation_policy_status_df: pd.DataFrame = pd.DataFrame( + columns=["Policy", "Value", "Status", "Message"]) + + @model_validator(mode="after") + def validate_full_name(cls, values, info: ValidationInfo): + """Match applicant's full name against provided name (case-insensitive)""" + try: + err_msgs = [] + expected = ( + info.context.get("application_summary_full_name") + if info.context + else None + ) + full_name_val = values.full_name + if not full_name_val: + err_msgs.append("Applicant's full name not present") + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Applicant's full name should be present", + full_name_val, + False, + "Applicant's full name not present", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Applicant's full name should be present", + full_name_val, + True, + "Applicant's full name is present", + ] + + full_name_val_len = 0 + if full_name_val: + full_name_val_len = len(full_name_val) + if not full_name_val and not ( + full_name_val_len >= 2 and full_name_val_len <= 61 + ): + err_msgs.append( + "Full name must have a length of at least 2 & at most 61" + ) + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Full name must have a length of at least 2 & at most 61", + full_name_val_len, + False, + "Full name does not have a length of at least 2 & at most 61", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Full name must have a length of at least 2 & at most 61", + full_name_val_len, + True, + "Full name has a length of at least 2 & at most 61", + ] + + if ( + not expected + or not full_name_val + or full_name_val.lower() != expected.lower() + ): + err_msgs.append("Name mismatch with provided value") + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Name should match with provided value", + full_name_val, + False, + "Name does not match with provided value", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Name should match with provided value", + full_name_val, + True, + "Name matches with provided value", + ] + + if not full_name_val or len(full_name_val.strip().split(" ")) < 2: + err_msgs.append( + "Full name must consist of at least 2 words (first name + last name)" + ) + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Full name must consist of at least 2 words (first name + last name)", + len(full_name_val.strip().split(" ")), + False, + "Full name does not consist of at least 2 words (first name + last name)", + ] + else: + values.validation_policy_status_df.loc[ + len(values.validation_policy_status_df) + ] = [ + "Full name must consist of at least 2 words (first name + last name)", + len(full_name_val.strip().split(" ")), + True, + "Full name does consist of at least 2 words (first name + last name)", + ] + + if err_msgs: + values.full_name_err_msgs = ", ".join(err_msgs) + else: + values.full_name_err_msgs = None + + return values + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @computed_field + @property + def is_red_flagged(self) -> bool: + if self.full_name_err_msgs or self.expiry_date_err_msgs: + return True + return False + diff --git a/schemas/payslip.py b/schemas/payslip.py new file mode 100644 index 0000000000000000000000000000000000000000..11b227c58f379f64d4662733ab4c2d222d7e6fd9 --- /dev/null +++ b/schemas/payslip.py @@ -0,0 +1,551 @@ +import datetime +import re + +from pydantic import ( + BaseModel, + computed_field, + Field, + ValidationInfo, + model_validator, + ConfigDict +) +import pandas as pd + + +class UKPayslipSchema(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + pay_period_start_date: datetime.date | None = Field( + default=None, + description="Pay period's start date in YYYY-MM-DD format", + examples=["2025-02-01"], + ) + pay_period_end_date: datetime.date | None = Field( + default=None, + description="Pay period's end date in YYYY-MM-DD format", + examples=["2025-02-28"], + ) + pay_period_days: int | None = Field( + default=None, + description="pay_period_end_date - pay_period_start_date in days", + examples=[28], + ) + pay_date: datetime.date | None = Field(None) + full_name: str | None = Field( + default=None, + description="Applicant's full name. Must consist of at least 2 words, have length gte 2 & lte 61", + examples=["Jodie Pippa"], + ) + employer_name: str | None = Field( + default=None, + description="Employer name extracted", + examples=["ABC Ltd"], + ) + is_basic_pay_net_pay_other_salary_components_present: bool = Field( + default=False, + description="Boolean indicating whether Basic Pay, Net Pay, other requisite salary components/line items are present in the payslip", + examples=[True, False], + ) + is_tax_deducation_present: bool = Field( + default=False, + description="Boolean flag indicating whether Tax Deduction line item is present in the payslip", + examples=[True, False], + ) + is_ni_deduction_present: bool = Field( + default=False, + description="Boolean flag indicating whether NI/National Insurance deduction line item is present in the payslip", + examples=[True, False], + ) + complete_employee_address: str | None = Field( + default=None, + description="Employee's complete address as a string", + examples=["123 Maple Street, London, UK, SW1A 1AA"], + ) + # employee_number: int | None = Field( + # default=None, + # description="Employee number", + # examples=[3558, 1234], + # ) + + pay_dates_err_msgs: str | None = None + full_name_err_msgs: str | None = None + employer_name_err_msgs: str | None = None + payslip_line_item_presence_err_msgs: str | None = None + complete_employee_address_err_msgs: str | None = None + validation_policy_status_df: pd.DataFrame = pd.DataFrame( + columns=["Policy", "Value", "Status", "Message"]) + # employee_number_err_msgs: str | None = None + # is_red_flagged: bool = False + + @model_validator(mode="after") + def validate_full_name(self, info: ValidationInfo): + """Match applicant's full name against provided name (case-insensitive)""" + try: + err_msgs = [] + expected = ( + info.context.get("application_summary_full_name") + if info.context + else None + ) + + if not self.full_name: + err_msgs.append("Applicant's full name not present") + self.validation_policy_status_df.loc[len( + self.validation_policy_status_df)] = ["Employer & Customer Names", self.full_name, False, "Applicant's full name is not present"] + else: + self.validation_policy_status_df.loc[len( + self.validation_policy_status_df)] = ["Employer & Customer Names", self.full_name, True, "Applicant's full name is present"] + + full_name_val_len = len(self.full_name) if self.full_name else 0 + if not (2 <= full_name_val_len <= 61): + err_msgs.append( + "Full name must have a length of at least 2 & at most 61" + ) + self.validation_policy_status_df.loc[len( + self.validation_policy_status_df)] = ["Employer & Customer Names", full_name_val_len, False, "Full name has a length of at least 2 & at most 61"] + else: + self.validation_policy_status_df.loc[len( + self.validation_policy_status_df)] = ["Employer & Customer Names", full_name_val_len, True, "Full name has a length of at least 2 & at most 61"] + + if not self.full_name or len(self.full_name.strip().split(" ")) < 2: + err_msgs.append( + "Full name must consist of at least 2 words (first name + last name)" + ) + self.validation_policy_status_df.loc[len( + self.validation_policy_status_df)] = ["Employer & Customer Names", self.full_name , False, "Full name does not consist of at least 2 words (first name + last name)"] + else: + self.validation_policy_status_df.loc[len( + self.validation_policy_status_df)] = ["Employer & Customer Names", len(self.full_name.strip().split(" ")), True, "Full name consists of at least 2 words (first name + last name)"] + + if ( + not expected + or not self.full_name + or self.full_name.lower() != expected.lower() + ): + err_msgs.append("Name mismatch with provided value") + self.validation_policy_status_df.loc[len( + self.validation_policy_status_df)] = ["Employer & Customer Names", f"{self.full_name}, {expected}", False, "Name does not match with provided value"] + else: + self.validation_policy_status_df.loc[len( + self.validation_policy_status_df)] = ["Employer & Customer Names", f"{self.full_name}, {expected}", True, "Name matches with provided value"] + + self.full_name_err_msgs = ", ".join(err_msgs) if err_msgs else None + return self + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @model_validator(mode="after") + def validate_employer_name(self, info: ValidationInfo): + """Match employer against provided employer name (case-insensitive)""" + try: + err_msgs = [] + expected = ( + info.context.get("application_summary_employer_name") + if info.context + else None + ) + employer_name_val = self.employer_name + + if not employer_name_val: + err_msgs.append("Employer name not present") + self.validation_policy_status_df.loc[len( + self.validation_policy_status_df)] = ["Employer & Customer Names", employer_name_val, False, "Employer name is not present"] + else: + + self.validation_policy_status_df.loc[len( + self.validation_policy_status_df)] = ["Employer & Customer Names", employer_name_val, True, "Employer name is present"] + + is_employer_name_match = ( + expected + and employer_name_val + and employer_name_val.lower() == expected.lower() + ) + + if not is_employer_name_match: + err_msgs.append("Employer name mismatch with provided value") + self.validation_policy_status_df.loc[len( + self.validation_policy_status_df)] = ["Employer & Customer Names", f"{employer_name_val}, {expected}", False, "Employer name does not match with provided value"] + else: + self.validation_policy_status_df.loc[len( + self.validation_policy_status_df)] = ["Employer & Customer Names", f"{employer_name_val}, {expected}", True, "Employer name matches with provided value"] + # # Allowed: letters, numbers, spaces, and common name punctuation + # pattern = r"^[A-Za-z0-9&\-,.()'/@ ]{2,100}$" + + # if not re.match(pattern, employer_name_val): + # err_msgs.append("Employer name contains invalid characters") + if not re.search(r"[A-Za-z]", employer_name_val): + err_msgs.append( + "Employer name must contain at least one letter") + if employer_name_val.strip() == "": + err_msgs.append("Employer name cannot be only whitespace") + + self.employer_name_err_msgs = ", ".join( + err_msgs) if err_msgs else None + return self + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @model_validator(mode="after") + def validate_payslip_dates(self): + try: + err_msgs = [] + today = datetime.date.today() + threshold_date = today - datetime.timedelta(days=35) + + if not self.pay_period_start_date or not self.pay_period_end_date: + err_msgs.append( + "Undated Payslips" + ) + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Undated Payslips", + f"{self.pay_period_start_date}, {self.pay_period_end_date}", + False, + "Undated payslip", + ] + else: + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Undated Payslips", + f"{self.pay_period_start_date}, {self.pay_period_end_date}", + True, + "Dated payslip", + ] + # self.is_red_flagged = True + + + if self.pay_date: + if not (threshold_date <= self.pay_date <= today): + err_msgs.append( + "Pay date must be within the last 35 days & not in the future" + ) + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Pay Date Requirement", + self.pay_date, + False, + "Pay date is not within the last 35 days & not in the future", + ] + else: + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Pay Date Requirement", + self.pay_date, + True, + "Pay date is within the last 35 days & not in the future", + ] + + # elif self.pay_period_end_date: + else: + if not (threshold_date <= self.pay_period_end_date <= today): + err_msgs.append( + "Pay period's end date must be within the last 35 days & not in the future" + ) + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Pay Period End Date (DD/MM/YYYY, if no pay date)", + self.pay_date, + False, + "Pay date is not within the last 35 days &/or in the future", + ] + else: + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Pay Period End Date (DD/MM/YYYY, if no pay date)", + self.pay_date, + True, + "Pay date is within the last 35 days & not in the future", + ] + + prev_month_end = datetime.date.today().replace(day=1) - \ + datetime.timedelta(days=1) + prev_month_start = prev_month_end.replace(day=1) + if not ( + prev_month_start <= self.pay_period_start_date + and self.pay_period_start_date < self.pay_period_end_date <= today + ): + err_msgs.append( + "Payslip date(s) must not be older than those of the last calendar month" + ) + + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Pay Period Month (MM/YYYY, if no pay date) basis pay period duration", + self.pay_date, + False, + "Payslip date(s) is older than those of the last calendar month", + ] + else: + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Pay Period Month (MM/YYYY, if no pay date) basis pay period duration", + self.pay_date, + True, + "Payslip date(s) is not older than those of the last calendar month", + ] + + if self.pay_period_start_date and self.pay_period_end_date: + + if self.pay_period_start_date >= self.pay_period_end_date: + err_msgs.append( + "Pay period's start date must be before the end date") + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Pay Period Start & End Dates", + f"{self.pay_period_start_date}, {self.pay_period_end_date}", + False, + "Pay period's start date is not before the end date", + ] + else: + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Pay Period Start & End Dates", + f"{self.pay_period_start_date}, {self.pay_period_end_date}", + True, + "Pay period's start date is before the end date", + ] + + + + if (self.pay_period_end_date - self.pay_period_start_date).days < 28: + err_msgs.append( + "Pay period's start date & end date must have a gap of at least 28 days" + ) + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Submission Requirement (Monthly Pay)", + (self.pay_period_end_date - self.pay_period_start_date).days, + False, + "Pay period's start date & end date donot have a gap of at least 28 days", + ] + else: + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Submission Requirement (Monthly Pay)", + (self.pay_period_end_date - self.pay_period_start_date).days, + True, + "Pay period's start date & end date have a gap of at least 28 days", + ] + else: + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Pay Period Start & End Dates", + f"{self.pay_period_start_date}, {self.pay_period_end_date}", + False, + "Pay period's start date is not before the end date", + ] + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Submission Requirement (Monthly Pay)", + f"{self.pay_period_start_date}, {self.pay_period_end_date}", + False, + "Pay period's start date & end date donot have a gap of at least 28 days", + ] + + + self.pay_dates_err_msgs = ", ".join(err_msgs) if err_msgs else None + return self + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @model_validator(mode="after") + def validate_payslip_components_checks(self): + try: + err_msgs = [] + + if not self.is_basic_pay_net_pay_other_salary_components_present: + err_msgs.append( + "Basic salary, Net Salary and/or other requisite salary components not present" + ) + + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Requisite salary line items", + self.is_basic_pay_net_pay_other_salary_components_present, + False, + "Basic salary, Net Salary and/or other requisite salary components not present", + ] + else: + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Requisite salary line items", + self.is_basic_pay_net_pay_other_salary_components_present, + True, + "Basic salary, Net Salary and/or other requisite salary components are present", + ] + + if not self.is_tax_deducation_present: + err_msgs.append("Tax Deduction line item must be present") + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Tax & NI Contributions", + self.is_tax_deducation_present, + False, + "Tax Deduction line item is not present", + ] + else: + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Tax & NI Contributions", + self.is_tax_deducation_present, + True, + "Tax Deduction line item is present", + ] + + if not self.is_ni_deduction_present: + err_msgs.append("NI/National Insurance line item must be present") + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Tax & NI Contributions", + self.is_ni_deduction_present, + False, + "NI/National Insurance line item is not present", + ] + else: + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Tax & NI Contributions", + self.is_ni_deduction_present, + True, + "NI/National Insurance line item is present", + ] + + self.payslip_line_item_presence_err_msgs = ( + ", ".join(err_msgs) if err_msgs else None + ) + return self + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + @model_validator(mode="after") + def validate_complete_address(self, info: ValidationInfo): + try: + err_msgs = [] + expected = ( + info.context.get("application_summary_complete_address") + if info.context + else None + ) + val = self.complete_employee_address + + if not val: + err_msgs.append("Applicant's address not present") + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Applicant Address", + val, + False, + "Applicant's address not present", + ] + else: + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Applicant Address", + val, + True, + "Applicant's address is present", + ] + + length = len(val) if val else 0 + if not (10 <= length <= 300): + err_msgs.append( + "Applicant's complete address must have a length of at least 10 & at most 300" + ) + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Applicant Address", + length, + False, + "Applicant's complete address does not have a length of at least 10 & at most 300", + ] + else: + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Applicant Address", + length, + True, + "Applicant's complete address has a length of at least 10 & at most 300", + ] + + if not expected or not val or val.lower() != expected.lower(): + err_msgs.append("Complete address mismatch with provided value") + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Applicant Address", + f"{val}, {expected}", + False, + "Complete address mismatch with provided value", + ] + else: + self.validation_policy_status_df.loc[ + len(self.validation_policy_status_df) + ] = [ + "Applicant Address", + f"{val}, {expected}", + True, + "Complete address matches with provided value", + ] + + self.complete_employee_address_err_msgs = ( + ", ".join(err_msgs) if err_msgs else None + ) + return self + except Exception as e: + # logger.exception(e, exc_info=True) + # return None + raise + + # @model_validator(mode="after") + # def validate_employee_number(self): + # try: + # if self.employee_number and self.employee_number <= 25: + # self.complete_employee_address_err_msgs = "Employee number low" + # return self + # except Exception as e: + # raise + + @computed_field + @property + def is_red_flagged(self) -> bool: + if any([ + self.pay_dates_err_msgs, + self.full_name_err_msgs, + self.employer_name_err_msgs, + self.payslip_line_item_presence_err_msgs, + self.complete_employee_address_err_msgs, + # self.employee_number_err_msgs, + ]): + return True + return False diff --git a/schemas/uk_address.py b/schemas/uk_address.py new file mode 100644 index 0000000000000000000000000000000000000000..c40bb058cbd4f6f48ee63060daff1e7d32f7b4fe --- /dev/null +++ b/schemas/uk_address.py @@ -0,0 +1,39 @@ +from pydantic import BaseModel, Field, field_validator +import re + +UK_POSTCODE_REGEX = re.compile(r"^(GIR ?0AA|[A-Z]{1,2}\d{1,2}[A-Z]?\s?\d[A-Z]{2})$", re.IGNORECASE) + +class UKAddress(BaseModel): + street_address: str = Field(..., min_length=5, max_length=100) + city: str = Field(..., min_length=2, max_length=50) + postcode: str + country: str = "United Kingdom" + + @field_validator("street_address") + @classmethod + def validate_street_address(cls, v: str) -> str: + if not re.match(r"^[a-zA-Z0-9\s,.'\-/#()]{5,100}$", v): + raise ValueError("Invalid characters in street address") + return v.strip() + + @field_validator("city") + @classmethod + def validate_city(cls, v: str) -> str: + if not re.match(r"^[a-zA-Z\s\-']+$", v): + raise ValueError("City must only contain alphabetic characters, spaces, hyphens, or apostrophes") + return v.strip() + + @field_validator("postcode") + @classmethod + def validate_postcode(cls, v: str) -> str: + cleaned = v.replace(" ", "").upper() + if not UK_POSTCODE_REGEX.match(cleaned): + raise ValueError("Invalid UK postcode format") + return v.upper() + + @field_validator("country") + @classmethod + def validate_country(cls, v: str) -> str: + if v.strip().lower() not in ["united kingdom", "uk"]: + raise ValueError("Country must be United Kingdom or UK") + return "United Kingdom" diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8fc51c7aac712c4eac583d3d23a5b127792f6a16 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,4 @@ + +from .image_utils import im_2_b64, encode_image, load_pdf_as_image, generate_metadata +from .process_files import process_uploaded_files +from .logger import setup_logger diff --git a/utils/__pycache__/__init__.cpython-313.pyc b/utils/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2cdd0639d4badba9963d78559d51cb92ea9a699c Binary files /dev/null and b/utils/__pycache__/__init__.cpython-313.pyc differ diff --git a/utils/__pycache__/document_display.cpython-313.pyc b/utils/__pycache__/document_display.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7da026e4046c5e8e24d85cb5e9b471e6c3c1532e Binary files /dev/null and b/utils/__pycache__/document_display.cpython-313.pyc differ diff --git a/utils/__pycache__/image_utils.cpython-313.pyc b/utils/__pycache__/image_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d9e76297b84b907de2d6e7a9d3d90c7d123db1b Binary files /dev/null and b/utils/__pycache__/image_utils.cpython-313.pyc differ diff --git a/utils/__pycache__/json_utils.cpython-313.pyc b/utils/__pycache__/json_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dcc39133c575fa12a12dee9bfd0e0e56a1f8b8c4 Binary files /dev/null and b/utils/__pycache__/json_utils.cpython-313.pyc differ diff --git a/utils/__pycache__/logger.cpython-313.pyc b/utils/__pycache__/logger.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..afb77591c9344439a21b5a386a545794bea658c6 Binary files /dev/null and b/utils/__pycache__/logger.cpython-313.pyc differ diff --git a/utils/__pycache__/prep_validators_payload.cpython-313.pyc b/utils/__pycache__/prep_validators_payload.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2f3e7013e2e39f120b528e0cf081b28c26b53e1 Binary files /dev/null and b/utils/__pycache__/prep_validators_payload.cpython-313.pyc differ diff --git a/utils/__pycache__/process_files.cpython-313.pyc b/utils/__pycache__/process_files.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b22d3066be5a72fba14b667743a9f4d7b2e7f020 Binary files /dev/null and b/utils/__pycache__/process_files.cpython-313.pyc differ diff --git a/utils/__pycache__/session_state.cpython-313.pyc b/utils/__pycache__/session_state.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c2a4e8353eb9c6ad1fd3dfd024e88de145e6d05 Binary files /dev/null and b/utils/__pycache__/session_state.cpython-313.pyc differ diff --git a/utils/document_display/__init__.py b/utils/document_display/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d60c6961204e0d1d9085ee4596a729d678222b10 --- /dev/null +++ b/utils/document_display/__init__.py @@ -0,0 +1 @@ +from .document_display import display_based_on_card diff --git a/utils/document_display/__pycache__/__init__.cpython-313.pyc b/utils/document_display/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..543b96b7f961b97f3a03cb726cd26f48a9082052 Binary files /dev/null and b/utils/document_display/__pycache__/__init__.cpython-313.pyc differ diff --git a/utils/document_display/__pycache__/bank_statement.cpython-313.pyc b/utils/document_display/__pycache__/bank_statement.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c7bddaa6d8c225a656482216a3c64011cebc94d Binary files /dev/null and b/utils/document_display/__pycache__/bank_statement.cpython-313.pyc differ diff --git a/utils/document_display/__pycache__/document_display.cpython-313.pyc b/utils/document_display/__pycache__/document_display.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08aa2be198d395bfe6c419090e2d21101806d95f Binary files /dev/null and b/utils/document_display/__pycache__/document_display.cpython-313.pyc differ diff --git a/utils/document_display/__pycache__/driving_license.cpython-313.pyc b/utils/document_display/__pycache__/driving_license.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c22c30a4a2c93ebe1a9494990241bc0de76bae42 Binary files /dev/null and b/utils/document_display/__pycache__/driving_license.cpython-313.pyc differ diff --git a/utils/document_display/__pycache__/others.cpython-313.pyc b/utils/document_display/__pycache__/others.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1344ca39da7c5b02cf3213a8d6ebfa90fb54ce91 Binary files /dev/null and b/utils/document_display/__pycache__/others.cpython-313.pyc differ diff --git a/utils/document_display/__pycache__/p60.cpython-313.pyc b/utils/document_display/__pycache__/p60.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f09df30ece8988dbea44c94f279142bbf0b631f3 Binary files /dev/null and b/utils/document_display/__pycache__/p60.cpython-313.pyc differ diff --git a/utils/document_display/__pycache__/passport.cpython-313.pyc b/utils/document_display/__pycache__/passport.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3531117dc62db3aeeb55387592599294cbeda96 Binary files /dev/null and b/utils/document_display/__pycache__/passport.cpython-313.pyc differ diff --git a/utils/document_display/__pycache__/payslip.cpython-313.pyc b/utils/document_display/__pycache__/payslip.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52977091646c1e5cec3ea6fce5c6a634b0875e23 Binary files /dev/null and b/utils/document_display/__pycache__/payslip.cpython-313.pyc differ diff --git a/utils/document_display/bank_statement.py b/utils/document_display/bank_statement.py new file mode 100644 index 0000000000000000000000000000000000000000..37a274eb566a725876bcbee3348fc2b17b6fcc92 --- /dev/null +++ b/utils/document_display/bank_statement.py @@ -0,0 +1,85 @@ +import streamlit as st +from utils.logger import setup_logger +import pandas as pd +from PIL import Image +import os + +logger = setup_logger(__name__) + + +def prune_bank_statement_for_display(analysis_results_for_id): + data_to_display = {} + + data_to_display["document_category"] = "bank_statement" + data_to_display["document_type"] = "bank_statement" + + data_to_display["account_holder_name"] = analysis_results_for_id.get( + "account_holder_name", None) + data_to_display["account_holder_address"] = analysis_results_for_id.get( + "account_holder_address", None) + data_to_display["bank_name"] = analysis_results_for_id.get( + "bank_name", None) + data_to_display["account_number"] = analysis_results_for_id.get( + "account_number", None) + data_to_display["sort_code"] = analysis_results_for_id.get( + "sort_code", None) + data_to_display["statement_start_date"] = analysis_results_for_id.get( + "statement_start_date", None) + data_to_display["statement_end_date"] = analysis_results_for_id.get( + "statement_end_date", None) + data_to_display["salary_credits"] = analysis_results_for_id.get( + "salary_credits", None) + + return data_to_display + + +def display_bank_statement(extracted_files, analysis_results_pruned): + + col1, col2 = st.columns([2, 3]) + + logger.info(f"file_path while displaying: {extracted_files}") + st.markdown("---") + + with col1: + if len(extracted_files) > 1: + st.image(extracted_files, caption=[os.path.basename( + img) for img in extracted_files], use_container_width=True) + else: + image = Image.open(extracted_files[0]) + st.image(image, caption=os.path.basename( + extracted_files[0])) # , + # use_container_width=True) + + logger.info( + f"analysis_results_pruned : {analysis_results_pruned}") + + with col2: + + dict_str = {} + + for key, value in analysis_results_pruned.items(): + if key != 'salary_credits': + dict_str[key] = value + + simple_df = pd.DataFrame.from_dict( + dict_str, + orient='index', columns=['Value']).reset_index() + simple_df.columns = ['Key', 'Value'] + simple_df = simple_df.fillna(value="Missing") + simple_df.index += 1 + st.dataframe(simple_df, use_container_width=True) + + st.markdown("Salary Credits") + + salary_dict = analysis_results_pruned['salary_credits'] + logger.info(f"salary_dict : {salary_dict}") + for salary_details in salary_dict: + simple_df = pd.DataFrame.from_dict( + salary_details, + orient='index', columns=['Value']).reset_index() + simple_df.columns = ['Key', 'Value'] + simple_df = simple_df.fillna(value="Missing") + simple_df.index += 1 + st.dataframe(simple_df, use_container_width=True) + + logger.info(f"simple_df: {simple_df}") diff --git a/utils/document_display/document_display.py b/utils/document_display/document_display.py new file mode 100644 index 0000000000000000000000000000000000000000..84c610df86e120ea0f4cb70db7c094b944a83ef6 --- /dev/null +++ b/utils/document_display/document_display.py @@ -0,0 +1,231 @@ +import streamlit as st +import os +from PIL import Image +import pymupdf + +from datetime import datetime +import re +from utils import setup_logger +from .passport import prune_passport_for_display, display_passport +from .driving_license import prune_driving_license_for_display, display_driving_license +from .bank_statement import prune_bank_statement_for_display, display_bank_statement +from .payslip import prune_payslip_for_display, display_payslip +from .p60 import prune_p60_for_display, display_p60 +from .others import display_others + +logger = setup_logger(__name__) + + +def load_pdf_as_image(file_path): + # Open PDF file + doc = pymupdf.Document(file_path) + + # Get the first page + page = doc[0] + + # Convert to image + pix = page.get_pixmap() + + # Convert to PIL Image + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + + return img + + +def generate_metadata(file_path): + """Generate metadata dictionary from file path and properties""" + file_stat = os.stat(file_path) + file_name = os.path.basename(file_path) + parent_dir = os.path.basename(os.path.dirname(file_path)) + + metadata = { + "File Name": file_name, + "Directory": parent_dir, + "File Size": f"{file_stat.st_size / 1024:.2f} KB", + "Last Modified": datetime.fromtimestamp(file_stat.st_mtime).strftime('%Y-%m-%d %H:%M:%S'), + "Created": datetime.fromtimestamp(file_stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S'), + "File Extension": os.path.splitext(file_name)[1], + "Full Path": file_path + } + + # Add image-specific metadata if it's an image + if file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')): + try: + with Image.open(file_path) as img: + metadata.update({ + "Image Size": f"{img.size[0]}x{img.size[1]}", + "Image Mode": img.mode, + "Image Format": img.format + }) + except Exception as e: + st.error(f"Error reading image metadata: {str(e)}") + + # Add PDF-specific metadata if it's a PDF + elif file_name.lower().endswith('.pdf'): + try: + doc = pymupdf.Document(file_path) + metadata.update({ + "Page Count": len(doc), + "PDF Version": doc.pdf_version, + "Document Info": doc.metadata if doc.metadata else "No PDF metadata available" + }) + except Exception as e: + st.error(f"Error reading PDF metadata: {str(e)}") + + return metadata + + +def merge_dict_values(dict1, dict2): + """ + Merge two dictionaries based on the following rules: + 1. If key is unique, keep the value + 2. If key exists in both dicts, keep non-None value; if both have values, keep in a list + 3. If value is None in both, keep the key with None value + """ + result = {} + + # Get all unique keys from both dictionaries + all_keys = set(dict1.keys()).union(set(dict2.keys())) + + for key in all_keys: + # Case 1: Key only in dict1 + if key in dict1 and key not in dict2: + result[key] = dict1[key] + # Case 2: Key only in dict2 + elif key in dict2 and key not in dict1: + result[key] = dict2[key] + # Case 3: Key in both dictionaries + else: + value1 = dict1[key] + value2 = dict2[key] + + # If both are dictionaries, recursively merge them + if isinstance(value1, dict) and isinstance(value2, dict): + result[key] = merge_dict_values(value1, value2) + # If one is None, use the non-None value + elif value1 is None and value2 is not None: + result[key] = value2 + elif value2 is None and value1 is not None: + result[key] = value1 + # If both have values, store in a list (unless they're the same) + elif value1 is not None and value2 is not None and value1 != value2: + result[key] = [value1, value2] + # If both are None or identical, keep one + else: + result[key] = value1 + + return result + + +def merge_json_file(json_data): + """ + Process a JSON file with nested dictionaries to create a consolidated result. + """ + result = {} + + # Process each top-level key + for file_path, pages in json_data.items(): + # Initialize an empty dictionary for this file path + result[file_path] = {} + + # Get all page dictionaries for this file + page_dicts = list(pages.values()) + + # Start with the first page as the base + merged_dict = page_dicts[0] + + # Merge with each subsequent page + for page_dict in page_dicts[1:]: + merged_dict = merge_dict_values(merged_dict, page_dict) + + result[file_path] = merged_dict + + return result + + +def display_based_on_card(original_file, analysis_results_for_original_file, extracted_files, current_upload): + + try: + analysis_results_for_id = merge_json_file( + {original_file: analysis_results_for_original_file}) + + analysis_results_for_id = analysis_results_for_id[original_file] + + logger.info(f"analysis_results_for_id : {analysis_results_for_id}") + except Exception as e: + logger.info( + f"Exception while trying to merge results of {original_file}") + logger.info( + f"analysis_results_for_original_file : {analysis_results_for_original_file}") + logger.info(f"error : {e}") + + return analysis_results_for_original_file + + analysis_results_for_id_updated = analysis_results_for_id + + try: + + document_type = analysis_results_for_id.get( + "document_type", "None") + + logger.info(f"document_type for {original_file}: {document_type}") + document_type = document_type.lower() + + document_type = re.sub('[^A-Za-z0-9]+', '', document_type) + print(f"document_type : {document_type}") + # analysis_results_pruned = {} + + if document_type == "passport": + analysis_results_pruned = prune_passport_for_display( + analysis_results_for_id) + + display_passport(extracted_files, analysis_results_pruned) + + elif document_type == "drivinglicense": + analysis_results_pruned = prune_driving_license_for_display( + analysis_results_for_id) + # if original_file not in st.session_state['tab_ocr']['values_display']: + # st.session_state['tab_ocr']['values_display'][original_file] = analysis_results_for_id_updated + display_driving_license(extracted_files, analysis_results_pruned) + + elif document_type == "bankstatement": + analysis_results_pruned = prune_bank_statement_for_display( + analysis_results_for_id) + # if original_file not in st.session_state['tab_ocr']['values_display']: + # st.session_state['tab_ocr']['values_display'][original_file] = analysis_results_for_id_updated + display_bank_statement(extracted_files, analysis_results_pruned) + + elif document_type == "payslip": + analysis_results_pruned = prune_payslip_for_display( + analysis_results_for_id) + # if original_file not in st.session_state['tab_ocr']['values_display']: + # st.session_state['tab_ocr']['values_display'][original_file] = analysis_results_for_id_updated + display_payslip(extracted_files, analysis_results_pruned) + + elif document_type == "p60": + analysis_results_pruned = prune_p60_for_display( + analysis_results_for_id) + # if original_file not in st.session_state['tab_ocr']['values_display']: + # st.session_state['tab_ocr']['values_display'][original_file] = analysis_results_for_id_updated + display_p60(extracted_files, analysis_results_pruned) + + else: + + analysis_results_for_id_updated["document_type"] = analysis_results_for_id.get( + "document_type", None) + # if original_file not in st.session_state['tab_ocr']['values_display']: + # st.session_state['tab_ocr']['values_display'][original_file] = analysis_results_for_id_updated + display_others(extracted_files, analysis_results_for_id_updated) + + except Exception as e: + logger.info( + f"Exception for processing analysis results of {analysis_results_for_id}: {e}") + analysis_results_for_id_updated = analysis_results_for_id + # if original_file not in st.session_state['tab_ocr']['values_display']: + # st.session_state['tab_ocr']['values_display'][original_file] = analysis_results_for_id_updated + display_others(extracted_files, analysis_results_for_id_updated) + + if original_file not in st.session_state['uploads'][current_upload]['values_display']: + st.session_state['uploads'][current_upload]['values_display'][original_file] = analysis_results_for_id_updated + + # return analysis_results_for_id_updated diff --git a/utils/document_display/driving_license.py b/utils/document_display/driving_license.py new file mode 100644 index 0000000000000000000000000000000000000000..971c102e96d4095196a7238d8dd40f3502db9ed1 --- /dev/null +++ b/utils/document_display/driving_license.py @@ -0,0 +1,69 @@ +import streamlit as st +from utils.logger import setup_logger +import pandas as pd +from PIL import Image +import os + +logger = setup_logger(__name__) + + +def prune_driving_license_for_display(analysis_results_for_id): + data_to_display = {} + data_to_display["document_category"] = "identity_verification_document" + data_to_display["document_type"] = "driving_license" + + data_to_display["surname"] = analysis_results_for_id.get( + "surname", None) + data_to_display["first_name"] = analysis_results_for_id.get( + "first_name", None) + data_to_display["date_of_birth"] = analysis_results_for_id.get( + "date_of_birth", None) + data_to_display["place_of_birth"] = analysis_results_for_id.get( + "place_of_birth", None) + data_to_display["date_of_issue"] = analysis_results_for_id.get( + "date_of_issue", None) + data_to_display["date_of_expiry"] = analysis_results_for_id.get( + "date_of_expiry", None) + data_to_display["issuing_authority"] = analysis_results_for_id.get( + "issuing_authority", None) + data_to_display["driver_number"] = analysis_results_for_id.get( + "driver_number", None) + data_to_display["address"] = analysis_results_for_id.get( + "address", None) + data_to_display["entitlements"] = analysis_results_for_id.get( + "entitlements", None) + + return data_to_display + + +def display_driving_license(extracted_files, analysis_results_pruned): + + col1, col2 = st.columns([2, 3]) + + logger.info(f"file_path while displaying: {extracted_files}") + st.markdown("---") + + with col1: + if len(extracted_files) > 1: + st.image(extracted_files, caption=[os.path.basename( + img) for img in extracted_files], use_container_width=True) + else: + image = Image.open(extracted_files[0]) + st.image(image, caption=os.path.basename( + extracted_files[0])) # , + # use_container_width=True) + + logger.info( + f"analysis_results_pruned : {analysis_results_pruned}") + + with col2: + + simple_df = pd.DataFrame.from_dict( + analysis_results_pruned, + orient='index', columns=['Value']).reset_index() + simple_df.columns = ['Key', 'Value'] + simple_df = simple_df.fillna(value="Missing") + simple_df.index += 1 + st.dataframe(simple_df, use_container_width=True) + + # logger.info(f"simple_df: {simple_df}") diff --git a/utils/document_display/others.py b/utils/document_display/others.py new file mode 100644 index 0000000000000000000000000000000000000000..b9fc235f0cfafc0edaf93ab7cf403a605fa882bc --- /dev/null +++ b/utils/document_display/others.py @@ -0,0 +1,111 @@ +import streamlit as st +from utils.logger import setup_logger +import pandas as pd +from PIL import Image +import os + +logger = setup_logger(__name__) + + +# def prune_others_for_display(analysis_results_for_id): +# data_to_display = {} + +# data_to_display["document_category"] = "income_document" +# data_to_display["document_type"] = "payslip" + +# data_to_display["employee_name"] = analysis_results_for_id.get( +# "employee_name", None) +# data_to_display["employer_name"] = analysis_results_for_id.get( +# "employer_name", None) +# data_to_display["payslip_date"] = analysis_results_for_id.get( +# "payslip_date", None) +# data_to_display["pay_period_start"] = analysis_results_for_id.get( +# "pay_period_start", None) +# data_to_display["pay_period_end"] = analysis_results_for_id.get( +# "pay_period_end", None) +# data_to_display["payment_frequency"] = analysis_results_for_id.get( +# "payment_frequency", None) +# data_to_display["basic_pay"] = analysis_results_for_id.get( +# "basic_pay", None) +# data_to_display["net_pay"] = analysis_results_for_id.get( +# "net_pay", None) +# data_to_display["gross_pay"] = analysis_results_for_id.get( +# "gross_pay", None) +# data_to_display["salary_components"] = analysis_results_for_id.get( +# "salary_components", None) +# data_to_display["ni_contribution"] = analysis_results_for_id.get( +# "ni_contribution", None) +# data_to_display["tax_deduction"] = analysis_results_for_id.get( +# "tax_deduction", None) +# data_to_display["other_deductions"] = analysis_results_for_id.get( +# "other_deductions", None) + +# return data_to_display + + +def display_others(extracted_files, analysis_results_pruned): + + col1, col2 = st.columns([2, 3]) + + logger.info(f"file_path while displaying: {extracted_files}") + st.markdown("---") + + with col1: + if len(extracted_files) > 1: + st.image(extracted_files, caption=[os.path.basename( + img) for img in extracted_files], use_container_width=True) + else: + image = Image.open(extracted_files[0]) + st.image(image, caption=os.path.basename( + extracted_files[0])) # , + # use_container_width=True) + + logger.info( + f"analysis_results_pruned: {analysis_results_pruned}") + + with col2: + + for key, value in analysis_results_pruned.items(): + if isinstance(value, dict): + st.write(f"**{key}:**") + sub_data = {"Key": [], "Value": []} + for sub_key, sub_value in value.items(): + sub_data["Key"].append(sub_key) + sub_data["Value"].append(sub_value) + sub_df = pd.DataFrame(sub_data) + sub_df.index += 1 + st.dataframe(sub_df, use_container_width=True) + # sub_col1, sub_col2 = st.columns(2) + # for sub_key, sub_value in value.items(): + # with sub_col1: + # st.write(f"{sub_key}", + # use_container_width=True) + # with sub_col2: + # with st.container(): + # st.write(f"{sub_value}", + # use_container_width=True) + else: + simple_data = {"Key": [key], "Value": [value]} + simple_df = pd.DataFrame(simple_data) + simple_df.index += 1 + logger.info(f"simple_df['Value'] : {simple_df['Value']}") + # simple_df["Value"] = simple_df["Value"].apply(lambda x: str(x) if not pd.isnull(x) else "") + def safe_to_str(x): + try: + if pd.isna(x): + return "" + except: + pass + return str(x) + + simple_df["Value"] = simple_df["Value"].apply(safe_to_str) + + st.dataframe(simple_df, use_container_width=True) + # sub_col1, sub_col2 = st.columns(2) + # with sub_col1: + # st.write(f"{key}", use_container_width=True) + # with st.container(): + # with sub_col2: + # st.write(f"{value}", use_container_width=True) + + logger.info(f"simple_df: {simple_df}") diff --git a/utils/document_display/p60.py b/utils/document_display/p60.py new file mode 100644 index 0000000000000000000000000000000000000000..176a14476447dbf1877b7d86479563ac3fb7bd32 --- /dev/null +++ b/utils/document_display/p60.py @@ -0,0 +1,103 @@ +import streamlit as st +from utils.logger import setup_logger +import pandas as pd +from PIL import Image +import os + +logger = setup_logger(__name__) + + +def prune_p60_for_display(analysis_results_for_id): + data_to_display = {} + + data_to_display["document_category"] = "income_document" + data_to_display["document_type"] = "p60" + + employee_details = analysis_results_for_id.get( + "employee_details", None) + + data_to_display["surname"] = employee_details.get( + "surname", None) + data_to_display["forenames_or_initials"] = employee_details.get( + "forenames_or_initials", None) + data_to_display["national_insurance_number"] = employee_details.get( + "national_insurance_number", None) + data_to_display["works_payroll_number"] = employee_details.get( + "works_payroll_number", None) + + pay_and_income_tax_details = analysis_results_for_id.get( + "pay_and_income_tax_details", None) + + data_to_display["previous_employments"] = pay_and_income_tax_details.get( + "previous_employments", None) + data_to_display["current_employment"] = pay_and_income_tax_details.get( + "current_employment", None) + data_to_display["total_for_year"] = pay_and_income_tax_details.get( + "total_for_year", None) + data_to_display["final_tax_code"] = pay_and_income_tax_details.get( + "final_tax_code", None) + + data_to_display["national_insurance_contributions"] = analysis_results_for_id.get( + "national_insurance_contributions", None) + + employer_details = analysis_results_for_id.get( + "employer_details", None) + + data_to_display["employer_name_and_address"] = employer_details.get( + "employer_name_and_address", None) + data_to_display["paye_reference"] = employer_details.get( + "paye_reference", None) + + return data_to_display + + +def display_p60(extracted_files, analysis_results_pruned): + + col1, col2 = st.columns([2, 3]) + + logger.info(f"file_path while displaying: {extracted_files}") + st.markdown("---") + + with col1: + if len(extracted_files) > 1: + st.image(extracted_files, caption=[os.path.basename( + img) for img in extracted_files], use_container_width=True) + else: + image = Image.open(extracted_files[0]) + st.image(image, caption=os.path.basename( + extracted_files[0])) # , + # use_container_width=True) + + logger.info( + f"analysis_results_pruned : {analysis_results_pruned}") + + with col2: + + dict_str = {} + + for key, value in analysis_results_pruned.items(): + # if key not in ['other_deductions', 'salary_components']: + dict_str[key] = value + + simple_df = pd.DataFrame.from_dict( + dict_str, + orient='index', columns=['Value']).reset_index() + simple_df.columns = ['Key', 'Value'] + simple_df = simple_df.fillna(value="Missing") + simple_df.index += 1 + st.dataframe(simple_df, use_container_width=True) + + # st.markdown("Other Deductions") + + # other_dedecutions_dict = st.session_state['tab_ocr'][ + # 'values_display'][original_file]['other_deductions'] + # logger.info(f"other_dedecutions_dict : {other_dedecutions_dict}") + # for other_deduc in other_dedecutions_dict: + # simple_df = pd.DataFrame.from_dict( + # other_deduc, + # orient='index', columns=['Value']).reset_index() + # simple_df.columns = ['Key', 'Value'] + # simple_df = simple_df.fillna(value="Missing") + # st.dataframe(simple_df, use_container_width=True) + + logger.info(f"simple_df: {simple_df}") diff --git a/utils/document_display/passport.py b/utils/document_display/passport.py new file mode 100644 index 0000000000000000000000000000000000000000..fc95710d2c90031a69c32c000a4df8e47aa3c886 --- /dev/null +++ b/utils/document_display/passport.py @@ -0,0 +1,65 @@ +import streamlit as st +from utils.logger import setup_logger +import pandas as pd +from PIL import Image +import os + +logger = setup_logger(__name__) + + +def prune_passport_for_display(analysis_results_for_id): + data_to_display = {} + data_to_display["document_category"] = "identity_verification_document" + data_to_display["document_type"] = "passport" + + data_to_display["passport_number"] = analysis_results_for_id.get( + "passport_number", None) + data_to_display["full_name"] = analysis_results_for_id.get( + "full_name", None) + data_to_display["date_of_birth"] = analysis_results_for_id.get( + "date_of_birth", None) + data_to_display["nationality"] = analysis_results_for_id.get( + "nationality", None) + data_to_display["date_of_issue"] = analysis_results_for_id.get( + "date_of_issue", None) + data_to_display["date_of_expiry"] = analysis_results_for_id.get( + "date_of_expiry", None) + data_to_display["sex"] = analysis_results_for_id.get( + "sex", None) + data_to_display["address"] = analysis_results_for_id.get( + "address", None) + + return data_to_display + + +def display_passport(extracted_files, analysis_results_pruned): + + col1, col2 = st.columns([2, 3]) + + logger.info(f"file_path while displaying: {extracted_files}") + st.markdown("---") + + with col1: + if len(extracted_files) > 1: + st.image(extracted_files, caption=[os.path.basename( + img) for img in extracted_files], use_container_width=True) + else: + image = Image.open(extracted_files[0]) + st.image(image, caption=os.path.basename( + extracted_files[0])) # , + # use_container_width=True) + + logger.info( + f"analysis_results_pruned : {analysis_results_pruned}") + + with col2: + + simple_df = pd.DataFrame.from_dict( + analysis_results_pruned, + orient='index', columns=['Value']).reset_index() + simple_df.columns = ['Key', 'Value'] + simple_df = simple_df.fillna(value="Missing") + simple_df.index += 1 + st.dataframe(simple_df, use_container_width=True) + + # logger.info(f"simple_df: {simple_df}") diff --git a/utils/document_display/payslip.py b/utils/document_display/payslip.py new file mode 100644 index 0000000000000000000000000000000000000000..433c219071d13dec94239e3c3b861ae4795738b2 --- /dev/null +++ b/utils/document_display/payslip.py @@ -0,0 +1,119 @@ +import streamlit as st +from utils.logger import setup_logger +import pandas as pd +from PIL import Image +import os + +logger = setup_logger(__name__) + + +def prune_payslip_for_display(analysis_results_for_id): + data_to_display = {} + + data_to_display["document_category"] = "income_document" + data_to_display["document_type"] = "payslip" + + data_to_display["employee_name"] = analysis_results_for_id.get( + "employee_name", None) + data_to_display["employer_name"] = analysis_results_for_id.get( + "employer_name", None) + data_to_display["employee_id"] = analysis_results_for_id.get( + "employee_id", None) + data_to_display["employee_address"] = analysis_results_for_id.get( + "employee_address", None) + data_to_display["employer_address"] = analysis_results_for_id.get( + "employer_address", None) + data_to_display["tax_code"] = analysis_results_for_id.get( + "tax_code", None) + data_to_display["payslip_date"] = analysis_results_for_id.get( + "payslip_date", None) + data_to_display["pay_period_start"] = analysis_results_for_id.get( + "pay_period_start", None) + data_to_display["pay_period_end"] = analysis_results_for_id.get( + "pay_period_end", None) + data_to_display["payment_frequency"] = analysis_results_for_id.get( + "payment_frequency", None) + data_to_display["basic_pay"] = analysis_results_for_id.get( + "basic_pay", None) + data_to_display["net_pay"] = analysis_results_for_id.get( + "net_pay", None) + data_to_display["gross_pay"] = analysis_results_for_id.get( + "gross_pay", None) + data_to_display["salary_components"] = analysis_results_for_id.get( + "salary_components", None) + data_to_display["ni_contribution"] = analysis_results_for_id.get( + "ni_contribution", None) + data_to_display["tax_deduction"] = analysis_results_for_id.get( + "tax_deduction", None) + data_to_display["other_deductions"] = analysis_results_for_id.get( + "other_deductions", None) + + return data_to_display + + +def display_payslip(extracted_files, analysis_results_pruned): + + col1, col2 = st.columns([2, 3]) + + logger.info(f"file_path while displaying: {extracted_files}") + st.markdown("---") + + with col1: + if len(extracted_files) > 1: + st.image(extracted_files, caption=[os.path.basename( + img) for img in extracted_files], use_container_width=True) + else: + image = Image.open(extracted_files[0]) + st.image(image, caption=os.path.basename( + extracted_files[0])) # , + # use_container_width=True) + + logger.info( + f"analysis_results_pruned : {analysis_results_pruned}") + + with col2: + + dict_str = {} + + for key, value in analysis_results_pruned.items(): + if key not in ['other_deductions', 'salary_components']: + dict_str[key] = value + + simple_df = pd.DataFrame.from_dict( + dict_str, + orient='index', columns=['Value']).reset_index() + simple_df.columns = ['Key', 'Value'] + simple_df = simple_df.fillna(value="Missing") + simple_df.index += 1 + st.dataframe(simple_df, use_container_width=True) + + st.markdown("Other Deductions") + + other_deductions_dict = analysis_results_pruned['other_deductions'] + logger.info(f"other_deductions_dict : {other_deductions_dict}") + + try: + # Flatten the nested list structure + flat_list = [] + for sublist in other_deductions_dict: + for item in sublist: + if isinstance(item, list): + flat_list.extend(item) + elif isinstance(item, dict): + flat_list.append(item) + + # Filter and format into dataframe if valid + if flat_list and isinstance(flat_list[0], dict) and 'name' in flat_list[0] and 'amount' in flat_list[0]: + df = pd.DataFrame(flat_list) + df.columns = ['Key', 'Value'] + df = df.fillna(value="Missing") + df.index += 1 + st.dataframe(df, use_container_width=True) + else: + raise ValueError("Data is not in expected dictionary format.") + except Exception as e: + logger.info(f"Different format for other deductions: {e}") + st.dataframe(other_deductions_dict) + + + logger.info(f"simple_df: {simple_df}") diff --git a/utils/image_utils.py b/utils/image_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7ef5258afa7d5daed940221fc11eb336d8a8ffad --- /dev/null +++ b/utils/image_utils.py @@ -0,0 +1,78 @@ +import base64 +from io import BytesIO +import pymupdf +from PIL import Image +import streamlit as st +import os +from datetime import datetime + + +def generate_metadata(file_path): + """Generate metadata dictionary from file path and properties""" + file_stat = os.stat(file_path) + file_name = os.path.basename(file_path) + parent_dir = os.path.basename(os.path.dirname(file_path)) + + metadata = { + "File Name": file_name, + "Directory": parent_dir, + "File Size": f"{file_stat.st_size / 1024:.2f} KB", + "Last Modified": datetime.fromtimestamp(file_stat.st_mtime).strftime('%Y-%m-%d %H:%M:%S'), + "Created": datetime.fromtimestamp(file_stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S'), + "File Extension": os.path.splitext(file_name)[1], + "Full Path": file_path + } + + # Add image-specific metadata if it's an image + if file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')): + try: + with Image.open(file_path) as img: + metadata.update({ + "Image Size": f"{img.size[0]}x{img.size[1]}", + "Image Mode": img.mode, + "Image Format": img.format + }) + except Exception as e: + st.error(f"Error reading image metadata: {str(e)}") + + # Add PDF-specific metadata if it's a PDF + elif file_name.lower().endswith('.pdf'): + try: + doc = pymupdf.Document(file_path) + metadata.update({ + "Page Count": len(doc), + "PDF Version": doc.pdf_version, + "Document Info": doc.metadata if doc.metadata else "No PDF metadata available" + }) + except Exception as e: + st.error(f"Error reading PDF metadata: {str(e)}") + + return metadata + + +def load_pdf_as_image(file_path): + # Open PDF file + doc = pymupdf.Document(file_path) + + # Get the first page + page = doc[0] + + # Convert to image + pix = page.get_pixmap() + + # Convert to PIL Image + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + + return img + + +def im_2_b64(image): + buff = BytesIO() + image.save(buff, format="JPEG") + img_str = base64.b64encode(buff.getvalue()).decode("utf-8") + return img_str + + +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') diff --git a/utils/json_utils.py b/utils/json_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f28b5f5f25aab0ce1332cb9442c375becf032011 --- /dev/null +++ b/utils/json_utils.py @@ -0,0 +1,96 @@ + + +import os +import json + +from pydantic import BaseModel, Field, model_validator +from typing import List +import pandas as pd +from utils.logger import setup_logger + +logger = setup_logger(__name__) + + +def restructure_documents(original_data:dict): + result = {} + + for pdf_path, pages in original_data.items(): + file_name = os.path.basename(pdf_path) + for image_path, data in pages.items(): + image_name = os.path.basename(image_path) + + document_category = data.get("document_category") + document_type = data.get("document_type") + + # Prepare the inner dict content + entry = { + "uploaded_file_path": file_name, + "uploaded_file_extracted_images": [image_name], + **data # include all original fields (including document_type and document_category) + } + + # Wrap it under document_type + wrapped_entry = {document_type: entry} + + # Append to appropriate document_category list + result.setdefault(document_category, []).append(wrapped_entry) + + return result + +def extract_document_types_from_transformed(transformed_data): + category_map = {} + + for category, docs in transformed_data.items(): + doc_types = set() + for item in docs: + for doc_type in item.keys(): # because each item is like {'payslip': {...}} + doc_types.add(doc_type) + category_map[category] = sorted(list(doc_types)) + + return category_map + + + +class DocumentTypeByCategory(BaseModel): + bank_statement: List[str] = Field(default_factory=list) + income_document: List[str] = Field(default_factory=list) + identity_verification_document: List[str] = Field(default_factory=list) + + # Computed flags + is_bank_statement_valid: bool = Field(default=False, exclude=True) + is_income_document_valid: bool = Field(default=False, exclude=True) + is_identity_verification_document_valid: bool = Field(default=False, exclude=True) + + @model_validator(mode="after") + def compute_valid_flags(self): + self.is_bank_statement_valid = bool(self.bank_statement) + self.is_income_document_valid = bool(self.income_document) + self.is_identity_verification_document_valid = bool(self.identity_verification_document) + return self + + def to_dataframe(self) -> pd.DataFrame: + data = [ + { + "document_category": "bank_statement", + "Uploaded": self.is_bank_statement_valid, + "document_types": ", ".join(self.bank_statement) if self.bank_statement else "Missing" + }, + { + "document_category": "income_document", + "Uploaded": self.is_income_document_valid, + "document_types": ", ".join(self.income_document) if self.income_document else "Missing" + }, + { + "document_category": "identity_verification_document", + "Uploaded": self.is_identity_verification_document_valid, + "document_types": ", ".join(self.identity_verification_document) if self.identity_verification_document else "Missing" + } + ] + + data_df = pd.DataFrame(data) + data_df.index += 1 + logger.info(f"df: {data_df}") + data_df['Uploaded'] = data_df['Uploaded'].apply(lambda x: '✅' if x else '❌') + + return data_df + diff --git a/utils/logger.py b/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..4335960274714e6c53c5a9e574516d1fc388cc2b --- /dev/null +++ b/utils/logger.py @@ -0,0 +1,43 @@ +import logging +import os +from datetime import datetime +# from config import LOGS_DIR +# Create logs directory if it doesn't exist + +LOGS_DIR = "logs_directory" +os.makedirs(LOGS_DIR, exist_ok=True) +# Generate filename with timestamp +log_filename = os.path.join( + LOGS_DIR, f"app_{datetime.now().strftime('%Y%m%d')}.log") + + +def setup_logger(name): + """ + Create a logger with the specified name that writes to both file and console + """ + logger = logging.getLogger(name) + # Only configure if it hasn't been configured yet + if not logger.handlers: + logger.setLevel(logging.DEBUG) + # Create file handler + file_handler = logging.FileHandler(log_filename) + file_handler.setLevel(logging.DEBUG) + file_formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + file_handler.setFormatter(file_formatter) + # Create console handler + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) # Less verbose for console + console_formatter = logging.Formatter( + '%(levelname)s - %(name)s - %(message)s' + ) + console_handler.setFormatter(console_formatter) + # Add handlers to logger + logger.addHandler(file_handler) + logger.addHandler(console_handler) + return logger + + +logger = setup_logger(__name__) diff --git a/utils/prep_validators_payload.py b/utils/prep_validators_payload.py new file mode 100644 index 0000000000000000000000000000000000000000..78755e480a0c84accdb2b9dcfba3fa0aa13996f0 --- /dev/null +++ b/utils/prep_validators_payload.py @@ -0,0 +1,207 @@ +from typing import Any, Dict + +import schemas +from utils.logger import setup_logger + +logger = setup_logger(__name__) + + +def group_documents_by_type(obj, result=None): + if result is None: + result = { + "payslip": [], + "bank_statement": [], + "passport": [], + "driving_license": [], + } + if isinstance(obj, dict): + doc_type = obj.get("document_type") + if doc_type in result: + result[doc_type].append(obj) + for value in obj.values(): + group_documents_by_type(value, result) + elif isinstance(obj, list): + for item in obj: + group_documents_by_type(item, result) + return result + + +# Transformation Functions + +def transform_validate_payslip( + data: Dict[str, Any], application_form_dict: Dict[str, str] +) -> schemas.UKPayslipSchema: + # return schemas.UKPayslipSchema( + # pay_period_start_date=data.get("pay_period_start"), + # pay_period_end_date=data.get("pay_period_end"), + # pay_date=data.get("payslip_date"), + # full_name=data.get("employee_name"), + # employer_name=data.get("employer_name"), + # is_basic_pay_net_pay_other_salary_components_present=bool( + # data.get("basic_pay") and data.get("net_pay") + # ), + # is_tax_deducation_present=bool(data.get("tax_deduction")), + # is_ni_deduction_present=bool(data.get("ni_contribution")), + # complete_employee_address=None, + # employee_number=None, + # ) + payslip_payload = { + "pay_period_start_date": data.get("pay_period_start"), + "pay_period_end_date": data.get("pay_period_end"), + "pay_date": data.get("payslip_date"), + "full_name": data.get("employee_name"), + "employer_name": data.get("employer_name"), + "is_basic_pay_net_pay_other_salary_components_present": bool( + data.get("basic_pay") and data.get("net_pay") + ), + "is_tax_deducation_present": bool(data.get("tax_deduction")), + "is_ni_deduction_present": bool(data.get("ni_contribution")), + "complete_employee_address": data.get("employee_address"), + # "employee_number": data.get("employee_id"), + } + # return payslip_payload + return schemas.UKPayslipSchema.model_validate( + payslip_payload, + context=application_form_dict, + ).model_dump() + + +def transform_validate_passport( + data: Dict[str, Any], application_form_dict: Dict[str, str] +) -> schemas.UKPassportSchema: + # name = data.get("full_name") or f"{data.get('given_names', '')} {data.get('surname', '')}".strip() + passport_payload = { + "full_name": data.get("given_names"), + "expiry_date": data.get("date_of_expiry"), + } + # return schemas.UKPassportSchema( + # full_name=name, + # expiry_date=data.get("date_of_expiry"), + # ) + # return passport_payload + return schemas.UKPassportSchema.model_validate( + passport_payload, + context=application_form_dict, + ).model_dump() + + +def transform_validate_driving_license( + data: Dict[str, Any], application_form_dict: Dict[str, str] +) -> schemas.UKDrivingLicense: + name = data.get("full_name") or f"{data.get('first_name', '')} {data.get('surname', '')}".strip() + driving_license_payload = {"full_name": name,} + # return schemas.UKPassportSchema( + # full_name=name, + # expiry_date=data.get("date_of_expiry"), + # ) + # return passport_payload + return schemas.UKDrivingLicense.model_validate( + driving_license_payload, + context=application_form_dict, + ).model_dump() + + +def transform_validate_bank_statement( + data: Dict[str, Any], application_form_dict: Dict[str, str] +) -> schemas.UKBankAccountStatement: + # First salary deposit date from 'salary_credits' if available + salary_credits = data.get("salary_credits", []) + first_salary_date = None + if salary_credits: + try: + # first_salary_date = int(salary_credits[0]["date"].split("-")[2]) + first_salary_date = salary_credits[0]["date"] + except (IndexError, ValueError, KeyError): + pass + + # return schemas.UKBankAccountStatement( + # statement_start_date=data.get("statement_start_date"), + # statement_end_date=data.get("statement_end_date"), + # first_salary_deposit_date_present=first_salary_date, + # bank_name=None, # Not present in this JSON sample + # full_name=data.get("account_holder_name"), + # account_number=None, + # sort_code=None, + # ) + bank_statement_payload = { + "statement_start_date": data.get("statement_start_date"), + "statement_end_date": data.get("statement_end_date"), + "first_salary_deposit_date_present": first_salary_date, + "bank_name": data.get("bank_name"), # Not present in this JSON sample + "full_name": data.get("account_holder_name"), + "account_number": data.get("account_number"), + "sort_code": data.get("sort_code"), + } + # return bank_statement_payload + return schemas.UKBankAccountStatement.model_validate( + bank_statement_payload, + context=application_form_dict, + ).model_dump() + + +def process_extracted_data( + extracted_data: Dict[str, Any], application_form: Dict[str, Any], full_data_transformed# schemas.CustomAppFormUpload +): + # full_data = json.loads(extracted_json_data) + # application_form_dict = application_form.model_dump() + + grouped_docs = group_documents_by_type(extracted_data) + + # for key in grouped_docs: + # if not grouped_docs[key]: + # return f"{key} document type file not uploaded" + + transformed_validated_data = { + # "payslips": [transform_payslip(doc) for doc in grouped_docs["payslip"]], + # "bank_statements": [transform_bank_statement(doc) for doc in grouped_docs["bank_statement"]], + # "passports": [transform_passport(doc) for doc in grouped_docs["passport"]], + "payslips": [ + transform_validate_payslip(doc, application_form) + for doc in grouped_docs["payslip"] + ], + "bank_statements": [ + transform_validate_bank_statement(doc, application_form) + for doc in grouped_docs["bank_statement"] + ], + "passports": [ + transform_validate_passport(doc, application_form) + for doc in grouped_docs["passport"] + ], + "driving_licenses": [ + transform_validate_driving_license(doc, application_form) + for doc in grouped_docs["driving_license"] + ], + } + + logger.info(f"transformed_validated_data: {transformed_validated_data}") + + # `names_across_docs` is a set that stores unique lowercase versions of full names extracted from + # the transformed and validated data. It is used to check if the names across the uploaded + # documents match. The set ensures that only unique names are stored, and it is used to determine + # if there is consistency in the names provided across the different types of documents. + names_across_docs = set() + names_all = [] + for docs in transformed_validated_data.values(): + for doc in docs: + if "full_name" in doc and doc['full_name'] is not None: + names_across_docs.add(doc["full_name"].lower().replace(" ", "")) + names_all.append(doc["full_name"]) + names_across_docs_match = len(names_across_docs) <= 1 + if names_across_docs_match: + cross_docs_name_eq_check = { + # "Policy": "The applicant's name must match across the uploaded documents", + "Policy": "Document Consistency", + "Value": names_all[-1], + "Status": names_across_docs_match, + "Message": "Applicant's name matches across the uploaded documents", + } + else: + cross_docs_name_eq_check = { + # "Policy": "The applicant's name must match across the uploaded documents", + "Policy": "Document Consistency", + "Value": names_all, + "Status": names_across_docs_match, + "Message": "Applicant's name does not match across the uploaded documents" + } + + return transformed_validated_data, cross_docs_name_eq_check diff --git a/utils/process_files.py b/utils/process_files.py new file mode 100644 index 0000000000000000000000000000000000000000..cb078f4f69e78afe2bcc37baca306053acd2cfb7 --- /dev/null +++ b/utils/process_files.py @@ -0,0 +1,120 @@ +import tempfile +import os +import pdf2image +import zipfile +from .logger import setup_logger +import pandas as pd + +logger = setup_logger(__name__) + + +def process_pdf(file_path: str, file_groups: dict, file_paths: list): + images = pdf2image.convert_from_path(file_path) + image_paths = [] + for i, img in enumerate(images): + img_path = f"{file_path}_page_{i}.png" + img.save(img_path, "PNG") + file_paths.append(img_path) + image_paths.append(img_path) + + file_groups[file_path] = image_paths + + return file_groups, file_paths + + +def process_uploaded_files(uploaded_files): + file_paths = [] + file_groups = {} + application_form = None + memo = None + temp_dir = tempfile.mkdtemp() + print("temp_dir", temp_dir) + + for uploaded_file in uploaded_files: + file_path = os.path.join(temp_dir, uploaded_file.name) + with open(file_path, "wb") as f: + f.write(uploaded_file.getbuffer()) + + logger.info( + f"file_path: {file_path}, uploaded_file.type : {uploaded_file.type}") + + if uploaded_file.type == "application/pdf": + file_groups, file_paths = process_pdf( + file_path=file_path, file_groups=file_groups, file_paths=file_paths) + + elif uploaded_file.type.startswith("image"): + file_paths.append(file_path) + file_groups[file_path] = [file_path] + elif uploaded_file.type == "application/zip": + with zipfile.ZipFile(file_path, 'r') as zip_ref: + extract_dir = os.path.join( + temp_dir, uploaded_file.name.replace(".zip", "")) + print(f"extract_dir : {extract_dir}") + zip_ref.extractall(extract_dir) + for root, _, files in os.walk(extract_dir): + for file in files: + if file.lower().endswith((".pdf")): + extracted_path = os.path.join(root, file) + file_groups, file_paths = process_pdf( + file_path=extracted_path, + file_groups=file_groups, + file_paths=file_paths) + + elif file.lower().endswith((".png", ".jpg", ".jpeg")): + extracted_path = os.path.join(root, file) + file_paths.append(extracted_path) + file_groups[extracted_path] = [extracted_path] + elif file.lower().endswith((".csv")): + extracted_path = os.path.join(root, file) + application_form = pd.read_csv( + extracted_path, header=None) + logger.info( + f"application_form: {application_form}") + application_form[0] = 'application_summary_' + \ + application_form[0].str.strip() + elif file.lower().endswith((".xlsx")): + extracted_path = os.path.join(root, file) + df_dict = pd.read_excel( + extracted_path, sheet_name=None, header=None) + + # logger.info(f"df_dict: {df_dict}") + + yellow_df = pd.DataFrame() + + yellow_df = pd.concat( + [yellow_df, df_dict['Sheet1'].iloc[31:32]], axis=0, ignore_index=True) + yellow_df = pd.concat( + [yellow_df, df_dict['Sheet1'].iloc[33:34]], axis=0, ignore_index=True) + yellow_df = pd.concat( + [yellow_df, df_dict['Sheet1'].iloc[50:51]], axis=0, ignore_index=True) + yellow_df = yellow_df[[0, 1]] + + blue_df = pd.DataFrame() + # Deposit details + blue_df = pd.concat( + [blue_df, df_dict['Sheet6'].iloc[44:47]], axis=0, ignore_index=True) + # memo = pd.concat([memo, df['Sheet6'].iloc[50:51]], axis=0, ignore_index=True) + blue_df = blue_df[[0, 1]] + + green_df = pd.DataFrame() + # Monthly costs for both applicants + green_df = pd.concat( + [green_df, df_dict['Sheet7'].iloc[5:23]], axis=0, ignore_index=True) + green_df = green_df[[0, 1]] + + memo = { + "Mortgage Details": yellow_df, + "Deposit details": blue_df, + "Monthly costs for both applicants": green_df, + } + + logger.info(f"memo : {memo}") + + else: + extracted_path = os.path.join(root, file) + file_paths.append(extracted_path) + file_groups[extracted_path] = [extracted_path] + + print(f"file_groups : {file_groups}") + + return file_paths, file_groups, temp_dir, application_form, memo diff --git a/utils/session_state.py b/utils/session_state.py new file mode 100644 index 0000000000000000000000000000000000000000..2c24694f785d58e9e9439af72b979cab0f61db37 --- /dev/null +++ b/utils/session_state.py @@ -0,0 +1,23 @@ +import streamlit as st + + +def reset_state(upload_id=None): + if upload_id: + st.session_state['uploads'][upload_id] = { + 'file_groups': None, + 'values_raw': None, + 'values_display': None, + 'results_transformed': None, + 'temp_dir': None, + 'application_form': None, + 'memo': None + } + st.session_state['current_upload'] = upload_id + else: + current = st.session_state.get('current_upload') + if current: + st.session_state['uploads'][current]['values_raw'] = None + st.session_state['uploads'][current]['values_display'] = None + st.session_state['uploads'][current]['results_transformed'] = None + # st.session_state['uploads'][current]['application_form'] = None + # st.session_state['uploads'][current]['temp_dir'] = None diff --git a/utils/tabs/__init__.py b/utils/tabs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/utils/tabs/__pycache__/__init__.cpython-313.pyc b/utils/tabs/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b21e9f81b42c7498959d819637ddf66f32068a9 Binary files /dev/null and b/utils/tabs/__pycache__/__init__.cpython-313.pyc differ diff --git a/utils/tabs/__pycache__/demo_validations.cpython-313.pyc b/utils/tabs/__pycache__/demo_validations.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6f92cfe68dd22489694c1fdc0cae2ca331f6c08 Binary files /dev/null and b/utils/tabs/__pycache__/demo_validations.cpython-313.pyc differ diff --git a/utils/tabs/__pycache__/document_upload_tab.cpython-313.pyc b/utils/tabs/__pycache__/document_upload_tab.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf185c9f414f422cec70389c8054b1db43408202 Binary files /dev/null and b/utils/tabs/__pycache__/document_upload_tab.cpython-313.pyc differ diff --git a/utils/tabs/__pycache__/document_validation_tab.cpython-313.pyc b/utils/tabs/__pycache__/document_validation_tab.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0db1feffa1038edf6ffb60bf99a8f95ca3a00d6 Binary files /dev/null and b/utils/tabs/__pycache__/document_validation_tab.cpython-313.pyc differ diff --git a/utils/tabs/__pycache__/memo.cpython-313.pyc b/utils/tabs/__pycache__/memo.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6642b31206504c5d74741898d8b97ee0b626189e Binary files /dev/null and b/utils/tabs/__pycache__/memo.cpython-313.pyc differ diff --git a/utils/tabs/demo_validations.py b/utils/tabs/demo_validations.py new file mode 100644 index 0000000000000000000000000000000000000000..989032a0146a9eda24dc2f38f9ec89ed1e14f3a8 --- /dev/null +++ b/utils/tabs/demo_validations.py @@ -0,0 +1,431 @@ +import streamlit as st +import pandas as pd + + +def display_demo_validations(): + st.header("Policies") + # demo_validations = [ + # # { + # # "Document Type": "Passport", + # # "Validation": "Full name must be present", + # # "Raises Red Flag": True, + # # # "Error Message": "Applicant's full name not present", + # # }, + # # { + # # "Document Type": "Passport", + # # "Validation": "Full name must have length between 2 & 61", + # # "Raises Red Flag": True, + # # # "Error Message": "Full name must have a length of at least 2 & at most 61", + # # }, + # # { + # # "Document Type": "Passport", + # # "Validation": "Full name must have at least two words", + # # "Raises Red Flag": True, + # # # "Error Message": "Full name must consist of at least 2 words (first name + last name)", + # # }, + # { + # "Document Type": "Passport", + # "Validation": ( + # "Full name must be present. " + # "Full name must have length between 2 & 61. " + # "Full name must have at least two words." + # ), + # # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # { + # "Document Type": "Passport", + # "Validation": "Expiry date must be present & after a year from current date", + # # "Raises Red Flag": True, + # # "Error Message": "Provided passport expires within 1 year", + # }, + # { + # "Document Type": "Payslip", + # "Validation": ( + # "Full name must be present. " + # "Full name must have length between 2 & 61. " + # "Full name must have at least two words." + # ), + # # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # { + # "Document Type": "Payslip", + # "Validation": "Employer name must be present", + # # "Raises Red Flag": True, + # # "Error Message": "Employer name not present", + # }, + # { + # "Document Type": "Payslip", + # "Validation": "Employer name must have at least alphabet", + # # "Raises Red Flag": True, + # # "Error Message": "Employer name must contain at least one letter", + # }, + # { + # "Document Type": "Payslip", + # "Validation": "Employer name cannot be only whitespace", + # # "Raises Red Flag": True, + # # "Error Message": "Employer name cannot be only whitespace", + # }, + # { + # "Document Type": "Payslip", + # "Validation": "Employer name must match the provided value", + # # "Raises Red Flag": True, + # # "Error Message": "Employer name mismatch with provided value", + # }, + # { + # "Document Type": "Payslip", + # "Validation": ( + # "Pay period start & dates must be present.\n" + # "Pay period start date cannot be on or after the end date.\n" + # "Pay period's end date must be within the last 35 days & not in the future.\n" + # "Pay period's date(s) must not be older than those of the last calendar month.\n" + # "Pay period's start date & end date must have a gap of at least 28 days." + # ), + # # "Raises Red Flag": True, + # # "Error Message": "Employer name mismatch with provided value", + # }, + # { + # "Document Type": "Payslip", + # "Validation": ( + # "Basic salary, Net Salary and/or other requisite salary components must be present. " + # "Tax Deduction line item must be present. " + # "NI/National Insurance line item must be present." + # ), + # # "Raises Red Flag": True, + # }, + # { + # "Document Type": "Payslip", + # "Validation": ( + # "Applicant's address must be present. " + # "Applicant's complete address must have a length of at least 10 & at most 300. " + # "Complete address must match with provided value. " + # ), + # # "Raises Red Flag": True, + # }, + # # { + # # "Document Type": "Payslip", + # # "Validation": "Employee number must be greater than 25", + # # "Raises Red Flag": True, + # # }, + # { + # "Document Type": "Digital Bank Account Statement", + # "Validation": ( + # "Full name must be present. " + # "Full name must have length between 2 & 61. " + # "Full name must have at least two words." + # ), + # # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # { + # "Document Type": "Digital Bank Account Statement", + # "Validation": ( + # "Bank name must be present. " + # "Bank name must have length between 4 & 50. " + # "Bank Name must match provided value." + # ), + # # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # { + # "Document Type": "Digital Bank Account Statement", + # "Validation": ( + # "Bank account number must be present. " + # "Bank account number must be of 8 digits only. " + # ), + # # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # { + # "Document Type": "Digital Bank Account Statement", + # "Validation": ( + # "Sort number must be present. " + # "It must be of the format xx-xx-xx wherein x are digits. " + # ), + # # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # { + # "Document Type": "Digital Bank Account Statement", + # "Validation": ( + # "Both statement start date & statement end date must be present. " + # "Account statement period's start date & end date must have a gap of at least 28 days. " + # "At least one salary credit must be present. " + # "Statement period's end date must be after the start date. " + # ), + # # "Raises Red Flag": True, + # # "Error Message": "Applicant's full name not present", + # }, + # ] + + demo_validations = [ + # { + # "Topic / Document Type": "General Guidance", + # "Policy / Rule / Condition": "Income/Employment Docs Risk", + # "Action / Guidance / Requirement": "Be aware of higher risk of manipulation (Payslips, bank statements, Customer name).", + # "Red Flag / Caution": "Higher risk category.", + # "Notes / Details": "", + # }, + { + "Topic / Document Type": "General Guidance", + "Policy / Rule / Condition": "Document Consistency", + # "Action / Guidance / Requirement": "Compare information across all documents (e.g., payslips vs bank statements) to ensure consistency.", + "Action / Guidance / Requirement": "Compare applicant's full name across all documents to ensure consistency.", + # "Red Flag / Caution": "Inconsistencies require investigation.", + # "Notes / Details": "", + }, + # { + # "Topic / Document Type": "General Guidance", + # "Policy / Rule / Condition": "Payslip YTD Check", + # "Action / Guidance / Requirement": "Do Year-to-Date figures (gross income, tax) make sense?", + # "Red Flag / Caution": "If figures don’t make sense, investigate.", + # "Notes / Details": "", + # }, + # { + # "Topic / Document Type": "General Guidance", + # "Policy / Rule / Condition": "Payslip Details Check", + # "Action / Guidance / Requirement": "Check for low employee numbers, rounded figures, differences in payment methods (e.g., payslip says BACS, statement shows Faster Payment).", + # "Red Flag / Caution": "These can be red flags requiring investigation.", + # "Notes / Details": "", + # }, + { + "Topic / Document Type": "General Guidance", + "Policy / Rule / Condition": "Overall Validation", + "Action / Guidance / Requirement": "Ensure document is genuine, not fraudulent, belongs to the customer, and is from the expected source.", + # "Red Flag / Caution": "Any doubt may indicate fraud.", + # "Notes / Details": "Applies to all documents.", + }, + { + "Topic / Document Type": "Passport", + "Policy / Rule / Condition": "Full Name", + "Action / Guidance / Requirement": ( + "Full name must be present. " + "Full name must have length between 2 & 61. " + "Full name must have at least two words." + ), + # "Raises Red Flag": True, + # "Error Message": "Applicant's full name not present", + }, + { + "Topic / Document Type": "Passport", + "Policy / Rule / Condition": "Expiry Date", + "Action / Guidance / Requirement": "Expiry date must be present & after a year from current date", + # "Raises Red Flag": True, + # "Error Message": "Provided passport expires within 1 year", + }, + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Employer & Customer Names", + "Action / Guidance / Requirement": "Must include correct Employer’s and Customer’s names.", + # "Red Flag / Caution": "Missing or incorrect names.", + # "Notes / Details": "Cross-reference with BMM/HOME.", + }, + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Submission Requirement (Monthly Pay)", + "Action / Guidance / Requirement": "Minimum one month's most recent payslip required.", + # "Red Flag / Caution": "", + # "Notes / Details": "", + }, + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Pay Date Requirement", + # "Action / Guidance / Requirement": "Pay date must be within 35 days of FCD (Final Completion Date).", + "Action / Guidance / Requirement": "Pay date must be within 35 days of document upload date.", + # "Red Flag / Caution": "Pay date older than 35 days from FCD.", + # "Notes / Details": "", + }, + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Pay Period End Date (DD/MM/YYYY, if no pay date)", + # "Action / Guidance / Requirement": "Period end date must be within 35 days of FCD.", + "Action / Guidance / Requirement": "Period end date must be within 35 days of document upload date.", + # "Red Flag / Caution": "Period end date older than 35 days from FCD.", + # "Notes / Details": "", + }, + { + "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "Pay Period Month (MM/YYYY, if no pay date)", + "Policy / Rule / Condition": "Pay Period Month (MM/YYYY, if no pay date) basis pay period duration", + "Action / Guidance / Requirement": "Payslips dated in the current or previous calendar month are acceptable (must be the most recent).", + # "Red Flag / Caution": "Older than previous calendar month.", + # "Notes / Details": "", + }, + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Undated Payslips", + "Action / Guidance / Requirement": "Unacceptable.", + # "Red Flag / Caution": "Undated payslip received.", + # "Notes / Details": "Request a dated version.", + }, + { # custom + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Pay Period Start & End Dates", + # "Action / Guidance / Requirement": "Pay date must be within 35 days of FCD (Final Completion Date).", + "Action / Guidance / Requirement": "Pay date must be within 35 days of document upload date.", + # "Red Flag / Caution": "Pay date older than 35 days from FCD.", + # "Notes / Details": "", + }, + { # custom + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Requisite salary line items", + "Action / Guidance / Requirement": "Basic salary, Net Salary and/or other requisite salary components must be present", + # "Red Flag / Caution": "Missing or nonsensical Tax/NI figures.", + # "Notes / Details": "", + }, + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Tax & NI Contributions", + "Action / Guidance / Requirement": "Must be visible. Perform a sense check.", + # "Red Flag / Caution": "Missing or nonsensical Tax/NI figures.", + # "Notes / Details": "", + }, + # custom + { + "Topic / Document Type": "Payslips", + "Policy / Rule / Condition": "Applicant Address", + "Action / Guidance / Requirement": ( + "Applicant's address must be present. " + "Applicant's complete address must have a length of at least 10 & at most 300. " + "Complete address must match with provided value. " + ), + # "Red Flag / Caution": "Missing or nonsensical Tax/NI figures.", + # "Notes / Details": "", + }, + # { + # "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "YTD Figures Match", + # "Action / Guidance / Requirement": "Verify YTD figures match declared income.", + # "Red Flag / Caution": "YTD figures do not match declared income.", + # "Notes / Details": "Add to YMI/FDM memo if they do not match.", + # }, + # { + # "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "Pension Income (on Payslip)", + # "Action / Guidance / Requirement": "Must show within the last 35 days / be the most recent.", + # "Red Flag / Caution": "Pension income shown is dated >35 days ago.", + # "Notes / Details": "Alternatively use pension annual statement/latest P60. Cross-reference with bank statement if possible.", + # }, + # { + # "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "Joint Applicants", + # "Action / Guidance / Requirement": "Required if applicable.", + # "Red Flag / Caution": "Missing payslip for a joint applicant.", + # "Notes / Details": "", + # }, + # { + # "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "Payslip Red Flags", + # "Action / Guidance / Requirement": "", + # "Red Flag / Caution": "Rounded figures. Low employee/payroll number. Presence of these flags.", + # "Notes / Details": "Investigate further.", + # }, + # { + # "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "Payslip Verification (HOME)", + # "Action / Guidance / Requirement": "Check information in HOME against payslip details (employer name, customer name, etc.).", + # "Red Flag / Caution": "Mismatches found (e.g., misspellings, missing words).", + # "Notes / Details": "Correct HOME after consulting customer. If correction not possible (e.g., space), add YMI/FDM memo explaining.", + # }, + # { + # "Topic / Document Type": "Payslips", + # "Policy / Rule / Condition": "Payslip Near 35-Day Limit", + # "Action / Guidance / Requirement": "If payslip is close to the 35-day limit and no decision is obtained.", + # "Red Flag / Caution": "Decision pending, payslip nearing expiry.", + # "Notes / Details": "Another, more recent payslip may be required.", + # }, + # { + # "Topic / Document Type": "Digital Bank Stmts", + # "Policy / Rule / Condition": "Purpose", + # "Action / Guidance / Requirement": "Used to confirm income/expenditure.", + # "Red Flag / Caution": "", + # "Notes / Details": "Cannot be used for ID & VA confirmation.", + # }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Coverage", + # "Action / Guidance / Requirement": "Must cover a full calendar month (vs 28 days for original).", + "Action / Guidance / Requirement": "Account statement period's start date & end date must have a gap of at least 28 days.", + # "Red Flag / Caution": "", + # "Notes / Details": "", + }, + { # Custom + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Full Name", + "Action / Guidance / Requirement": ( + "Full name must be present. " + "Full name must have length between 2 & 61. " + "Full name must have at least two words." + ), + # "Raises Red Flag": True, + # "Error Message": "Applicant's full name not present", + }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Data Match", + "Action / Guidance / Requirement": "Customer data on statement must match profile.", + # "Red Flag / Caution": "Data mismatch.", + # "Notes / Details": "", + }, + # { + # "Topic / Document Type": "Digital Bank Stmts", + # "Policy / Rule / Condition": "Pay Info Match", + # "Action / Guidance / Requirement": "Verify pay information matches the payslip.", + # "Red Flag / Caution": "Pay info mismatch vs payslip.", + # "Notes / Details": "", + # }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Authenticity Doubt", + "Action / Guidance / Requirement": "If any doubt regarding authenticity.", + # "Red Flag / Caution": "Suspected non-genuine digital statement.", + # "Notes / Details": "Cases may be referred to Fraud.", + }, + { # Custom + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Salary deposit", + "Action / Guidance / Requirement": "At least one salary credit must be present", + # "Red Flag / Caution": "Data mismatch.", + # "Notes / Details": "", + }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Bank name", + "Action / Guidance / Requirement": ( + "Bank name must be present. " + "Bank name must have length between 4 & 50. " + # "Bank Name must match provided value." + ), + }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Bank account number", + "Action / Guidance / Requirement": ( + "Bank account number must be present. " + "Bank account number must be of 8 digits only. " + ), + }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Sort code", + "Action / Guidance / Requirement": ( + "Sort number must be present. " + "It must be of the format xx-xx-xx wherein x are digits. " + ), + }, + { + "Topic / Document Type": "Digital Bank Stmts", + "Policy / Rule / Condition": "Date checks", + "Action / Guidance / Requirement": ( + "Both statement start date & statement end date must be present. " + "At least one salary credit must be present. " + "Statement period's end date must be after the start date. " + ), + }, + ] + + demo_validations_df = pd.DataFrame(demo_validations) + demo_validations_df.index += 1 + # st.dataframe(demo_validations_df) + st.table(demo_validations_df) diff --git a/utils/tabs/document_upload_tab.py b/utils/tabs/document_upload_tab.py new file mode 100644 index 0000000000000000000000000000000000000000..f0c967481699d9ad43a54c727481697620d37075 --- /dev/null +++ b/utils/tabs/document_upload_tab.py @@ -0,0 +1,84 @@ +import streamlit as st +import uuid +from datetime import datetime +from utils.session_state import reset_state +from utils.process_files import process_uploaded_files +from utils.document_display import display_based_on_card +from llm.document_analyzer import analyze_files +import json +from utils.logger import setup_logger + +logger = setup_logger(__name__) + + +def upload_documents(): + uploaded_files = st.file_uploader( + "Upload Images, PDFs, or ZIP files", + type=["png", "jpg", "jpeg", "pdf", "zip"], + accept_multiple_files=True, + key="file_uploader" + ) + + if uploaded_files and st.session_state.get('current_upload') is None: + # if uploaded_files: + new_upload_id = str(uuid.uuid4())[:8] # Shorter UUID for readability + new_upload_id = "_".join([str(datetime.now()), new_upload_id]) + reset_state(upload_id=new_upload_id) + st.session_state['analyze_clicked'] = False + + # Process files + file_paths, file_groups, temp_dir, application_form, memo = process_uploaded_files( + uploaded_files) + st.session_state['uploads'][new_upload_id]['file_groups'] = file_groups + st.session_state['uploads'][new_upload_id]['temp_dir'] = temp_dir + st.session_state['uploads'][new_upload_id]['application_form'] = application_form + st.session_state['uploads'][new_upload_id]['memo'] = memo + + # Get current upload + current_upload = st.session_state.get('current_upload') + + if current_upload: + current_data = st.session_state['uploads'][current_upload] + file_groups = current_data['file_groups'] + temp_dir = current_data['temp_dir'] + + if file_groups: + + analyze_clicked = st.button("🔍 Analyze") + + if analyze_clicked: + st.session_state['analyze_clicked'] = True + reset_state() # Reset raw & display results only + + # Analysis step: only when user has clicked Analyze + if st.session_state.get('analyze_clicked', False): + + if current_data['values_raw'] is None: + with st.spinner("Analyzing documents..."): + analysis_results_groups, json_output_path = analyze_files( + file_groups=file_groups, + temp_dir=temp_dir, + current_upload=current_upload + ) + current_data['values_raw'] = analysis_results_groups + + if current_data['values_display'] is None: + current_data['values_display'] = {} + + for original_file, extracted_files in file_groups.items(): + display_based_on_card( + original_file=original_file, + analysis_results_for_original_file=current_data['values_raw'][original_file], + extracted_files=extracted_files, + current_upload=current_upload + ) + + st.download_button( + label="Download Analysis JSON", + data=json.dumps( + # current_data['values_raw'], + current_data['results_transformed'], + indent=4), + file_name="analysis_results.json", + mime="application/json" + ) diff --git a/utils/tabs/document_validation_tab.py b/utils/tabs/document_validation_tab.py new file mode 100644 index 0000000000000000000000000000000000000000..eb33ede4eaf6c8fec836ddf03ed90eb027c3cdd9 --- /dev/null +++ b/utils/tabs/document_validation_tab.py @@ -0,0 +1,107 @@ +import streamlit as st +from utils.prep_validators_payload import process_extracted_data +import pandas as pd +from utils.logger import setup_logger +from utils.json_utils import DocumentTypeByCategory, extract_document_types_from_transformed + +logger = setup_logger(__name__) + +def validate_documents(current): + st.header("Validation Results") + + # if current is not None: + # if st.session_state['uploads'][current]['application_form'] is None: + # custom_app_form = { + # "application_summary_full_name": "Jodie Pippa", + # "application_summary_bank_name": "HSBC", + # "application_summary_employer_name": "ABC Ltd", + # "application_summary_complete_address": "123 Maple Street, London, UK, SW1A 1AA", + # "full_name_err_msgs": None, + # "bank_name_err_msgs": None, + # "employer_name_err_msgs": None, + # "complete_employee_address_err_msgs": None, + # "is_incomplete": False, + # } + # else: + # custom_app_form = st.session_state['uploads'][current]['application_form'].set_index(0)[1].to_dict() + + # custom_app_form = CustomAppFormUpload.model_validate( + # uploaded_custom_form_dict).model_dump() + current_upload = st.session_state.get('current_upload') + if current_upload: + + if st.session_state['uploads'][current]['application_form'] is None: + custom_app_form = { + "application_summary_full_name": "Jodie Pippa", + "application_summary_bank_name": "HSBC", + "application_summary_employer_name": "ABC Ltd", + "application_summary_complete_address": "123 Maple Street, London, UK, SW1A 1AA", + "full_name_err_msgs": None, + "bank_name_err_msgs": None, + "employer_name_err_msgs": None, + "complete_employee_address_err_msgs": None, + "is_incomplete": False, + } + else: + custom_app_form = st.session_state['uploads'][current]['application_form'].set_index(0)[1].to_dict() + + + current_data = st.session_state['uploads'][current_upload] + full_data = current_data['values_raw'] + full_data_transformed = current_data['results_transformed'] + logger.info(f"full_data : {full_data}") + + + if full_data_transformed is not None: + logger.info(f"full_data_transformed : {full_data_transformed}") + full_data_transformed_categories = extract_document_types_from_transformed(full_data_transformed) + global_validations = DocumentTypeByCategory.model_validate(full_data_transformed_categories).to_dataframe() + st.markdown("## Document Type Upload validations") + st.table(global_validations) + st.markdown("---") + + if full_data is not None: + validations, cross_docs_name_eq_check = process_extracted_data(full_data, custom_app_form, full_data_transformed) + + # st.markdown("## Document validations") + # document_types_not_uploaded = set() + # for key in validations: + # if not validations[key]: + # document_types_not_uploaded.add(key) + # if document_types_not_uploaded: + # document_types_not_uploaded_list = list(document_types_not_uploaded) + # document_types_not_uploaded_list.sort() + # doc_types_not_uploaded_pts = "\n".join( + # [f"- {item}" for item in document_types_not_uploaded_list] + # ) + # st.markdown(doc_types_not_uploaded_pts) + # else: + # st.markdown("*All document types uploaded*") + + st.markdown("## Document validations") + for k, v in validations.items(): + st.markdown(f"Validations: {k}") + if len(v) > 0: + df = v[0]['validation_policy_status_df'] + df.index += 1 + logger.info(f"df: {df}") + df['Status'] = df['Status'].apply(lambda x: '✅' if x else '❌') + + + # df = pd.DataFrame(v).T + # st.dataframe(df, use_container_width=True) + for col in df.select_dtypes(include='object'): + df[col] = df[col].astype(str) + + st.table(df) + + # st.markdown(f"**RedFlag**: {v[0]['is_red_flagged']}") + st.markdown("---") + + st.markdown("---") + st.markdown("## General Guidance") + cross_docs_name_eq_check_df = pd.DataFrame([cross_docs_name_eq_check]) + cross_docs_name_eq_check_df['Status'] = cross_docs_name_eq_check_df['Status'].apply(lambda x: '✅' if x else '❌') + cross_docs_name_eq_check_df.index += 1 + st.table(cross_docs_name_eq_check_df) + diff --git a/utils/tabs/memo.py b/utils/tabs/memo.py new file mode 100644 index 0000000000000000000000000000000000000000..4e9726ac287321429eb590094951378b0e32e966 --- /dev/null +++ b/utils/tabs/memo.py @@ -0,0 +1,24 @@ +import streamlit as st +from utils.logger import setup_logger + +logger = setup_logger(__name__) + + +def display_memo(): + st.header("Memo") + current = st.session_state.get('current_upload') + if current is not None: + memo = st.session_state['uploads'][current]['memo'] + logger.info(f"Memo: {memo}") + + if memo is not None: + updated_memo = {} + + for title, content in memo.items(): + st.subheader(f"{title}") + edited_content = st.data_editor(content) + st.markdown("---") + + updated_memo[title] = edited_content + + st.session_state['uploads'][current]['memo'] = updated_memo