Spaces:
Build error
Build error
import datetime | |
import re | |
from pydantic import ( | |
BaseModel, | |
computed_field, | |
Field, | |
ValidationInfo, | |
model_validator, | |
ConfigDict | |
) | |
import pandas as pd | |
class UKPayslipSchema(BaseModel): | |
model_config = ConfigDict(arbitrary_types_allowed=True) | |
pay_period_start_date: datetime.date | None = Field( | |
default=None, | |
description="Pay period's start date in YYYY-MM-DD format", | |
examples=["2025-02-01"], | |
) | |
pay_period_end_date: datetime.date | None = Field( | |
default=None, | |
description="Pay period's end date in YYYY-MM-DD format", | |
examples=["2025-02-28"], | |
) | |
pay_period_days: int | None = Field( | |
default=None, | |
description="pay_period_end_date - pay_period_start_date in days", | |
examples=[28], | |
) | |
pay_date: datetime.date | None = Field(None) | |
full_name: str | None = Field( | |
default=None, | |
description="Applicant's full name. Must consist of at least 2 words, have length gte 2 & lte 61", | |
examples=["Jodie Pippa"], | |
) | |
employer_name: str | None = Field( | |
default=None, | |
description="Employer name extracted", | |
examples=["ABC Ltd"], | |
) | |
is_basic_pay_net_pay_other_salary_components_present: bool = Field( | |
default=False, | |
description="Boolean indicating whether Basic Pay, Net Pay, other requisite salary components/line items are present in the payslip", | |
examples=[True, False], | |
) | |
is_tax_deducation_present: bool = Field( | |
default=False, | |
description="Boolean flag indicating whether Tax Deduction line item is present in the payslip", | |
examples=[True, False], | |
) | |
is_ni_deduction_present: bool = Field( | |
default=False, | |
description="Boolean flag indicating whether NI/National Insurance deduction line item is present in the payslip", | |
examples=[True, False], | |
) | |
complete_employee_address: str | None = Field( | |
default=None, | |
description="Employee's complete address as a string", | |
examples=["123 Maple Street, London, UK, SW1A 1AA"], | |
) | |
# employee_number: int | None = Field( | |
# default=None, | |
# description="Employee number", | |
# examples=[3558, 1234], | |
# ) | |
pay_dates_err_msgs: str | None = None | |
full_name_err_msgs: str | None = None | |
employer_name_err_msgs: str | None = None | |
payslip_line_item_presence_err_msgs: str | None = None | |
complete_employee_address_err_msgs: str | None = None | |
validation_policy_status_df: pd.DataFrame = pd.DataFrame( | |
columns=["Policy", "Value", "Status", "Message"]) | |
# employee_number_err_msgs: str | None = None | |
# is_red_flagged: bool = False | |
def validate_full_name(self, info: ValidationInfo): | |
"""Match applicant's full name against provided name (case-insensitive)""" | |
try: | |
err_msgs = [] | |
expected = ( | |
info.context.get("application_summary_full_name") | |
if info.context | |
else None | |
) | |
if not self.full_name: | |
err_msgs.append("Applicant's full name not present") | |
self.validation_policy_status_df.loc[len( | |
self.validation_policy_status_df)] = ["Employer & Customer Names", self.full_name, False, "Applicant's full name is not present"] | |
else: | |
self.validation_policy_status_df.loc[len( | |
self.validation_policy_status_df)] = ["Employer & Customer Names", self.full_name, True, "Applicant's full name is present"] | |
full_name_val_len = len(self.full_name) if self.full_name else 0 | |
if not (2 <= full_name_val_len <= 61): | |
err_msgs.append( | |
"Full name must have a length of at least 2 & at most 61" | |
) | |
self.validation_policy_status_df.loc[len( | |
self.validation_policy_status_df)] = ["Employer & Customer Names", full_name_val_len, False, "Full name has a length of at least 2 & at most 61"] | |
else: | |
self.validation_policy_status_df.loc[len( | |
self.validation_policy_status_df)] = ["Employer & Customer Names", full_name_val_len, True, "Full name has a length of at least 2 & at most 61"] | |
if not self.full_name or len(self.full_name.strip().split(" ")) < 2: | |
err_msgs.append( | |
"Full name must consist of at least 2 words (first name + last name)" | |
) | |
self.validation_policy_status_df.loc[len( | |
self.validation_policy_status_df)] = ["Employer & Customer Names", self.full_name , False, "Full name does not consist of at least 2 words (first name + last name)"] | |
else: | |
self.validation_policy_status_df.loc[len( | |
self.validation_policy_status_df)] = ["Employer & Customer Names", len(self.full_name.strip().split(" ")), True, "Full name consists of at least 2 words (first name + last name)"] | |
if ( | |
not expected | |
or not self.full_name | |
or self.full_name.lower() != expected.lower() | |
): | |
err_msgs.append("Name mismatch with provided value") | |
self.validation_policy_status_df.loc[len( | |
self.validation_policy_status_df)] = ["Employer & Customer Names", f"{self.full_name}, {expected}", False, "Name does not match with provided value"] | |
else: | |
self.validation_policy_status_df.loc[len( | |
self.validation_policy_status_df)] = ["Employer & Customer Names", f"{self.full_name}, {expected}", True, "Name matches with provided value"] | |
self.full_name_err_msgs = ", ".join(err_msgs) if err_msgs else None | |
return self | |
except Exception as e: | |
# logger.exception(e, exc_info=True) | |
# return None | |
raise | |
def validate_employer_name(self, info: ValidationInfo): | |
"""Match employer against provided employer name (case-insensitive)""" | |
try: | |
err_msgs = [] | |
expected = ( | |
info.context.get("application_summary_employer_name") | |
if info.context | |
else None | |
) | |
employer_name_val = self.employer_name | |
if not employer_name_val: | |
err_msgs.append("Employer name not present") | |
self.validation_policy_status_df.loc[len( | |
self.validation_policy_status_df)] = ["Employer & Customer Names", employer_name_val, False, "Employer name is not present"] | |
else: | |
self.validation_policy_status_df.loc[len( | |
self.validation_policy_status_df)] = ["Employer & Customer Names", employer_name_val, True, "Employer name is present"] | |
is_employer_name_match = ( | |
expected | |
and employer_name_val | |
and employer_name_val.lower() == expected.lower() | |
) | |
if not is_employer_name_match: | |
err_msgs.append("Employer name mismatch with provided value") | |
self.validation_policy_status_df.loc[len( | |
self.validation_policy_status_df)] = ["Employer & Customer Names", f"{employer_name_val}, {expected}", False, "Employer name does not match with provided value"] | |
else: | |
self.validation_policy_status_df.loc[len( | |
self.validation_policy_status_df)] = ["Employer & Customer Names", f"{employer_name_val}, {expected}", True, "Employer name matches with provided value"] | |
# # Allowed: letters, numbers, spaces, and common name punctuation | |
# pattern = r"^[A-Za-z0-9&\-,.()'/@ ]{2,100}$" | |
# if not re.match(pattern, employer_name_val): | |
# err_msgs.append("Employer name contains invalid characters") | |
if not re.search(r"[A-Za-z]", employer_name_val): | |
err_msgs.append( | |
"Employer name must contain at least one letter") | |
if employer_name_val.strip() == "": | |
err_msgs.append("Employer name cannot be only whitespace") | |
self.employer_name_err_msgs = ", ".join( | |
err_msgs) if err_msgs else None | |
return self | |
except Exception as e: | |
# logger.exception(e, exc_info=True) | |
# return None | |
raise | |
def validate_payslip_dates(self): | |
try: | |
err_msgs = [] | |
today = datetime.date.today() | |
threshold_date = today - datetime.timedelta(days=35) | |
if not self.pay_period_start_date or not self.pay_period_end_date: | |
err_msgs.append( | |
"Undated Payslips" | |
) | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Undated Payslips", | |
f"{self.pay_period_start_date}, {self.pay_period_end_date}", | |
False, | |
"Undated payslip", | |
] | |
else: | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Undated Payslips", | |
f"{self.pay_period_start_date}, {self.pay_period_end_date}", | |
True, | |
"Dated payslip", | |
] | |
# self.is_red_flagged = True | |
if self.pay_date: | |
if not (threshold_date <= self.pay_date <= today): | |
err_msgs.append( | |
"Pay date must be within the last 35 days & not in the future" | |
) | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Pay Date Requirement", | |
self.pay_date, | |
False, | |
"Pay date is not within the last 35 days & not in the future", | |
] | |
else: | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Pay Date Requirement", | |
self.pay_date, | |
True, | |
"Pay date is within the last 35 days & not in the future", | |
] | |
# elif self.pay_period_end_date: | |
else: | |
if not (threshold_date <= self.pay_period_end_date <= today): | |
err_msgs.append( | |
"Pay period's end date must be within the last 35 days & not in the future" | |
) | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Pay Period End Date (DD/MM/YYYY, if no pay date)", | |
self.pay_date, | |
False, | |
"Pay date is not within the last 35 days &/or in the future", | |
] | |
else: | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Pay Period End Date (DD/MM/YYYY, if no pay date)", | |
self.pay_date, | |
True, | |
"Pay date is within the last 35 days & not in the future", | |
] | |
prev_month_end = datetime.date.today().replace(day=1) - \ | |
datetime.timedelta(days=1) | |
prev_month_start = prev_month_end.replace(day=1) | |
if not ( | |
prev_month_start <= self.pay_period_start_date | |
and self.pay_period_start_date < self.pay_period_end_date <= today | |
): | |
err_msgs.append( | |
"Payslip date(s) must not be older than those of the last calendar month" | |
) | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Pay Period Month (MM/YYYY, if no pay date) basis pay period duration", | |
self.pay_date, | |
False, | |
"Payslip date(s) is older than those of the last calendar month", | |
] | |
else: | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Pay Period Month (MM/YYYY, if no pay date) basis pay period duration", | |
self.pay_date, | |
True, | |
"Payslip date(s) is not older than those of the last calendar month", | |
] | |
if self.pay_period_start_date and self.pay_period_end_date: | |
if self.pay_period_start_date >= self.pay_period_end_date: | |
err_msgs.append( | |
"Pay period's start date must be before the end date") | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Pay Period Start & End Dates", | |
f"{self.pay_period_start_date}, {self.pay_period_end_date}", | |
False, | |
"Pay period's start date is not before the end date", | |
] | |
else: | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Pay Period Start & End Dates", | |
f"{self.pay_period_start_date}, {self.pay_period_end_date}", | |
True, | |
"Pay period's start date is before the end date", | |
] | |
if (self.pay_period_end_date - self.pay_period_start_date).days < 28: | |
err_msgs.append( | |
"Pay period's start date & end date must have a gap of at least 28 days" | |
) | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Submission Requirement (Monthly Pay)", | |
(self.pay_period_end_date - self.pay_period_start_date).days, | |
False, | |
"Pay period's start date & end date donot have a gap of at least 28 days", | |
] | |
else: | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Submission Requirement (Monthly Pay)", | |
(self.pay_period_end_date - self.pay_period_start_date).days, | |
True, | |
"Pay period's start date & end date have a gap of at least 28 days", | |
] | |
else: | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Pay Period Start & End Dates", | |
f"{self.pay_period_start_date}, {self.pay_period_end_date}", | |
False, | |
"Pay period's start date is not before the end date", | |
] | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Submission Requirement (Monthly Pay)", | |
f"{self.pay_period_start_date}, {self.pay_period_end_date}", | |
False, | |
"Pay period's start date & end date donot have a gap of at least 28 days", | |
] | |
self.pay_dates_err_msgs = ", ".join(err_msgs) if err_msgs else None | |
return self | |
except Exception as e: | |
# logger.exception(e, exc_info=True) | |
# return None | |
raise | |
def validate_payslip_components_checks(self): | |
try: | |
err_msgs = [] | |
if not self.is_basic_pay_net_pay_other_salary_components_present: | |
err_msgs.append( | |
"Basic salary, Net Salary and/or other requisite salary components not present" | |
) | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Requisite salary line items", | |
self.is_basic_pay_net_pay_other_salary_components_present, | |
False, | |
"Basic salary, Net Salary and/or other requisite salary components not present", | |
] | |
else: | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Requisite salary line items", | |
self.is_basic_pay_net_pay_other_salary_components_present, | |
True, | |
"Basic salary, Net Salary and/or other requisite salary components are present", | |
] | |
if not self.is_tax_deducation_present: | |
err_msgs.append("Tax Deduction line item must be present") | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Tax & NI Contributions", | |
self.is_tax_deducation_present, | |
False, | |
"Tax Deduction line item is not present", | |
] | |
else: | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Tax & NI Contributions", | |
self.is_tax_deducation_present, | |
True, | |
"Tax Deduction line item is present", | |
] | |
if not self.is_ni_deduction_present: | |
err_msgs.append("NI/National Insurance line item must be present") | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Tax & NI Contributions", | |
self.is_ni_deduction_present, | |
False, | |
"NI/National Insurance line item is not present", | |
] | |
else: | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Tax & NI Contributions", | |
self.is_ni_deduction_present, | |
True, | |
"NI/National Insurance line item is present", | |
] | |
self.payslip_line_item_presence_err_msgs = ( | |
", ".join(err_msgs) if err_msgs else None | |
) | |
return self | |
except Exception as e: | |
# logger.exception(e, exc_info=True) | |
# return None | |
raise | |
def validate_complete_address(self, info: ValidationInfo): | |
try: | |
err_msgs = [] | |
expected = ( | |
info.context.get("application_summary_complete_address") | |
if info.context | |
else None | |
) | |
val = self.complete_employee_address | |
if not val: | |
err_msgs.append("Applicant's address not present") | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Applicant Address", | |
val, | |
False, | |
"Applicant's address not present", | |
] | |
else: | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Applicant Address", | |
val, | |
True, | |
"Applicant's address is present", | |
] | |
length = len(val) if val else 0 | |
if not (10 <= length <= 300): | |
err_msgs.append( | |
"Applicant's complete address must have a length of at least 10 & at most 300" | |
) | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Applicant Address", | |
length, | |
False, | |
"Applicant's complete address does not have a length of at least 10 & at most 300", | |
] | |
else: | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Applicant Address", | |
length, | |
True, | |
"Applicant's complete address has a length of at least 10 & at most 300", | |
] | |
if not expected or not val or val.lower() != expected.lower(): | |
err_msgs.append("Complete address mismatch with provided value") | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Applicant Address", | |
f"{val}, {expected}", | |
False, | |
"Complete address mismatch with provided value", | |
] | |
else: | |
self.validation_policy_status_df.loc[ | |
len(self.validation_policy_status_df) | |
] = [ | |
"Applicant Address", | |
f"{val}, {expected}", | |
True, | |
"Complete address matches with provided value", | |
] | |
self.complete_employee_address_err_msgs = ( | |
", ".join(err_msgs) if err_msgs else None | |
) | |
return self | |
except Exception as e: | |
# logger.exception(e, exc_info=True) | |
# return None | |
raise | |
# @model_validator(mode="after") | |
# def validate_employee_number(self): | |
# try: | |
# if self.employee_number and self.employee_number <= 25: | |
# self.complete_employee_address_err_msgs = "Employee number low" | |
# return self | |
# except Exception as e: | |
# raise | |
def is_red_flagged(self) -> bool: | |
if any([ | |
self.pay_dates_err_msgs, | |
self.full_name_err_msgs, | |
self.employer_name_err_msgs, | |
self.payslip_line_item_presence_err_msgs, | |
self.complete_employee_address_err_msgs, | |
# self.employee_number_err_msgs, | |
]): | |
return True | |
return False | |