import datetime import re from pydantic import ( BaseModel, computed_field, Field, ValidationInfo, model_validator, ConfigDict ) import pandas as pd class UKPayslipSchema(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) pay_period_start_date: datetime.date | None = Field( default=None, description="Pay period's start date in YYYY-MM-DD format", examples=["2025-02-01"], ) pay_period_end_date: datetime.date | None = Field( default=None, description="Pay period's end date in YYYY-MM-DD format", examples=["2025-02-28"], ) pay_period_days: int | None = Field( default=None, description="pay_period_end_date - pay_period_start_date in days", examples=[28], ) pay_date: datetime.date | None = Field(None) full_name: str | None = Field( default=None, description="Applicant's full name. Must consist of at least 2 words, have length gte 2 & lte 61", examples=["Jodie Pippa"], ) employer_name: str | None = Field( default=None, description="Employer name extracted", examples=["ABC Ltd"], ) is_basic_pay_net_pay_other_salary_components_present: bool = Field( default=False, description="Boolean indicating whether Basic Pay, Net Pay, other requisite salary components/line items are present in the payslip", examples=[True, False], ) is_tax_deducation_present: bool = Field( default=False, description="Boolean flag indicating whether Tax Deduction line item is present in the payslip", examples=[True, False], ) is_ni_deduction_present: bool = Field( default=False, description="Boolean flag indicating whether NI/National Insurance deduction line item is present in the payslip", examples=[True, False], ) complete_employee_address: str | None = Field( default=None, description="Employee's complete address as a string", examples=["123 Maple Street, London, UK, SW1A 1AA"], ) # employee_number: int | None = Field( # default=None, # description="Employee number", # examples=[3558, 1234], # ) pay_dates_err_msgs: str | None = None full_name_err_msgs: str | None = None employer_name_err_msgs: str | None = None payslip_line_item_presence_err_msgs: str | None = None complete_employee_address_err_msgs: str | None = None validation_policy_status_df: pd.DataFrame = pd.DataFrame( columns=["Policy", "Value", "Status", "Message"]) # employee_number_err_msgs: str | None = None # is_red_flagged: bool = False @model_validator(mode="after") def validate_full_name(self, info: ValidationInfo): """Match applicant's full name against provided name (case-insensitive)""" try: err_msgs = [] expected = ( info.context.get("application_summary_full_name") if info.context else None ) if not self.full_name: err_msgs.append("Applicant's full name not present") self.validation_policy_status_df.loc[len( self.validation_policy_status_df)] = ["Employer & Customer Names", self.full_name, False, "Applicant's full name is not present"] else: self.validation_policy_status_df.loc[len( self.validation_policy_status_df)] = ["Employer & Customer Names", self.full_name, True, "Applicant's full name is present"] full_name_val_len = len(self.full_name) if self.full_name else 0 if not (2 <= full_name_val_len <= 61): err_msgs.append( "Full name must have a length of at least 2 & at most 61" ) self.validation_policy_status_df.loc[len( self.validation_policy_status_df)] = ["Employer & Customer Names", full_name_val_len, False, "Full name has a length of at least 2 & at most 61"] else: self.validation_policy_status_df.loc[len( self.validation_policy_status_df)] = ["Employer & Customer Names", full_name_val_len, True, "Full name has a length of at least 2 & at most 61"] if not self.full_name or len(self.full_name.strip().split(" ")) < 2: err_msgs.append( "Full name must consist of at least 2 words (first name + last name)" ) self.validation_policy_status_df.loc[len( self.validation_policy_status_df)] = ["Employer & Customer Names", self.full_name , False, "Full name does not consist of at least 2 words (first name + last name)"] else: self.validation_policy_status_df.loc[len( self.validation_policy_status_df)] = ["Employer & Customer Names", len(self.full_name.strip().split(" ")), True, "Full name consists of at least 2 words (first name + last name)"] if ( not expected or not self.full_name or self.full_name.lower() != expected.lower() ): err_msgs.append("Name mismatch with provided value") self.validation_policy_status_df.loc[len( self.validation_policy_status_df)] = ["Employer & Customer Names", f"{self.full_name}, {expected}", False, "Name does not match with provided value"] else: self.validation_policy_status_df.loc[len( self.validation_policy_status_df)] = ["Employer & Customer Names", f"{self.full_name}, {expected}", True, "Name matches with provided value"] self.full_name_err_msgs = ", ".join(err_msgs) if err_msgs else None return self except Exception as e: # logger.exception(e, exc_info=True) # return None raise @model_validator(mode="after") def validate_employer_name(self, info: ValidationInfo): """Match employer against provided employer name (case-insensitive)""" try: err_msgs = [] expected = ( info.context.get("application_summary_employer_name") if info.context else None ) employer_name_val = self.employer_name if not employer_name_val: err_msgs.append("Employer name not present") self.validation_policy_status_df.loc[len( self.validation_policy_status_df)] = ["Employer & Customer Names", employer_name_val, False, "Employer name is not present"] else: self.validation_policy_status_df.loc[len( self.validation_policy_status_df)] = ["Employer & Customer Names", employer_name_val, True, "Employer name is present"] is_employer_name_match = ( expected and employer_name_val and employer_name_val.lower() == expected.lower() ) if not is_employer_name_match: err_msgs.append("Employer name mismatch with provided value") self.validation_policy_status_df.loc[len( self.validation_policy_status_df)] = ["Employer & Customer Names", f"{employer_name_val}, {expected}", False, "Employer name does not match with provided value"] else: self.validation_policy_status_df.loc[len( self.validation_policy_status_df)] = ["Employer & Customer Names", f"{employer_name_val}, {expected}", True, "Employer name matches with provided value"] # # Allowed: letters, numbers, spaces, and common name punctuation # pattern = r"^[A-Za-z0-9&\-,.()'/@ ]{2,100}$" # if not re.match(pattern, employer_name_val): # err_msgs.append("Employer name contains invalid characters") if not re.search(r"[A-Za-z]", employer_name_val): err_msgs.append( "Employer name must contain at least one letter") if employer_name_val.strip() == "": err_msgs.append("Employer name cannot be only whitespace") self.employer_name_err_msgs = ", ".join( err_msgs) if err_msgs else None return self except Exception as e: # logger.exception(e, exc_info=True) # return None raise @model_validator(mode="after") def validate_payslip_dates(self): try: err_msgs = [] today = datetime.date.today() threshold_date = today - datetime.timedelta(days=35) if not self.pay_period_start_date or not self.pay_period_end_date: err_msgs.append( "Undated Payslips" ) self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Undated Payslips", f"{self.pay_period_start_date}, {self.pay_period_end_date}", False, "Undated payslip", ] else: self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Undated Payslips", f"{self.pay_period_start_date}, {self.pay_period_end_date}", True, "Dated payslip", ] # self.is_red_flagged = True if self.pay_date: if not (threshold_date <= self.pay_date <= today): err_msgs.append( "Pay date must be within the last 35 days & not in the future" ) self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Pay Date Requirement", self.pay_date, False, "Pay date is not within the last 35 days & not in the future", ] else: self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Pay Date Requirement", self.pay_date, True, "Pay date is within the last 35 days & not in the future", ] # elif self.pay_period_end_date: else: if not (threshold_date <= self.pay_period_end_date <= today): err_msgs.append( "Pay period's end date must be within the last 35 days & not in the future" ) self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Pay Period End Date (DD/MM/YYYY, if no pay date)", self.pay_date, False, "Pay date is not within the last 35 days &/or in the future", ] else: self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Pay Period End Date (DD/MM/YYYY, if no pay date)", self.pay_date, True, "Pay date is within the last 35 days & not in the future", ] prev_month_end = datetime.date.today().replace(day=1) - \ datetime.timedelta(days=1) prev_month_start = prev_month_end.replace(day=1) if not ( prev_month_start <= self.pay_period_start_date and self.pay_period_start_date < self.pay_period_end_date <= today ): err_msgs.append( "Payslip date(s) must not be older than those of the last calendar month" ) self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Pay Period Month (MM/YYYY, if no pay date) basis pay period duration", self.pay_date, False, "Payslip date(s) is older than those of the last calendar month", ] else: self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Pay Period Month (MM/YYYY, if no pay date) basis pay period duration", self.pay_date, True, "Payslip date(s) is not older than those of the last calendar month", ] if self.pay_period_start_date and self.pay_period_end_date: if self.pay_period_start_date >= self.pay_period_end_date: err_msgs.append( "Pay period's start date must be before the end date") self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Pay Period Start & End Dates", f"{self.pay_period_start_date}, {self.pay_period_end_date}", False, "Pay period's start date is not before the end date", ] else: self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Pay Period Start & End Dates", f"{self.pay_period_start_date}, {self.pay_period_end_date}", True, "Pay period's start date is before the end date", ] if (self.pay_period_end_date - self.pay_period_start_date).days < 28: err_msgs.append( "Pay period's start date & end date must have a gap of at least 28 days" ) self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Submission Requirement (Monthly Pay)", (self.pay_period_end_date - self.pay_period_start_date).days, False, "Pay period's start date & end date donot have a gap of at least 28 days", ] else: self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Submission Requirement (Monthly Pay)", (self.pay_period_end_date - self.pay_period_start_date).days, True, "Pay period's start date & end date have a gap of at least 28 days", ] else: self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Pay Period Start & End Dates", f"{self.pay_period_start_date}, {self.pay_period_end_date}", False, "Pay period's start date is not before the end date", ] self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Submission Requirement (Monthly Pay)", f"{self.pay_period_start_date}, {self.pay_period_end_date}", False, "Pay period's start date & end date donot have a gap of at least 28 days", ] self.pay_dates_err_msgs = ", ".join(err_msgs) if err_msgs else None return self except Exception as e: # logger.exception(e, exc_info=True) # return None raise @model_validator(mode="after") def validate_payslip_components_checks(self): try: err_msgs = [] if not self.is_basic_pay_net_pay_other_salary_components_present: err_msgs.append( "Basic salary, Net Salary and/or other requisite salary components not present" ) self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Requisite salary line items", self.is_basic_pay_net_pay_other_salary_components_present, False, "Basic salary, Net Salary and/or other requisite salary components not present", ] else: self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Requisite salary line items", self.is_basic_pay_net_pay_other_salary_components_present, True, "Basic salary, Net Salary and/or other requisite salary components are present", ] if not self.is_tax_deducation_present: err_msgs.append("Tax Deduction line item must be present") self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Tax & NI Contributions", self.is_tax_deducation_present, False, "Tax Deduction line item is not present", ] else: self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Tax & NI Contributions", self.is_tax_deducation_present, True, "Tax Deduction line item is present", ] if not self.is_ni_deduction_present: err_msgs.append("NI/National Insurance line item must be present") self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Tax & NI Contributions", self.is_ni_deduction_present, False, "NI/National Insurance line item is not present", ] else: self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Tax & NI Contributions", self.is_ni_deduction_present, True, "NI/National Insurance line item is present", ] self.payslip_line_item_presence_err_msgs = ( ", ".join(err_msgs) if err_msgs else None ) return self except Exception as e: # logger.exception(e, exc_info=True) # return None raise @model_validator(mode="after") def validate_complete_address(self, info: ValidationInfo): try: err_msgs = [] expected = ( info.context.get("application_summary_complete_address") if info.context else None ) val = self.complete_employee_address if not val: err_msgs.append("Applicant's address not present") self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Applicant Address", val, False, "Applicant's address not present", ] else: self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Applicant Address", val, True, "Applicant's address is present", ] length = len(val) if val else 0 if not (10 <= length <= 300): err_msgs.append( "Applicant's complete address must have a length of at least 10 & at most 300" ) self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Applicant Address", length, False, "Applicant's complete address does not have a length of at least 10 & at most 300", ] else: self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Applicant Address", length, True, "Applicant's complete address has a length of at least 10 & at most 300", ] if not expected or not val or val.lower() != expected.lower(): err_msgs.append("Complete address mismatch with provided value") self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Applicant Address", f"{val}, {expected}", False, "Complete address mismatch with provided value", ] else: self.validation_policy_status_df.loc[ len(self.validation_policy_status_df) ] = [ "Applicant Address", f"{val}, {expected}", True, "Complete address matches with provided value", ] self.complete_employee_address_err_msgs = ( ", ".join(err_msgs) if err_msgs else None ) return self except Exception as e: # logger.exception(e, exc_info=True) # return None raise # @model_validator(mode="after") # def validate_employee_number(self): # try: # if self.employee_number and self.employee_number <= 25: # self.complete_employee_address_err_msgs = "Employee number low" # return self # except Exception as e: # raise @computed_field @property def is_red_flagged(self) -> bool: if any([ self.pay_dates_err_msgs, self.full_name_err_msgs, self.employer_name_err_msgs, self.payslip_line_item_presence_err_msgs, self.complete_employee_address_err_msgs, # self.employee_number_err_msgs, ]): return True return False