vamsidharmuthireddy commited on
Commit
52c1998
·
verified ·
1 Parent(s): 48e7216

Upload 90 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +2 -0
  2. __init__.py +0 -0
  3. __pycache__/__init__.cpython-313.pyc +0 -0
  4. api/__init__.py +0 -0
  5. app.py +440 -0
  6. app_fastapi.py +0 -0
  7. app_streamlit.py +48 -0
  8. app_streamlit_bak.py +79 -0
  9. checks copy.ipynb +235 -0
  10. config.toml +3 -0
  11. llm/__init__.py +0 -0
  12. llm/__pycache__/__init__.cpython-313.pyc +0 -0
  13. llm/__pycache__/document_analyzer.cpython-313.pyc +0 -0
  14. llm/__pycache__/llm.cpython-313.pyc +0 -0
  15. llm/document_analyzer.py +102 -0
  16. llm/llm.py +47 -0
  17. logs_directory/app_20250414.log +78 -0
  18. logs_directory/app_20250415.log +0 -0
  19. logs_directory/app_20250416.log +0 -0
  20. logs_directory/app_20250417.log +0 -0
  21. logs_directory/app_20250420.log +0 -0
  22. prompts/__init__.py +7 -0
  23. prompts/__pycache__/__init__.cpython-313.pyc +0 -0
  24. prompts/__pycache__/document_type.cpython-313.pyc +0 -0
  25. prompts/__pycache__/genric_ocr.cpython-313.pyc +0 -0
  26. prompts/bank_statement/__pycache__/bank_statement.cpython-313.pyc +0 -0
  27. prompts/bank_statement/bank_statement.py +67 -0
  28. prompts/document_type.py +103 -0
  29. prompts/genric_ocr.py +9 -0
  30. prompts/identity_documents/__pycache__/driving_license.cpython-313.pyc +0 -0
  31. prompts/identity_documents/__pycache__/passport.cpython-313.pyc +0 -0
  32. prompts/identity_documents/driving_license.py +39 -0
  33. prompts/identity_documents/passport.py +58 -0
  34. prompts/income_document/__pycache__/p60.cpython-313.pyc +0 -0
  35. prompts/income_document/__pycache__/payslip.cpython-313.pyc +0 -0
  36. prompts/income_document/p60.py +61 -0
  37. prompts/income_document/payslip.py +84 -0
  38. schemas/__init__.py +5 -0
  39. schemas/__pycache__/__init__.cpython-313.pyc +0 -0
  40. schemas/__pycache__/account_statement.cpython-313.pyc +0 -0
  41. schemas/__pycache__/custom_app_form.cpython-313.pyc +0 -0
  42. schemas/__pycache__/id.cpython-313.pyc +0 -0
  43. schemas/__pycache__/payslip.cpython-313.pyc +0 -0
  44. schemas/__pycache__/uk_address.cpython-313.pyc +0 -0
  45. schemas/account_statement.py +609 -0
  46. schemas/custom_app_form.py +163 -0
  47. schemas/id.py +291 -0
  48. schemas/payslip.py +551 -0
  49. schemas/uk_address.py +39 -0
  50. utils/__init__.py +4 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ **__pycache__**
2
+ logs_directory/
__init__.py ADDED
File without changes
__pycache__/__init__.cpython-313.pyc ADDED
Binary file (165 Bytes). View file
 
api/__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ from schemas.custom_app_form import CustomAppFormUpload
5
+ from utils.prep_validators_payload import process_extracted_data
6
+
7
+
8
+ upload_docs_tab, demo_validations_considered_tab, upload_docs_validation_results_tab = st.tabs(
9
+ ["Upload Documents", "Demo Validations", "Validation Results"]
10
+ )
11
+
12
+ with upload_docs_tab:
13
+ st.header("Upload Documents")
14
+ # st.markdown("## Upload Custom Application Form")
15
+ uploaded_custom_application_form_file = st.file_uploader(
16
+ label="Upload Custom Application Form",
17
+ accept_multiple_files=False,
18
+ type=["csv"],
19
+ )
20
+
21
+ uploaded_files = st.file_uploader(
22
+ label="Upload files to be validated",
23
+ accept_multiple_files=True,
24
+ type=["png", "jpg", "jpeg", "pdf", "zip"]
25
+ )
26
+
27
+ if uploaded_custom_application_form_file:
28
+ uploaded_custom_form_df = pd.read_csv(
29
+ uploaded_custom_application_form_file, header=None)
30
+ uploaded_custom_form_dict = dict(
31
+ zip(uploaded_custom_form_df[0], uploaded_custom_form_df[1]))
32
+ st.write("Raw Dictionary:")
33
+ st.json(uploaded_custom_form_dict)
34
+ custom_app_form = CustomAppFormUpload.model_validate(
35
+ uploaded_custom_form_dict).model_dump()
36
+ st.write("Parsed Dictionary:")
37
+ st.json(custom_app_form)
38
+ # print(custom_app_form)
39
+ if isinstance(custom_app_form, dict) and not custom_app_form.get("is_incomplete"):
40
+ st.session_state["custom_app_form"] = custom_app_form
41
+ st.write("Session State:")
42
+ st.write(st.session_state)
43
+
44
+
45
+ with demo_validations_considered_tab:
46
+ st.header("Demo Validations")
47
+ # demo_validations = [
48
+ # # {
49
+ # # "Document Type": "Passport",
50
+ # # "Validation": "Full name must be present",
51
+ # # "Raises Red Flag": True,
52
+ # # # "Error Message": "Applicant's full name not present",
53
+ # # },
54
+ # # {
55
+ # # "Document Type": "Passport",
56
+ # # "Validation": "Full name must have length between 2 & 61",
57
+ # # "Raises Red Flag": True,
58
+ # # # "Error Message": "Full name must have a length of at least 2 & at most 61",
59
+ # # },
60
+ # # {
61
+ # # "Document Type": "Passport",
62
+ # # "Validation": "Full name must have at least two words",
63
+ # # "Raises Red Flag": True,
64
+ # # # "Error Message": "Full name must consist of at least 2 words (first name + last name)",
65
+ # # },
66
+ # {
67
+ # "Document Type": "Passport",
68
+ # "Validation": (
69
+ # "Full name must be present. "
70
+ # "Full name must have length between 2 & 61. "
71
+ # "Full name must have at least two words."
72
+ # ),
73
+ # "Raises Red Flag": True,
74
+ # # "Error Message": "Applicant's full name not present",
75
+ # },
76
+ # {
77
+ # "Document Type": "Passport",
78
+ # "Validation": "Expiry date must be present & after a year from current date",
79
+ # "Raises Red Flag": True,
80
+ # # "Error Message": "Provided passport expires within 1 year",
81
+ # },
82
+ # {
83
+ # "Document Type": "Payslip",
84
+ # "Validation": (
85
+ # "Full name must be present. "
86
+ # "Full name must have length between 2 & 61. "
87
+ # "Full name must have at least two words."
88
+ # ),
89
+ # "Raises Red Flag": True,
90
+ # # "Error Message": "Applicant's full name not present",
91
+ # },
92
+ # {
93
+ # "Document Type": "Payslip",
94
+ # "Validation": "Employer name must be present",
95
+ # "Raises Red Flag": True,
96
+ # # "Error Message": "Employer name not present",
97
+ # },
98
+ # {
99
+ # "Document Type": "Payslip",
100
+ # "Validation": "Employer name must have at least alphabet",
101
+ # "Raises Red Flag": True,
102
+ # # "Error Message": "Employer name must contain at least one letter",
103
+ # },
104
+ # {
105
+ # "Document Type": "Payslip",
106
+ # "Validation": "Employer name cannot be only whitespace",
107
+ # "Raises Red Flag": True,
108
+ # # "Error Message": "Employer name cannot be only whitespace",
109
+ # },
110
+ # {
111
+ # "Document Type": "Payslip",
112
+ # "Validation": "Employer name must match the provided value",
113
+ # "Raises Red Flag": True,
114
+ # # "Error Message": "Employer name mismatch with provided value",
115
+ # },
116
+ # {
117
+ # "Document Type": "Payslip",
118
+ # "Validation": (
119
+ # "Pay period start & dates must be present.\n"
120
+ # "Pay period start date cannot be on or after the end date.\n"
121
+ # "Pay period's end date must be within the last 35 days & not in the future.\n"
122
+ # "Pay period's date(s) must not be older than those of the last calendar month.\n"
123
+ # "Pay period's start date & end date must have a gap of at least 28 days."
124
+ # ),
125
+ # "Raises Red Flag": True,
126
+ # # "Error Message": "Employer name mismatch with provided value",
127
+ # },
128
+ # {
129
+ # "Document Type": "Payslip",
130
+ # "Validation": (
131
+ # "Basic salary, Net Salary and/or other requisite salary components must be present. "
132
+ # "Tax Deduction line item must be present. "
133
+ # "NI/National Insurance line item must be present."
134
+ # ),
135
+ # "Raises Red Flag": True,
136
+ # },
137
+ # {
138
+ # "Document Type": "Payslip",
139
+ # "Validation": (
140
+ # "Applicant's address must be present. "
141
+ # "Applicant's complete address must have a length of at least 10 & at most 300. "
142
+ # "Complete address must match with provided value. "
143
+ # ),
144
+ # "Raises Red Flag": True,
145
+ # },
146
+ # {
147
+ # "Document Type": "Payslip",
148
+ # "Validation": "Employee number must be greater than 25",
149
+ # "Raises Red Flag": True,
150
+ # },
151
+ # {
152
+ # "Document Type": "Digital Bank Account Statement",
153
+ # "Validation": (
154
+ # "Full name must be present. "
155
+ # "Full name must have length between 2 & 61. "
156
+ # "Full name must have at least two words."
157
+ # ),
158
+ # "Raises Red Flag": True,
159
+ # # "Error Message": "Applicant's full name not present",
160
+ # },
161
+ # {
162
+ # "Document Type": "Digital Bank Account Statement",
163
+ # "Validation": (
164
+ # "Bank name must be present. "
165
+ # "Bank name must have length between 4 & 50. "
166
+ # "Bank Name must match provided value."
167
+ # ),
168
+ # "Raises Red Flag": True,
169
+ # # "Error Message": "Applicant's full name not present",
170
+ # },
171
+ # {
172
+ # "Document Type": "Digital Bank Account Statement",
173
+ # "Validation": (
174
+ # "Bank account number must be present. "
175
+ # "Bank account number must be of 8 digits only. "
176
+ # ),
177
+ # "Raises Red Flag": True,
178
+ # # "Error Message": "Applicant's full name not present",
179
+ # },
180
+ # {
181
+ # "Document Type": "Digital Bank Account Statement",
182
+ # "Validation": (
183
+ # "Sort number must be present. "
184
+ # "It must be of the format xx-xx-xx wherein x are digits. "
185
+ # ),
186
+ # "Raises Red Flag": True,
187
+ # # "Error Message": "Applicant's full name not present",
188
+ # },
189
+ # {
190
+ # "Document Type": "Digital Bank Account Statement",
191
+ # "Validation": (
192
+ # "Both statement start date & statement end date must be present. "
193
+ # "Account statement period's start date & end date must have a gap of at least 28 days. "
194
+ # "At least one salary credit must be present. "
195
+ # "Statement period's end date must be after the start date. "
196
+ # ),
197
+ # "Raises Red Flag": True,
198
+ # # "Error Message": "Applicant's full name not present",
199
+ # },
200
+ # ]
201
+ demo_validations = [
202
+ # {
203
+ # "Topic / Document Type": "General Guidance",
204
+ # "Policy / Rule / Condition": "Income/Employment Docs Risk",
205
+ # "Action / Guidance / Requirement": "Be aware of higher risk of manipulation (Payslips, bank statements, Customer name).",
206
+ # "Red Flag / Caution": "Higher risk category.",
207
+ # "Notes / Details": "",
208
+ # },
209
+ {
210
+ "Topic / Document Type": "General Guidance",
211
+ "Policy / Rule / Condition": "Document Consistency",
212
+ # "Action / Guidance / Requirement": "Compare information across all documents (e.g., payslips vs bank statements) to ensure consistency.",
213
+ "Action / Guidance / Requirement": "Compare applicant's full name across all documents to ensure consistency.",
214
+ # "Red Flag / Caution": "Inconsistencies require investigation.",
215
+ # "Notes / Details": "",
216
+ },
217
+ # {
218
+ # "Topic / Document Type": "General Guidance",
219
+ # "Policy / Rule / Condition": "Payslip YTD Check",
220
+ # "Action / Guidance / Requirement": "Do Year-to-Date figures (gross income, tax) make sense?",
221
+ # "Red Flag / Caution": "If figures don’t make sense, investigate.",
222
+ # "Notes / Details": "",
223
+ # },
224
+ # {
225
+ # "Topic / Document Type": "General Guidance",
226
+ # "Policy / Rule / Condition": "Payslip Details Check",
227
+ # "Action / Guidance / Requirement": "Check for low employee numbers, rounded figures, differences in payment methods (e.g., payslip says BACS, statement shows Faster Payment).",
228
+ # "Red Flag / Caution": "These can be red flags requiring investigation.",
229
+ # "Notes / Details": "",
230
+ # },
231
+ {
232
+ "Topic / Document Type": "General Guidance",
233
+ "Policy / Rule / Condition": "Overall Validation",
234
+ "Action / Guidance / Requirement": "Ensure document is genuine, not fraudulent, belongs to the customer, and is from the expected source.",
235
+ # "Red Flag / Caution": "Any doubt may indicate fraud.",
236
+ # "Notes / Details": "Applies to all documents.",
237
+ },
238
+ {
239
+ "Topic / Document Type": "Passport",
240
+ "Policy / Rule / Condition": "Full Name",
241
+ "Action / Guidance / Requirement": (
242
+ "Full name must be present. "
243
+ "Full name must have length between 2 & 61. "
244
+ "Full name must have at least two words."
245
+ ),
246
+ # "Raises Red Flag": True,
247
+ # "Error Message": "Applicant's full name not present",
248
+ },
249
+ {
250
+ "Topic / Document Type": "Passport",
251
+ "Policy / Rule / Condition": "Expiry Date",
252
+ "Action / Guidance / Requirement": "Expiry date must be present & after a year from current date",
253
+ # "Raises Red Flag": True,
254
+ # "Error Message": "Provided passport expires within 1 year",
255
+ },
256
+ {
257
+ "Topic / Document Type": "Payslips",
258
+ "Policy / Rule / Condition": "Employer & Customer Names",
259
+ "Action / Guidance / Requirement": "Must include correct Employer’s and Customer’s names.",
260
+ # "Red Flag / Caution": "Missing or incorrect names.",
261
+ # "Notes / Details": "Cross-reference with BMM/HOME.",
262
+ },
263
+ {
264
+ "Topic / Document Type": "Payslips",
265
+ "Policy / Rule / Condition": "Submission Requirement (Monthly Pay)",
266
+ "Action / Guidance / Requirement": "Minimum one month's most recent payslip required.",
267
+ # "Red Flag / Caution": "",
268
+ # "Notes / Details": "",
269
+ },
270
+ {
271
+ "Topic / Document Type": "Payslips",
272
+ "Policy / Rule / Condition": "Pay Date Requirement",
273
+ # "Action / Guidance / Requirement": "Pay date must be within 35 days of FCD (Final Completion Date).",
274
+ "Action / Guidance / Requirement": "Pay date must be within 35 days of document upload date.",
275
+ # "Red Flag / Caution": "Pay date older than 35 days from FCD.",
276
+ # "Notes / Details": "",
277
+ },
278
+ {
279
+ "Topic / Document Type": "Payslips",
280
+ "Policy / Rule / Condition": "Pay Period End Date (DD/MM/YYYY, if no pay date)",
281
+ "Action / Guidance / Requirement": "Period end date must be within 35 days of FCD.",
282
+ # "Red Flag / Caution": "Period end date older than 35 days from FCD.",
283
+ # "Notes / Details": "",
284
+ },
285
+ {
286
+ "Topic / Document Type": "Payslips",
287
+ # "Policy / Rule / Condition": "Pay Period Month (MM/YYYY, if no pay date)",
288
+ "Policy / Rule / Condition": "Pay Period Month (MM/YYYY, if no pay date) basis pay period duration",
289
+ "Action / Guidance / Requirement": "Payslips dated in the current or previous calendar month are acceptable (must be the most recent).",
290
+ # "Red Flag / Caution": "Older than previous calendar month.",
291
+ # "Notes / Details": "",
292
+ },
293
+ {
294
+ "Topic / Document Type": "Payslips",
295
+ "Policy / Rule / Condition": "Undated Payslips",
296
+ "Action / Guidance / Requirement": "Unacceptable.",
297
+ # "Red Flag / Caution": "Undated payslip received.",
298
+ # "Notes / Details": "Request a dated version.",
299
+ },
300
+ {
301
+ "Topic / Document Type": "Payslips",
302
+ "Policy / Rule / Condition": "Tax & NI Contributions",
303
+ "Action / Guidance / Requirement": "Must be visible. Perform a sense check.",
304
+ # "Red Flag / Caution": "Missing or nonsensical Tax/NI figures.",
305
+ # "Notes / Details": "",
306
+ },
307
+ # custom
308
+ {
309
+ "Topic / Document Type": "Payslips",
310
+ "Policy / Rule / Condition": "Applicant Address",
311
+ "Action / Guidance / Requirement": (
312
+ "Applicant's address must be present. "
313
+ "Applicant's complete address must have a length of at least 10 & at most 300. "
314
+ "Complete address must match with provided value. "
315
+ ),
316
+ # "Red Flag / Caution": "Missing or nonsensical Tax/NI figures.",
317
+ # "Notes / Details": "",
318
+ },
319
+ # {
320
+ # "Topic / Document Type": "Payslips",
321
+ # "Policy / Rule / Condition": "YTD Figures Match",
322
+ # "Action / Guidance / Requirement": "Verify YTD figures match declared income.",
323
+ # "Red Flag / Caution": "YTD figures do not match declared income.",
324
+ # "Notes / Details": "Add to YMI/FDM memo if they do not match.",
325
+ # },
326
+ # {
327
+ # "Topic / Document Type": "Payslips",
328
+ # "Policy / Rule / Condition": "Pension Income (on Payslip)",
329
+ # "Action / Guidance / Requirement": "Must show within the last 35 days / be the most recent.",
330
+ # "Red Flag / Caution": "Pension income shown is dated >35 days ago.",
331
+ # "Notes / Details": "Alternatively use pension annual statement/latest P60. Cross-reference with bank statement if possible.",
332
+ # },
333
+ # {
334
+ # "Topic / Document Type": "Payslips",
335
+ # "Policy / Rule / Condition": "Joint Applicants",
336
+ # "Action / Guidance / Requirement": "Required if applicable.",
337
+ # "Red Flag / Caution": "Missing payslip for a joint applicant.",
338
+ # "Notes / Details": "",
339
+ # },
340
+ # {
341
+ # "Topic / Document Type": "Payslips",
342
+ # "Policy / Rule / Condition": "Payslip Red Flags",
343
+ # "Action / Guidance / Requirement": "",
344
+ # "Red Flag / Caution": "Rounded figures. Low employee/payroll number. Presence of these flags.",
345
+ # "Notes / Details": "Investigate further.",
346
+ # },
347
+ # {
348
+ # "Topic / Document Type": "Payslips",
349
+ # "Policy / Rule / Condition": "Payslip Verification (HOME)",
350
+ # "Action / Guidance / Requirement": "Check information in HOME against payslip details (employer name, customer name, etc.).",
351
+ # "Red Flag / Caution": "Mismatches found (e.g., misspellings, missing words).",
352
+ # "Notes / Details": "Correct HOME after consulting customer. If correction not possible (e.g., space), add YMI/FDM memo explaining.",
353
+ # },
354
+ # {
355
+ # "Topic / Document Type": "Payslips",
356
+ # "Policy / Rule / Condition": "Payslip Near 35-Day Limit",
357
+ # "Action / Guidance / Requirement": "If payslip is close to the 35-day limit and no decision is obtained.",
358
+ # "Red Flag / Caution": "Decision pending, payslip nearing expiry.",
359
+ # "Notes / Details": "Another, more recent payslip may be required.",
360
+ # },
361
+ # {
362
+ # "Topic / Document Type": "Digital Bank Stmts",
363
+ # "Policy / Rule / Condition": "Purpose",
364
+ # "Action / Guidance / Requirement": "Used to confirm income/expenditure.",
365
+ # "Red Flag / Caution": "",
366
+ # "Notes / Details": "Cannot be used for ID & VA confirmation.",
367
+ # },
368
+ {
369
+ "Topic / Document Type": "Digital Bank Stmts",
370
+ "Policy / Rule / Condition": "Coverage",
371
+ # "Action / Guidance / Requirement": "Must cover a full calendar month (vs 28 days for original).",
372
+ "Action / Guidance / Requirement": "Account statement period's start date & end date must have a gap of at least 28 days.",
373
+ # "Red Flag / Caution": "",
374
+ # "Notes / Details": "",
375
+ },
376
+ {
377
+ "Topic / Document Type": "Digital Bank Stmts",
378
+ "Policy / Rule / Condition": "Data Match",
379
+ "Action / Guidance / Requirement": "Customer data on statement must match profile.",
380
+ # "Red Flag / Caution": "Data mismatch.",
381
+ # "Notes / Details": "",
382
+ },
383
+ # {
384
+ # "Topic / Document Type": "Digital Bank Stmts",
385
+ # "Policy / Rule / Condition": "Pay Info Match",
386
+ # "Action / Guidance / Requirement": "Verify pay information matches the payslip.",
387
+ # "Red Flag / Caution": "Pay info mismatch vs payslip.",
388
+ # "Notes / Details": "",
389
+ # },
390
+ {
391
+ "Topic / Document Type": "Digital Bank Stmts",
392
+ "Policy / Rule / Condition": "Authenticity Doubt",
393
+ "Action / Guidance / Requirement": "If any doubt regarding authenticity.",
394
+ # "Red Flag / Caution": "Suspected non-genuine digital statement.",
395
+ # "Notes / Details": "Cases may be referred to Fraud.",
396
+ },
397
+ {
398
+ "Topic / Document Type": "Digital Bank Stmts",
399
+ "Policy / Rule / Condition": "Bank name",
400
+ "Action / Guidance / Requirement": (
401
+ "Bank name must be present. "
402
+ "Bank name must have length between 4 & 50. "
403
+ "Bank Name must match provided value."
404
+ ),
405
+ },
406
+ {
407
+ "Topic / Document Type": "Digital Bank Stmts",
408
+ "Policy / Rule / Condition": "Bank account number",
409
+ "Action / Guidance / Requirement": (
410
+ "Bank account number must be present. "
411
+ "Bank account number must be of 8 digits only. "
412
+ ),
413
+ },
414
+ {
415
+ "Topic / Document Type": "Digital Bank Stmts",
416
+ "Policy / Rule / Condition": "Sort code",
417
+ "Action / Guidance / Requirement": (
418
+ "Sort number must be present. "
419
+ "It must be of the format xx-xx-xx wherein x are digits. "
420
+ ),
421
+ },
422
+ {
423
+ "Topic / Document Type": "Digital Bank Stmts",
424
+ "Policy / Rule / Condition": "Date checks",
425
+ "Action / Guidance / Requirement": (
426
+ "Both statement start date & statement end date must be present. "
427
+ "At least one salary credit must be present. "
428
+ "Statement period's end date must be after the start date. "
429
+ ),
430
+ },
431
+ ]
432
+
433
+ demo_validations_df = pd.DataFrame(demo_validations)
434
+ st.table(demo_validations_df)
435
+
436
+
437
+ with upload_docs_validation_results_tab:
438
+ st.header("Validation Results")
439
+ if st.session_state:
440
+ st.session_state
app_fastapi.py ADDED
File without changes
app_streamlit.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from utils.process_files import process_uploaded_files
3
+ from utils.document_display import display_based_on_card
4
+ import os
5
+ import pandas as pd
6
+ import json
7
+ from llm.document_analyzer import analyze_files
8
+
9
+ from PIL import Image
10
+ from utils import setup_logger
11
+ from utils.session_state import reset_state
12
+ from datetime import datetime
13
+ import uuid
14
+ from utils.tabs.document_upload_tab import upload_documents
15
+ from utils.tabs.memo import display_memo
16
+ from utils.tabs.demo_validations import display_demo_validations
17
+ from utils.tabs.document_validation_tab import validate_documents
18
+
19
+ logger = setup_logger(__name__)
20
+
21
+ st.set_page_config(layout="wide")
22
+
23
+
24
+ # Initialize session state structures
25
+ if 'uploads' not in st.session_state:
26
+ st.session_state['uploads'] = {}
27
+ if 'current_upload' not in st.session_state:
28
+ st.session_state['current_upload'] = None
29
+
30
+
31
+ st.title("🪪 Underwriting Workflow")
32
+
33
+
34
+ upload_docs_tab, memo_tab, upload_docs_validation_results_tab, demo_validations_considered_tab = st.tabs(
35
+ ["Upload Documents", "Memo", "Validation Results", "Policies"]
36
+ )
37
+
38
+ with upload_docs_tab:
39
+ upload_documents()
40
+
41
+ with memo_tab:
42
+ display_memo()
43
+
44
+ with demo_validations_considered_tab:
45
+ display_demo_validations()
46
+
47
+ with upload_docs_validation_results_tab:
48
+ validate_documents(current=st.session_state['current_upload'])
app_streamlit_bak.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from utils.process_files import process_uploaded_files
3
+ from utils.document_display import display_based_on_card
4
+ import os
5
+ import pandas as pd
6
+ import json
7
+ from llm.document_analyzer import analyze_files
8
+
9
+ from PIL import Image
10
+ from utils import setup_logger
11
+
12
+ logger = setup_logger(__name__)
13
+
14
+ st.set_page_config(layout="wide")
15
+ if len(st.session_state) == 0:
16
+ if 'tab_ocr' not in st.session_state:
17
+ # if st.session_state['tab_ocr']['file_groups'] is None:
18
+ st.session_state = {
19
+ 'tab_ocr': {
20
+ 'file_groups': None,
21
+ 'values_raw': None,
22
+ 'values_display': None
23
+
24
+ }
25
+ }
26
+
27
+ logger.info(f"st.session_state: {st.session_state}")
28
+ st.title("ID Analyser")
29
+
30
+ uploaded_files = st.file_uploader("Upload Images, PDFs", accept_multiple_files=True, type=[
31
+ "png", "jpg", "jpeg", "pdf", "zip"])
32
+
33
+
34
+ if uploaded_files:
35
+ st.session_state = {
36
+ 'tab_ocr': {
37
+ 'file_groups': None,
38
+ 'values_raw': None,
39
+ 'values_display': None
40
+
41
+ }
42
+ }
43
+ file_paths, file_groups, temp_dir = process_uploaded_files(
44
+ uploaded_files) # Remove file paths later
45
+ if st.session_state['tab_ocr']['file_groups'] is None:
46
+ st.session_state['tab_ocr']['file_groups'] = file_groups
47
+
48
+ analyze_clicked = st.button("Analyze")
49
+
50
+ if analyze_clicked:
51
+ st.session_state['tab_ocr']['values_raw'] = None
52
+ st.session_state['tab_ocr']['values_display'] = None
53
+
54
+ if analyze_clicked or st.session_state['tab_ocr']['values_display']:
55
+ # if st.button("Analyze") or st.session_state['tab_ocr']['values_display'] is not None:
56
+ if st.session_state['tab_ocr']['values_raw'] is None:
57
+ analysis_results_groups, json_output_path = analyze_files(
58
+ file_groups=st.session_state['tab_ocr']['file_groups'],
59
+ temp_dir=temp_dir)
60
+
61
+ st.session_state['tab_ocr']['values_raw'] = analysis_results_groups
62
+
63
+ if st.session_state['tab_ocr']['values_display'] is None:
64
+ st.session_state['tab_ocr']['values_display'] = {}
65
+
66
+ for original_file, extracted_files in st.session_state['tab_ocr']['file_groups'].items():
67
+ analysis_results_for_id = display_based_on_card(
68
+ original_file=original_file,
69
+ analysis_results_for_original_file=st.session_state[
70
+ 'tab_ocr']['values_raw'][original_file],
71
+ extracted_files=extracted_files)
72
+
73
+ st.download_button(
74
+ label="Download Analysis JSON",
75
+ data=json.dumps(
76
+ st.session_state['tab_ocr']['values_raw'], indent=4),
77
+ file_name="analysis_results.json",
78
+ mime="application/json"
79
+ )
checks copy.ipynb ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## ID: Passport"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "## Check against sample extracted JSON"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "import json\n",
24
+ "\n",
25
+ "from utils.prep_validators_payload import process_extracted_data"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 2,
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "custom_app_form = {\n",
35
+ " \"application_summary_full_name\": \"Jodie Pippa\",\n",
36
+ " \"application_summary_bank_name\": \"HSBC\",\n",
37
+ " \"application_summary_employer_name\": \"ABC Ltd\",\n",
38
+ " \"application_summary_complete_address\": \"123 Maple Street, London, UK, SW1A 1AA\",\n",
39
+ " \"full_name_err_msgs\": None,\n",
40
+ " \"bank_name_err_msgs\": None,\n",
41
+ " \"employer_name_err_msgs\": None,\n",
42
+ " \"complete_employee_address_err_msgs\": None,\n",
43
+ " \"is_incomplete\": False,\n",
44
+ "}\n"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 3,
50
+ "metadata": {},
51
+ "outputs": [
52
+ {
53
+ "name": "stdout",
54
+ "output_type": "stream",
55
+ "text": [
56
+ "{'/tmp/tmp6w8qn6h6/sample_documents/sample_documents/3.pdf': {'/tmp/tmp6w8qn6h6/sample_documents/sample_documents/3.pdf_page_0.png': {'document_category': 'bank_statement', 'document_type': 'bank_statement', 'account_holder_name': 'Jodie Pippa', 'account_holder_address': '', 'bank_name': 'HSBC', 'account_number': '12345678', 'sort_code': '20-00-00', 'statement_start_date': '2025-01-01', 'statement_end_date': '2025-02-28', 'salary_credits': [{'date': '2025-01-06', 'amount': '2213.83', 'from': 'ABC Ltd', 'description': 'Salary - ABC Ltd (Pro-rated Jan)'}, {'date': '2025-02-06', 'amount': '2566.66', 'from': 'ABC Ltd', 'description': 'Salary - ABC Ltd (Full Feb Salary)'}]}}, '/tmp/tmp6w8qn6h6/sample_documents/sample_documents/5.pdf': {'/tmp/tmp6w8qn6h6/sample_documents/sample_documents/5.pdf_page_0.png': {'document_category': 'income_document', 'document_type': 'payslip', 'employee_name': 'Jodie Pippa', 'employer_name': 'ABC Ltd', 'employee_id': 'JP12345', 'employee_address': '123 Maple Street, London, UK, SW1A 1AA', 'employer_address': '456 Business Street, London, UK, SW1A 2BB', 'tax_code': '1257L', 'payslip_date': '2025-01-31', 'pay_period_start': '2025-01-06', 'pay_period_end': '2025-01-31', 'payment_frequency': 'monthly', 'basic_pay': '3333.33', 'net_pay': '2566.66', 'gross_pay': '3333.33', 'salary_components': [], 'ni_contribution': '266.67', 'tax_deduction': '333.33', 'other_deductions': [{'name': 'Pension Contribution', 'amount': '166.67'}]}}, '/tmp/tmp6w8qn6h6/sample_documents/sample_documents/2.pdf': {'/tmp/tmp6w8qn6h6/sample_documents/sample_documents/2.pdf_page_0.png': {'document_category': 'income_document', 'document_type': 'p60', 'employee_details': {'surname': 'Pippa', 'forenames_or_initials': 'Jodie', 'national_insurance_number': 'AB123456C', 'works_payroll_number': '5342'}, 'pay_and_income_tax_details': {'previous_employments': {'pay': 0.0, 'tax_deducted': 0.0}, 'current_employment': {'pay': 9545.45, 'tax_deducted': 0.0}, 'total_for_year': {'pay': 9545.45, 'tax_deducted': 0.0}, 'final_tax_code': '1257'}, 'national_insurance_contributions': [{'nic_letter': 'A', 'earnings': {'at_or_above_lel': 6396.0, 'above_lel_up_to_pt': 0.0, 'above_pt_up_to_uel': 3149.45}, 'employee_contributions_above_pt': 377.93}], 'statutory_payments': {'maternity_pay': 0.0, 'paternity_pay': 0.0, 'adoption_pay': 0.0, 'shared_parental_pay': 0.0}, 'other_details': {'student_loan_deductions': 0.0}, 'employer_details': {'employer_name_and_address': None, 'paye_reference': '123/AB456'}}}, '/tmp/tmp6w8qn6h6/sample_documents/sample_documents/1.pdf': {'/tmp/tmp6w8qn6h6/sample_documents/sample_documents/1.pdf_page_0.png': {'document_category': 'identity_verification_document', 'document_type': 'passport', 'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P<GBRUNITED<KINGDOM<FIVE<<JODIE<PIPPA<<<<<<<', 'mrz_line_2': '1071857032GBR8501178F1601312<<<<<<<<<<<<<<02'}}}\n"
57
+ ]
58
+ }
59
+ ],
60
+ "source": [
61
+ "with open(\"../analysis_results.json\", \"r\") as f:\n",
62
+ " full_data = json.load(f)\n",
63
+ "\n",
64
+ "print(full_data)\n",
65
+ "# print(process_extracted_data(full_data, custom_app_form))"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 4,
71
+ "metadata": {},
72
+ "outputs": [
73
+ {
74
+ "name": "stdout",
75
+ "output_type": "stream",
76
+ "text": [
77
+ "{'payslips': [{'pay_period_start_date': datetime.date(2025, 1, 6), 'pay_period_end_date': datetime.date(2025, 1, 31), 'pay_period_days': None, 'pay_date': datetime.date(2025, 1, 31), 'full_name': 'Jodie Pippa', 'employer_name': 'ABC Ltd', 'is_basic_pay_net_pay_other_salary_components_present': True, 'is_tax_deducation_present': True, 'is_ni_deduction_present': True, 'complete_employee_address': '123 Maple Street, London, UK, SW1A 1AA', 'pay_dates_err_msgs': \"Pay date must be within the last 35 days & not in the future, Pay period's start date & end date must have a gap of at least 28 days\", 'full_name_err_msgs': None, 'employer_name_err_msgs': None, 'payslip_line_item_presence_err_msgs': None, 'complete_employee_address_err_msgs': None, 'validation_policy_status_df': Policy Value Status \\\n",
78
+ "0 Applicant's full name should be present Jodie Pippa True \n",
79
+ "1 Full name must have a length of at least 2 & a... 11 True \n",
80
+ "2 Full name must consist of at least 2 words (fi... 2 True \n",
81
+ "3 Name should match with provided value Jodie Pippa True \n",
82
+ "4 Employer name must be present ABC Ltd True \n",
83
+ "5 Employer name must match with provided value ABC Ltd True \n",
84
+ "\n",
85
+ " Message \n",
86
+ "0 Applicant's full name is present \n",
87
+ "1 Full name has a length of at least 2 & at most 61 \n",
88
+ "2 Full name consists of at least 2 words (first ... \n",
89
+ "3 Name matches with provided value \n",
90
+ "4 Employer name is present \n",
91
+ "5 Employer name matches with provided value , 'is_red_flagged': True}], 'bank_statements': [{'statement_start_date': datetime.date(2025, 1, 1), 'statement_end_date': datetime.date(2025, 2, 28), 'first_salary_deposit_date_present': 6, 'bank_name': 'HSBC', 'full_name': 'Jodie Pippa', 'account_number': '12345678', 'sort_code': '20-00-00', 'account_statement_date_err_msgs': None, 'full_name_err_msgs': None, 'bank_name_err_msgs': None, 'account_number_err_msgs': None, 'sort_code_err_msgs': None, 'salary_deposit_err_msgs': None, 'is_red_flagged': False}], 'passports': [{'full_name': 'JODIE PIPPA', 'expiry_date': datetime.date(2016, 1, 31), 'full_name_err_msgs': None, 'expiry_date_err_msgs': 'Provided passport expires within 1 year', 'is_red_flagged': True}]}\n"
92
+ ]
93
+ }
94
+ ],
95
+ "source": [
96
+ "print(process_extracted_data(full_data, custom_app_form))"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 5,
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "data": {
106
+ "text/html": [
107
+ "<div>\n",
108
+ "<style scoped>\n",
109
+ " .dataframe tbody tr th:only-of-type {\n",
110
+ " vertical-align: middle;\n",
111
+ " }\n",
112
+ "\n",
113
+ " .dataframe tbody tr th {\n",
114
+ " vertical-align: top;\n",
115
+ " }\n",
116
+ "\n",
117
+ " .dataframe thead th {\n",
118
+ " text-align: right;\n",
119
+ " }\n",
120
+ "</style>\n",
121
+ "<table border=\"1\" class=\"dataframe\">\n",
122
+ " <thead>\n",
123
+ " <tr style=\"text-align: right;\">\n",
124
+ " <th></th>\n",
125
+ " <th>Policy</th>\n",
126
+ " <th>Value</th>\n",
127
+ " <th>Status</th>\n",
128
+ " <th>Message</th>\n",
129
+ " </tr>\n",
130
+ " </thead>\n",
131
+ " <tbody>\n",
132
+ " <tr>\n",
133
+ " <th>0</th>\n",
134
+ " <td>Applicant's full name should be present</td>\n",
135
+ " <td>Jodie Pippa</td>\n",
136
+ " <td>True</td>\n",
137
+ " <td>Applicant's full name is present</td>\n",
138
+ " </tr>\n",
139
+ " <tr>\n",
140
+ " <th>1</th>\n",
141
+ " <td>Full name must have a length of at least 2 &amp; a...</td>\n",
142
+ " <td>11</td>\n",
143
+ " <td>True</td>\n",
144
+ " <td>Full name has a length of at least 2 &amp; at most 61</td>\n",
145
+ " </tr>\n",
146
+ " <tr>\n",
147
+ " <th>2</th>\n",
148
+ " <td>Full name must consist of at least 2 words (fi...</td>\n",
149
+ " <td>2</td>\n",
150
+ " <td>True</td>\n",
151
+ " <td>Full name consists of at least 2 words (first ...</td>\n",
152
+ " </tr>\n",
153
+ " <tr>\n",
154
+ " <th>3</th>\n",
155
+ " <td>Name should match with provided value</td>\n",
156
+ " <td>Jodie Pippa</td>\n",
157
+ " <td>True</td>\n",
158
+ " <td>Name matches with provided value</td>\n",
159
+ " </tr>\n",
160
+ " <tr>\n",
161
+ " <th>4</th>\n",
162
+ " <td>Employer name must be present</td>\n",
163
+ " <td>ABC Ltd</td>\n",
164
+ " <td>True</td>\n",
165
+ " <td>Employer name is present</td>\n",
166
+ " </tr>\n",
167
+ " <tr>\n",
168
+ " <th>5</th>\n",
169
+ " <td>Employer name must match with provided value</td>\n",
170
+ " <td>ABC Ltd</td>\n",
171
+ " <td>True</td>\n",
172
+ " <td>Employer name matches with provided value</td>\n",
173
+ " </tr>\n",
174
+ " </tbody>\n",
175
+ "</table>\n",
176
+ "</div>"
177
+ ],
178
+ "text/plain": [
179
+ " Policy Value Status \\\n",
180
+ "0 Applicant's full name should be present Jodie Pippa True \n",
181
+ "1 Full name must have a length of at least 2 & a... 11 True \n",
182
+ "2 Full name must consist of at least 2 words (fi... 2 True \n",
183
+ "3 Name should match with provided value Jodie Pippa True \n",
184
+ "4 Employer name must be present ABC Ltd True \n",
185
+ "5 Employer name must match with provided value ABC Ltd True \n",
186
+ "\n",
187
+ " Message \n",
188
+ "0 Applicant's full name is present \n",
189
+ "1 Full name has a length of at least 2 & at most 61 \n",
190
+ "2 Full name consists of at least 2 words (first ... \n",
191
+ "3 Name matches with provided value \n",
192
+ "4 Employer name is present \n",
193
+ "5 Employer name matches with provided value "
194
+ ]
195
+ },
196
+ "execution_count": 5,
197
+ "metadata": {},
198
+ "output_type": "execute_result"
199
+ }
200
+ ],
201
+ "source": [
202
+ "a = process_extracted_data(full_data, custom_app_form)\n",
203
+ "a['payslips'][0]['validation_policy_status_df']"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": null,
209
+ "metadata": {},
210
+ "outputs": [],
211
+ "source": []
212
+ }
213
+ ],
214
+ "metadata": {
215
+ "kernelspec": {
216
+ "display_name": "hsbc_uk_demo_venv",
217
+ "language": "python",
218
+ "name": "python3"
219
+ },
220
+ "language_info": {
221
+ "codemirror_mode": {
222
+ "name": "ipython",
223
+ "version": 3
224
+ },
225
+ "file_extension": ".py",
226
+ "mimetype": "text/x-python",
227
+ "name": "python",
228
+ "nbconvert_exporter": "python",
229
+ "pygments_lexer": "ipython3",
230
+ "version": "3.13.1"
231
+ }
232
+ },
233
+ "nbformat": 4,
234
+ "nbformat_minor": 2
235
+ }
config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [server]
2
+
3
+ maxUploadSize = 10
llm/__init__.py ADDED
File without changes
llm/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (169 Bytes). View file
 
llm/__pycache__/document_analyzer.cpython-313.pyc ADDED
Binary file (3.88 kB). View file
 
llm/__pycache__/llm.cpython-313.pyc ADDED
Binary file (2.21 kB). View file
 
llm/document_analyzer.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ from PIL import Image
4
+ import json
5
+ import os
6
+ from utils import im_2_b64, load_pdf_as_image, generate_metadata
7
+ from .llm import DocumentLLM
8
+ from prompts import (document_type_prompt, passport_prompt,
9
+ payslip_prompt, bank_statement_prompt,
10
+ p60_prompt, driving_license_prompt,
11
+ genric_ocr_prompt)
12
+ from utils.json_utils import restructure_documents
13
+
14
+ from utils import setup_logger
15
+
16
+ logger = setup_logger(__name__)
17
+
18
+
19
+ def analyze_files(file_groups: dict, temp_dir, current_upload):
20
+ document_llm = DocumentLLM()
21
+ results_group = {}
22
+ for original_file, extracted_files in file_groups.items():
23
+ results = {}
24
+ for file_name in extracted_files:
25
+ results[file_name] = {"status": "processed",
26
+ "type": "image", "dummy_data": 12345}
27
+
28
+ logger.info(f"file_name : {file_name}")
29
+ extension = file_name.lower().split('.')[-1]
30
+
31
+ results[file_name] = generate_metadata(file_name)
32
+
33
+ try:
34
+ logger.info(f"Starting analysis for {file_name}")
35
+
36
+ if extension in ['jpg', 'jpeg', 'png', 'gif']:
37
+ image = Image.open(file_name)
38
+ image_buffer = im_2_b64(image)
39
+ elif extension == 'pdf':
40
+ img = load_pdf_as_image(file_name)
41
+ image_buffer = im_2_b64(img)
42
+ st.image(img, use_container_width=True)
43
+
44
+ else:
45
+ st.write(
46
+ f"Unsupported file format: {extension}")
47
+
48
+ if image_buffer is not None:
49
+ results[file_name] = document_llm.call_llm_api(
50
+ prompt=document_type_prompt,
51
+ image_path=file_name)
52
+
53
+ logger.info(
54
+ f"File name: {file_name}, Results: {results[file_name]}")
55
+ document_type = results[file_name].get(
56
+ 'document_type', None)
57
+
58
+ if document_type is not None:
59
+
60
+ prompt = None
61
+
62
+ if document_type == 'passport':
63
+ prompt = passport_prompt
64
+ elif document_type == 'driving_license':
65
+ prompt = driving_license_prompt
66
+ elif document_type == 'bank_statement':
67
+ prompt = bank_statement_prompt
68
+ elif document_type == 'payslip':
69
+ prompt = payslip_prompt
70
+ elif document_type == 'p60':
71
+ prompt = p60_prompt
72
+ else:
73
+ prompt = genric_ocr_prompt
74
+
75
+ if prompt is not None:
76
+ data = document_llm.call_llm_api(
77
+ prompt=prompt,
78
+ image_path=file_name)
79
+
80
+ results[file_name].update(data)
81
+
82
+ logger.info(f"{file_name}: {data}")
83
+
84
+ except Exception as e:
85
+ st.error(f"Error processing {file_name}: {str(e)}")
86
+
87
+ image_buffer = None
88
+
89
+ results_group[original_file] = results
90
+
91
+
92
+ results_transformed = restructure_documents(results_group)
93
+ st.session_state['uploads'][current_upload]['results_transformed'] = results_transformed
94
+
95
+
96
+ # Save analysis results to a JSON file
97
+ json_output_path = os.path.join(
98
+ temp_dir, "analysis_results.json")
99
+ with open(json_output_path, "w") as json_file:
100
+ json.dump(results_group, json_file, indent=4)
101
+
102
+ return results_group, json_output_path
llm/llm.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from google import genai
3
+ from google.genai.types import HttpOptions
4
+ from dotenv import load_dotenv
5
+ import os
6
+ import json
7
+ import re
8
+ from vertexai.generative_models import Part, Image, GenerativeModel
9
+ from utils import setup_logger
10
+ import vertexai
11
+
12
+ logger = setup_logger(__name__)
13
+
14
+ load_dotenv()
15
+
16
+ project = os.getenv("GOOGLE_CLOUD_PROJECT")
17
+ location = os.getenv("GOOGLE_CLOUD_LOCATION")
18
+ vertexai.init(project=project, location=location)
19
+
20
+
21
+ class DocumentLLM(BaseModel):
22
+
23
+ def call_llm_api(self, prompt, image_path):
24
+
25
+ model: GenerativeModel = GenerativeModel(
26
+ model_name="gemini-2.0-flash-001")
27
+
28
+ text_part = Part.from_text(prompt)
29
+ image_part = Part.from_image(Image.load_from_file(image_path))
30
+
31
+ response = model.generate_content([
32
+ image_part,
33
+ text_part])
34
+
35
+ content = response.text
36
+
37
+ try:
38
+ content = json.loads(content)
39
+ except Exception as e:
40
+ logger.info(f"Json is being formatted")
41
+ content = re.sub(r"^```json\s*|\s*```$", "",
42
+ content, flags=re.MULTILINE)
43
+
44
+ # Parse JSON
45
+ content = json.loads(content)
46
+
47
+ return content
logs_directory/app_20250414.log ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-14 22:13:56 - llm.document_analyzer - INFO - [document_analyzer.py:26] - file_name : /tmp/tmpq7o86ysz/1.pdf_page_0.png
2
+ 2025-04-14 22:14:01 - utils.document_display - INFO - [document_display.py:148] - analysis_results_for_id : {'File Name': '1.pdf_page_0.png', 'Directory': 'tmpq7o86ysz', 'File Size': '943.88 KB', 'Last Modified': '2025-04-14 22:13:56', 'Created': '2025-04-14 22:13:56', 'File Extension': '.png', 'Full Path': '/tmp/tmpq7o86ysz/1.pdf_page_0.png', 'Image Size': '1700x2200', 'Image Mode': 'RGB', 'Image Format': 'PNG'}
3
+ 2025-04-14 22:14:01 - utils.document_display - INFO - [document_display.py:167] - document_type for /tmp/tmpq7o86ysz/1.pdf: None
4
+ 2025-04-14 22:14:01 - utils.document_display - INFO - [document_display.py:245] - Exception for processing analysis results of {'File Name': '1.pdf_page_0.png', 'Directory': 'tmpq7o86ysz', 'File Size': '943.88 KB', 'Last Modified': '2025-04-14 22:13:56', 'Created': '2025-04-14 22:13:56', 'File Extension': '.png', 'Full Path': '/tmp/tmpq7o86ysz/1.pdf_page_0.png', 'Image Size': '1700x2200', 'Image Mode': 'RGB', 'Image Format': 'PNG'}: 'NoneType' object has no attribute 'lower'
5
+ 2025-04-14 22:14:01 - utils.document_display - INFO - [document_display.py:249] - analysis_results_for_id_updated : {'File Name': '1.pdf_page_0.png', 'Directory': 'tmpq7o86ysz', 'File Size': '943.88 KB', 'Last Modified': '2025-04-14 22:13:56', 'Created': '2025-04-14 22:13:56', 'File Extension': '.png', 'Full Path': '/tmp/tmpq7o86ysz/1.pdf_page_0.png', 'Image Size': '1700x2200', 'Image Mode': 'RGB', 'Image Format': 'PNG'}
6
+ 2025-04-14 22:14:01 - __main__ - INFO - [app_streamlit.py:40] - file_path while displaying: ['/tmp/tmpq7o86ysz/1.pdf_page_0.png']
7
+ 2025-04-14 22:18:51 - llm.document_analyzer - INFO - [document_analyzer.py:26] - file_name : /tmp/tmpqmaed05g/1.pdf_page_0.png
8
+ 2025-04-14 22:18:51 - llm.document_analyzer - INFO - [document_analyzer.py:32] - Starting analysis for /tmp/tmpqmaed05g/1.pdf_page_0.png
9
+ 2025-04-14 22:18:55 - utils.document_display - INFO - [document_display.py:148] - analysis_results_for_id : {'File Name': '1.pdf_page_0.png', 'Directory': 'tmpqmaed05g', 'File Size': '943.88 KB', 'Last Modified': '2025-04-14 22:18:51', 'Created': '2025-04-14 22:18:51', 'File Extension': '.png', 'Full Path': '/tmp/tmpqmaed05g/1.pdf_page_0.png', 'Image Size': '1700x2200', 'Image Mode': 'RGB', 'Image Format': 'PNG'}
10
+ 2025-04-14 22:18:55 - utils.document_display - INFO - [document_display.py:167] - document_type for /tmp/tmpqmaed05g/1.pdf: None
11
+ 2025-04-14 22:18:55 - utils.document_display - INFO - [document_display.py:249] - analysis_results_for_id_updated : {'document_type': None}
12
+ 2025-04-14 22:18:55 - __main__ - INFO - [app_streamlit.py:40] - file_path while displaying: ['/tmp/tmpqmaed05g/1.pdf_page_0.png']
13
+ 2025-04-14 22:37:29 - llm.document_analyzer - INFO - [document_analyzer.py:26] - file_name : /tmp/tmp5dfq1etu/1.pdf_page_0.png
14
+ 2025-04-14 22:37:29 - llm.document_analyzer - INFO - [document_analyzer.py:32] - Starting analysis for /tmp/tmp5dfq1etu/1.pdf_page_0.png
15
+ 2025-04-14 22:37:35 - llm.document_analyzer - INFO - [document_analyzer.py:51] - File name: /tmp/tmp5dfq1etu/1.pdf_page_0.png, Results: {'document_category': 'identity_verification_document', 'document_type': 'passport'}
16
+ 2025-04-14 22:37:40 - llm.llm - INFO - [llm.py:40] - Json is being formatted
17
+ 2025-04-14 22:37:40 - llm.document_analyzer - INFO - [document_analyzer.py:76] - /tmp/tmp5dfq1etu/1.pdf_page_0.png: {'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P<GBRUNITED<KINGDOM<FIVE<<JODIE<PIPPA<<<<<<<', 'mrz_line_2': '1071857032GBR8501178F1601312<<<<<<<<<<<<<<02'}
18
+ 2025-04-14 22:37:40 - utils.document_display - INFO - [document_display.py:148] - analysis_results_for_id : {'document_category': 'identity_verification_document', 'document_type': 'passport', 'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P<GBRUNITED<KINGDOM<FIVE<<JODIE<PIPPA<<<<<<<', 'mrz_line_2': '1071857032GBR8501178F1601312<<<<<<<<<<<<<<02'}
19
+ 2025-04-14 22:37:40 - utils.document_display - INFO - [document_display.py:167] - document_type for /tmp/tmp5dfq1etu/1.pdf: passport
20
+ 2025-04-14 22:37:40 - utils.document_display - INFO - [document_display.py:249] - analysis_results_for_id_updated : {'document_category': 'identity_verification_document', 'document_type': 'passport', 'passport_number': '107185703', 'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'date_of_birth': '1985-01-17', 'nationality': 'BRITISH CITIZEN', 'date_of_issue': '2006-01-31', 'gender': None, 'address': None}
21
+ 2025-04-14 22:37:40 - __main__ - INFO - [app_streamlit.py:40] - file_path while displaying: ['/tmp/tmp5dfq1etu/1.pdf_page_0.png']
22
+ 2025-04-14 22:37:40 - __main__ - INFO - [app_streamlit.py:40] - file_path while displaying: ['/tmp/tmp5dfq1etu/1.pdf_page_0.png']
23
+ 2025-04-14 22:48:54 - llm.document_analyzer - INFO - [document_analyzer.py:26] - file_name : /tmp/tmpfmxw59lw/sample_documents/sample_documents/3.pdf_page_0.png
24
+ 2025-04-14 22:48:54 - llm.document_analyzer - INFO - [document_analyzer.py:32] - Starting analysis for /tmp/tmpfmxw59lw/sample_documents/sample_documents/3.pdf_page_0.png
25
+ 2025-04-14 22:48:59 - llm.llm - INFO - [llm.py:40] - Json is being formatted
26
+ 2025-04-14 22:48:59 - llm.document_analyzer - INFO - [document_analyzer.py:51] - File name: /tmp/tmpfmxw59lw/sample_documents/sample_documents/3.pdf_page_0.png, Results: {'document_category': 'bank_statement', 'document_type': 'bank_statement'}
27
+ 2025-04-14 22:49:03 - llm.llm - INFO - [llm.py:40] - Json is being formatted
28
+ 2025-04-14 22:49:03 - llm.document_analyzer - INFO - [document_analyzer.py:76] - /tmp/tmpfmxw59lw/sample_documents/sample_documents/3.pdf_page_0.png: {'account_holder_name': 'Jodie Pippa', 'account_holder_address': '', 'statement_start_date': '2025-01-01', 'statement_end_date': '2025-02-28', 'salary_credits': [{'date': '2025-01-06', 'amount': '2213.83', 'from': 'ABC Ltd', 'description': 'Salary - ABC Ltd (Pro-rated Jan)'}, {'date': '2025-02-06', 'amount': '2566.66', 'from': 'ABC Ltd', 'description': 'Salary - ABC Ltd (Full Feb Salary)'}]}
29
+ 2025-04-14 22:49:03 - llm.document_analyzer - INFO - [document_analyzer.py:26] - file_name : /tmp/tmpfmxw59lw/sample_documents/sample_documents/5.pdf_page_0.png
30
+ 2025-04-14 22:49:03 - llm.document_analyzer - INFO - [document_analyzer.py:32] - Starting analysis for /tmp/tmpfmxw59lw/sample_documents/sample_documents/5.pdf_page_0.png
31
+ 2025-04-14 22:49:08 - llm.llm - INFO - [llm.py:40] - Json is being formatted
32
+ 2025-04-14 22:49:08 - llm.document_analyzer - INFO - [document_analyzer.py:51] - File name: /tmp/tmpfmxw59lw/sample_documents/sample_documents/5.pdf_page_0.png, Results: {'document_category': 'income_document', 'document_type': 'payslip'}
33
+ 2025-04-14 22:49:12 - llm.llm - INFO - [llm.py:40] - Json is being formatted
34
+ 2025-04-14 22:49:12 - llm.document_analyzer - INFO - [document_analyzer.py:76] - /tmp/tmpfmxw59lw/sample_documents/sample_documents/5.pdf_page_0.png: {'employee_name': 'Jodie Pippa', 'employer_name': 'ABC Ltd', 'payslip_date': '2025-01-31', 'pay_period_start': '2025-01-06', 'pay_period_end': '2025-01-31', 'payment_frequency': 'monthly', 'basic_pay': '3333.33', 'net_pay': '2566.66', 'gross_pay': '3333.33', 'salary_components': [], 'ni_contribution': '266.67', 'tax_deduction': '333.33', 'other_deductions': [{'name': 'Pension Contribution', 'amount': '166.67'}]}
35
+ 2025-04-14 22:49:12 - llm.document_analyzer - INFO - [document_analyzer.py:26] - file_name : /tmp/tmpfmxw59lw/sample_documents/sample_documents/2.pdf_page_0.png
36
+ 2025-04-14 22:49:12 - llm.document_analyzer - INFO - [document_analyzer.py:32] - Starting analysis for /tmp/tmpfmxw59lw/sample_documents/sample_documents/2.pdf_page_0.png
37
+ 2025-04-14 22:49:17 - llm.llm - INFO - [llm.py:40] - Json is being formatted
38
+ 2025-04-14 22:49:17 - llm.document_analyzer - INFO - [document_analyzer.py:51] - File name: /tmp/tmpfmxw59lw/sample_documents/sample_documents/2.pdf_page_0.png, Results: {'document_category': 'income_document', 'document_type': 'payslip'}
39
+ 2025-04-14 22:49:22 - llm.llm - INFO - [llm.py:40] - Json is being formatted
40
+ 2025-04-14 22:49:22 - llm.document_analyzer - INFO - [document_analyzer.py:76] - /tmp/tmpfmxw59lw/sample_documents/sample_documents/2.pdf_page_0.png: {'employee_name': 'Jodie Pippa', 'employer_name': '', 'payslip_date': '', 'pay_period_start': '', 'pay_period_end': '', 'payment_frequency': '', 'basic_pay': '', 'net_pay': '', 'gross_pay': '9545.45', 'salary_components': [], 'ni_contribution': '377.93', 'tax_deduction': '0', 'other_deductions': [{'name': 'Student Loan deductions', 'amount': '0'}]}
41
+ 2025-04-14 22:49:22 - llm.document_analyzer - INFO - [document_analyzer.py:26] - file_name : /tmp/tmpfmxw59lw/sample_documents/sample_documents/4.pdf_page_0.png
42
+ 2025-04-14 22:49:22 - llm.document_analyzer - INFO - [document_analyzer.py:32] - Starting analysis for /tmp/tmpfmxw59lw/sample_documents/sample_documents/4.pdf_page_0.png
43
+ 2025-04-14 22:49:26 - llm.llm - INFO - [llm.py:40] - Json is being formatted
44
+ 2025-04-14 22:49:26 - llm.document_analyzer - INFO - [document_analyzer.py:51] - File name: /tmp/tmpfmxw59lw/sample_documents/sample_documents/4.pdf_page_0.png, Results: {'document_category': 'unknown', 'document_type': 'unknown'}
45
+ 2025-04-14 22:49:32 - llm.llm - INFO - [llm.py:40] - Json is being formatted
46
+ 2025-04-14 22:49:32 - llm.document_analyzer - INFO - [document_analyzer.py:76] - /tmp/tmpfmxw59lw/sample_documents/sample_documents/4.pdf_page_0.png: {'Contract of Employment': {'Employee Name': 'Jodie Pippa', 'Job Title': 'Sales Manager', 'Start Date': '6th January 2025', 'Employer': 'ABC Ltd', 'Address': '456 Business Street, London, UK, SW1A 2BB', 'Employee Address': '123 Maple Street, London, UK, SW1A 1AA', 'Job Title and Duties': {'Job Title': 'Sales Manager', 'Reporting To': 'Managing Director', 'Main Duties and Responsibilities': ['Manage the sales team to achieve monthly and annual sales targets.', 'Develop and implement sales strategies to grow the business.', 'Build and maintain relationships with key clients.', 'Prepare sales reports and forecasts for senior management.']}, 'Place of Work': "The normal place of work is the company's office at 456 Business Street, London, UK, SW1A 2BB. However, the employee may be required to work at other locations as necessary.", 'Hours of Work': 'The normal working hours are 40 hours per week, Monday to Friday, 9:00 AM to 5:30 PM, with a one-hour unpaid lunch break.', 'Salary and Benefits': {'Basic Salary': '£40,000 per annum, payable monthly in arrears on the last working day of each month.', 'Bonus Scheme': 'Eligible for a performance-based bonus of up to 10% of annual salary.', 'Pension': 'Auto-enrolment into the company pension scheme in line with UK legislation.', 'Holiday Entitlement': '25 days per annum plus UK public holidays.'}, 'Probationary Period': "The first 3 months of employment will be a probationary period. During this time, the employee's suitability for the role will be assessed.", 'Termination of Employment': {'Notice Periods': ["During probationary period: 1 week's notice by either party.", "After probationary period: 1 month's notice by the employee, 2 months' notice by the employer.Summary Dismissal: The employer reserves the right to terminate employment without notice in cases of gross misconduct."]}}}
47
+ 2025-04-14 22:49:32 - llm.document_analyzer - INFO - [document_analyzer.py:26] - file_name : /tmp/tmpfmxw59lw/sample_documents/sample_documents/4.pdf_page_1.png
48
+ 2025-04-14 22:49:32 - llm.document_analyzer - INFO - [document_analyzer.py:32] - Starting analysis for /tmp/tmpfmxw59lw/sample_documents/sample_documents/4.pdf_page_1.png
49
+ 2025-04-14 22:49:36 - llm.llm - INFO - [llm.py:40] - Json is being formatted
50
+ 2025-04-14 22:49:36 - llm.document_analyzer - INFO - [document_analyzer.py:51] - File name: /tmp/tmpfmxw59lw/sample_documents/sample_documents/4.pdf_page_1.png, Results: {'document_category': 'unknown', 'document_type': 'unknown'}
51
+ 2025-04-14 22:49:42 - llm.llm - INFO - [llm.py:40] - Json is being formatted
52
+ 2025-04-14 22:49:42 - llm.document_analyzer - INFO - [document_analyzer.py:76] - /tmp/tmpfmxw59lw/sample_documents/sample_documents/4.pdf_page_1.png: {'clauses': [{'clause_number': '7', 'title': 'Confidentiality and Data Protection', 'content': 'The employee agrees to maintain the confidentiality of all company information and comply with the UK Data Protection Act 2018 and GDPR.'}, {'clause_number': '8', 'title': 'Intellectual Property', 'content': 'Any intellectual property created by the employee during the course of employment shall belong to the company.'}, {'clause_number': '9', 'title': 'Grievance and Disciplinary Procedures', 'content': "The company's grievance and disciplinary procedures will apply, as outlined in the employee handbook."}, {'clause_number': '10', 'title': 'Health and Safety', 'content': "The employee agrees to comply with the company's health and safety policies and procedures."}, {'clause_number': '11', 'title': 'Mobility Clause', 'content': 'The employee may be required to work at other locations within the UK or travel as necessary for business purposes.'}, {'clause_number': '12', 'title': 'Entire Agreement', 'content': 'This contract constitutes the entire agreement between the parties and supersedes any previous agreements or understandings.'}], 'signatures': {'employer': {'name': None, 'position': None, 'date': None}, 'employee': {'name': 'Jodie Pippa', 'date': None}}}
53
+ 2025-04-14 22:49:42 - llm.document_analyzer - INFO - [document_analyzer.py:26] - file_name : /tmp/tmpfmxw59lw/sample_documents/sample_documents/1.pdf_page_0.png
54
+ 2025-04-14 22:49:42 - llm.document_analyzer - INFO - [document_analyzer.py:32] - Starting analysis for /tmp/tmpfmxw59lw/sample_documents/sample_documents/1.pdf_page_0.png
55
+ 2025-04-14 22:49:46 - llm.llm - INFO - [llm.py:40] - Json is being formatted
56
+ 2025-04-14 22:49:46 - llm.document_analyzer - INFO - [document_analyzer.py:51] - File name: /tmp/tmpfmxw59lw/sample_documents/sample_documents/1.pdf_page_0.png, Results: {'document_category': 'identity_verification_document', 'document_type': 'passport'}
57
+ 2025-04-14 22:49:52 - llm.llm - INFO - [llm.py:40] - Json is being formatted
58
+ 2025-04-14 22:49:52 - llm.document_analyzer - INFO - [document_analyzer.py:76] - /tmp/tmpfmxw59lw/sample_documents/sample_documents/1.pdf_page_0.png: {'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P<GBRUNITED<KINGDOM<FIVE<<JODIE<PIPPA<<<<<<<', 'mrz_line_2': '1071857032GBR8501178F1601312<<<<<<<<<<<<<<02'}
59
+ 2025-04-14 22:49:52 - utils.document_display - INFO - [document_display.py:148] - analysis_results_for_id : {'document_category': 'bank_statement', 'document_type': 'bank_statement', 'account_holder_name': 'Jodie Pippa', 'account_holder_address': '', 'statement_start_date': '2025-01-01', 'statement_end_date': '2025-02-28', 'salary_credits': [{'date': '2025-01-06', 'amount': '2213.83', 'from': 'ABC Ltd', 'description': 'Salary - ABC Ltd (Pro-rated Jan)'}, {'date': '2025-02-06', 'amount': '2566.66', 'from': 'ABC Ltd', 'description': 'Salary - ABC Ltd (Full Feb Salary)'}]}
60
+ 2025-04-14 22:49:52 - utils.document_display - INFO - [document_display.py:167] - document_type for /tmp/tmpfmxw59lw/sample_documents/sample_documents/3.pdf: bank_statement
61
+ 2025-04-14 22:49:52 - utils.document_display - INFO - [document_display.py:251] - analysis_results_for_id_updated : {'document_type': 'bank_statement'}
62
+ 2025-04-14 22:49:52 - __main__ - INFO - [app_streamlit.py:40] - file_path while displaying: ['/tmp/tmpfmxw59lw/sample_documents/sample_documents/3.pdf_page_0.png']
63
+ 2025-04-14 22:49:52 - utils.document_display - INFO - [document_display.py:148] - analysis_results_for_id : {'document_category': 'income_document', 'document_type': 'payslip', 'employee_name': 'Jodie Pippa', 'employer_name': 'ABC Ltd', 'payslip_date': '2025-01-31', 'pay_period_start': '2025-01-06', 'pay_period_end': '2025-01-31', 'payment_frequency': 'monthly', 'basic_pay': '3333.33', 'net_pay': '2566.66', 'gross_pay': '3333.33', 'salary_components': [], 'ni_contribution': '266.67', 'tax_deduction': '333.33', 'other_deductions': [{'name': 'Pension Contribution', 'amount': '166.67'}]}
64
+ 2025-04-14 22:49:52 - utils.document_display - INFO - [document_display.py:167] - document_type for /tmp/tmpfmxw59lw/sample_documents/sample_documents/5.pdf: payslip
65
+ 2025-04-14 22:49:52 - utils.document_display - INFO - [document_display.py:251] - analysis_results_for_id_updated : {'document_category': 'income_document', 'document_type': 'payslip', 'employee_name': 'Jodie Pippa', 'employer_name': 'ABC Ltd', 'payslip_date': '2025-01-31', 'pay_period_start': '2025-01-06', 'pay_period_end': '2025-01-31', 'payment_frequency': 'monthly', 'basic_pay': '3333.33', 'net_pay': '2566.66', 'gross_pay': '3333.33', 'salary_components': [], 'ni_contribution': '266.67', 'tax_deduction': '333.33', 'other_deductions': [{'name': 'Pension Contribution', 'amount': '166.67'}]}
66
+ 2025-04-14 22:49:52 - __main__ - INFO - [app_streamlit.py:40] - file_path while displaying: ['/tmp/tmpfmxw59lw/sample_documents/sample_documents/5.pdf_page_0.png']
67
+ 2025-04-14 22:49:52 - utils.document_display - INFO - [document_display.py:148] - analysis_results_for_id : {'document_category': 'income_document', 'document_type': 'payslip', 'employee_name': 'Jodie Pippa', 'employer_name': '', 'payslip_date': '', 'pay_period_start': '', 'pay_period_end': '', 'payment_frequency': '', 'basic_pay': '', 'net_pay': '', 'gross_pay': '9545.45', 'salary_components': [], 'ni_contribution': '377.93', 'tax_deduction': '0', 'other_deductions': [{'name': 'Student Loan deductions', 'amount': '0'}]}
68
+ 2025-04-14 22:49:52 - utils.document_display - INFO - [document_display.py:167] - document_type for /tmp/tmpfmxw59lw/sample_documents/sample_documents/2.pdf: payslip
69
+ 2025-04-14 22:49:52 - utils.document_display - INFO - [document_display.py:251] - analysis_results_for_id_updated : {'document_category': 'income_document', 'document_type': 'payslip', 'employee_name': 'Jodie Pippa', 'employer_name': '', 'payslip_date': '', 'pay_period_start': '', 'pay_period_end': '', 'payment_frequency': '', 'basic_pay': '', 'net_pay': '', 'gross_pay': '9545.45', 'salary_components': [], 'ni_contribution': '377.93', 'tax_deduction': '0', 'other_deductions': [{'name': 'Student Loan deductions', 'amount': '0'}]}
70
+ 2025-04-14 22:49:52 - __main__ - INFO - [app_streamlit.py:40] - file_path while displaying: ['/tmp/tmpfmxw59lw/sample_documents/sample_documents/2.pdf_page_0.png']
71
+ 2025-04-14 22:49:53 - utils.document_display - INFO - [document_display.py:148] - analysis_results_for_id : {'document_category': 'unknown', 'Contract of Employment': {'Employee Name': 'Jodie Pippa', 'Job Title': 'Sales Manager', 'Start Date': '6th January 2025', 'Employer': 'ABC Ltd', 'Address': '456 Business Street, London, UK, SW1A 2BB', 'Employee Address': '123 Maple Street, London, UK, SW1A 1AA', 'Job Title and Duties': {'Job Title': 'Sales Manager', 'Reporting To': 'Managing Director', 'Main Duties and Responsibilities': ['Manage the sales team to achieve monthly and annual sales targets.', 'Develop and implement sales strategies to grow the business.', 'Build and maintain relationships with key clients.', 'Prepare sales reports and forecasts for senior management.']}, 'Place of Work': "The normal place of work is the company's office at 456 Business Street, London, UK, SW1A 2BB. However, the employee may be required to work at other locations as necessary.", 'Hours of Work': 'The normal working hours are 40 hours per week, Monday to Friday, 9:00 AM to 5:30 PM, with a one-hour unpaid lunch break.', 'Salary and Benefits': {'Basic Salary': '£40,000 per annum, payable monthly in arrears on the last working day of each month.', 'Bonus Scheme': 'Eligible for a performance-based bonus of up to 10% of annual salary.', 'Pension': 'Auto-enrolment into the company pension scheme in line with UK legislation.', 'Holiday Entitlement': '25 days per annum plus UK public holidays.'}, 'Probationary Period': "The first 3 months of employment will be a probationary period. During this time, the employee's suitability for the role will be assessed.", 'Termination of Employment': {'Notice Periods': ["During probationary period: 1 week's notice by either party.", "After probationary period: 1 month's notice by the employee, 2 months' notice by the employer.Summary Dismissal: The employer reserves the right to terminate employment without notice in cases of gross misconduct."]}}, 'document_type': 'unknown', 'signatures': {'employer': {'name': None, 'position': None, 'date': None}, 'employee': {'name': 'Jodie Pippa', 'date': None}}, 'clauses': [{'clause_number': '7', 'title': 'Confidentiality and Data Protection', 'content': 'The employee agrees to maintain the confidentiality of all company information and comply with the UK Data Protection Act 2018 and GDPR.'}, {'clause_number': '8', 'title': 'Intellectual Property', 'content': 'Any intellectual property created by the employee during the course of employment shall belong to the company.'}, {'clause_number': '9', 'title': 'Grievance and Disciplinary Procedures', 'content': "The company's grievance and disciplinary procedures will apply, as outlined in the employee handbook."}, {'clause_number': '10', 'title': 'Health and Safety', 'content': "The employee agrees to comply with the company's health and safety policies and procedures."}, {'clause_number': '11', 'title': 'Mobility Clause', 'content': 'The employee may be required to work at other locations within the UK or travel as necessary for business purposes.'}, {'clause_number': '12', 'title': 'Entire Agreement', 'content': 'This contract constitutes the entire agreement between the parties and supersedes any previous agreements or understandings.'}]}
72
+ 2025-04-14 22:49:53 - utils.document_display - INFO - [document_display.py:167] - document_type for /tmp/tmpfmxw59lw/sample_documents/sample_documents/4.pdf: unknown
73
+ 2025-04-14 22:49:53 - utils.document_display - INFO - [document_display.py:251] - analysis_results_for_id_updated : {'document_type': 'unknown'}
74
+ 2025-04-14 22:49:53 - __main__ - INFO - [app_streamlit.py:40] - file_path while displaying: ['/tmp/tmpfmxw59lw/sample_documents/sample_documents/4.pdf_page_0.png', '/tmp/tmpfmxw59lw/sample_documents/sample_documents/4.pdf_page_1.png']
75
+ 2025-04-14 22:49:53 - utils.document_display - INFO - [document_display.py:148] - analysis_results_for_id : {'document_category': 'identity_verification_document', 'document_type': 'passport', 'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'surname': 'UNITED-KINGDOM-FIVE', 'given_names': 'JODIE PIPPA', 'passport_number': '107185703', 'nationality': 'BRITISH CITIZEN', 'date_of_birth': '1985-01-17', 'place_of_birth': 'LONDON', 'sex': 'F', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'issuing_authority': 'UKPA', 'passport_type': 'P', 'country_code': 'GBR', 'mrz_line_1': 'P<GBRUNITED<KINGDOM<FIVE<<JODIE<PIPPA<<<<<<<', 'mrz_line_2': '1071857032GBR8501178F1601312<<<<<<<<<<<<<<02'}
76
+ 2025-04-14 22:49:53 - utils.document_display - INFO - [document_display.py:167] - document_type for /tmp/tmpfmxw59lw/sample_documents/sample_documents/1.pdf: passport
77
+ 2025-04-14 22:49:53 - utils.document_display - INFO - [document_display.py:251] - analysis_results_for_id_updated : {'document_category': 'identity_verification_document', 'document_type': 'passport', 'passport_number': '107185703', 'full_name': 'UNITED-KINGDOM-FIVE JODIE PIPPA', 'date_of_birth': '1985-01-17', 'nationality': 'BRITISH CITIZEN', 'date_of_issue': '2006-01-31', 'date_of_expiry': '2016-01-31', 'sex': 'F', 'address': None}
78
+ 2025-04-14 22:49:53 - __main__ - INFO - [app_streamlit.py:40] - file_path while displaying: ['/tmp/tmpfmxw59lw/sample_documents/sample_documents/1.pdf_page_0.png']
logs_directory/app_20250415.log ADDED
The diff for this file is too large to render. See raw diff
 
logs_directory/app_20250416.log ADDED
The diff for this file is too large to render. See raw diff
 
logs_directory/app_20250417.log ADDED
The diff for this file is too large to render. See raw diff
 
logs_directory/app_20250420.log ADDED
The diff for this file is too large to render. See raw diff
 
prompts/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from .document_type import document_type_prompt
2
+ from .bank_statement.bank_statement import bank_statement_prompt
3
+ from .identity_documents.passport import passport_prompt
4
+ from .identity_documents.driving_license import driving_license_prompt
5
+ from .income_document.p60 import p60_prompt
6
+ from .income_document.payslip import payslip_prompt
7
+ from .genric_ocr import genric_ocr_prompt
prompts/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (631 Bytes). View file
 
prompts/__pycache__/document_type.cpython-313.pyc ADDED
Binary file (2.37 kB). View file
 
prompts/__pycache__/genric_ocr.cpython-313.pyc ADDED
Binary file (522 Bytes). View file
 
prompts/bank_statement/__pycache__/bank_statement.cpython-313.pyc ADDED
Binary file (1.92 kB). View file
 
prompts/bank_statement/bank_statement.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bank_statement_prompt = """
2
+ 🏦 Bank Statement Information Extraction Prompt
3
+ You are a document information extraction assistant.
4
+
5
+ You will be given an image of a bank statement. Your task is to extract structured data that can be used to verify a customer's identity and income information.
6
+
7
+ 📝 Extract the following fields:
8
+ 1. Identity Information
9
+ - account_holder_name
10
+ - account_holder_address
11
+ - bank_name
12
+ - account_number
13
+ - sort_code
14
+
15
+ 2. Statement Period
16
+ - statement_start_date (format: YYYY-MM-DD)
17
+ - statement_end_date (format: YYYY-MM-DD)
18
+
19
+ 3. Income Information
20
+ - salary_credits — an array of objects, where each object contains:
21
+ - date (of credit) (format: YYYY-MM-DD)
22
+ - amount
23
+ - from — From account details
24
+ - description
25
+
26
+ 📦 Output Format
27
+
28
+ {
29
+ "account_holder_name": "",
30
+ "account_holder_address": "",
31
+ "bank_name": "",
32
+ "account_number: "",
33
+ "sort_code: "",
34
+ "statement_start_date": "",
35
+ "statement_end_date": "",
36
+ "salary_credits": [
37
+ {
38
+ "date": "", Dates must be in YYYY-MM-DD format.
39
+ "amount": "",
40
+ "from" : "",
41
+ "description": ""
42
+ }
43
+ ]
44
+ }
45
+
46
+ 📌 Instructions
47
+ Identify salary credits based on transaction descriptions (e.g. containing "Salary", "SAL", "Payroll", "Company Name", etc.).
48
+
49
+ Dates must be in YYYY-MM-DD format.
50
+
51
+ If no data is available for a field, give null.
52
+
53
+ Only return the structured JSON — no explanation or extra content.
54
+
55
+ While extracting user full name, then make sure to extract first name followed by last name
56
+
57
+
58
+ ✅ With this output, we will validate:
59
+ Name
60
+ Address
61
+ Presence of salary credit
62
+ Salary credits across different months
63
+ Salary consistency (regularity & similar amount)
64
+ That the statement period covers at least 28 days
65
+
66
+
67
+ """
prompts/document_type.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ document_type_prompt = """
2
+
3
+ Document Type Identification Agent Prompt
4
+ You are a document classification assistant.
5
+
6
+ You will be given an image of a document. Your task is to analyze its content and identify the most appropriate document_category and document_type.
7
+
8
+ Here are the valid document categories and their corresponding types:
9
+
10
+
11
+ {
12
+ "identity_verification_document": [
13
+ "passport",
14
+ "driving_license",
15
+ "national_identity_card",
16
+ "other
17
+ ],
18
+ "bank_statement": [
19
+ "bank_statement",
20
+ "other
21
+ ],
22
+ "income_document": [
23
+ "payslip",
24
+ "p60",
25
+ "contract_of_employment",
26
+ "other
27
+ ]
28
+ }
29
+
30
+
31
+
32
+ 🧪 Few-shot examples
33
+
34
+ Example 1:
35
+ Image shows a government-issued ID with a photo, name, nationality, and passport number.
36
+
37
+ {
38
+ "document_category": "identity_verification_document",
39
+ "document_type": "passport"
40
+ }
41
+
42
+ Example 2:
43
+ Image shows a payslip with details like gross salary, deductions, employer name, and pay period.
44
+
45
+ {
46
+ "document_category": "income_document",
47
+ "document_type": "payslip"
48
+ }
49
+
50
+
51
+ Example 3:
52
+ Image shows a monthly bank statement with account number, transaction list, balances, and bank branding.
53
+
54
+ {
55
+ "document_category": "bank_statement",
56
+ "document_type": "bank_statement"
57
+ }
58
+
59
+ Example 4:
60
+ Image shows a plastic card with license number, issue/expiry dates, categories of vehicles, and a photo.
61
+
62
+ {
63
+ "document_category": "identity_verification_document",
64
+ "document_type": "driving_license"
65
+ }
66
+
67
+ Example 5:
68
+
69
+ Image contains unclear or unrelated content (e.g., a receipt or handwritten note).
70
+ {
71
+ "document_category": "unknown",
72
+ "document_type": "unknown"
73
+ }
74
+
75
+
76
+ Instructions:
77
+ Extract the textual and visual information from the document image.
78
+
79
+ Match it to the most likely document_category and document_type from the list above.
80
+
81
+ Return your answer in strict JSON format, as shown below:
82
+
83
+
84
+ {
85
+ "document_category": "string",
86
+ "document_type": "string"
87
+ }
88
+
89
+ If you are unable to confidently classify the document, return:
90
+
91
+ {
92
+ "document_category": "unknown",
93
+ "document_type": "unknown"
94
+ }
95
+
96
+ Constraints:
97
+ Your response must only include the JSON object.
98
+
99
+ Do not include any explanation, notes, or additional content outside the JSON.
100
+
101
+
102
+
103
+ """
prompts/genric_ocr.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ genric_ocr_prompt = """
2
+ You are an expert document analyzer spealizing in converting images of the document into structured data.
3
+
4
+
5
+ You will be given images of the documents. Extract the data from them in a structured json way.
6
+
7
+ The output should only be in JSON format. Dont output anything other than a valid json loadable in python.
8
+
9
+ """
prompts/identity_documents/__pycache__/driving_license.cpython-313.pyc ADDED
Binary file (1.21 kB). View file
 
prompts/identity_documents/__pycache__/passport.cpython-313.pyc ADDED
Binary file (1.6 kB). View file
 
prompts/identity_documents/driving_license.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ driving_license_prompt = """
2
+
3
+ You are an intelligent document parser. Extract all relevant information from the provided image of a UK driving licence and return it in structured JSON format.
4
+
5
+ The fields you must extract are:
6
+ - surname
7
+ - first_name (Might span across two lines)
8
+ - date_of_birth (in YYYY-MM-DD format)
9
+ - place_of_birth
10
+ - date_of_issue (in YYYY-MM-DD format)
11
+ - date_of_expiry (in YYYY-MM-DD format)
12
+ - issuing_authority
13
+ - driver_number
14
+ - signature
15
+ - address (with line_1, city, and postcode)
16
+ - entitlements (as a list of licence categories)
17
+
18
+ Return the output strictly in the following JSON format:
19
+
20
+ {
21
+ "surname": "",
22
+ "first_name": "",
23
+ "date_of_birth": "",
24
+ "place_of_birth": "",
25
+ "date_of_issue": "",
26
+ "date_of_expiry": "",
27
+ "issuing_authority": "",
28
+ "driver_number": "",
29
+ "signature": "",
30
+ "address": {
31
+ "line_1": "",
32
+ "city": "",
33
+ "postcode": ""
34
+ },
35
+ "entitlements": []
36
+ }
37
+
38
+ Do not include any additional explanation or text—only return the filled JSON object.
39
+ """
prompts/identity_documents/passport.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ passport_prompt = """
2
+ 🛂 UK Passport Information Extraction Prompt
3
+ You are an intelligent document parser.
4
+
5
+ You will be given an image of a United Kingdom (UK) passport. Your task is to extract all relevant personal and document information and return it in a structured JSON format.
6
+
7
+ 📝 Extract the following fields (if available):
8
+
9
+ - full_name (concatenated from surname and given_names)
10
+ - surname
11
+ - given_names
12
+ - passport_number
13
+ - nationality
14
+ - date_of_birth (in YYYY-MM-DD format)
15
+ - place_of_birth
16
+ - sex (M or F)
17
+ - date_of_issue (in YYYY-MM-DD format)
18
+ - date_of_expiry (in YYYY-MM-DD format)
19
+ - issuing_authority
20
+ - passport_type (usually P)
21
+ - country_code
22
+ - mrz_line_1
23
+ - mrz_line_2
24
+
25
+
26
+ 📦 Output Format
27
+ Return your result using the following JSON structure:
28
+
29
+ {
30
+ "full_name": "",
31
+ "surname": "",
32
+ "given_names": "",
33
+ "passport_number": "",
34
+ "nationality": "",
35
+ "date_of_birth": "",
36
+ "place_of_birth": "",
37
+ "sex": "",
38
+ "date_of_issue": "",
39
+ "date_of_expiry": "",
40
+ "issuing_authority": "",
41
+ "passport_type": "",
42
+ "country_code": "",
43
+ "mrz_line_1": "",
44
+ "mrz_line_2": ""
45
+ }
46
+
47
+ 📌 Instructions
48
+ If a field is not present or not readable, return it as an empty string "".
49
+
50
+ Dates must be in YYYY-MM-DD format.
51
+
52
+ The MRZ (Machine Readable Zone) consists of two lines usually at the bottom of the passport data page.
53
+
54
+ Respond only with the JSON object — no extra text or explanation.
55
+
56
+
57
+
58
+ """
prompts/income_document/__pycache__/p60.cpython-313.pyc ADDED
Binary file (3.09 kB). View file
 
prompts/income_document/__pycache__/payslip.cpython-313.pyc ADDED
Binary file (2.4 kB). View file
 
prompts/income_document/p60.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ p60_prompt = """
2
+ You are an expert document parser. Extract the structured information from a UK P60 End of Year Certificate and return it as a JSON object.
3
+
4
+ Use the following format and structure. Ensure numerical values are parsed correctly, and fields with missing data should use null where appropriate. The "national_insurance_contributions" field should be an array to accommodate multiple NIC letters.
5
+
6
+ {
7
+ "employee_details": {
8
+ "surname": "", // Employee's last name
9
+ "forenames_or_initials": "", // Employee's first name or initials
10
+ "national_insurance_number": "", // NI number (e.g. AB123456C)
11
+ "works_payroll_number": "" // Internal payroll identifier
12
+ },
13
+ "pay_and_income_tax_details": {
14
+ "previous_employments": {
15
+ "pay": 0.00, // Pay from previous jobs in the tax year
16
+ "tax_deducted": 0.00 // Tax deducted from previous jobs
17
+ },
18
+ "current_employment": {
19
+ "pay": 0.00, // Pay from this employment
20
+ "tax_deducted": 0.00 // Tax deducted from this employment
21
+ },
22
+ "total_for_year": {
23
+ "pay": 0.00, // Total pay for the year
24
+ "tax_deducted": 0.00 // Total tax deducted for the year
25
+ },
26
+ "final_tax_code": "" // Final PAYE tax code (e.g. 1257L)
27
+ },
28
+ "national_insurance_contributions": [
29
+ {
30
+ "nic_letter": "", // NIC table letter (e.g. A, B, C, J)
31
+ "earnings": {
32
+ "at_or_above_lel": 0.00, // Earnings above Lower Earnings Limit
33
+ "above_lel_up_to_pt": 0.00, // Earnings above LEL up to PT
34
+ "above_pt_up_to_uel": 0.00 // Earnings above PT up to UEL
35
+ },
36
+ "employee_contributions_above_pt": 0.00 // Contributions on earnings above PT
37
+ }
38
+ // ... Add more entries if needed
39
+ ],
40
+ "statutory_payments": {
41
+ "maternity_pay": 0.00, // Statutory Maternity Pay included
42
+ "paternity_pay": 0.00, // Statutory Paternity Pay included
43
+ "adoption_pay": 0.00, // Statutory Adoption Pay included
44
+ "shared_parental_pay": 0.00 // Statutory Shared Parental Pay included
45
+ },
46
+ "other_details": {
47
+ "student_loan_deductions": 0.00 // Student loan deductions (whole £ only)
48
+ },
49
+ "employer_details": {
50
+ "employer_name_and_address": "", // Employer's name and full address
51
+ "paye_reference": "" // Employer's PAYE reference (e.g. 123/AB456)
52
+ }
53
+ }
54
+
55
+ ✅ Additional Instructions (Optional):
56
+ - Extract values as they appear on the document (e.g., keep leading zeroes, currency format).
57
+ - If an amount field is empty or not present, use null.
58
+ - Preserve all textual details like addresses or names exactly as shown.
59
+ - While extracting user full name, then make sure to extract first name followed by last name
60
+
61
+ """
prompts/income_document/payslip.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ payslip_prompt = """
2
+ 💼 Payslip Information Extraction Prompt
3
+ You are a document information extraction assistant.
4
+
5
+ You will be given an image of a UK payslip. Your task is to extract all relevant details required for salary verification and compliance checks.
6
+
7
+
8
+ 📝 Extract the following fields:
9
+ 1. Identity & Date
10
+ - employee_name
11
+ - employer_name
12
+ - employee_id
13
+ - employee_address
14
+ - employer_address
15
+ - tax_code
16
+ - payslip_date (format: YYYY-MM-DD)
17
+ - pay_period_start (format: YYYY-MM-DD)
18
+ - pay_period_end (format: YYYY-MM-DD)
19
+ - payment_frequency (monthly or weekly)
20
+
21
+ 2. Salary Details
22
+ - basic_pay
23
+ - net_pay
24
+ - gross_pay
25
+ - salary_components: list of { name, amount } (e.g., bonus, overtime, allowance)
26
+
27
+ 3. Deductions
28
+ - ni_contribution (National Insurance)
29
+ - tax_deduction
30
+ - other_deductions: list of { name, amount }
31
+
32
+
33
+ 📦 Output Format
34
+
35
+ {
36
+ "employee_name": "",
37
+ "employer_name": "",
38
+ "employee_id : "",
39
+ "employee_address : "",
40
+ "employer_address" : "",
41
+ "tax_code": "",
42
+ "payslip_date": "",
43
+ "pay_period_start": "",
44
+ "pay_period_end": "",
45
+ "payment_frequency": "",
46
+ "basic_pay": "",
47
+ "net_pay": "",
48
+ "gross_pay": "",
49
+ "salary_components": [
50
+ {
51
+ "name": "",
52
+ "amount": ""
53
+ }
54
+ ],
55
+ "ni_contribution": "",
56
+ "tax_deduction": "",
57
+ "other_deductions": [
58
+ {
59
+ "name": "",
60
+ "amount": ""
61
+ }
62
+ ]
63
+ }
64
+
65
+
66
+ 📌 Instructions
67
+ - All monetary values should be extracted as numeric strings (e.g., "1550.00").
68
+ - Dates must be returned in YYYY-MM-DD format.
69
+ - If a field is missing or unreadable, use null.
70
+ - Only return the structured JSON — no explanation or extra content.
71
+ - Always extract only relevaant information
72
+ - While extracting user full name, then make sure to extract first name followed by last name
73
+ - other_deductions should only be list of dictionaries
74
+
75
+ ✅ This output supports the following checks:
76
+ - Payslip includes Basic Pay, Net Pay, and detailed Salary Components
77
+ - National Insurance (NI) is present
78
+ - Tax deduction is clearly shown
79
+ - Determine if payslip is most recent (monthly) or 4 consecutive weeks (weekly)
80
+ - If only a date range is shown (e.g. pay_period_end), ensure it's within 35 days from today
81
+ - If the payslip has no date, it's invalid
82
+
83
+
84
+ """
schemas/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .account_statement import UKBankAccountStatement
2
+ from .custom_app_form import CustomAppFormUpload
3
+ from .id import UKPassportSchema, UKDrivingLicense
4
+ from .payslip import UKPayslipSchema
5
+ from .uk_address import UKAddress
schemas/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (464 Bytes). View file
 
schemas/__pycache__/account_statement.cpython-313.pyc ADDED
Binary file (16.9 kB). View file
 
schemas/__pycache__/custom_app_form.cpython-313.pyc ADDED
Binary file (6.07 kB). View file
 
schemas/__pycache__/id.cpython-313.pyc ADDED
Binary file (10.8 kB). View file
 
schemas/__pycache__/payslip.cpython-313.pyc ADDED
Binary file (20.3 kB). View file
 
schemas/__pycache__/uk_address.cpython-313.pyc ADDED
Binary file (2.91 kB). View file
 
schemas/account_statement.py ADDED
@@ -0,0 +1,609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import re
3
+
4
+ from pydantic import (
5
+ BaseModel,
6
+ Field,
7
+ ValidationInfo,
8
+ computed_field,
9
+ model_validator,
10
+ ConfigDict
11
+ )
12
+ import pandas as pd
13
+
14
+
15
+ class UKBankAccountStatement(BaseModel):
16
+ model_config = ConfigDict(arbitrary_types_allowed=True)
17
+ statement_start_date: datetime.date | None = Field(
18
+ default=None,
19
+ description="Digital Bank account statement period's start date in YYYY-MM-DD format",
20
+ examples=["2025-01-01"],
21
+ )
22
+ statement_end_date: datetime.date | None = Field(
23
+ default=None,
24
+ description="Digital Bank account statement period's end date in YYYY-MM-DD format",
25
+ examples=["2025-01-31"],
26
+ )
27
+ first_salary_deposit_date_present: int | datetime.date | None = Field(
28
+ default=None,
29
+ description=(
30
+ "The day/date of the very first salary deposit line item present in"
31
+ " the bank account statement. Value must be gte 1 & lte 31"
32
+ ),
33
+ examples=[
34
+ "If first present salary deposit date is 2025-01-06, then 6 must be passed"
35
+ ],
36
+ )
37
+ bank_name: str | None = Field(
38
+ default=None,
39
+ description="Extracted bank name value, stripped of whitespaces at beginning & end",
40
+ examples=["HSBC"],
41
+ ) # , min_length=4, max_length=50)
42
+ full_name: str | None = Field(
43
+ default=None,
44
+ description="Applicant's full name. Must consist of at least 2 words, have length gte 2 & lte 61",
45
+ examples=["Jodie Pippa"],
46
+ ) # , min_length=2, max_length=61)
47
+ account_number: str | None = Field(
48
+ default=None,
49
+ description="UK Bank Account Statement's account number. Must be of 8 characters length only",
50
+ examples=["12345678"],
51
+ ) # , min_length=8, max_length=8) # 12345678
52
+ sort_code: str | None = Field(
53
+ default=None,
54
+ description="UK Bank Account Sort Code. Must be of length 8 characters only. Format: xx-xx-xx",
55
+ examples="20-00-00",
56
+ ) # , min_length=8, max_length=8) # 20-00-00
57
+ # is_salary_credit_consistent_across_months: bool = Field(
58
+ # default=False,
59
+ # description=(
60
+ # "If the bank account statement spans several months, sense check "
61
+ # "whether salary deposit amounts across months are consistent"
62
+ # ),
63
+ # examples=[True, False, None],
64
+ # )
65
+
66
+ account_statement_date_err_msgs: str | None = None
67
+ full_name_err_msgs: str | None = None
68
+ bank_name_err_msgs: str | None = None
69
+ account_number_err_msgs: str | None = None
70
+ sort_code_err_msgs: str | None = None
71
+ salary_deposit_err_msgs: str | None = None
72
+ validation_policy_status_df: pd.DataFrame = pd.DataFrame(
73
+ columns=["Policy", "Value", "Status", "Message"])
74
+
75
+ @model_validator(mode="after")
76
+ def validate_full_name(cls, values, info: ValidationInfo):
77
+ """Match applicant's full name against provided name (case-insensitive)"""
78
+ try:
79
+ err_msgs = []
80
+ expected = (
81
+ info.context.get("application_summary_full_name")
82
+ if info.context
83
+ else None
84
+ )
85
+ full_name_val = values.full_name
86
+ if not full_name_val:
87
+ err_msgs.append("Applicant's full name not present")
88
+ values.validation_policy_status_df.loc[
89
+ len(values.validation_policy_status_df)
90
+ ] = [
91
+ "Full Name",
92
+ full_name_val,
93
+ False,
94
+ "Applicant's full name not present",
95
+ ]
96
+ else:
97
+ values.validation_policy_status_df.loc[
98
+ len(values.validation_policy_status_df)
99
+ ] = [
100
+ "Full Name",
101
+ full_name_val,
102
+ True,
103
+ "Applicant's full name is present",
104
+ ]
105
+
106
+
107
+
108
+ full_name_val_len = 0
109
+ if full_name_val:
110
+ full_name_val_len = len(full_name_val)
111
+ if not full_name_val and not (
112
+ full_name_val_len >= 2 and full_name_val_len <= 61
113
+ ):
114
+ err_msgs.append(
115
+ "Full name must have a length of at least 2 & at most 61"
116
+ )
117
+ values.validation_policy_status_df.loc[
118
+ len(values.validation_policy_status_df)
119
+ ] = [
120
+ "Full Name",
121
+ full_name_val_len,
122
+ False,
123
+ "Full name does not have a length of at least 2 & at most 61",
124
+ ]
125
+ else:
126
+ values.validation_policy_status_df.loc[
127
+ len(values.validation_policy_status_df)
128
+ ] = [
129
+ "Full Name",
130
+ full_name_val_len,
131
+ True,
132
+ "Full name has a length of at least 2 & at most 61",
133
+ ]
134
+
135
+
136
+ if (
137
+ not expected
138
+ or not full_name_val
139
+ or full_name_val.lower() != expected.lower()
140
+ ):
141
+ err_msgs.append("Name mismatch with provided value")
142
+ values.validation_policy_status_df.loc[
143
+ len(values.validation_policy_status_df)
144
+ ] = [
145
+ "Data Match",
146
+ f"{full_name_val}, {expected}",
147
+ False,
148
+ "Name does not match with provided value",
149
+ ]
150
+ else:
151
+ values.validation_policy_status_df.loc[
152
+ len(values.validation_policy_status_df)
153
+ ] = [
154
+ "Data Match",
155
+ f"{full_name_val}, {expected}",
156
+ True,
157
+ "Name matches with provided value",
158
+ ]
159
+
160
+
161
+ if not full_name_val or len(full_name_val.strip().split(" ")) < 2:
162
+ err_msgs.append(
163
+ "Full name must consist of at least 2 words (first name + last name)"
164
+ )
165
+ values.validation_policy_status_df.loc[
166
+ len(values.validation_policy_status_df)
167
+ ] = [
168
+ "Full Name",
169
+ full_name_val,
170
+ False,
171
+ "Full name does not consist of at least 2 words (first name + last name)",
172
+ ]
173
+ else:
174
+ values.validation_policy_status_df.loc[
175
+ len(values.validation_policy_status_df)
176
+ ] = [
177
+ "Full Name",
178
+ full_name_val,
179
+ True,
180
+ "Full name consists of at least 2 words (first name + last name)",
181
+ ]
182
+
183
+
184
+ if err_msgs:
185
+ values.full_name_err_msgs = ", ".join(err_msgs)
186
+ else:
187
+ values.full_name_err_msgs = None
188
+
189
+ return values
190
+ except Exception as e:
191
+ # logger.exception(e, exc_info=True)
192
+ # return None
193
+ raise
194
+
195
+ @model_validator(mode="after")
196
+ def validate_bank_name(cls, values, info: ValidationInfo):
197
+ """Match bank name against provided name (case-insensitive)"""
198
+ try:
199
+ err_msgs = []
200
+ expected = (
201
+ info.context.get("application_summary_bank_name")
202
+ if info.context
203
+ else None
204
+ )
205
+ bank_name_val = values.bank_name
206
+ if not bank_name_val:
207
+ err_msgs.append("Bank name not present")
208
+ values.validation_policy_status_df.loc[
209
+ len(values.validation_policy_status_df)
210
+ ] = [
211
+ "Bank name",
212
+ bank_name_val,
213
+ False,
214
+ "Bank name is not present",
215
+ ]
216
+ else:
217
+ values.validation_policy_status_df.loc[
218
+ len(values.validation_policy_status_df)
219
+ ] = [
220
+ "Bank name",
221
+ bank_name_val,
222
+ True,
223
+ "Bank name is present",
224
+ ]
225
+
226
+
227
+ bank_name_val_len = 0
228
+ if bank_name_val:
229
+ bank_name_val_len = len(bank_name_val)
230
+ if not bank_name_val and not (
231
+ bank_name_val_len >= 4 and bank_name_val_len <= 50
232
+ ):
233
+ err_msgs.append(
234
+ "Bank name must have a length of at least 4 & at most 50"
235
+ )
236
+ values.validation_policy_status_df.loc[
237
+ len(values.validation_policy_status_df)
238
+ ] = [
239
+ "Bank name",
240
+ bank_name_val_len,
241
+ False,
242
+ "Bank name does not have a length of at least 4 & at most 50",
243
+ ]
244
+ else:
245
+ values.validation_policy_status_df.loc[
246
+ len(values.validation_policy_status_df)
247
+ ] = [
248
+ "Bank name",
249
+ bank_name_val_len,
250
+ True,
251
+ "Bank name has a length of at least 4 & at most 50",
252
+ ]
253
+
254
+
255
+ if (
256
+ not expected
257
+ or not bank_name_val
258
+ or bank_name_val.lower() != expected.lower()
259
+ ):
260
+ err_msgs.append("Bank name mismatch with provided value")
261
+ values.validation_policy_status_df.loc[
262
+ len(values.validation_policy_status_df)
263
+ ] = [
264
+ "Data Match",
265
+ f"{bank_name_val}, {expected}",
266
+ False,
267
+ "Bank name does not match with provided value",
268
+ ]
269
+ else:
270
+ values.validation_policy_status_df.loc[
271
+ len(values.validation_policy_status_df)
272
+ ] = [
273
+ "Data Match",
274
+ f"{bank_name_val}, {expected}",
275
+ True,
276
+ "Bank name matches with provided value",
277
+ ]
278
+
279
+
280
+
281
+ if err_msgs:
282
+ values.bank_name_err_msgs = ", ".join(err_msgs)
283
+ else:
284
+ values.bank_name_err_msgs = None
285
+
286
+ return values
287
+ except Exception as e:
288
+ # logger.exception(e, exc_info=True)
289
+ # return None
290
+ raise
291
+
292
+ @model_validator(mode="after")
293
+ def validate_account_number(cls, values):
294
+ """Validate detected bank account number"""
295
+ try:
296
+ err_msgs = list()
297
+
298
+ if not values.account_number:
299
+ err_msgs.append(
300
+ "Bank account number not present. Bank account number must be present."
301
+ )
302
+ values.validation_policy_status_df.loc[
303
+ len(values.validation_policy_status_df)
304
+ ] = [
305
+ "Bank account number",
306
+ values.account_number,
307
+ False,
308
+ "Bank account number is not present.",
309
+ ]
310
+ else:
311
+ values.validation_policy_status_df.loc[
312
+ len(values.validation_policy_status_df)
313
+ ] = [
314
+ "Bank account number",
315
+ values.account_number,
316
+ True,
317
+ "Bank name matches is present",
318
+ ]
319
+
320
+
321
+
322
+ if not values.account_number or not re.fullmatch(
323
+ r"^\d{8}$", values.account_number
324
+ ):
325
+ err_msgs.append(
326
+ "Provided account number is invalid. It must be of 8 digits length only"
327
+ )
328
+ values.validation_policy_status_df.loc[
329
+ len(values.validation_policy_status_df)
330
+ ] = [
331
+ "Bank account number",
332
+ values.account_number,
333
+ False,
334
+ "Provided account number is invalid",
335
+ ]
336
+ else:
337
+ values.validation_policy_status_df.loc[
338
+ len(values.validation_policy_status_df)
339
+ ] = [
340
+ "Bank account number",
341
+ values.account_number,
342
+ True,
343
+ "Provided account number is valid",
344
+ ]
345
+
346
+
347
+
348
+ if err_msgs:
349
+ values.account_number_err_msgs = ", ".join(err_msgs)
350
+ else:
351
+ values.account_number_err_msgs = None
352
+
353
+ return values
354
+ except Exception as e:
355
+ # logger.exception(e, exc_info=True)
356
+ # return None
357
+ raise
358
+
359
+ @model_validator(mode="after")
360
+ def validate_sort_code(cls, values):
361
+ """Validate extracted Bank Account Sort Code"""
362
+ try:
363
+ err_msgs = list()
364
+
365
+ if not values.sort_code:
366
+ err_msgs.append(
367
+ "Sort code not present. Sort number must be present.")
368
+ values.validation_policy_status_df.loc[
369
+ len(values.validation_policy_status_df)
370
+ ] = [
371
+ "Sort code",
372
+ values.sort_code,
373
+ False,
374
+ "Sort code is not present.",
375
+ ]
376
+ else:
377
+ values.validation_policy_status_df.loc[
378
+ len(values.validation_policy_status_df)
379
+ ] = [
380
+ "Sort code",
381
+ values.sort_code,
382
+ True,
383
+ "Sort code is present.",
384
+ ]
385
+
386
+
387
+
388
+ # if not values.sort_code or not re.fullmatch(r"^\d{2}-?\d{2}-?\d{2}$", values.sort_code):
389
+ if not values.sort_code or not re.fullmatch(
390
+ r"^\d{2}-\d{2}-\d{2}$", values.sort_code
391
+ ):
392
+ err_msgs.append(
393
+ "Provided sort code's format is invalid. It must be of the format xx-xx-xx wherein x are digits."
394
+ )
395
+ values.validation_policy_status_df.loc[
396
+ len(values.validation_policy_status_df)
397
+ ] = [
398
+ "Sort code",
399
+ values.sort_code,
400
+ False,
401
+ "Sort code's format is invalid.",
402
+ ]
403
+ else:
404
+ values.validation_policy_status_df.loc[
405
+ len(values.validation_policy_status_df)
406
+ ] = [
407
+ "Sort code",
408
+ values.sort_code,
409
+ True,
410
+ "Sort code's format is valid.",
411
+ ]
412
+
413
+
414
+
415
+ if err_msgs:
416
+ values.sort_code_err_msgs = ", ".join(err_msgs)
417
+ else:
418
+ values.sort_code_err_msgs = None
419
+
420
+ return values
421
+ except Exception as e:
422
+ # logger.exception(e, exc_info=True)
423
+ # return None
424
+ raise
425
+
426
+ @model_validator(mode="after")
427
+ def validate_bank_account_statement_dates(cls, values):
428
+ try:
429
+ err_msgs = list()
430
+ statement_start_date_val = values.statement_start_date
431
+ statement_end_date_val = values.statement_end_date
432
+
433
+ if not statement_start_date_val or not statement_end_date_val:
434
+ err_msgs.append(
435
+ "Both statement start date & statement end date must be present"
436
+ )
437
+ values.validation_policy_status_df.loc[
438
+ len(values.validation_policy_status_df)
439
+ ] = [
440
+ "Date checks",
441
+ f"{statement_start_date_val}, {statement_end_date_val}",
442
+ False,
443
+ "Both statement start date & statement end date are not present",
444
+ ]
445
+ else:
446
+ values.validation_policy_status_df.loc[
447
+ len(values.validation_policy_status_df)
448
+ ] = [
449
+ "Date checks",
450
+ f"{statement_start_date_val}, {statement_end_date_val}",
451
+ True,
452
+ "Both statement start date & statement end date are present",
453
+ ]
454
+
455
+
456
+
457
+ if statement_start_date_val and statement_end_date_val:
458
+ if (statement_end_date_val - statement_start_date_val).days < 28:
459
+ err_msgs.append(
460
+ "Account statement period's start date & end date must have a gap of at least 28 days"
461
+ )
462
+ values.validation_policy_status_df.loc[
463
+ len(values.validation_policy_status_df)
464
+ ] = [
465
+ "Coverage",
466
+ f"{statement_start_date_val}, {statement_end_date_val}",
467
+ False,
468
+ "Account statement period's start date & end date donot have a gap of at least 28 days",
469
+ ]
470
+ else:
471
+ values.validation_policy_status_df.loc[
472
+ len(values.validation_policy_status_df)
473
+ ] = [
474
+ "Coverage",
475
+ f"{statement_start_date_val}, {statement_end_date_val}",
476
+ True,
477
+ "Account statement period's start date & end date have a gap of at least 28 days",
478
+ ]
479
+
480
+
481
+ if err_msgs:
482
+ values.account_statement_date_err_msgs = ", ".join(err_msgs)
483
+ else:
484
+ values.account_statement_date_err_msgs = None
485
+
486
+ return values
487
+
488
+ except Exception as e:
489
+ # logger.exception(e, exc_info=True)
490
+ # return None
491
+ raise
492
+
493
+ @model_validator(mode="after")
494
+ def validate_salary_credit_checks(cls, values):
495
+ try:
496
+ err_msgs = list()
497
+
498
+ statement_start_date_val = values.statement_start_date
499
+ statement_end_date_val = values.statement_end_date
500
+ first_salary_deposit_date_present_val = (
501
+ values.first_salary_deposit_date_present
502
+ )
503
+ # # is_salary_credit_present_val = values.is_salary_credit_present
504
+ # is_salary_credit_consistent_across_months_val = (
505
+ # values.is_salary_credit_consistent_across_months
506
+ # )
507
+
508
+ # if not statement_start_date_val or not statement_end_date_val:
509
+ # err_msgs.append(
510
+ # "Both statement start date & statement end date must be present"
511
+ # )
512
+ # values.validation_policy_status_df.loc[len(
513
+ # values.validation_policy_status_df)] = ["Both statement start date & statement end date must be present", f"{statement_start_date_val}, {statement_end_date_val}", False, "Both statement start date & statement end date are not present"]
514
+ # else:
515
+ # values.validation_policy_status_df.loc[len(
516
+ # values.validation_policy_status_df)] = ["Both statement start date & statement end date must be present", f"{statement_start_date_val}, {statement_end_date_val}", True, "Both statement start date & statement end date are present"]
517
+
518
+
519
+ if not first_salary_deposit_date_present_val:
520
+ err_msgs.append("At least one salary credit must be present")
521
+ values.validation_policy_status_df.loc[
522
+ len(values.validation_policy_status_df)
523
+ ] = [
524
+ "Salary deposit",
525
+ first_salary_deposit_date_present_val,
526
+ False,
527
+ "At least one salary credit is not present",
528
+ ]
529
+ else:
530
+ values.validation_policy_status_df.loc[
531
+ len(values.validation_policy_status_df)
532
+ ] = [
533
+ "Salary deposit",
534
+ first_salary_deposit_date_present_val,
535
+ True,
536
+ "At least one salary credit is present",
537
+ ]
538
+
539
+
540
+ if (
541
+ not statement_start_date_val
542
+ or not statement_end_date_val
543
+ or (statement_end_date_val < statement_start_date_val)
544
+ ):
545
+ err_msgs.append(
546
+ "Statement period's end date must be after the start date"
547
+ )
548
+ values.validation_policy_status_df.loc[
549
+ len(values.validation_policy_status_df)
550
+ ] = [
551
+ "Date checks",
552
+ f"{statement_start_date_val}, {statement_end_date_val}",
553
+ False,
554
+ "Statement period's end date is not after the start date",
555
+ ]
556
+ else:
557
+ values.validation_policy_status_df.loc[
558
+ len(values.validation_policy_status_df)
559
+ ] = [
560
+ "Date checks",
561
+ f"{statement_start_date_val}, {statement_end_date_val}",
562
+ True,
563
+ "Statement period's end date is after the start date",
564
+ ]
565
+
566
+
567
+ # # if start and end and (start.month != end.month or start.year != end.year):
568
+ # if (
569
+ # statement_start_date_val
570
+ # and statement_end_date_val
571
+ # and first_salary_deposit_date_present_val
572
+ # and (
573
+ # statement_start_date_val.month < statement_end_date_val.month
574
+ # or statement_start_date_val.year < statement_end_date_val.year
575
+ # )
576
+ # and (
577
+ # statement_end_date_val.day >= first_salary_deposit_date_present_val
578
+ # )
579
+ # ):
580
+ # if not is_salary_credit_consistent_across_months_val:
581
+ # err_msgs.append(
582
+ # "Salary credit amount across months must be consistent"
583
+ # )
584
+
585
+ if err_msgs:
586
+ values.salary_deposit_err_msgs = ", ".join(err_msgs)
587
+ else:
588
+ values.salary_deposit_err_msgs = None
589
+
590
+ return values
591
+
592
+ except Exception as e:
593
+ # logger.exception(e, exc_info=True)
594
+ # return None
595
+ raise
596
+
597
+ @computed_field
598
+ @property
599
+ def is_red_flagged(self) -> bool:
600
+ if (
601
+ self.account_statement_date_err_msgs
602
+ or self.full_name_err_msgs
603
+ or self.bank_name_err_msgs
604
+ or self.account_number_err_msgs
605
+ or self.sort_code_err_msgs
606
+ or self.salary_deposit_err_msgs
607
+ ):
608
+ return True
609
+ return False
schemas/custom_app_form.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import re
3
+
4
+ from dateutil.relativedelta import relativedelta
5
+ from pydantic import (
6
+ BaseModel,
7
+ computed_field,
8
+ Field,
9
+ ValidationInfo,
10
+ model_validator,
11
+ ConfigDict
12
+ )
13
+ import pandas as pd
14
+
15
+
16
+ class CustomAppFormUpload(BaseModel):
17
+ model_config = ConfigDict(arbitrary_types_allowed=True)
18
+ # application_summary_full_name: str | None = Field(None, alias="full_name")
19
+ # application_summary_bank_name: str | None = Field(None, alias="bank_name")
20
+ # application_summary_employer_name: str | None = Field(None, alias="employer_name")
21
+ # application_summary_complete_address: str | None = Field(None, alias="complete_address")
22
+ application_summary_full_name: str = Field(alias="full_name")
23
+ application_summary_bank_name: str = Field(alias="bank_name")
24
+ application_summary_employer_name: str = Field(alias="employer_name")
25
+ application_summary_complete_address: str = Field(alias="complete_address")
26
+
27
+ full_name_err_msgs: str | None = None
28
+ bank_name_err_msgs: str | None = None
29
+ employer_name_err_msgs: str | None = None
30
+ complete_employee_address_err_msgs: str | None = None
31
+ validation_policy_status_df: pd.DataFrame = pd.DataFrame(
32
+ columns=["Policy", "Value", "Status", "Message"])
33
+ # is_incomplete: bool = False
34
+
35
+ @model_validator(mode="after")
36
+ def validate_full_name(self, info: ValidationInfo):
37
+ """Validate provided applicant's full name"""
38
+ try:
39
+ err_msgs = []
40
+
41
+ full_name_val = self.application_summary_full_name
42
+ if not full_name_val:
43
+ err_msgs.append("Applicant's full name not present")
44
+
45
+ full_name_val_len = 0
46
+ if full_name_val:
47
+ full_name_val_len = len(full_name_val)
48
+ if not full_name_val and not (
49
+ full_name_val_len >= 2 and full_name_val_len <= 61
50
+ ):
51
+ err_msgs.append(
52
+ "Full name must have a length of at least 2 & at most 61"
53
+ )
54
+
55
+ if not full_name_val or len(full_name_val.strip().split(" ")) < 2:
56
+ err_msgs.append(
57
+ "Full name must consist of at least 2 words (first name + last name)"
58
+ )
59
+
60
+ if err_msgs:
61
+ self.full_name_err_msgs = ", ".join(err_msgs)
62
+ else:
63
+ self.full_name_err_msgs = None
64
+
65
+ return self
66
+ except Exception as e:
67
+ # logger.exception(e, exc_info=True)
68
+ # return None
69
+ raise
70
+
71
+ @model_validator(mode="after")
72
+ def validate_bank_name(self, info: ValidationInfo):
73
+ """Validate provided bank name"""
74
+ try:
75
+ err_msgs = []
76
+ bank_name_val = self.application_summary_bank_name
77
+ if not bank_name_val:
78
+ err_msgs.append("Bank name not present")
79
+
80
+ bank_name_val_len = 0
81
+ if bank_name_val:
82
+ bank_name_val_len = len(bank_name_val)
83
+ if not bank_name_val and not (
84
+ bank_name_val_len >= 4 and bank_name_val_len <= 50
85
+ ):
86
+ err_msgs.append(
87
+ "Bank name must have a length of at least 4 & at most 50"
88
+ )
89
+
90
+ if err_msgs:
91
+ self.bank_name_err_msgs = ", ".join(err_msgs)
92
+ else:
93
+ self.bank_name_err_msgs = None
94
+
95
+ return self
96
+ except Exception as e:
97
+ # logger.exception(e, exc_info=True)
98
+ # return None
99
+ raise
100
+
101
+ @model_validator(mode="after")
102
+ def validate_employer_name(self, info: ValidationInfo):
103
+ """Validate provided employer name"""
104
+ try:
105
+ err_msgs = []
106
+ employer_name_val = self.application_summary_employer_name
107
+
108
+ if not employer_name_val:
109
+ err_msgs.append("Employer name not present")
110
+
111
+ # # Allowed: letters, numbers, spaces, and common name punctuation
112
+ # pattern = r"^[A-Za-z0-9&\-,.()'/@ ]{2,100}$"
113
+ # if not re.match(pattern, employer_name_val):
114
+ # err_msgs.append("Employer name contains invalid characters")
115
+ if not re.search(r"[A-Za-z]", employer_name_val):
116
+ err_msgs.append(
117
+ "Employer name must contain at least one letter")
118
+ if employer_name_val.strip() == "":
119
+ err_msgs.append("Employer name cannot be only whitespace")
120
+
121
+ self.employer_name_err_msgs = ", ".join(
122
+ err_msgs) if err_msgs else None
123
+ return self
124
+ except Exception as e:
125
+ # logger.exception(e, exc_info=True)
126
+ # return None
127
+ raise
128
+
129
+ @model_validator(mode="after")
130
+ def validate_complete_address(self, info: ValidationInfo):
131
+ try:
132
+ err_msgs = []
133
+ val = self.application_summary_complete_address
134
+
135
+ if not val:
136
+ err_msgs.append("Applicant's address not present")
137
+
138
+ length = len(val) if val else 0
139
+ if not (10 <= length <= 300):
140
+ err_msgs.append(
141
+ "Applicant's complete address must have a length of at least 10 & at most 300"
142
+ )
143
+
144
+ self.complete_employee_address_err_msgs = (
145
+ ", ".join(err_msgs) if err_msgs else None
146
+ )
147
+ return self
148
+ except Exception as e:
149
+ # logger.exception(e, exc_info=True)
150
+ # return None
151
+ raise
152
+
153
+ @computed_field
154
+ @property
155
+ def is_incomplete(self) -> bool:
156
+ if any([
157
+ self.full_name_err_msgs,
158
+ self.bank_name_err_msgs,
159
+ self.employer_name_err_msgs,
160
+ self.complete_employee_address_err_msgs,
161
+ ]):
162
+ return True
163
+ return False
schemas/id.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+
3
+ from dateutil.relativedelta import relativedelta
4
+ from pydantic import (
5
+ BaseModel,
6
+ computed_field,
7
+ Field,
8
+ ValidationInfo,
9
+ model_validator,
10
+ ConfigDict
11
+ )
12
+ import pandas as pd
13
+
14
+
15
+ class UKPassportSchema(BaseModel):
16
+ model_config = ConfigDict(arbitrary_types_allowed=True)
17
+ full_name: str | None = Field(
18
+ default=None,
19
+ description="Applicant's full name. Must consist of at least 2 words, have length gte 2 & lte 61",
20
+ examples=["Jodie Pippa"],
21
+ ) # , min_length=2, max_length=61)
22
+ expiry_date: datetime.date | None = Field(
23
+ default=None,
24
+ description="The passport's expiry date in YYYY-MM-DD format",
25
+ examples=["2028-06-01"],
26
+ )
27
+
28
+ full_name_err_msgs: str | None = None
29
+ expiry_date_err_msgs: str | None = None
30
+ validation_policy_status_df: pd.DataFrame = pd.DataFrame(
31
+ columns=["Policy", "Value", "Status", "Message"])
32
+
33
+ @model_validator(mode="after")
34
+ def validate_expiry_date(cls, values):
35
+ try:
36
+ err_msgs = list()
37
+ expiry_date_val = values.expiry_date
38
+ if not expiry_date_val:
39
+ err_msgs.append("Expiry date must be present")
40
+ values.validation_policy_status_df.loc[len(
41
+ values.validation_policy_status_df)] = ["Expiry date must be present", values.expiry_date, False, "Expiry date is not present"]
42
+ else:
43
+ values.validation_policy_status_df.loc[len(
44
+ values.validation_policy_status_df)] = ["Expiry date must be present", values.expiry_date, True, "Expiry date is present"]
45
+ if expiry_date_val < datetime.date.today() + relativedelta(years=1):
46
+ # raise ValueError("Provided passport expires within 1 year")
47
+ err_msgs.append("Provided passport expires within 1 year")
48
+ values.validation_policy_status_df.loc[
49
+ len(values.validation_policy_status_df)
50
+ ] = [
51
+ "Provided passport expiry should be more than 1 year",
52
+ values.expiry_date,
53
+ False,
54
+ "Provided passport expires within 1 year &/or is expired",
55
+ ]
56
+ else:
57
+ values.validation_policy_status_df.loc[
58
+ len(values.validation_policy_status_df)
59
+ ] = [
60
+ "Provided passport expiry should be more than 1 year",
61
+ values.expiry_date,
62
+ True,
63
+ "Provided passport does not expire within 1 year",
64
+ ]
65
+
66
+ values.expiry_date_err_msgs = ", ".join(
67
+ err_msgs) if err_msgs else None
68
+ return values
69
+ except Exception as e:
70
+ raise
71
+ # if not values.expiry_date_err_msgs:
72
+ # values.expiry_date_err_msgs = "Provided passport expires within 1 year"
73
+ # else:
74
+ # values.expiry_date_err_msgs = f"{values.expiry_date_err_msgs}, Provided passport expires within 1 year"
75
+ # if not values.expiry_date_err_msgs:
76
+ # values.expiry_date_err_msgs = None
77
+ # return values
78
+
79
+ @model_validator(mode="after")
80
+ def validate_full_name(cls, values, info: ValidationInfo):
81
+ """Match applicant's full name against provided name (case-insensitive)"""
82
+ try:
83
+ err_msgs = []
84
+ expected = (
85
+ info.context.get("application_summary_full_name")
86
+ if info.context
87
+ else None
88
+ )
89
+ full_name_val = values.full_name
90
+ if not full_name_val:
91
+ err_msgs.append("Applicant's full name not present")
92
+ values.validation_policy_status_df.loc[len(
93
+ values.validation_policy_status_df)] = ["Applicant's full name should be present", full_name_val, False, "Applicant's full name not present"]
94
+ else:
95
+ values.validation_policy_status_df.loc[len(
96
+ values.validation_policy_status_df)] = ["Applicant's full name should be present", full_name_val, True, "Applicant's full name is present"]
97
+
98
+ full_name_val_len = 0
99
+ if full_name_val:
100
+ full_name_val_len = len(full_name_val)
101
+ if not full_name_val and not (
102
+ full_name_val_len >= 2 and full_name_val_len <= 61
103
+ ):
104
+ err_msgs.append(
105
+ "Full name must have a length of at least 2 & at most 61"
106
+ )
107
+ values.validation_policy_status_df.loc[len(
108
+ values.validation_policy_status_df)] = [ "Full name must have a length of at least 2 & at most 61", full_name_val_len, False, "Full name does not have a length of at least 2 & at most 61"]
109
+ else:
110
+ values.validation_policy_status_df.loc[len(
111
+ values.validation_policy_status_df)] = [ "Full name must have a length of at least 2 & at most 61", full_name_val_len, True, "Full name has a length of at least 2 & at most 61"]
112
+
113
+
114
+ if (
115
+ not expected
116
+ or not full_name_val
117
+ or full_name_val.lower() != expected.lower()
118
+ ):
119
+ err_msgs.append("Name mismatch with provided value")
120
+ values.validation_policy_status_df.loc[len(
121
+ values.validation_policy_status_df)] = ["Name should match with provided value", full_name_val, False, "Name does not match with provided value"]
122
+ else:
123
+ values.validation_policy_status_df.loc[len(
124
+ values.validation_policy_status_df)] = ["Name should match with provided value", full_name_val, True, "Name matches with provided value"]
125
+
126
+
127
+ if not full_name_val or len(full_name_val.strip().split(" ")) < 2:
128
+ err_msgs.append(
129
+ "Full name must consist of at least 2 words (first name + last name)"
130
+ )
131
+ values.validation_policy_status_df.loc[len(
132
+ values.validation_policy_status_df)] = ["Full name must consist of at least 2 words (first name + last name)", len(full_name_val.strip().split(" ")), False, "Full name does not consist of at least 2 words (first name + last name)"]
133
+ else:
134
+ values.validation_policy_status_df.loc[len(
135
+ values.validation_policy_status_df)] = ["Full name must consist of at least 2 words (first name + last name)", len(full_name_val.strip().split(" ")), True, "Full name does consist of at least 2 words (first name + last name)"]
136
+
137
+
138
+ if err_msgs:
139
+ values.full_name_err_msgs = ", ".join(err_msgs)
140
+ else:
141
+ values.full_name_err_msgs = None
142
+
143
+ return values
144
+ except Exception as e:
145
+ # logger.exception(e, exc_info=True)
146
+ # return None
147
+ raise
148
+
149
+ @computed_field
150
+ @property
151
+ def is_red_flagged(self) -> bool:
152
+ if self.full_name_err_msgs or self.expiry_date_err_msgs:
153
+ return True
154
+ return False
155
+
156
+
157
+ class UKDrivingLicense(BaseModel):
158
+ model_config = ConfigDict(arbitrary_types_allowed=True)
159
+ full_name: str | None = Field(
160
+ default=None,
161
+ description="Applicant's full name. Must consist of at least 2 words, have length gte 2 & lte 61",
162
+ examples=["Jodie Pippa"],
163
+ ) # , min_length=2, max_length=61)
164
+
165
+ full_name_err_msgs: str | None = None
166
+ expiry_date_err_msgs: str | None = None
167
+ validation_policy_status_df: pd.DataFrame = pd.DataFrame(
168
+ columns=["Policy", "Value", "Status", "Message"])
169
+
170
+ @model_validator(mode="after")
171
+ def validate_full_name(cls, values, info: ValidationInfo):
172
+ """Match applicant's full name against provided name (case-insensitive)"""
173
+ try:
174
+ err_msgs = []
175
+ expected = (
176
+ info.context.get("application_summary_full_name")
177
+ if info.context
178
+ else None
179
+ )
180
+ full_name_val = values.full_name
181
+ if not full_name_val:
182
+ err_msgs.append("Applicant's full name not present")
183
+ values.validation_policy_status_df.loc[
184
+ len(values.validation_policy_status_df)
185
+ ] = [
186
+ "Applicant's full name should be present",
187
+ full_name_val,
188
+ False,
189
+ "Applicant's full name not present",
190
+ ]
191
+ else:
192
+ values.validation_policy_status_df.loc[
193
+ len(values.validation_policy_status_df)
194
+ ] = [
195
+ "Applicant's full name should be present",
196
+ full_name_val,
197
+ True,
198
+ "Applicant's full name is present",
199
+ ]
200
+
201
+ full_name_val_len = 0
202
+ if full_name_val:
203
+ full_name_val_len = len(full_name_val)
204
+ if not full_name_val and not (
205
+ full_name_val_len >= 2 and full_name_val_len <= 61
206
+ ):
207
+ err_msgs.append(
208
+ "Full name must have a length of at least 2 & at most 61"
209
+ )
210
+ values.validation_policy_status_df.loc[
211
+ len(values.validation_policy_status_df)
212
+ ] = [
213
+ "Full name must have a length of at least 2 & at most 61",
214
+ full_name_val_len,
215
+ False,
216
+ "Full name does not have a length of at least 2 & at most 61",
217
+ ]
218
+ else:
219
+ values.validation_policy_status_df.loc[
220
+ len(values.validation_policy_status_df)
221
+ ] = [
222
+ "Full name must have a length of at least 2 & at most 61",
223
+ full_name_val_len,
224
+ True,
225
+ "Full name has a length of at least 2 & at most 61",
226
+ ]
227
+
228
+ if (
229
+ not expected
230
+ or not full_name_val
231
+ or full_name_val.lower() != expected.lower()
232
+ ):
233
+ err_msgs.append("Name mismatch with provided value")
234
+ values.validation_policy_status_df.loc[
235
+ len(values.validation_policy_status_df)
236
+ ] = [
237
+ "Name should match with provided value",
238
+ full_name_val,
239
+ False,
240
+ "Name does not match with provided value",
241
+ ]
242
+ else:
243
+ values.validation_policy_status_df.loc[
244
+ len(values.validation_policy_status_df)
245
+ ] = [
246
+ "Name should match with provided value",
247
+ full_name_val,
248
+ True,
249
+ "Name matches with provided value",
250
+ ]
251
+
252
+ if not full_name_val or len(full_name_val.strip().split(" ")) < 2:
253
+ err_msgs.append(
254
+ "Full name must consist of at least 2 words (first name + last name)"
255
+ )
256
+ values.validation_policy_status_df.loc[
257
+ len(values.validation_policy_status_df)
258
+ ] = [
259
+ "Full name must consist of at least 2 words (first name + last name)",
260
+ len(full_name_val.strip().split(" ")),
261
+ False,
262
+ "Full name does not consist of at least 2 words (first name + last name)",
263
+ ]
264
+ else:
265
+ values.validation_policy_status_df.loc[
266
+ len(values.validation_policy_status_df)
267
+ ] = [
268
+ "Full name must consist of at least 2 words (first name + last name)",
269
+ len(full_name_val.strip().split(" ")),
270
+ True,
271
+ "Full name does consist of at least 2 words (first name + last name)",
272
+ ]
273
+
274
+ if err_msgs:
275
+ values.full_name_err_msgs = ", ".join(err_msgs)
276
+ else:
277
+ values.full_name_err_msgs = None
278
+
279
+ return values
280
+ except Exception as e:
281
+ # logger.exception(e, exc_info=True)
282
+ # return None
283
+ raise
284
+
285
+ @computed_field
286
+ @property
287
+ def is_red_flagged(self) -> bool:
288
+ if self.full_name_err_msgs or self.expiry_date_err_msgs:
289
+ return True
290
+ return False
291
+
schemas/payslip.py ADDED
@@ -0,0 +1,551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import re
3
+
4
+ from pydantic import (
5
+ BaseModel,
6
+ computed_field,
7
+ Field,
8
+ ValidationInfo,
9
+ model_validator,
10
+ ConfigDict
11
+ )
12
+ import pandas as pd
13
+
14
+
15
+ class UKPayslipSchema(BaseModel):
16
+ model_config = ConfigDict(arbitrary_types_allowed=True)
17
+ pay_period_start_date: datetime.date | None = Field(
18
+ default=None,
19
+ description="Pay period's start date in YYYY-MM-DD format",
20
+ examples=["2025-02-01"],
21
+ )
22
+ pay_period_end_date: datetime.date | None = Field(
23
+ default=None,
24
+ description="Pay period's end date in YYYY-MM-DD format",
25
+ examples=["2025-02-28"],
26
+ )
27
+ pay_period_days: int | None = Field(
28
+ default=None,
29
+ description="pay_period_end_date - pay_period_start_date in days",
30
+ examples=[28],
31
+ )
32
+ pay_date: datetime.date | None = Field(None)
33
+ full_name: str | None = Field(
34
+ default=None,
35
+ description="Applicant's full name. Must consist of at least 2 words, have length gte 2 & lte 61",
36
+ examples=["Jodie Pippa"],
37
+ )
38
+ employer_name: str | None = Field(
39
+ default=None,
40
+ description="Employer name extracted",
41
+ examples=["ABC Ltd"],
42
+ )
43
+ is_basic_pay_net_pay_other_salary_components_present: bool = Field(
44
+ default=False,
45
+ description="Boolean indicating whether Basic Pay, Net Pay, other requisite salary components/line items are present in the payslip",
46
+ examples=[True, False],
47
+ )
48
+ is_tax_deducation_present: bool = Field(
49
+ default=False,
50
+ description="Boolean flag indicating whether Tax Deduction line item is present in the payslip",
51
+ examples=[True, False],
52
+ )
53
+ is_ni_deduction_present: bool = Field(
54
+ default=False,
55
+ description="Boolean flag indicating whether NI/National Insurance deduction line item is present in the payslip",
56
+ examples=[True, False],
57
+ )
58
+ complete_employee_address: str | None = Field(
59
+ default=None,
60
+ description="Employee's complete address as a string",
61
+ examples=["123 Maple Street, London, UK, SW1A 1AA"],
62
+ )
63
+ # employee_number: int | None = Field(
64
+ # default=None,
65
+ # description="Employee number",
66
+ # examples=[3558, 1234],
67
+ # )
68
+
69
+ pay_dates_err_msgs: str | None = None
70
+ full_name_err_msgs: str | None = None
71
+ employer_name_err_msgs: str | None = None
72
+ payslip_line_item_presence_err_msgs: str | None = None
73
+ complete_employee_address_err_msgs: str | None = None
74
+ validation_policy_status_df: pd.DataFrame = pd.DataFrame(
75
+ columns=["Policy", "Value", "Status", "Message"])
76
+ # employee_number_err_msgs: str | None = None
77
+ # is_red_flagged: bool = False
78
+
79
+ @model_validator(mode="after")
80
+ def validate_full_name(self, info: ValidationInfo):
81
+ """Match applicant's full name against provided name (case-insensitive)"""
82
+ try:
83
+ err_msgs = []
84
+ expected = (
85
+ info.context.get("application_summary_full_name")
86
+ if info.context
87
+ else None
88
+ )
89
+
90
+ if not self.full_name:
91
+ err_msgs.append("Applicant's full name not present")
92
+ self.validation_policy_status_df.loc[len(
93
+ self.validation_policy_status_df)] = ["Employer & Customer Names", self.full_name, False, "Applicant's full name is not present"]
94
+ else:
95
+ self.validation_policy_status_df.loc[len(
96
+ self.validation_policy_status_df)] = ["Employer & Customer Names", self.full_name, True, "Applicant's full name is present"]
97
+
98
+ full_name_val_len = len(self.full_name) if self.full_name else 0
99
+ if not (2 <= full_name_val_len <= 61):
100
+ err_msgs.append(
101
+ "Full name must have a length of at least 2 & at most 61"
102
+ )
103
+ self.validation_policy_status_df.loc[len(
104
+ self.validation_policy_status_df)] = ["Employer & Customer Names", full_name_val_len, False, "Full name has a length of at least 2 & at most 61"]
105
+ else:
106
+ self.validation_policy_status_df.loc[len(
107
+ self.validation_policy_status_df)] = ["Employer & Customer Names", full_name_val_len, True, "Full name has a length of at least 2 & at most 61"]
108
+
109
+ if not self.full_name or len(self.full_name.strip().split(" ")) < 2:
110
+ err_msgs.append(
111
+ "Full name must consist of at least 2 words (first name + last name)"
112
+ )
113
+ self.validation_policy_status_df.loc[len(
114
+ self.validation_policy_status_df)] = ["Employer & Customer Names", self.full_name , False, "Full name does not consist of at least 2 words (first name + last name)"]
115
+ else:
116
+ self.validation_policy_status_df.loc[len(
117
+ self.validation_policy_status_df)] = ["Employer & Customer Names", len(self.full_name.strip().split(" ")), True, "Full name consists of at least 2 words (first name + last name)"]
118
+
119
+ if (
120
+ not expected
121
+ or not self.full_name
122
+ or self.full_name.lower() != expected.lower()
123
+ ):
124
+ err_msgs.append("Name mismatch with provided value")
125
+ self.validation_policy_status_df.loc[len(
126
+ self.validation_policy_status_df)] = ["Employer & Customer Names", f"{self.full_name}, {expected}", False, "Name does not match with provided value"]
127
+ else:
128
+ self.validation_policy_status_df.loc[len(
129
+ self.validation_policy_status_df)] = ["Employer & Customer Names", f"{self.full_name}, {expected}", True, "Name matches with provided value"]
130
+
131
+ self.full_name_err_msgs = ", ".join(err_msgs) if err_msgs else None
132
+ return self
133
+ except Exception as e:
134
+ # logger.exception(e, exc_info=True)
135
+ # return None
136
+ raise
137
+
138
+ @model_validator(mode="after")
139
+ def validate_employer_name(self, info: ValidationInfo):
140
+ """Match employer against provided employer name (case-insensitive)"""
141
+ try:
142
+ err_msgs = []
143
+ expected = (
144
+ info.context.get("application_summary_employer_name")
145
+ if info.context
146
+ else None
147
+ )
148
+ employer_name_val = self.employer_name
149
+
150
+ if not employer_name_val:
151
+ err_msgs.append("Employer name not present")
152
+ self.validation_policy_status_df.loc[len(
153
+ self.validation_policy_status_df)] = ["Employer & Customer Names", employer_name_val, False, "Employer name is not present"]
154
+ else:
155
+
156
+ self.validation_policy_status_df.loc[len(
157
+ self.validation_policy_status_df)] = ["Employer & Customer Names", employer_name_val, True, "Employer name is present"]
158
+
159
+ is_employer_name_match = (
160
+ expected
161
+ and employer_name_val
162
+ and employer_name_val.lower() == expected.lower()
163
+ )
164
+
165
+ if not is_employer_name_match:
166
+ err_msgs.append("Employer name mismatch with provided value")
167
+ self.validation_policy_status_df.loc[len(
168
+ self.validation_policy_status_df)] = ["Employer & Customer Names", f"{employer_name_val}, {expected}", False, "Employer name does not match with provided value"]
169
+ else:
170
+ self.validation_policy_status_df.loc[len(
171
+ self.validation_policy_status_df)] = ["Employer & Customer Names", f"{employer_name_val}, {expected}", True, "Employer name matches with provided value"]
172
+ # # Allowed: letters, numbers, spaces, and common name punctuation
173
+ # pattern = r"^[A-Za-z0-9&\-,.()'/@ ]{2,100}$"
174
+
175
+ # if not re.match(pattern, employer_name_val):
176
+ # err_msgs.append("Employer name contains invalid characters")
177
+ if not re.search(r"[A-Za-z]", employer_name_val):
178
+ err_msgs.append(
179
+ "Employer name must contain at least one letter")
180
+ if employer_name_val.strip() == "":
181
+ err_msgs.append("Employer name cannot be only whitespace")
182
+
183
+ self.employer_name_err_msgs = ", ".join(
184
+ err_msgs) if err_msgs else None
185
+ return self
186
+ except Exception as e:
187
+ # logger.exception(e, exc_info=True)
188
+ # return None
189
+ raise
190
+
191
+ @model_validator(mode="after")
192
+ def validate_payslip_dates(self):
193
+ try:
194
+ err_msgs = []
195
+ today = datetime.date.today()
196
+ threshold_date = today - datetime.timedelta(days=35)
197
+
198
+ if not self.pay_period_start_date or not self.pay_period_end_date:
199
+ err_msgs.append(
200
+ "Undated Payslips"
201
+ )
202
+ self.validation_policy_status_df.loc[
203
+ len(self.validation_policy_status_df)
204
+ ] = [
205
+ "Undated Payslips",
206
+ f"{self.pay_period_start_date}, {self.pay_period_end_date}",
207
+ False,
208
+ "Undated payslip",
209
+ ]
210
+ else:
211
+ self.validation_policy_status_df.loc[
212
+ len(self.validation_policy_status_df)
213
+ ] = [
214
+ "Undated Payslips",
215
+ f"{self.pay_period_start_date}, {self.pay_period_end_date}",
216
+ True,
217
+ "Dated payslip",
218
+ ]
219
+ # self.is_red_flagged = True
220
+
221
+
222
+ if self.pay_date:
223
+ if not (threshold_date <= self.pay_date <= today):
224
+ err_msgs.append(
225
+ "Pay date must be within the last 35 days & not in the future"
226
+ )
227
+ self.validation_policy_status_df.loc[
228
+ len(self.validation_policy_status_df)
229
+ ] = [
230
+ "Pay Date Requirement",
231
+ self.pay_date,
232
+ False,
233
+ "Pay date is not within the last 35 days & not in the future",
234
+ ]
235
+ else:
236
+ self.validation_policy_status_df.loc[
237
+ len(self.validation_policy_status_df)
238
+ ] = [
239
+ "Pay Date Requirement",
240
+ self.pay_date,
241
+ True,
242
+ "Pay date is within the last 35 days & not in the future",
243
+ ]
244
+
245
+ # elif self.pay_period_end_date:
246
+ else:
247
+ if not (threshold_date <= self.pay_period_end_date <= today):
248
+ err_msgs.append(
249
+ "Pay period's end date must be within the last 35 days & not in the future"
250
+ )
251
+ self.validation_policy_status_df.loc[
252
+ len(self.validation_policy_status_df)
253
+ ] = [
254
+ "Pay Period End Date (DD/MM/YYYY, if no pay date)",
255
+ self.pay_date,
256
+ False,
257
+ "Pay date is not within the last 35 days &/or in the future",
258
+ ]
259
+ else:
260
+ self.validation_policy_status_df.loc[
261
+ len(self.validation_policy_status_df)
262
+ ] = [
263
+ "Pay Period End Date (DD/MM/YYYY, if no pay date)",
264
+ self.pay_date,
265
+ True,
266
+ "Pay date is within the last 35 days & not in the future",
267
+ ]
268
+
269
+ prev_month_end = datetime.date.today().replace(day=1) - \
270
+ datetime.timedelta(days=1)
271
+ prev_month_start = prev_month_end.replace(day=1)
272
+ if not (
273
+ prev_month_start <= self.pay_period_start_date
274
+ and self.pay_period_start_date < self.pay_period_end_date <= today
275
+ ):
276
+ err_msgs.append(
277
+ "Payslip date(s) must not be older than those of the last calendar month"
278
+ )
279
+
280
+ self.validation_policy_status_df.loc[
281
+ len(self.validation_policy_status_df)
282
+ ] = [
283
+ "Pay Period Month (MM/YYYY, if no pay date) basis pay period duration",
284
+ self.pay_date,
285
+ False,
286
+ "Payslip date(s) is older than those of the last calendar month",
287
+ ]
288
+ else:
289
+ self.validation_policy_status_df.loc[
290
+ len(self.validation_policy_status_df)
291
+ ] = [
292
+ "Pay Period Month (MM/YYYY, if no pay date) basis pay period duration",
293
+ self.pay_date,
294
+ True,
295
+ "Payslip date(s) is not older than those of the last calendar month",
296
+ ]
297
+
298
+ if self.pay_period_start_date and self.pay_period_end_date:
299
+
300
+ if self.pay_period_start_date >= self.pay_period_end_date:
301
+ err_msgs.append(
302
+ "Pay period's start date must be before the end date")
303
+ self.validation_policy_status_df.loc[
304
+ len(self.validation_policy_status_df)
305
+ ] = [
306
+ "Pay Period Start & End Dates",
307
+ f"{self.pay_period_start_date}, {self.pay_period_end_date}",
308
+ False,
309
+ "Pay period's start date is not before the end date",
310
+ ]
311
+ else:
312
+ self.validation_policy_status_df.loc[
313
+ len(self.validation_policy_status_df)
314
+ ] = [
315
+ "Pay Period Start & End Dates",
316
+ f"{self.pay_period_start_date}, {self.pay_period_end_date}",
317
+ True,
318
+ "Pay period's start date is before the end date",
319
+ ]
320
+
321
+
322
+
323
+ if (self.pay_period_end_date - self.pay_period_start_date).days < 28:
324
+ err_msgs.append(
325
+ "Pay period's start date & end date must have a gap of at least 28 days"
326
+ )
327
+ self.validation_policy_status_df.loc[
328
+ len(self.validation_policy_status_df)
329
+ ] = [
330
+ "Submission Requirement (Monthly Pay)",
331
+ (self.pay_period_end_date - self.pay_period_start_date).days,
332
+ False,
333
+ "Pay period's start date & end date donot have a gap of at least 28 days",
334
+ ]
335
+ else:
336
+ self.validation_policy_status_df.loc[
337
+ len(self.validation_policy_status_df)
338
+ ] = [
339
+ "Submission Requirement (Monthly Pay)",
340
+ (self.pay_period_end_date - self.pay_period_start_date).days,
341
+ True,
342
+ "Pay period's start date & end date have a gap of at least 28 days",
343
+ ]
344
+ else:
345
+ self.validation_policy_status_df.loc[
346
+ len(self.validation_policy_status_df)
347
+ ] = [
348
+ "Pay Period Start & End Dates",
349
+ f"{self.pay_period_start_date}, {self.pay_period_end_date}",
350
+ False,
351
+ "Pay period's start date is not before the end date",
352
+ ]
353
+ self.validation_policy_status_df.loc[
354
+ len(self.validation_policy_status_df)
355
+ ] = [
356
+ "Submission Requirement (Monthly Pay)",
357
+ f"{self.pay_period_start_date}, {self.pay_period_end_date}",
358
+ False,
359
+ "Pay period's start date & end date donot have a gap of at least 28 days",
360
+ ]
361
+
362
+
363
+ self.pay_dates_err_msgs = ", ".join(err_msgs) if err_msgs else None
364
+ return self
365
+ except Exception as e:
366
+ # logger.exception(e, exc_info=True)
367
+ # return None
368
+ raise
369
+
370
+ @model_validator(mode="after")
371
+ def validate_payslip_components_checks(self):
372
+ try:
373
+ err_msgs = []
374
+
375
+ if not self.is_basic_pay_net_pay_other_salary_components_present:
376
+ err_msgs.append(
377
+ "Basic salary, Net Salary and/or other requisite salary components not present"
378
+ )
379
+
380
+ self.validation_policy_status_df.loc[
381
+ len(self.validation_policy_status_df)
382
+ ] = [
383
+ "Requisite salary line items",
384
+ self.is_basic_pay_net_pay_other_salary_components_present,
385
+ False,
386
+ "Basic salary, Net Salary and/or other requisite salary components not present",
387
+ ]
388
+ else:
389
+ self.validation_policy_status_df.loc[
390
+ len(self.validation_policy_status_df)
391
+ ] = [
392
+ "Requisite salary line items",
393
+ self.is_basic_pay_net_pay_other_salary_components_present,
394
+ True,
395
+ "Basic salary, Net Salary and/or other requisite salary components are present",
396
+ ]
397
+
398
+ if not self.is_tax_deducation_present:
399
+ err_msgs.append("Tax Deduction line item must be present")
400
+ self.validation_policy_status_df.loc[
401
+ len(self.validation_policy_status_df)
402
+ ] = [
403
+ "Tax & NI Contributions",
404
+ self.is_tax_deducation_present,
405
+ False,
406
+ "Tax Deduction line item is not present",
407
+ ]
408
+ else:
409
+ self.validation_policy_status_df.loc[
410
+ len(self.validation_policy_status_df)
411
+ ] = [
412
+ "Tax & NI Contributions",
413
+ self.is_tax_deducation_present,
414
+ True,
415
+ "Tax Deduction line item is present",
416
+ ]
417
+
418
+ if not self.is_ni_deduction_present:
419
+ err_msgs.append("NI/National Insurance line item must be present")
420
+ self.validation_policy_status_df.loc[
421
+ len(self.validation_policy_status_df)
422
+ ] = [
423
+ "Tax & NI Contributions",
424
+ self.is_ni_deduction_present,
425
+ False,
426
+ "NI/National Insurance line item is not present",
427
+ ]
428
+ else:
429
+ self.validation_policy_status_df.loc[
430
+ len(self.validation_policy_status_df)
431
+ ] = [
432
+ "Tax & NI Contributions",
433
+ self.is_ni_deduction_present,
434
+ True,
435
+ "NI/National Insurance line item is present",
436
+ ]
437
+
438
+ self.payslip_line_item_presence_err_msgs = (
439
+ ", ".join(err_msgs) if err_msgs else None
440
+ )
441
+ return self
442
+ except Exception as e:
443
+ # logger.exception(e, exc_info=True)
444
+ # return None
445
+ raise
446
+
447
+ @model_validator(mode="after")
448
+ def validate_complete_address(self, info: ValidationInfo):
449
+ try:
450
+ err_msgs = []
451
+ expected = (
452
+ info.context.get("application_summary_complete_address")
453
+ if info.context
454
+ else None
455
+ )
456
+ val = self.complete_employee_address
457
+
458
+ if not val:
459
+ err_msgs.append("Applicant's address not present")
460
+ self.validation_policy_status_df.loc[
461
+ len(self.validation_policy_status_df)
462
+ ] = [
463
+ "Applicant Address",
464
+ val,
465
+ False,
466
+ "Applicant's address not present",
467
+ ]
468
+ else:
469
+ self.validation_policy_status_df.loc[
470
+ len(self.validation_policy_status_df)
471
+ ] = [
472
+ "Applicant Address",
473
+ val,
474
+ True,
475
+ "Applicant's address is present",
476
+ ]
477
+
478
+ length = len(val) if val else 0
479
+ if not (10 <= length <= 300):
480
+ err_msgs.append(
481
+ "Applicant's complete address must have a length of at least 10 & at most 300"
482
+ )
483
+ self.validation_policy_status_df.loc[
484
+ len(self.validation_policy_status_df)
485
+ ] = [
486
+ "Applicant Address",
487
+ length,
488
+ False,
489
+ "Applicant's complete address does not have a length of at least 10 & at most 300",
490
+ ]
491
+ else:
492
+ self.validation_policy_status_df.loc[
493
+ len(self.validation_policy_status_df)
494
+ ] = [
495
+ "Applicant Address",
496
+ length,
497
+ True,
498
+ "Applicant's complete address has a length of at least 10 & at most 300",
499
+ ]
500
+
501
+ if not expected or not val or val.lower() != expected.lower():
502
+ err_msgs.append("Complete address mismatch with provided value")
503
+ self.validation_policy_status_df.loc[
504
+ len(self.validation_policy_status_df)
505
+ ] = [
506
+ "Applicant Address",
507
+ f"{val}, {expected}",
508
+ False,
509
+ "Complete address mismatch with provided value",
510
+ ]
511
+ else:
512
+ self.validation_policy_status_df.loc[
513
+ len(self.validation_policy_status_df)
514
+ ] = [
515
+ "Applicant Address",
516
+ f"{val}, {expected}",
517
+ True,
518
+ "Complete address matches with provided value",
519
+ ]
520
+
521
+ self.complete_employee_address_err_msgs = (
522
+ ", ".join(err_msgs) if err_msgs else None
523
+ )
524
+ return self
525
+ except Exception as e:
526
+ # logger.exception(e, exc_info=True)
527
+ # return None
528
+ raise
529
+
530
+ # @model_validator(mode="after")
531
+ # def validate_employee_number(self):
532
+ # try:
533
+ # if self.employee_number and self.employee_number <= 25:
534
+ # self.complete_employee_address_err_msgs = "Employee number low"
535
+ # return self
536
+ # except Exception as e:
537
+ # raise
538
+
539
+ @computed_field
540
+ @property
541
+ def is_red_flagged(self) -> bool:
542
+ if any([
543
+ self.pay_dates_err_msgs,
544
+ self.full_name_err_msgs,
545
+ self.employer_name_err_msgs,
546
+ self.payslip_line_item_presence_err_msgs,
547
+ self.complete_employee_address_err_msgs,
548
+ # self.employee_number_err_msgs,
549
+ ]):
550
+ return True
551
+ return False
schemas/uk_address.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field, field_validator
2
+ import re
3
+
4
+ UK_POSTCODE_REGEX = re.compile(r"^(GIR ?0AA|[A-Z]{1,2}\d{1,2}[A-Z]?\s?\d[A-Z]{2})$", re.IGNORECASE)
5
+
6
+ class UKAddress(BaseModel):
7
+ street_address: str = Field(..., min_length=5, max_length=100)
8
+ city: str = Field(..., min_length=2, max_length=50)
9
+ postcode: str
10
+ country: str = "United Kingdom"
11
+
12
+ @field_validator("street_address")
13
+ @classmethod
14
+ def validate_street_address(cls, v: str) -> str:
15
+ if not re.match(r"^[a-zA-Z0-9\s,.'\-/#()]{5,100}$", v):
16
+ raise ValueError("Invalid characters in street address")
17
+ return v.strip()
18
+
19
+ @field_validator("city")
20
+ @classmethod
21
+ def validate_city(cls, v: str) -> str:
22
+ if not re.match(r"^[a-zA-Z\s\-']+$", v):
23
+ raise ValueError("City must only contain alphabetic characters, spaces, hyphens, or apostrophes")
24
+ return v.strip()
25
+
26
+ @field_validator("postcode")
27
+ @classmethod
28
+ def validate_postcode(cls, v: str) -> str:
29
+ cleaned = v.replace(" ", "").upper()
30
+ if not UK_POSTCODE_REGEX.match(cleaned):
31
+ raise ValueError("Invalid UK postcode format")
32
+ return v.upper()
33
+
34
+ @field_validator("country")
35
+ @classmethod
36
+ def validate_country(cls, v: str) -> str:
37
+ if v.strip().lower() not in ["united kingdom", "uk"]:
38
+ raise ValueError("Country must be United Kingdom or UK")
39
+ return "United Kingdom"
utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from .image_utils import im_2_b64, encode_image, load_pdf_as_image, generate_metadata
3
+ from .process_files import process_uploaded_files
4
+ from .logger import setup_logger