pratham0011 commited on
Commit
b4edeae
Β·
verified Β·
1 Parent(s): 0a83c23

Upload 9 files

Browse files
.dockerignore ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Git
2
+ .git
3
+ .gitignore
4
+
5
+ # Python
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+ *.so
10
+ .Python
11
+ env/
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # Virtual Environment
28
+ venv/
29
+ ENV/
30
+
31
+ # IDE
32
+ .idea/
33
+ .vscode/
34
+ *.swp
35
+ *.swo
36
+
37
+ # OS specific
38
+ .DS_Store
39
+ Thumbs.db
40
+
41
+ # Docker
42
+ .dockerignore
43
+ docker-compose.yml
44
+
45
+ # Large files and directories
46
+ *.model
47
+ *.bin
48
+ *.h5
49
+ *.ckpt
50
+ *.pt
51
+ *.pth
52
+ models/
Dockerfile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Create a non-root user to run the application
6
+ RUN groupadd -r appuser && useradd -r -g appuser appuser
7
+
8
+ # Install system dependencies
9
+ RUN apt-get update && apt-get install -y \
10
+ build-essential \
11
+ curl \
12
+ git \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Create cache directories with proper permissions
16
+ RUN mkdir -p /app/model_cache && \
17
+ chmod 777 /app/model_cache
18
+
19
+ # Copy requirements first for better caching
20
+ COPY requirements.txt .
21
+
22
+ # Install Python dependencies
23
+ RUN pip install --no-cache-dir -r requirements.txt
24
+
25
+ # Copy application code
26
+ COPY . .
27
+ COPY ../utils /app/utils
28
+ COPY ../recommender.py /app/recommender.py
29
+
30
+ # Set proper permissions
31
+ RUN chown -R appuser:appuser /app
32
+
33
+ # Expose port for FastAPI
34
+ EXPOSE 7860
35
+
36
+ # Set environment variables
37
+ ENV PYTHONUNBUFFERED=1
38
+ ENV TRANSFORMERS_CACHE=/app/model_cache
39
+ ENV HF_HOME=/app/model_cache
40
+ ENV SENTENCE_TRANSFORMERS_HOME=/app/model_cache
41
+
42
+ # Switch to non-root user
43
+ USER appuser
44
+
45
+ # Command to run the application
46
+ CMD ["python", "main.py"]
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import List
4
+ import uvicorn
5
+ import os
6
+ import sys
7
+
8
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
9
+
10
+ from recommender import SHLRecommender
11
+ from utils.validators import url as is_valid_url
12
+
13
+ app = FastAPI(
14
+ title="SHL Test Recommender API",
15
+ description="API for recommending SHL tests based on job descriptions or queries",
16
+ version="1.0.0",
17
+ docs_url="/docs",
18
+ redoc_url="/redoc"
19
+ )
20
+
21
+ # Add CORS middleware to allow requests from any origin
22
+ from fastapi.middleware.cors import CORSMiddleware
23
+ app.add_middleware(
24
+ CORSMiddleware,
25
+ allow_origins=["*"], # Allow all origins
26
+ allow_credentials=True,
27
+ allow_methods=["*"], # Allow all methods
28
+ allow_headers=["*"], # Allow all headers
29
+ )
30
+
31
+ recommender = SHLRecommender()
32
+
33
+ # Define request and response models
34
+ class RecommendRequest(BaseModel):
35
+ query: str
36
+ max_recommendations: int = 10
37
+
38
+ class Assessment(BaseModel):
39
+ url: str
40
+ adaptive_support: str
41
+ description: str
42
+ duration: int
43
+ remote_support: str
44
+ test_type: List[str]
45
+
46
+ class RecommendationResponse(BaseModel):
47
+ recommended_assessments: List[Assessment]
48
+
49
+ # API endpoints
50
+ @app.get("/health")
51
+ async def health_check():
52
+ try:
53
+ if not recommender or not hasattr(recommender, 'df') or recommender.df.empty:
54
+ return {"status": "unhealthy"}
55
+
56
+ if not hasattr(recommender, 'embedding_model') or not hasattr(recommender, 'model') or not hasattr(recommender, 'tokenizer'):
57
+ return {"status": "unhealthy"}
58
+
59
+ if not hasattr(recommender, 'product_embeddings') or len(recommender.product_embeddings) == 0:
60
+ return {"status": "unhealthy"}
61
+
62
+ return {"status": "healthy"}
63
+ except Exception:
64
+ return {"status": "unhealthy"}
65
+
66
+ @app.get("/")
67
+ async def root():
68
+ return {"message": "Welcome to the SHL Test Recommender API."}
69
+
70
+ @app.post("/optimize")
71
+ async def optimize_memory():
72
+ try:
73
+ recommender.optimize_memory()
74
+ return {"status": "success", "message": "Memory optimized successfully"}
75
+ except Exception as e:
76
+ raise HTTPException(status_code=500, detail=str(e))
77
+
78
+ # Main recommend endpoint
79
+ @app.post("/recommend", response_model=RecommendationResponse)
80
+ async def recommend(request: RecommendRequest):
81
+ return await process_recommendation(request.query, request.max_recommendations)
82
+
83
+
84
+ async def process_recommendation(query: str, max_recommendations: int):
85
+ try:
86
+ is_url = is_valid_url(query)
87
+
88
+ recommendations = recommender.get_recommendations(
89
+ query,
90
+ is_url=is_url,
91
+ max_recommendations=max_recommendations
92
+ )
93
+
94
+ formatted_assessments = []
95
+ for rec in recommendations:
96
+ duration_str = rec['Duration']
97
+ try:
98
+ duration_int = int(''.join(filter(str.isdigit, duration_str)))
99
+ except:
100
+ duration_int = 60
101
+
102
+ test_type_list = [rec['Test Type']] if rec['Test Type'] and rec['Test Type'] != "Unknown" else ["General Assessment"]
103
+
104
+ test_description = recommender.generate_test_description(
105
+ test_name=rec['Test Name'],
106
+ test_type=rec['Test Type'] if rec['Test Type'] and rec['Test Type'] != "Unknown" else "General Assessment"
107
+ )
108
+
109
+ description = test_description
110
+
111
+ formatted_assessments.append(
112
+ Assessment(
113
+ url=rec['Link'],
114
+ adaptive_support="Yes" if rec['Adaptive/IRT'] == "Yes" else "No",
115
+ description=description,
116
+ duration=duration_int,
117
+ remote_support="Yes" if rec['Remote Testing'] == "Yes" else "No",
118
+ test_type=test_type_list
119
+ )
120
+ )
121
+
122
+ return RecommendationResponse(
123
+ recommended_assessments=formatted_assessments
124
+ )
125
+ except Exception as e:
126
+ try:
127
+ recommender.optimize_memory()
128
+ except:
129
+ pass
130
+ raise HTTPException(status_code=500, detail=str(e))
131
+
132
+ if __name__ == "__main__":
133
+ # Check if running on Hugging Face Spaces
134
+ IS_HF_SPACE = os.environ.get('SPACE_ID') is not None
135
+ port = 7860 if IS_HF_SPACE else 8000
136
+
137
+ print(f"Starting FastAPI server on port {port}")
138
+ uvicorn.run("app:app", host="0.0.0.0", port=port, reload=True)
main.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uvicorn
3
+
4
+ from app import app
5
+
6
+ if __name__ == "__main__":
7
+ # Check if running on Hugging Face Spaces
8
+ IS_HF_SPACE = os.environ.get('SPACE_ID') is not None
9
+ port = 7860 if IS_HF_SPACE else 8000
10
+
11
+ print(f"Starting SHL Test Recommender API on port {port}")
12
+ uvicorn.run(app, host="0.0.0.0", port=port)
requirements.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ gradio==5.20.0
4
+ transformers
5
+ pandas
6
+ scikit-learn
7
+ sentence-transformers
8
+ torch
9
+ beautifulsoup4
10
+ requests
11
+ python-multipart
12
+ huggingface-hub
13
+ safetensors
14
+ tqdm
15
+ typing-extensions
16
+ pydantic
17
+ psutil
18
+ accelerate
19
+ numpy
20
+ scipy
21
+ joblib
22
+ aiofiles
23
+ anyio
24
+ httpx
25
+ jinja2
26
+ markdown-it-py
27
+ pillow
28
+ pyyaml
29
+ regex
30
+ rich
31
+ setuptools
32
+ six
33
+ sympy
34
+ tokenizers
35
+ tomlkit
36
+ typing-inspection
utils/__pycache__/validators.cpython-313.pyc ADDED
Binary file (679 Bytes). View file
 
utils/data.csv ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Test Name,Remote Testing (Yes/No),Adaptive/IRT (Yes/No),Test Type,Link,Duration
2
+ Teller 7.0,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/teller-7-0/,20 minutes
3
+ Contact Center Sales & Service + 8.0,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/contact-center-sales-and-service-8-0/,20 minutes
4
+ Director - Short Form,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/director-short-form/,20 minutes
5
+ "Bookkeeping, Accounting, Auditing Clerk Short Form",Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/bookkeeping-accounting-auditing-clerk-short-form/,20 minutes
6
+ .NET MVC (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/net-mvc-new/,45 minutes
7
+ Written Spanish,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/written-spanish/,20 minutes
8
+ Workplace Safety - Individual 7.1 (Americas),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/workplace-safety-individual-7-1-solution/,20 minutes
9
+ Contact Center Sales & Service 8.0,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/contact-center-sales-and-service-8-0-4268/,20 minutes
10
+ Adobe Photoshop CC,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/adobe-photoshop-cc/,20 minutes
11
+ Apprentice 8.0 Job Focused Assessment,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/apprentice-8-0-job-focused-assessment/,20 minutes
12
+ Apprentice + 8.0 Job Focused Assessment,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/apprentice-8-0-job-focused-assessment-4261/,20 minutes
13
+ Apache HBase (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/apache-hbase-new/,20 minutes
14
+ .NET Framework 4.5,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/net-framework-4-5/,45 minutes
15
+ Automata - Fix (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/automata-fix-new/,20 minutes
16
+ Workplace Safety Solution,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/workplace-safety-solution/,20 minutes
17
+ Workplace Safety - Team 7.1 (International),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/workplace-safety-team-7-1-solution/,20 minutes
18
+ Global Skills Development Report,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/global-skills-development-report/,45 minutes
19
+ Contact Center Team Lead/Coach - Short Form,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/contact-center-team-leadcoach-short-form/,20 minutes
20
+ Entry Level Customer Service 7.1 (International),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/entry-level-customer-service-%28retail-and-cc%29-7-1/,20 minutes
21
+ Written English v1,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/written-english-v1/,20 minutes
22
+ AngularJS (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/angularjs-new/,45 minutes
23
+ Accounts Payable (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/accounts-payable-new/,20 minutes
24
+ 360 Digital Report,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/360-digital-report/,20 minutes
25
+ Contact Centre Agent Solution - UK,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/contact-centre-agent-solution-uk/,20 minutes
26
+ .NET WPF (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/net-wpf-new/,45 minutes
27
+ AI Skills,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/ai-skills/,20 minutes
28
+ Workplace Safety - Team 7.0 Solution,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/workplace-safety-team-7-0-solution/,20 minutes
29
+ Claims/Operations Supervisor Solution,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/claimsoperations-supervisor-solution/,20 minutes
30
+ Event Sales Manager Solution,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/event-sales-manager-solution/,20 minutes
31
+ Account Manager Solution,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/account-manager-solution/,20 minutes
32
+ Agile Software Development,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/agile-software-development/,7 minutes
33
+ Entry Level Sales Sift Out 7.1,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/entry-level-sales-sift-out-7-1/,20 minutes
34
+ Transcriptionist Solution,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/transcriptionist-solution/,20 minutes
35
+ Android Development (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/android-development-new/,45 minutes
36
+ Agency Manager Solution,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/agency-manager-solution/,20 minutes
37
+ Teller with Sales - Short Form,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/teller-with-sales-short-form/,20 minutes
38
+ Workplace Safety - Team 7.1 (Americas),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/workplace-safety-team-7-1-%28americas%29/,20 minutes
39
+ Contact Center Manager - Short Form,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/contact-center-manager-short-form/,20 minutes
40
+ ASP.NET 4.5,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/asp-net-4-5/,45 minutes
41
+ Bank Operations Supervisor - Short Form,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/bank-operations-supervisor-short-form/,20 minutes
42
+ 360Β° Multi-Rater Feedback System (MFS),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/360-multi-rater-feedback-system-mfs/,20 minutes
43
+ ADO.NET (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/ado-net-new/,45 minutes
44
+ Apache Hadoop (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/apache-hadoop-new/,20 minutes
45
+ Executive - Short Form,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/executive-short-form/,20 minutes
46
+ Entry level Sales 7.1 (International),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/entry-level-sales-7-1/,20 minutes
47
+ Zabbix (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/zabbix-new/,20 minutes
48
+ Customer Service - Short Form,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/customer-service-short-form/,20 minutes
49
+ Customer Service - Short Form - UK,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/customer-service-short-form-uk/,20 minutes
50
+ Entry Level Customer Service 7.1 (South Africa),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/entry-level-customer-service-7-1-%28south-africa%29/,20 minutes
51
+ Automata - SQL (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/automata-sql-new/,45 minutes
52
+ Entry level Sales 7.1 (Americas),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/entry-level-sales-7-1-%28americas%29/,20 minutes
53
+ .NET WCF (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/net-wcf-new/,45 minutes
54
+ Financial Professional - Short Form,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/financial-professional-short-form/,20 minutes
55
+ Entry Level Cashier 7.1 (Americas),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/entry-level-cashier-7-1-%28americas%29/,20 minutes
56
+ Branch Manager - Short Form,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/branch-manager-short-form/,20 minutes
57
+ Administrative Professional - Short Form,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/administrative-professional-short-form/,20 minutes
58
+ Cashier Solution,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/cashier-solution/,20 minutes
59
+ Accounts Receivable (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/accounts-receivable-new/,20 minutes
60
+ Aerospace Engineering (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/aerospace-engineering-new/,20 minutes
61
+ Aeronautical Engineering (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/aeronautical-engineering-new/,20 minutes
62
+ Accounts Receivable Simulation (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/accounts-receivable-simulation-new/,20 minutes
63
+ Apache Spark (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/apache-spark-new/,20 minutes
64
+ .NET MVVM (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/net-mvvm-new/,45 minutes
65
+ Accounts Payable Simulation (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/accounts-payable-simulation-new/,20 minutes
66
+ Entry Level Cashier 7.1 (International),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/entry-level-cashier-7-1-%28international%29/,20 minutes
67
+ .NET XAML (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/net-xaml-new/,45 minutes
68
+ Apache Hive (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/apache-hive-new/,20 minutes
69
+ Assessment and Development Center Exercises,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/assessment-and-development-center-exercises/,45 minutes
70
+ Bank Administrative Assistant - Short Form,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/bank-administrative-assistant-short-form/,20 minutes
71
+ Agile Testing (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/agile-testing-new/,20 minutes
72
+ Automata (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/automata-new/,20 minutes
73
+ Angular 6 (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/angular-6-new/,45 minutes
74
+ Bilingual Spanish Reservation Agent Solution,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/bilingual-spanish-reservation-agent-solution/,20 minutes
75
+ Apache Pig (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/apache-pig-new/,20 minutes
76
+ ASP .NET with C# (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/asp-net-with-c-new/,45 minutes
77
+ Workplace Safety - Individual 7.0 Solution,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/workplace-safety-individual-7-0-solution/,20 minutes
78
+ Entry Level Customer Service 7.1 (Americas),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/entry-level-customer-service-7-1-%28americas%29/,20 minutes
79
+ Apache Hadoop Extensions (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/apache-hadoop-extensions-new/,20 minutes
80
+ Adobe Experience Manager (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/adobe-experience-manager-new/,20 minutes
81
+ District/Regional Manager Solution,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/districtregional-manager-solution/,20 minutes
82
+ Contact Center Customer Service + 8.0,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/contact-center-customer-service-8-0/,20 minutes
83
+ Bank Collections Agent - Short Form,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/bank-collections-agent-short-form/,20 minutes
84
+ Contact Center Customer Service 8.0,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/contact-center-customer-service-8-0-4269/,20 minutes
85
+ Amazon Web Services (AWS) Development (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/amazon-web-services-aws-development-new/,45 minutes
86
+ Customer Service with Sales - Short Form,Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/customer-service-with-sales-short-form/,20 minutes
87
+ Apache Kafka (New),Yes,Yes,Cognitive Ability,https://www.shl.com/solutions/products/product-catalog/view/apache-kafka-new/,20 minutes
88
+ Behavioral Assessments,Unknown,Unknown,Personality Assessment,https://www.shl.com/solutions/products/assessments/,25 minutes
89
+ Virtual Assessment & Development Centers,Unknown,Yes,Unknown,https://www.shl.com/solutions/products/assessments/,45 minutes
90
+ Personality Assessments,Unknown,Unknown,Personality Assessment,https://www.shl.com/solutions/products/assessments/,25 minutes
91
+ Cognitive Assessments,Unknown,Unknown,Cognitive Ability,https://www.shl.com/solutions/products/assessments/,20 minutes
92
+ Skills and Simulations,Unknown,Unknown,Unknown,https://www.shl.com/solutions/products/assessments/,Unknown
93
+ Job Focused Assessments,Unknown,Unknown,Unknown,https://www.shl.com/solutions/products/assessments/,Unknown
94
+ verify g plus,Yes,Yes,Unknown,https://www.shl.com/assets/documents/rebranded-assets/product-factsheet-verify-g-plus.pdf,Unknown
95
+ verify deductive reasoning,Yes,Yes,Cognitive Ability,https://www.shl.com/assets/documents/rebranded-assets/product-factsheet-verify-deductive-reasoning.pdf,20 minutes
96
+ verify interactive inductive,Yes,Yes,Unknown,https://www.shl.com/assets/documents/rebranded-assets/product-factsheet-verify-interactive-inductive.pdf,20 minutes
97
+ verify interactive numerical reasoning,Yes,Yes,Cognitive Ability,https://www.shl.com/assets/documents/rebranded-assets/product-factsheet-verify-interactive-numerical-reasoning.pdf,20 minutes
98
+ SHL's Cognitive Assessments,Unknown,Unknown,Cognitive Ability,https://www.shl.com/solutions/products/assessments/cognitive-assessments/,20 minutes
99
+ SHL Verify G+ Test,Unknown,Yes,Unknown,https://www.shl.com/assets/documents/rebranded-assets/product-factsheet-verify-g-plus.pdf,Unknown
100
+ SHL Verify Deductive Reasoning,Unknown,Yes,Unknown,https://www.shl.com/assets/documents/rebranded-assets/product-factsheet-verify-deductive-reasoning.pdf,20 minutes
101
+ SHL Verify Interactive – Inductive Reasoning,Unknown,Yes,Unknown,https://www.shl.com/assets/documents/rebranded-assets/product-factsheet-verify-interactive-inductive.pdf,20 minutes
102
+ SHL Verify Interactive – Numerical Reasoning,Unknown,Yes,Unknown,https://www.shl.com/assets/documents/rebranded-assets/product-factsheet-verify-interactive-numerical-reasoning.pdf,20 minutes
103
+ Our Assessment Science,Unknown,Unknown,Unknown,https://www.shl.com/solutions/products/assessments/cognitive-assessments/,Unknown
104
+ Graduate Hiring,Unknown,Unknown,Unknown,https://www.shl.com/solutions/products/assessments/cognitive-assessments/,Unknown
105
+ Technology Hiring,Unknown,Unknown,Unknown,https://www.shl.com/solutions/products/assessments/cognitive-assessments/,Unknown
106
+ Competency Fit,Unknown,Unknown,Unknown,https://www.shl.com/solutions/products/assessments/cognitive-assessments/,Unknown
107
+ SHL Mobilize,Unknown,Unknown,Unknown,https://www.shl.com/solutions/products/assessments/cognitive-assessments/,Unknown
108
+ SHL Occupational Personality Questionnaire (OPQ),Unknown,Unknown,Personality Assessment,https://www.shl.com/solutions/products/assessments/personality-assessment/,25 minutes
109
+ SHL Motivational Questionnaire (MQ),Unknown,Unknown,Unknown,https://www.shl.com/solutions/products/assessments/personality-assessment/,Unknown
110
+ Technical Skills,Unknown,Unknown,Technical Skills,https://www.shl.com/solutions/products/assessments/skills-and-simulations/,45 minutes
111
+ Coding Simulations,Unknown,Unknown,Technical Skills,https://www.shl.com/solutions/products/assessments/skills-and-simulations/,45 minutes
112
+ Language Skills,Unknown,Unknown,Unknown,https://www.shl.com/solutions/products/assessments/skills-and-simulations/,Unknown
113
+ Contact Center Simulation,Unknown,Unknown,Unknown,https://www.shl.com/solutions/products/assessments/skills-and-simulations/,Unknown
114
+ Business Skills,Unknown,Unknown,Unknown,https://www.shl.com/solutions/products/assessments/skills-and-simulations/,Unknown
115
+ "And many, many more!",Unknown,Unknown,Unknown,https://www.shl.com/solutions/products/assessments/skills-and-simulations/,Unknown
utils/shl_product_scraper.py ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import csv
4
+ import re
5
+ import time
6
+ from urllib.parse import urljoin
7
+
8
+ def scrape_shl_products():
9
+ # URL to scrape
10
+ base_url = "https://www.shl.com"
11
+ catalog_url = "https://www.shl.com/solutions/products/product-catalog/"
12
+
13
+ # Send HTTP request with improved headers
14
+ headers = {
15
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
16
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
17
+ 'Accept-Language': 'en-US,en;q=0.5',
18
+ 'Connection': 'keep-alive',
19
+ 'Upgrade-Insecure-Requests': '1',
20
+ 'Cache-Control': 'max-age=0'
21
+ }
22
+
23
+ try:
24
+ # First try to get the main product catalog page
25
+ print(f"Fetching main catalog: {catalog_url}")
26
+ response = requests.get(catalog_url, headers=headers)
27
+ response.raise_for_status()
28
+
29
+ soup = BeautifulSoup(response.content, 'html.parser')
30
+
31
+ # Look for the actual product catalog items
32
+ view_links = []
33
+
34
+ for link in soup.find_all('a', href=True):
35
+ href = link['href']
36
+ if '/product-catalog/view/' in href:
37
+ view_links.append(href)
38
+
39
+ print(f"Found {len(view_links)} product catalog view links")
40
+
41
+ # Get more detail pages by looking at pagination
42
+ pagination_links = []
43
+ for link in soup.find_all('a', href=True):
44
+ href = link['href']
45
+ if '/product-catalog/' in href and ('?start=' in href or '&start=' in href):
46
+ pagination_links.append(href)
47
+
48
+ if pagination_links:
49
+ print(f"Found {len(pagination_links)} pagination links")
50
+
51
+ # Process each pagination page
52
+ for page_url in pagination_links:
53
+ # Make sure the URL is absolute
54
+ if not page_url.startswith('http'):
55
+ page_url = urljoin(base_url, page_url)
56
+
57
+ print(f"Fetching pagination page: {page_url}")
58
+ try:
59
+ page_response = requests.get(page_url, headers=headers)
60
+ page_response.raise_for_status()
61
+ page_soup = BeautifulSoup(page_response.content, 'html.parser')
62
+
63
+ # Find all product view links on this pagination page
64
+ for link in page_soup.find_all('a', href=True):
65
+ href = link['href']
66
+ if '/product-catalog/view/' in href:
67
+ view_links.append(href)
68
+
69
+ except Exception as e:
70
+ print(f"Error fetching pagination page: {e}")
71
+
72
+ # Be nice to the server
73
+ time.sleep(1)
74
+
75
+ # Remove duplicates and ensure all links are absolute
76
+ view_links = list(set(view_links))
77
+ view_links = [urljoin(base_url, link) if not link.startswith('http') else link for link in view_links]
78
+
79
+ print(f"Found {len(view_links)} unique product view links")
80
+
81
+ # Now we'll scrape each individual product page
82
+ products = []
83
+
84
+ for i, product_url in enumerate(view_links):
85
+ print(f"Scraping product {i+1}/{len(view_links)}: {product_url}")
86
+
87
+ try:
88
+ # Delay between requests to be polite to the server
89
+ if i > 0:
90
+ time.sleep(1)
91
+
92
+ product_response = requests.get(product_url, headers=headers)
93
+ product_response.raise_for_status()
94
+ product_soup = BeautifulSoup(product_response.content, 'html.parser')
95
+
96
+ # Extract product name - usually in the title or main heading
97
+ product_name = ""
98
+ # Try to get it from the title
99
+ title_tag = product_soup.find('title')
100
+ if title_tag:
101
+ title_text = title_tag.text.strip()
102
+ # Clean up title - often in format "Product Name | SHL"
103
+ if '|' in title_text:
104
+ product_name = title_text.split('|')[0].strip()
105
+
106
+ # If no name from title, try the H1
107
+ if not product_name:
108
+ h1_tag = product_soup.find('h1')
109
+ if h1_tag:
110
+ product_name = h1_tag.text.strip()
111
+
112
+ # If still no name, extract from URL
113
+ if not product_name:
114
+ url_parts = product_url.rstrip('/').split('/')
115
+ product_name = url_parts[-1].replace('-', ' ').title()
116
+
117
+ # Get page content as text for analysis
118
+ page_text = product_soup.get_text().lower()
119
+
120
+ # Try to determine if remote testing is supported
121
+ remote_testing = "Unknown"
122
+ remote_terms = ['remote', 'online', 'virtual', 'internet', 'web-based', 'digital', 'web browser',
123
+ 'online platform', 'from anywhere', 'off-site', 'distance']
124
+ remote_phrases = [
125
+ 'take the test remotely', 'administer remotely', 'online assessment', 'digital delivery',
126
+ 'web-based platform', 'remote proctoring', 'internet connection', 'browser-based',
127
+ 'accessible anywhere', 'remote testing', 'online testing'
128
+ ]
129
+
130
+ # Check for remote testing keywords
131
+ for term in remote_terms:
132
+ if term in page_text:
133
+ remote_testing = "Yes"
134
+ break
135
+
136
+ # If not found with simple terms, check for phrases
137
+ if remote_testing == "Unknown":
138
+ for phrase in remote_phrases:
139
+ if phrase in page_text:
140
+ remote_testing = "Yes"
141
+ break
142
+
143
+ # Most modern SHL tests are remote, so if we're still uncertain, check for contrary evidence
144
+ if remote_testing == "Unknown" and not any(x in page_text for x in ['in-person only', 'on-site only', 'physical test center required']):
145
+ # If product URL or name contains certain keywords, it's likely remote
146
+ product_url_lower = product_url.lower()
147
+ if any(x in product_url_lower for x in ['online', 'digital', 'remote', 'virtual']):
148
+ remote_testing = "Yes"
149
+ # For SHL products, most are remote unless explicitly stated otherwise
150
+ elif 'shl.com' in product_url:
151
+ remote_testing = "Yes"
152
+
153
+ # ENHANCED ADAPTIVE/IRT DETECTION
154
+ adaptive = "Unknown"
155
+ # Direct adaptive terminology
156
+ adaptive_terms = [
157
+ 'adaptive', 'irt', 'item response theory', 'tailored', 'adjusts difficulty',
158
+ 'computer adaptive', 'cat', 'adaptive testing', 'dynamic difficulty',
159
+ 'adjusts questions', 'personalized assessment', 'adaptive algorithm',
160
+ 'smart testing', 'tailored questioning', 'adaptive assessment'
161
+ ]
162
+
163
+ # Phrases that indicate adaptive testing
164
+ adaptive_phrases = [
165
+ 'questions adapt based on', 'difficulty adjusts', 'tailored to ability',
166
+ 'adapts to the test taker', 'customizes questions', 'dynamic question selection',
167
+ 'questions change based on previous answers', 'adapts to user performance',
168
+ 'intelligent testing algorithm', 'questions get harder or easier'
169
+ ]
170
+
171
+ # Check for adaptive terms
172
+ for term in adaptive_terms:
173
+ if term in page_text:
174
+ adaptive = "Yes"
175
+ break
176
+
177
+ # If not found with simple terms, check for phrases
178
+ if adaptive == "Unknown":
179
+ for phrase in adaptive_phrases:
180
+ if phrase in page_text:
181
+ adaptive = "Yes"
182
+ break
183
+
184
+ # Check headings and specific sections that might contain information
185
+ if adaptive == "Unknown":
186
+ for heading in product_soup.find_all(['h2', 'h3', 'h4']):
187
+ heading_text = heading.get_text().lower()
188
+ if any(term in heading_text for term in ['adaptive', 'test methodology', 'assessment technology']):
189
+ # Get the next paragraph or content
190
+ next_elem = heading.find_next(['p', 'div'])
191
+ if next_elem:
192
+ next_text = next_elem.get_text().lower()
193
+ if any(term in next_text for term in adaptive_terms) or any(phrase in next_text for phrase in adaptive_phrases):
194
+ adaptive = "Yes"
195
+ break
196
+
197
+ # Look for specific product indicators
198
+ if adaptive == "Unknown":
199
+ # Many SHL Verify tests are adaptive
200
+ test_name_lower = product_name.lower()
201
+ if 'verify' in test_name_lower and any(x in test_name_lower for x in ['reasoning', 'ability', 'numerical', 'verbal', 'logical']):
202
+ adaptive = "Yes"
203
+ # Also check for ADEPT or CAT in name
204
+ elif any(x in test_name_lower for x in ['adept', 'cat']):
205
+ adaptive = "Yes"
206
+
207
+ # Try to extract test type
208
+ test_type = "Unknown"
209
+ type_mapping = {
210
+ "Cognitive Ability": ['cognitive', 'ability', 'intelligence', 'reasoning', 'numerical', 'verbal', 'logical'],
211
+ "Personality Assessment": ['personality', 'behavioral', 'behaviour', 'character', 'temperament'],
212
+ "Technical Skills": ['technical', 'coding', 'programming', 'development', 'software', 'microsoft', 'excel'],
213
+ "Situational Judgment": ['situational', 'judgment', 'judgement', 'scenario', 'case study'],
214
+ "Job-Specific Assessment": ['job-specific', 'role-specific', 'position', 'occupation']
215
+ }
216
+
217
+ for test_category, keywords in type_mapping.items():
218
+ for keyword in keywords:
219
+ if keyword in page_text:
220
+ test_type = test_category
221
+ break
222
+ if test_type != "Unknown":
223
+ break
224
+
225
+ # ENHANCED DURATION EXTRACTION
226
+ duration = "Unknown"
227
+
228
+ # Common duration pattern phrases to look for - expanded with more patterns
229
+ duration_patterns = [
230
+ r'takes?\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
231
+ r'duration\s*:?\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
232
+ r'time\s*(?:limit|frame|allotted)?\s*:?\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
233
+ r'(?:test|assessment)\s*(?:duration|length|time)\s*:?\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
234
+ r'(?:takes?|requires?|needs?)\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
235
+ r'completed in\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
236
+ r'(?:typically|usually|generally|normally)\s*(?:takes?|lasts?|runs?)\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
237
+ r'time\s*to\s*(?:complete|finish)\s*(?:is|:)?\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
238
+ r'(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
239
+ r'(\d+)\s*(?:min|mins|minutes|minute)',
240
+ r'time(?:frame)?:?\s*(\d+)',
241
+ r'duration:?\s*(\d+)',
242
+ ]
243
+
244
+ # Specialized function to process duration matches
245
+ def process_duration_match(match):
246
+ if match.group(2) and match.group(2).isdigit():
247
+ # If there's a range, use the maximum value
248
+ return f"{match.group(2)} minutes"
249
+ else:
250
+ return f"{match.group(1)} minutes"
251
+
252
+ # First look for duration in specific elements that are likely to contain duration info
253
+ duration_containers = product_soup.find_all(['li', 'p', 'span', 'div'], string=re.compile(
254
+ r'(?:duration|time|minutes|mins|length|complete)', re.I))
255
+
256
+ for container in duration_containers:
257
+ container_text = container.get_text().lower()
258
+ for pattern in duration_patterns:
259
+ duration_match = re.search(pattern, container_text)
260
+ if duration_match:
261
+ duration = process_duration_match(duration_match)
262
+ break
263
+ if duration != "Unknown":
264
+ break
265
+
266
+ # If still unknown, look in tables that might contain specs or details
267
+ if duration == "Unknown":
268
+ tables = product_soup.find_all('table')
269
+ for table in tables:
270
+ rows = table.find_all('tr')
271
+ for row in rows:
272
+ row_text = row.get_text().lower()
273
+ if any(term in row_text for term in ['duration', 'time', 'minutes', 'length']):
274
+ for pattern in duration_patterns:
275
+ duration_match = re.search(pattern, row_text)
276
+ if duration_match:
277
+ duration = process_duration_match(duration_match)
278
+ break
279
+ if duration != "Unknown":
280
+ break
281
+ if duration != "Unknown":
282
+ break
283
+
284
+ # If still unknown, search all text on the page
285
+ if duration == "Unknown":
286
+ for pattern in duration_patterns:
287
+ duration_match = re.search(pattern, page_text)
288
+ if duration_match:
289
+ duration = process_duration_match(duration_match)
290
+ break
291
+
292
+ # Check for PDF links that might have test details
293
+ if duration == "Unknown" or adaptive == "Unknown":
294
+ pdf_links = [a['href'] for a in product_soup.find_all('a', href=True) if a['href'].endswith('.pdf')]
295
+ for pdf_link in pdf_links:
296
+ # We'll just note that there's a PDF that might have more info
297
+ pdf_url = urljoin(product_url, pdf_link)
298
+ print(f"Found potential info PDF: {pdf_url}")
299
+ # We don't download and parse PDFs here but could expand functionality
300
+
301
+ # If still unknown, assign duration based on product name and test type
302
+ if duration == "Unknown":
303
+ test_name_lower = product_name.lower()
304
+
305
+ # Technical and programming tests typically take longer
306
+ if any(tech in test_name_lower for tech in [
307
+ '.net', 'java', 'python', 'c#', 'javascript', 'angular', 'react',
308
+ 'node', 'aws', 'cloud', 'azure', 'devops', 'programming', 'coding',
309
+ 'development', 'sql', 'database'
310
+ ]):
311
+ duration = "45 minutes"
312
+
313
+ # Cognitive tests have standard durations
314
+ elif test_type == "Cognitive Ability":
315
+ if any(term in test_name_lower for term in ['numerical', 'verbal']):
316
+ duration = "20 minutes"
317
+ elif 'inductive' in test_name_lower:
318
+ duration = "25 minutes"
319
+ elif 'deductive' in test_name_lower:
320
+ duration = "20 minutes"
321
+ elif 'reasoning' in test_name_lower:
322
+ duration = "20 minutes"
323
+ else:
324
+ duration = "20 minutes" # Default for cognitive tests
325
+
326
+ # Personality assessments typically take 25-30 minutes
327
+ elif test_type == "Personality Assessment":
328
+ duration = "25 minutes"
329
+
330
+ # SJTs typically take 30 minutes
331
+ elif test_type == "Situational Judgment":
332
+ duration = "30 minutes"
333
+
334
+ # Short form assessments are usually shorter
335
+ elif 'short form' in test_name_lower:
336
+ duration = "15 minutes"
337
+
338
+ # Check for specific product types in the name
339
+ elif 'agile' in test_name_lower and 'software' in test_name_lower:
340
+ duration = "7 minutes" # From your data
341
+
342
+ products.append({
343
+ "Test Name": product_name,
344
+ "Remote Testing": remote_testing,
345
+ "Adaptive/IRT": adaptive,
346
+ "Test Type": test_type,
347
+ "Link": product_url,
348
+ "Duration": duration
349
+ })
350
+
351
+ except Exception as e:
352
+ print(f"Error scraping product {product_url}: {e}")
353
+
354
+ # If we have few or no products, we'll add the known SHL products
355
+ if len(products) < 10:
356
+ print("Adding known SHL products as fallback")
357
+ known_products = [
358
+ {
359
+ "Test Name": "Verify G+ General Ability Test",
360
+ "Remote Testing": "Yes",
361
+ "Adaptive/IRT": "Yes",
362
+ "Test Type": "Cognitive Ability",
363
+ "Link": "https://www.shl.com/solutions/products/verify-g-general-ability-test/",
364
+ "Duration": "18 minutes"
365
+ },
366
+ {
367
+ "Test Name": "SHL Personality Inventory",
368
+ "Remote Testing": "Yes",
369
+ "Adaptive/IRT": "No",
370
+ "Test Type": "Personality Assessment",
371
+ "Link": "https://www.shl.com/solutions/products/personality-inventory/",
372
+ "Duration": "25 minutes"
373
+ },
374
+ {
375
+ "Test Name": "Verify Numerical Reasoning Test",
376
+ "Remote Testing": "Yes",
377
+ "Adaptive/IRT": "Yes",
378
+ "Test Type": "Cognitive Ability",
379
+ "Link": "https://www.shl.com/solutions/products/verify-numerical-reasoning-test/",
380
+ "Duration": "15 minutes"
381
+ },
382
+ {
383
+ "Test Name": "Verify Verbal Reasoning Test",
384
+ "Remote Testing": "Yes",
385
+ "Adaptive/IRT": "Yes",
386
+ "Test Type": "Cognitive Ability",
387
+ "Link": "https://www.shl.com/solutions/products/verify-verbal-reasoning-test/",
388
+ "Duration": "15 minutes"
389
+ },
390
+ {
391
+ "Test Name": "OPQ32 Occupational Personality Questionnaire",
392
+ "Remote Testing": "Yes",
393
+ "Adaptive/IRT": "No",
394
+ "Test Type": "Personality Assessment",
395
+ "Link": "https://www.shl.com/solutions/products/opq32-occupational-personality-questionnaire/",
396
+ "Duration": "35 minutes"
397
+ },
398
+ {
399
+ "Test Name": "Situational Judgment Test",
400
+ "Remote Testing": "Yes",
401
+ "Adaptive/IRT": "No",
402
+ "Test Type": "Situational Judgment",
403
+ "Link": "https://www.shl.com/solutions/products/situational-judgment-test/",
404
+ "Duration": "30 minutes"
405
+ },
406
+ {
407
+ "Test Name": "Coding Assessment",
408
+ "Remote Testing": "Yes",
409
+ "Adaptive/IRT": "No",
410
+ "Test Type": "Technical Skills",
411
+ "Link": "https://www.shl.com/solutions/products/coding-assessment/",
412
+ "Duration": "60 minutes"
413
+ },
414
+ {
415
+ "Test Name": "MQ Motivation Questionnaire",
416
+ "Remote Testing": "Yes",
417
+ "Adaptive/IRT": "No",
418
+ "Test Type": "Motivation Assessment",
419
+ "Link": "https://www.shl.com/solutions/products/mq-motivation-questionnaire/",
420
+ "Duration": "25 minutes"
421
+ },
422
+ {
423
+ "Test Name": "ADEPT-15 Personality Assessment",
424
+ "Remote Testing": "Yes",
425
+ "Adaptive/IRT": "Yes",
426
+ "Test Type": "Personality Assessment",
427
+ "Link": "https://www.shl.com/solutions/products/adept-15/",
428
+ "Duration": "20 minutes"
429
+ },
430
+ {
431
+ "Test Name": "Inductive Reasoning Test",
432
+ "Remote Testing": "Yes",
433
+ "Adaptive/IRT": "Yes",
434
+ "Test Type": "Cognitive Ability",
435
+ "Link": "https://www.shl.com/solutions/products/inductive-reasoning-test/",
436
+ "Duration": "20 minutes"
437
+ },
438
+ {
439
+ "Test Name": "Microsoft Office Assessment",
440
+ "Remote Testing": "Yes",
441
+ "Adaptive/IRT": "No",
442
+ "Test Type": "Technical Skills",
443
+ "Link": "https://www.shl.com/solutions/products/microsoft-office-assessment/",
444
+ "Duration": "40 minutes"
445
+ },
446
+ {
447
+ "Test Name": "Call Center Assessment",
448
+ "Remote Testing": "Yes",
449
+ "Adaptive/IRT": "No",
450
+ "Test Type": "Job-Specific Assessment",
451
+ "Link": "https://www.shl.com/solutions/products/call-center-assessment/",
452
+ "Duration": "30 minutes"
453
+ }
454
+ ]
455
+
456
+ # Add known products that aren't already in our list
457
+ seen_names = set(product["Test Name"] for product in products)
458
+ for product in known_products:
459
+ if product["Test Name"] not in seen_names:
460
+ products.append(product)
461
+
462
+ # Write data to CSV
463
+ with open('utils\data.csv', 'w', newline='', encoding='utf-8') as csvfile:
464
+ fieldnames = ["Test Name", "Remote Testing (Yes/No)", "Adaptive/IRT (Yes/No)", "Test Type", "Link", "Duration"]
465
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
466
+
467
+ writer.writeheader()
468
+ for product in products:
469
+ writer.writerow({
470
+ "Test Name": product["Test Name"],
471
+ "Remote Testing (Yes/No)": product["Remote Testing"],
472
+ "Adaptive/IRT (Yes/No)": product["Adaptive/IRT"],
473
+ "Test Type": product["Test Type"],
474
+ "Link": product["Link"],
475
+ "Duration": product["Duration"]
476
+ })
477
+
478
+ print(f"Successfully scraped {len(products)} products and saved to data.csv")
479
+
480
+ # Also try to scrape additional product information from other SHL pages
481
+ try:
482
+ scrape_additional_products(headers, products, base_url)
483
+ except Exception as e:
484
+ print(f"Error during additional product scraping: {e}")
485
+
486
+ # Add an extra pass to improve duration and adaptive information
487
+ try:
488
+ enhance_product_information(products)
489
+ except Exception as e:
490
+ print(f"Error during information enhancement: {e}")
491
+
492
+ except requests.exceptions.RequestException as e:
493
+ print(f"Error fetching the URL: {e}")
494
+ except Exception as e:
495
+ print(f"An error occurred: {e}")
496
+ import traceback
497
+ traceback.print_exc()
498
+
499
+ def enhance_product_information(products):
500
+ """Add an additional pass to improve duration and adaptive/IRT information"""
501
+ print("Enhancing product information...")
502
+
503
+ # Define common test durations by product category or type
504
+ test_duration_mapping = {
505
+ # Technical/coding tests
506
+ 'technical': {
507
+ 'default': '45 minutes',
508
+ 'keywords': ['.net', 'java', 'python', 'c#', 'javascript', 'angular', 'react',
509
+ 'node', 'aws', 'cloud', 'azure', 'devops', 'programming', 'coding',
510
+ 'development', 'sql', 'database', 'technical']
511
+ },
512
+ # Cognitive tests
513
+ 'cognitive': {
514
+ 'default': '20 minutes',
515
+ 'keywords': ['cognitive', 'ability', 'reasoning', 'numerical', 'verbal', 'logical', 'inductive']
516
+ },
517
+ # Personality assessments
518
+ 'personality': {
519
+ 'default': '25 minutes',
520
+ 'keywords': ['personality', 'behavioral', 'behaviour', 'character', 'temperament']
521
+ },
522
+ # Situational judgment tests
523
+ 'situational': {
524
+ 'default': '30 minutes',
525
+ 'keywords': ['situational', 'judgment', 'judgement', 'scenario', 'case study']
526
+ }
527
+ }
528
+
529
+ # Enhanced list of products known to be adaptive
530
+ adaptive_products = [
531
+ 'verify', 'adept', 'cat', 'adaptive', 'irt', 'g+', 'verify g', 'verify numerical',
532
+ 'verify verbal', 'verify inductive', 'verify interactive'
533
+ ]
534
+
535
+ # Enhance each product
536
+ for product in products:
537
+ # First enhance duration information if it's Unknown
538
+ if product["Duration"] == "Unknown":
539
+ product_name_lower = product["Test Name"].lower()
540
+ test_type_lower = product["Test Type"].lower() if product["Test Type"] else ""
541
+
542
+ # Check for short form assessments
543
+ if 'short form' in product_name_lower:
544
+ product["Duration"] = "15 minutes"
545
+ continue
546
+
547
+ # Apply the mappings based on test name and type
548
+ for category, details in test_duration_mapping.items():
549
+ keywords = details['keywords']
550
+ if any(keyword in product_name_lower for keyword in keywords) or any(keyword in test_type_lower for keyword in keywords):
551
+ product["Duration"] = details['default']
552
+ break
553
+
554
+ # Special cases for specific products
555
+ if 'agile software development' in product_name_lower:
556
+ product["Duration"] = "7 minutes"
557
+
558
+ # Then enhance adaptive/IRT information if it's Unknown
559
+ if product["Adaptive/IRT"] == "Unknown":
560
+ product_name_lower = product["Test Name"].lower()
561
+
562
+ # Check if the product name contains any keywords associated with adaptive tests
563
+ if any(adaptive_term in product_name_lower for adaptive_term in adaptive_products):
564
+ product["Adaptive/IRT"] = "Yes"
565
+
566
+ # Specific product families known to use adaptive technology
567
+ elif product_name_lower.startswith('verify') and 'reasoning' in product_name_lower:
568
+ product["Adaptive/IRT"] = "Yes"
569
+ elif 'interactive' in product_name_lower and any(term in product_name_lower for term in ['reasoning', 'ability', 'cognitive']):
570
+ product["Adaptive/IRT"] = "Yes"
571
+
572
+ # Write the enhanced data back to CSV
573
+ with open('utils\data.csv', 'w', newline='', encoding='utf-8') as csvfile:
574
+ fieldnames = ["Test Name", "Remote Testing (Yes/No)", "Adaptive/IRT (Yes/No)", "Test Type", "Link", "Duration"]
575
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
576
+
577
+ writer.writeheader()
578
+ for product in products:
579
+ writer.writerow({
580
+ "Test Name": product["Test Name"],
581
+ "Remote Testing (Yes/No)": product["Remote Testing"],
582
+ "Adaptive/IRT (Yes/No)": product["Adaptive/IRT"],
583
+ "Test Type": product["Test Type"],
584
+ "Link": product["Link"],
585
+ "Duration": product["Duration"]
586
+ })
587
+
588
+ print("Product information enhancement completed")
589
+
590
+ def scrape_additional_products(headers, existing_products, base_url):
591
+ """Scrape additional product information from other SHL pages"""
592
+
593
+ # Additional pages that might have product information
594
+ additional_urls = [
595
+ "https://www.shl.com/solutions/products/assessments/",
596
+ "https://www.shl.com/solutions/products/assessments/cognitive-assessments/",
597
+ "https://www.shl.com/solutions/products/assessments/personality-assessment/",
598
+ "https://www.shl.com/solutions/products/assessments/skills-and-simulations/"
599
+ ]
600
+
601
+ seen_names = set(product["Test Name"] for product in existing_products)
602
+ new_products = []
603
+
604
+ for url in additional_urls:
605
+ print(f"Scraping additional products from: {url}")
606
+
607
+ try:
608
+ response = requests.get(url, headers=headers)
609
+ response.raise_for_status()
610
+ soup = BeautifulSoup(response.content, 'html.parser')
611
+
612
+ # Look for PDF links that might contain detailed product information
613
+ pdf_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.pdf')]
614
+
615
+ for pdf_link in pdf_links:
616
+ pdf_url = urljoin(base_url, pdf_link)
617
+ print(f"Found potential product PDF: {pdf_url}")
618
+
619
+ # Extract a possible product name from the PDF link
620
+ pdf_name = pdf_link.split('/')[-1].replace('-', ' ').replace('_', ' ').replace('.pdf', '')
621
+
622
+ # Clean up the name
623
+ pdf_name = re.sub(r'product factsheet', '', pdf_name, flags=re.IGNORECASE).strip()
624
+
625
+ # If it looks like a valid product name, add it
626
+ if len(pdf_name) > 3 and pdf_name not in seen_names:
627
+ # Extract product details from the PDF name
628
+ is_verify = 'verify' in pdf_name.lower()
629
+ is_adaptive = is_verify or any(term in pdf_name.lower() for term in ['adaptive', 'interactive'])
630
+
631
+ product_type = "Unknown"
632
+ if any(term in pdf_name.lower() for term in ['reasoning', 'numerical', 'verbal', 'cognitive']):
633
+ product_type = "Cognitive Ability"
634
+ elif any(term in pdf_name.lower() for term in ['personality', 'behavioral']):
635
+ product_type = "Personality Assessment"
636
+
637
+ # Assign duration based on product type
638
+ duration = "Unknown"
639
+ if product_type == "Cognitive Ability":
640
+ if 'numerical' in pdf_name.lower() or 'verbal' in pdf_name.lower():
641
+ duration = "20 minutes"
642
+ elif 'deductive' in pdf_name.lower():
643
+ duration = "20 minutes"
644
+ else:
645
+ duration = "20 minutes"
646
+ elif product_type == "Personality Assessment":
647
+ duration = "25 minutes"
648
+
649
+ new_products.append({
650
+ "Test Name": pdf_name,
651
+ "Remote Testing": "Yes", # Modern SHL tests are typically remote
652
+ "Adaptive/IRT": "Yes" if is_adaptive else "Unknown",
653
+ "Test Type": product_type,
654
+ "Link": pdf_url,
655
+ "Duration": duration
656
+ })
657
+
658
+ seen_names.add(pdf_name)
659
+
660
+ # Find product sections - look for content blocks with headings followed by descriptions
661
+ sections = soup.find_all(['section', 'div'], class_=['product-section', 'content-block', 'product-listing'])
662
+
663
+ if not sections:
664
+ # If no obvious sections, look for headings that might describe products
665
+ headings = soup.find_all(['h2', 'h3'], class_=lambda c: c and ('title' in c or 'heading' in c))
666
+
667
+ for heading in headings:
668
+ product_name = heading.get_text().strip()
669
+
670
+ if len(product_name) < 5 or product_name in seen_names:
671
+ continue
672
+
673
+ # Find a nearby link
674
+ parent = heading.find_parent()
675
+ link_elem = parent.find('a') if parent else None
676
+ product_url = link_elem['href'] if link_elem and link_elem.has_attr('href') else url
677
+
678
+ if not product_url.startswith('http'):
679
+ product_url = urljoin(base_url, product_url)
680
+
681
+ # Get description
682
+ description = ""
683
+ next_elem = heading.find_next_sibling()
684
+ if next_elem and next_elem.name == 'p':
685
+ description = next_elem.get_text().lower()
686
+
687
+ # Extract info from description
688
+ remote_testing = "Yes" if any(term in description for term in ['remote', 'online', 'virtual']) else "Unknown"
689
+ adaptive = "Yes" if any(term in description for term in ['adaptive', 'irt', 'tailored']) else "Unknown"
690
+
691
+ # Determine test type
692
+ test_type = "Unknown"
693
+ if any(term in product_name.lower() or term in description for term in ['cognitive', 'ability', 'intelligence']):
694
+ test_type = "Cognitive Ability"
695
+ elif any(term in product_name.lower() or term in description for term in ['personality', 'behavioral']):
696
+ test_type = "Personality Assessment"
697
+ elif any(term in product_name.lower() or term in description for term in ['situational', 'judgment']):
698
+ test_type = "Situational Judgment"
699
+ elif any(term in product_name.lower() or term in description for term in ['coding', 'programming', 'technical']):
700
+ test_type = "Technical Skills"
701
+
702
+ # Look for duration with enhanced patterns
703
+ duration = "Unknown"
704
+
705
+ # Common duration pattern phrases to look for
706
+ duration_patterns = [
707
+ r'takes?\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
708
+ r'duration\s*:?\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
709
+ r'time\s*(?:limit|frame|allotted)?\s*:?\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
710
+ r'(?:test|assessment)\s*(?:duration|length|time)\s*:?\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
711
+ r'(?:takes?|requires?|needs?)\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
712
+ r'completed in\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
713
+ r'(?:typically|usually|generally|normally)\s*(?:takes?|lasts?|runs?)\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
714
+ r'time\s*to\s*(?:complete|finish)\s*(?:is|:)?\s*(?:about|approximately|around)?\s*(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
715
+ r'(\d+)[\s-]*(?:to|-|–)?\s*(\d+)?\s*(?:min|mins|minutes|minute)',
716
+ r'(\d+)\s*(?:min|mins|minutes|minute)',
717
+ ]
718
+
719
+ # Process duration matches
720
+ def process_duration_match(match):
721
+ if match.group(2) and match.group(2).isdigit():
722
+ # If there's a range, use the maximum value
723
+ return f"{match.group(2)} minutes"
724
+ else:
725
+ return f"{match.group(1)} minutes"
726
+
727
+ # Try each pattern on the description
728
+ for pattern in duration_patterns:
729
+ duration_match = re.search(pattern, description)
730
+ if duration_match:
731
+ duration = process_duration_match(duration_match)
732
+ break
733
+
734
+ # If still unknown and it's a known test type, assign default durations
735
+ if duration == "Unknown":
736
+ product_lower = product_name.lower()
737
+ if any(word in product_lower for word in ['cognitive', 'numerical', 'verbal', 'reasoning']):
738
+ duration = "20 minutes"
739
+ elif any(word in product_lower for word in ['personality', 'behavioral']):
740
+ duration = "25 minutes"
741
+ elif any(word in product_lower for word in ['situational', 'judgment']):
742
+ duration = "30 minutes"
743
+ elif any(word in product_lower for word in ['coding', 'programming', 'technical']):
744
+ duration = "45 minutes"
745
+
746
+ new_products.append({
747
+ "Test Name": product_name,
748
+ "Remote Testing": remote_testing,
749
+ "Adaptive/IRT": adaptive,
750
+ "Test Type": test_type,
751
+ "Link": product_url,
752
+ "Duration": duration
753
+ })
754
+
755
+ seen_names.add(product_name)
756
+
757
+ # Be nice to the server
758
+ time.sleep(1)
759
+
760
+ except Exception as e:
761
+ print(f"Error scraping additional page {url}: {e}")
762
+
763
+ # Add new products to existing products
764
+ if new_products:
765
+ print(f"Found {len(new_products)} additional products")
766
+ existing_products.extend(new_products)
767
+
768
+ # Update CSV with the new products
769
+ with open('utils\data.csv', 'w', newline='', encoding='utf-8') as csvfile:
770
+ fieldnames = ["Test Name", "Remote Testing (Yes/No)", "Adaptive/IRT (Yes/No)", "Test Type", "Link", "Duration"]
771
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
772
+
773
+ writer.writeheader()
774
+ for product in existing_products:
775
+ writer.writerow({
776
+ "Test Name": product["Test Name"],
777
+ "Remote Testing (Yes/No)": product["Remote Testing"],
778
+ "Adaptive/IRT (Yes/No)": product["Adaptive/IRT"],
779
+ "Test Type": product["Test Type"],
780
+ "Link": product["Link"],
781
+ "Duration": product["Duration"]
782
+ })
783
+
784
+ print(f"Updated data.csv with {len(existing_products)} total products")
785
+
786
+ if __name__ == "__main__":
787
+ scrape_shl_products()
utils/validators.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def url(value):
4
+
5
+ pattern = re.compile(
6
+ r'^(?:http|ftp)s?://' # http:// or https://
7
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
8
+ r'localhost|' # localhost...
9
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ipv4
10
+ r'(?::\d+)?' # optional port
11
+ r'(?:/?|[/?]\S+)$', re.IGNORECASE)
12
+
13
+ return bool(pattern.match(value))