|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
import pickle
|
|
import os
|
|
import sys
|
|
from typing import List
|
|
|
|
|
|
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
|
|
sys.path.append(project_root)
|
|
|
|
class AttritionModel:
|
|
def __init__(self):
|
|
self.model = None
|
|
self.preprocessor = None
|
|
self.model_path = os.path.join(project_root, "models", "attrition_model.pkl")
|
|
self.preprocessor_path = os.path.join(project_root, "models", "attrition_preprocessor.pkl")
|
|
|
|
|
|
os.makedirs(os.path.dirname(self.model_path), exist_ok=True)
|
|
|
|
|
|
self.numeric_features = [
|
|
'Age', 'DistanceFromHome', 'EnvironmentSatisfaction',
|
|
'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
|
|
'TotalWorkingYears', 'WorkLifeBalance', 'YearsAtCompany'
|
|
]
|
|
self.categorical_features = ['OverTime']
|
|
|
|
|
|
try:
|
|
with open(self.model_path, 'rb') as f:
|
|
self.model = pickle.load(f)
|
|
with open(self.preprocessor_path, 'rb') as f:
|
|
self.preprocessor = pickle.load(f)
|
|
except:
|
|
print("No existing model found. Please train the model first.")
|
|
|
|
def preprocess_data(self, X):
|
|
"""Preprocess the input data"""
|
|
|
|
numeric_transformer = StandardScaler()
|
|
categorical_transformer = OneHotEncoder(drop='first', sparse=False)
|
|
|
|
|
|
self.preprocessor = ColumnTransformer(
|
|
transformers=[
|
|
('num', numeric_transformer, self.numeric_features),
|
|
('cat', categorical_transformer, self.categorical_features)
|
|
],
|
|
remainder='drop'
|
|
)
|
|
|
|
return self.preprocessor.fit_transform(X)
|
|
|
|
def train(self, X, y):
|
|
"""Train the model with the given data"""
|
|
|
|
X_processed = self.preprocess_data(X)
|
|
|
|
|
|
self.model = RandomForestClassifier(
|
|
n_estimators=100,
|
|
max_depth=10,
|
|
random_state=42
|
|
)
|
|
self.model.fit(X_processed, y)
|
|
|
|
|
|
with open(self.model_path, 'wb') as f:
|
|
pickle.dump(self.model, f)
|
|
with open(self.preprocessor_path, 'wb') as f:
|
|
pickle.dump(self.preprocessor, f)
|
|
|
|
def predict(self, features):
|
|
"""Make a prediction using the trained model."""
|
|
try:
|
|
if self.model is None:
|
|
raise ValueError("Model not loaded. Please ensure model file exists and is valid.")
|
|
|
|
print(f"Input features: {features}")
|
|
|
|
|
|
processed_features = {}
|
|
for key, value in features.items():
|
|
if key == 'OverTime':
|
|
|
|
if isinstance(value, str):
|
|
processed_features[key] = 1 if value.lower() in ['yes', 'true', '1'] else 0
|
|
else:
|
|
processed_features[key] = 1 if value else 0
|
|
else:
|
|
|
|
try:
|
|
processed_features[key] = float(value)
|
|
except (ValueError, TypeError):
|
|
|
|
raise ValueError(f"Invalid value for feature {key}: {value}. Expected numeric value.")
|
|
|
|
print(f"Processed features: {processed_features}")
|
|
|
|
|
|
X = pd.DataFrame([processed_features])
|
|
|
|
|
|
required_columns = self.numeric_features + self.categorical_features
|
|
|
|
for col in required_columns:
|
|
if col not in X.columns:
|
|
raise ValueError(f"Missing required feature: {col}")
|
|
|
|
|
|
X = X[required_columns]
|
|
|
|
|
|
print(f"Input data types before conversion: {X.dtypes}")
|
|
|
|
|
|
for col in self.numeric_features:
|
|
X[col] = pd.to_numeric(X[col], errors='coerce').astype(np.float64)
|
|
|
|
|
|
for col in self.categorical_features:
|
|
X[col] = X[col].astype(np.int64)
|
|
|
|
print(f"Input data types after conversion: {X.dtypes}")
|
|
print(f"Input data: {X.to_dict('records')}")
|
|
|
|
|
|
if X.isnull().any().any():
|
|
print(f"Warning: NaN values detected in input: {X.isnull().sum()}")
|
|
|
|
X = X.fillna(X.mean())
|
|
|
|
|
|
if self.preprocessor is not None:
|
|
try:
|
|
X_processed = self.preprocessor.transform(X)
|
|
print("Preprocessing successful")
|
|
except Exception as e:
|
|
print(f"Error during preprocessing: {str(e)}")
|
|
|
|
try:
|
|
|
|
|
|
X_direct = X.copy()
|
|
X_direct['OverTime_Yes'] = X_direct['OverTime']
|
|
X_direct = X_direct.drop('OverTime', axis=1)
|
|
|
|
|
|
prediction = bool(self.model.predict(X_direct.values)[0])
|
|
probability = float(self.model.predict_proba(X_direct.values)[0][1])
|
|
|
|
print("Used direct prediction as fallback")
|
|
return {
|
|
"prediction": prediction,
|
|
"probability": probability
|
|
}
|
|
except Exception as direct_error:
|
|
print(f"Direct prediction also failed: {str(direct_error)}")
|
|
raise ValueError(f"Failed to process input data: {str(e)}")
|
|
else:
|
|
|
|
X_processed = X.values
|
|
print("No preprocessor available, using raw values")
|
|
|
|
|
|
prediction = bool(self.model.predict(X_processed)[0])
|
|
probability = float(self.model.predict_proba(X_processed)[0][1])
|
|
|
|
print(f"Prediction result: {prediction}, probability: {probability}")
|
|
|
|
return {
|
|
"prediction": prediction,
|
|
"probability": probability
|
|
}
|
|
|
|
except Exception as e:
|
|
import traceback
|
|
traceback.print_exc()
|
|
raise ValueError(f"Error during prediction: {str(e)}")
|
|
|
|
def get_feature_importance(self) -> List[float]:
|
|
"""Get the feature importance scores as a list of floats."""
|
|
try:
|
|
if hasattr(self.model, 'feature_importances_'):
|
|
|
|
return [float(x) for x in self.model.feature_importances_]
|
|
return None
|
|
except Exception as e:
|
|
print(f"Error getting feature importance: {str(e)}")
|
|
return None
|
|
|
|
def train_model():
|
|
"""Train and save the attrition prediction model"""
|
|
try:
|
|
model = AttritionModel()
|
|
|
|
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
project_root = os.path.dirname(os.path.dirname(current_dir))
|
|
data_file = os.path.join(project_root, "data", "HR-Employee-Attrition.csv")
|
|
model_dir = os.path.join(project_root, 'models')
|
|
|
|
print(f"Loading data from: {data_file}")
|
|
print(f"Model will be saved to: {model_dir}")
|
|
|
|
|
|
if not os.path.exists(data_file):
|
|
raise FileNotFoundError(f"Data file not found at {data_file}")
|
|
|
|
|
|
os.makedirs(model_dir, exist_ok=True)
|
|
|
|
|
|
print("Loading and preparing data...")
|
|
data = pd.read_csv(data_file)
|
|
|
|
|
|
features = model.numeric_features + model.categorical_features
|
|
print(f"Using features: {features}")
|
|
|
|
X = data[features]
|
|
y = data['Attrition'].map({'Yes': 1, 'No': 0})
|
|
|
|
|
|
print("Training model...")
|
|
model.train(X, y)
|
|
print("Model trained and saved successfully")
|
|
|
|
except Exception as e:
|
|
print(f"Error during model training: {str(e)}")
|
|
import traceback
|
|
print(traceback.format_exc())
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
train_model() |