Spaces:
Sleeping
Sleeping
File size: 10,260 Bytes
3efedb0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pickle
import os
import sys
from typing import List
# Add the project root to Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path.append(project_root)
class AttritionModel:
def __init__(self):
self.model = None
self.preprocessor = None
self.model_path = os.path.join(project_root, "models", "attrition_model.pkl")
self.preprocessor_path = os.path.join(project_root, "models", "attrition_preprocessor.pkl")
# Create models directory if it doesn't exist
os.makedirs(os.path.dirname(self.model_path), exist_ok=True)
# Define the features we'll use
self.numeric_features = [
'Age', 'DistanceFromHome', 'EnvironmentSatisfaction',
'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
'TotalWorkingYears', 'WorkLifeBalance', 'YearsAtCompany'
]
self.categorical_features = ['OverTime']
# Try to load existing model and preprocessor
try:
with open(self.model_path, 'rb') as f:
self.model = pickle.load(f)
with open(self.preprocessor_path, 'rb') as f:
self.preprocessor = pickle.load(f)
except:
print("No existing model found. Please train the model first.")
def preprocess_data(self, X):
"""Preprocess the input data"""
# Create preprocessing steps for numeric and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse=False)
# Combine preprocessing steps
self.preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, self.numeric_features),
('cat', categorical_transformer, self.categorical_features)
],
remainder='drop' # Drop any columns not specified in features
)
return self.preprocessor.fit_transform(X)
def train(self, X, y):
"""Train the model with the given data"""
# Preprocess the data
X_processed = self.preprocess_data(X)
# Create and train the model
self.model = RandomForestClassifier(
n_estimators=100,
max_depth=10,
random_state=42
)
self.model.fit(X_processed, y)
# Save the model and preprocessor
with open(self.model_path, 'wb') as f:
pickle.dump(self.model, f)
with open(self.preprocessor_path, 'wb') as f:
pickle.dump(self.preprocessor, f)
def predict(self, features):
"""Make a prediction using the trained model."""
try:
if self.model is None:
raise ValueError("Model not loaded. Please ensure model file exists and is valid.")
print(f"Input features: {features}")
# Convert string inputs to appropriate types
processed_features = {}
for key, value in features.items():
if key == 'OverTime':
# Convert 'Yes'/'No' to 1/0
if isinstance(value, str):
processed_features[key] = 1 if value.lower() in ['yes', 'true', '1'] else 0
else:
processed_features[key] = 1 if value else 0
else:
# Convert other values to appropriate numeric types
try:
processed_features[key] = float(value)
except (ValueError, TypeError):
# Handle conversion errors
raise ValueError(f"Invalid value for feature {key}: {value}. Expected numeric value.")
print(f"Processed features: {processed_features}")
# Create DataFrame with processed values
X = pd.DataFrame([processed_features])
# Ensure all required columns are present
required_columns = self.numeric_features + self.categorical_features
for col in required_columns:
if col not in X.columns:
raise ValueError(f"Missing required feature: {col}")
# Ensure columns are in the correct order for the preprocessor
X = X[required_columns]
# Debug information
print(f"Input data types before conversion: {X.dtypes}")
# Convert all numeric columns to float64
for col in self.numeric_features:
X[col] = pd.to_numeric(X[col], errors='coerce').astype(np.float64)
# Convert categorical columns to appropriate types
for col in self.categorical_features:
X[col] = X[col].astype(np.int64)
print(f"Input data types after conversion: {X.dtypes}")
print(f"Input data: {X.to_dict('records')}")
# Check for NaN values
if X.isnull().any().any():
print(f"Warning: NaN values detected in input: {X.isnull().sum()}")
# Fill NaN values with appropriate defaults
X = X.fillna(X.mean())
# Use preprocessor
if self.preprocessor is not None:
try:
X_processed = self.preprocessor.transform(X)
print("Preprocessing successful")
except Exception as e:
print(f"Error during preprocessing: {str(e)}")
# Try direct prediction without preprocessing as fallback
try:
# For direct prediction, we need to handle categorical features manually
# Convert 'OverTime' to one-hot encoding manually
X_direct = X.copy()
X_direct['OverTime_Yes'] = X_direct['OverTime']
X_direct = X_direct.drop('OverTime', axis=1)
# Make prediction with direct features
prediction = bool(self.model.predict(X_direct.values)[0])
probability = float(self.model.predict_proba(X_direct.values)[0][1])
print("Used direct prediction as fallback")
return {
"prediction": prediction,
"probability": probability
}
except Exception as direct_error:
print(f"Direct prediction also failed: {str(direct_error)}")
raise ValueError(f"Failed to process input data: {str(e)}")
else:
# If no preprocessor, just use the raw values
X_processed = X.values
print("No preprocessor available, using raw values")
# Make prediction
prediction = bool(self.model.predict(X_processed)[0])
probability = float(self.model.predict_proba(X_processed)[0][1])
print(f"Prediction result: {prediction}, probability: {probability}")
return {
"prediction": prediction,
"probability": probability
}
except Exception as e:
import traceback
traceback.print_exc()
raise ValueError(f"Error during prediction: {str(e)}")
def get_feature_importance(self) -> List[float]:
"""Get the feature importance scores as a list of floats."""
try:
if hasattr(self.model, 'feature_importances_'):
# Convert feature importances to a list of floats
return [float(x) for x in self.model.feature_importances_]
return None
except Exception as e:
print(f"Error getting feature importance: {str(e)}")
return None
def train_model():
"""Train and save the attrition prediction model"""
try:
model = AttritionModel()
# Get absolute paths
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(os.path.dirname(current_dir))
data_file = os.path.join(project_root, "data", "HR-Employee-Attrition.csv")
model_dir = os.path.join(project_root, 'models')
print(f"Loading data from: {data_file}")
print(f"Model will be saved to: {model_dir}")
# Ensure data file exists
if not os.path.exists(data_file):
raise FileNotFoundError(f"Data file not found at {data_file}")
# Create models directory if it doesn't exist
os.makedirs(model_dir, exist_ok=True)
# Load data
print("Loading and preparing data...")
data = pd.read_csv(data_file)
# Select only the features we want to use
features = model.numeric_features + model.categorical_features
print(f"Using features: {features}")
X = data[features]
y = data['Attrition'].map({'Yes': 1, 'No': 0})
# Train the model
print("Training model...")
model.train(X, y)
print("Model trained and saved successfully")
except Exception as e:
print(f"Error during model training: {str(e)}")
import traceback
print(traceback.format_exc())
sys.exit(1)
if __name__ == "__main__":
train_model() |