Synthack-SyntaxSquad / src /api /attrition_model.py
WickedFaith's picture
Upload 77 files
3efedb0 verified
raw
history blame contribute delete
10.3 kB
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pickle
import os
import sys
from typing import List
# Add the project root to Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path.append(project_root)
class AttritionModel:
def __init__(self):
self.model = None
self.preprocessor = None
self.model_path = os.path.join(project_root, "models", "attrition_model.pkl")
self.preprocessor_path = os.path.join(project_root, "models", "attrition_preprocessor.pkl")
# Create models directory if it doesn't exist
os.makedirs(os.path.dirname(self.model_path), exist_ok=True)
# Define the features we'll use
self.numeric_features = [
'Age', 'DistanceFromHome', 'EnvironmentSatisfaction',
'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
'TotalWorkingYears', 'WorkLifeBalance', 'YearsAtCompany'
]
self.categorical_features = ['OverTime']
# Try to load existing model and preprocessor
try:
with open(self.model_path, 'rb') as f:
self.model = pickle.load(f)
with open(self.preprocessor_path, 'rb') as f:
self.preprocessor = pickle.load(f)
except:
print("No existing model found. Please train the model first.")
def preprocess_data(self, X):
"""Preprocess the input data"""
# Create preprocessing steps for numeric and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse=False)
# Combine preprocessing steps
self.preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, self.numeric_features),
('cat', categorical_transformer, self.categorical_features)
],
remainder='drop' # Drop any columns not specified in features
)
return self.preprocessor.fit_transform(X)
def train(self, X, y):
"""Train the model with the given data"""
# Preprocess the data
X_processed = self.preprocess_data(X)
# Create and train the model
self.model = RandomForestClassifier(
n_estimators=100,
max_depth=10,
random_state=42
)
self.model.fit(X_processed, y)
# Save the model and preprocessor
with open(self.model_path, 'wb') as f:
pickle.dump(self.model, f)
with open(self.preprocessor_path, 'wb') as f:
pickle.dump(self.preprocessor, f)
def predict(self, features):
"""Make a prediction using the trained model."""
try:
if self.model is None:
raise ValueError("Model not loaded. Please ensure model file exists and is valid.")
print(f"Input features: {features}")
# Convert string inputs to appropriate types
processed_features = {}
for key, value in features.items():
if key == 'OverTime':
# Convert 'Yes'/'No' to 1/0
if isinstance(value, str):
processed_features[key] = 1 if value.lower() in ['yes', 'true', '1'] else 0
else:
processed_features[key] = 1 if value else 0
else:
# Convert other values to appropriate numeric types
try:
processed_features[key] = float(value)
except (ValueError, TypeError):
# Handle conversion errors
raise ValueError(f"Invalid value for feature {key}: {value}. Expected numeric value.")
print(f"Processed features: {processed_features}")
# Create DataFrame with processed values
X = pd.DataFrame([processed_features])
# Ensure all required columns are present
required_columns = self.numeric_features + self.categorical_features
for col in required_columns:
if col not in X.columns:
raise ValueError(f"Missing required feature: {col}")
# Ensure columns are in the correct order for the preprocessor
X = X[required_columns]
# Debug information
print(f"Input data types before conversion: {X.dtypes}")
# Convert all numeric columns to float64
for col in self.numeric_features:
X[col] = pd.to_numeric(X[col], errors='coerce').astype(np.float64)
# Convert categorical columns to appropriate types
for col in self.categorical_features:
X[col] = X[col].astype(np.int64)
print(f"Input data types after conversion: {X.dtypes}")
print(f"Input data: {X.to_dict('records')}")
# Check for NaN values
if X.isnull().any().any():
print(f"Warning: NaN values detected in input: {X.isnull().sum()}")
# Fill NaN values with appropriate defaults
X = X.fillna(X.mean())
# Use preprocessor
if self.preprocessor is not None:
try:
X_processed = self.preprocessor.transform(X)
print("Preprocessing successful")
except Exception as e:
print(f"Error during preprocessing: {str(e)}")
# Try direct prediction without preprocessing as fallback
try:
# For direct prediction, we need to handle categorical features manually
# Convert 'OverTime' to one-hot encoding manually
X_direct = X.copy()
X_direct['OverTime_Yes'] = X_direct['OverTime']
X_direct = X_direct.drop('OverTime', axis=1)
# Make prediction with direct features
prediction = bool(self.model.predict(X_direct.values)[0])
probability = float(self.model.predict_proba(X_direct.values)[0][1])
print("Used direct prediction as fallback")
return {
"prediction": prediction,
"probability": probability
}
except Exception as direct_error:
print(f"Direct prediction also failed: {str(direct_error)}")
raise ValueError(f"Failed to process input data: {str(e)}")
else:
# If no preprocessor, just use the raw values
X_processed = X.values
print("No preprocessor available, using raw values")
# Make prediction
prediction = bool(self.model.predict(X_processed)[0])
probability = float(self.model.predict_proba(X_processed)[0][1])
print(f"Prediction result: {prediction}, probability: {probability}")
return {
"prediction": prediction,
"probability": probability
}
except Exception as e:
import traceback
traceback.print_exc()
raise ValueError(f"Error during prediction: {str(e)}")
def get_feature_importance(self) -> List[float]:
"""Get the feature importance scores as a list of floats."""
try:
if hasattr(self.model, 'feature_importances_'):
# Convert feature importances to a list of floats
return [float(x) for x in self.model.feature_importances_]
return None
except Exception as e:
print(f"Error getting feature importance: {str(e)}")
return None
def train_model():
"""Train and save the attrition prediction model"""
try:
model = AttritionModel()
# Get absolute paths
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(os.path.dirname(current_dir))
data_file = os.path.join(project_root, "data", "HR-Employee-Attrition.csv")
model_dir = os.path.join(project_root, 'models')
print(f"Loading data from: {data_file}")
print(f"Model will be saved to: {model_dir}")
# Ensure data file exists
if not os.path.exists(data_file):
raise FileNotFoundError(f"Data file not found at {data_file}")
# Create models directory if it doesn't exist
os.makedirs(model_dir, exist_ok=True)
# Load data
print("Loading and preparing data...")
data = pd.read_csv(data_file)
# Select only the features we want to use
features = model.numeric_features + model.categorical_features
print(f"Using features: {features}")
X = data[features]
y = data['Attrition'].map({'Yes': 1, 'No': 0})
# Train the model
print("Training model...")
model.train(X, y)
print("Model trained and saved successfully")
except Exception as e:
print(f"Error during model training: {str(e)}")
import traceback
print(traceback.format_exc())
sys.exit(1)
if __name__ == "__main__":
train_model()