File size: 10,260 Bytes
3efedb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pickle
import os
import sys
from typing import List

# Add the project root to Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path.append(project_root)

class AttritionModel:
    def __init__(self):
        self.model = None
        self.preprocessor = None
        self.model_path = os.path.join(project_root, "models", "attrition_model.pkl")
        self.preprocessor_path = os.path.join(project_root, "models", "attrition_preprocessor.pkl")
        
        # Create models directory if it doesn't exist
        os.makedirs(os.path.dirname(self.model_path), exist_ok=True)
        
        # Define the features we'll use
        self.numeric_features = [
            'Age', 'DistanceFromHome', 'EnvironmentSatisfaction',
            'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
            'TotalWorkingYears', 'WorkLifeBalance', 'YearsAtCompany'
        ]
        self.categorical_features = ['OverTime']
        
        # Try to load existing model and preprocessor
        try:
            with open(self.model_path, 'rb') as f:
                self.model = pickle.load(f)
            with open(self.preprocessor_path, 'rb') as f:
                self.preprocessor = pickle.load(f)
        except:
            print("No existing model found. Please train the model first.")
    
    def preprocess_data(self, X):
        """Preprocess the input data"""
        # Create preprocessing steps for numeric and categorical data
        numeric_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(drop='first', sparse=False)
        
        # Combine preprocessing steps
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, self.numeric_features),
                ('cat', categorical_transformer, self.categorical_features)
            ],
            remainder='drop'  # Drop any columns not specified in features
        )
        
        return self.preprocessor.fit_transform(X)
    
    def train(self, X, y):
        """Train the model with the given data"""
        # Preprocess the data
        X_processed = self.preprocess_data(X)
        
        # Create and train the model
        self.model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=42
        )
        self.model.fit(X_processed, y)
        
        # Save the model and preprocessor
        with open(self.model_path, 'wb') as f:
            pickle.dump(self.model, f)
        with open(self.preprocessor_path, 'wb') as f:
            pickle.dump(self.preprocessor, f)
    
    def predict(self, features):
        """Make a prediction using the trained model."""
        try:
            if self.model is None:
                raise ValueError("Model not loaded. Please ensure model file exists and is valid.")
            
            print(f"Input features: {features}")
            
            # Convert string inputs to appropriate types
            processed_features = {}
            for key, value in features.items():
                if key == 'OverTime':
                    # Convert 'Yes'/'No' to 1/0
                    if isinstance(value, str):
                        processed_features[key] = 1 if value.lower() in ['yes', 'true', '1'] else 0
                    else:
                        processed_features[key] = 1 if value else 0
                else:
                    # Convert other values to appropriate numeric types
                    try:
                        processed_features[key] = float(value)
                    except (ValueError, TypeError):
                        # Handle conversion errors
                        raise ValueError(f"Invalid value for feature {key}: {value}. Expected numeric value.")
            
            print(f"Processed features: {processed_features}")
            
            # Create DataFrame with processed values
            X = pd.DataFrame([processed_features])
            
            # Ensure all required columns are present
            required_columns = self.numeric_features + self.categorical_features
            
            for col in required_columns:
                if col not in X.columns:
                    raise ValueError(f"Missing required feature: {col}")
            
            # Ensure columns are in the correct order for the preprocessor
            X = X[required_columns]
            
            # Debug information
            print(f"Input data types before conversion: {X.dtypes}")
            
            # Convert all numeric columns to float64
            for col in self.numeric_features:
                X[col] = pd.to_numeric(X[col], errors='coerce').astype(np.float64)
            
            # Convert categorical columns to appropriate types
            for col in self.categorical_features:
                X[col] = X[col].astype(np.int64)
            
            print(f"Input data types after conversion: {X.dtypes}")
            print(f"Input data: {X.to_dict('records')}")
            
            # Check for NaN values
            if X.isnull().any().any():
                print(f"Warning: NaN values detected in input: {X.isnull().sum()}")
                # Fill NaN values with appropriate defaults
                X = X.fillna(X.mean())
            
            # Use preprocessor
            if self.preprocessor is not None:
                try:
                    X_processed = self.preprocessor.transform(X)
                    print("Preprocessing successful")
                except Exception as e:
                    print(f"Error during preprocessing: {str(e)}")
                    # Try direct prediction without preprocessing as fallback
                    try:
                        # For direct prediction, we need to handle categorical features manually
                        # Convert 'OverTime' to one-hot encoding manually
                        X_direct = X.copy()
                        X_direct['OverTime_Yes'] = X_direct['OverTime']
                        X_direct = X_direct.drop('OverTime', axis=1)
                        
                        # Make prediction with direct features
                        prediction = bool(self.model.predict(X_direct.values)[0])
                        probability = float(self.model.predict_proba(X_direct.values)[0][1])
                        
                        print("Used direct prediction as fallback")
                        return {
                            "prediction": prediction,
                            "probability": probability
                        }
                    except Exception as direct_error:
                        print(f"Direct prediction also failed: {str(direct_error)}")
                        raise ValueError(f"Failed to process input data: {str(e)}")
            else:
                # If no preprocessor, just use the raw values
                X_processed = X.values
                print("No preprocessor available, using raw values")
            
            # Make prediction
            prediction = bool(self.model.predict(X_processed)[0])
            probability = float(self.model.predict_proba(X_processed)[0][1])
            
            print(f"Prediction result: {prediction}, probability: {probability}")
            
            return {
                "prediction": prediction,
                "probability": probability
            }
            
        except Exception as e:
            import traceback
            traceback.print_exc()
            raise ValueError(f"Error during prediction: {str(e)}")

    def get_feature_importance(self) -> List[float]:
        """Get the feature importance scores as a list of floats."""
        try:
            if hasattr(self.model, 'feature_importances_'):
                # Convert feature importances to a list of floats
                return [float(x) for x in self.model.feature_importances_]
            return None
        except Exception as e:
            print(f"Error getting feature importance: {str(e)}")
            return None

def train_model():
    """Train and save the attrition prediction model"""
    try:
        model = AttritionModel()
        
        # Get absolute paths
        current_dir = os.path.dirname(os.path.abspath(__file__))
        project_root = os.path.dirname(os.path.dirname(current_dir))
        data_file = os.path.join(project_root, "data", "HR-Employee-Attrition.csv")
        model_dir = os.path.join(project_root, 'models')
        
        print(f"Loading data from: {data_file}")
        print(f"Model will be saved to: {model_dir}")
        
        # Ensure data file exists
        if not os.path.exists(data_file):
            raise FileNotFoundError(f"Data file not found at {data_file}")
        
        # Create models directory if it doesn't exist
        os.makedirs(model_dir, exist_ok=True)
        
        # Load data
        print("Loading and preparing data...")
        data = pd.read_csv(data_file)
        
        # Select only the features we want to use
        features = model.numeric_features + model.categorical_features
        print(f"Using features: {features}")
        
        X = data[features]
        y = data['Attrition'].map({'Yes': 1, 'No': 0})
        
        # Train the model
        print("Training model...")
        model.train(X, y)
        print("Model trained and saved successfully")
        
    except Exception as e:
        print(f"Error during model training: {str(e)}")
        import traceback
        print(traceback.format_exc())
        sys.exit(1)

if __name__ == "__main__":
    train_model()