Spaces:

WickedFaith
/

Synthack-SyntaxSquad

Sleeping

File size: 10,260 Bytes

3efedb0

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pickle
import os
import sys
from typing import List

# Add the project root to Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path.append(project_root)

class AttritionModel:
    def __init__(self):
        self.model = None
        self.preprocessor = None
        self.model_path = os.path.join(project_root, "models", "attrition_model.pkl")
        self.preprocessor_path = os.path.join(project_root, "models", "attrition_preprocessor.pkl")
        
        # Create models directory if it doesn't exist
        os.makedirs(os.path.dirname(self.model_path), exist_ok=True)
        
        # Define the features we'll use
        self.numeric_features = [
            'Age', 'DistanceFromHome', 'EnvironmentSatisfaction',
            'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
            'TotalWorkingYears', 'WorkLifeBalance', 'YearsAtCompany'
        ]
        self.categorical_features = ['OverTime']
        
        # Try to load existing model and preprocessor
        try:
            with open(self.model_path, 'rb') as f:
                self.model = pickle.load(f)
            with open(self.preprocessor_path, 'rb') as f:
                self.preprocessor = pickle.load(f)
        except:
            print("No existing model found. Please train the model first.")
    
    def preprocess_data(self, X):
        """Preprocess the input data"""
        # Create preprocessing steps for numeric and categorical data
        numeric_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(drop='first', sparse=False)
        
        # Combine preprocessing steps
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, self.numeric_features),
                ('cat', categorical_transformer, self.categorical_features)
            ],
            remainder='drop'  # Drop any columns not specified in features
        )
        
        return self.preprocessor.fit_transform(X)
    
    def train(self, X, y):
        """Train the model with the given data"""
        # Preprocess the data
        X_processed = self.preprocess_data(X)
        
        # Create and train the model
        self.model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=42
        )
        self.model.fit(X_processed, y)
        
        # Save the model and preprocessor
        with open(self.model_path, 'wb') as f:
            pickle.dump(self.model, f)
        with open(self.preprocessor_path, 'wb') as f:
            pickle.dump(self.preprocessor, f)
    
    def predict(self, features):
        """Make a prediction using the trained model."""
        try:
            if self.model is None:
                raise ValueError("Model not loaded. Please ensure model file exists and is valid.")
            
            print(f"Input features: {features}")
            
            # Convert string inputs to appropriate types
            processed_features = {}
            for key, value in features.items():
                if key == 'OverTime':
                    # Convert 'Yes'/'No' to 1/0
                    if isinstance(value, str):
                        processed_features[key] = 1 if value.lower() in ['yes', 'true', '1'] else 0
                    else:
                        processed_features[key] = 1 if value else 0
                else:
                    # Convert other values to appropriate numeric types
                    try:
                        processed_features[key] = float(value)
                    except (ValueError, TypeError):
                        # Handle conversion errors
                        raise ValueError(f"Invalid value for feature {key}: {value}. Expected numeric value.")
            
            print(f"Processed features: {processed_features}")
            
            # Create DataFrame with processed values
            X = pd.DataFrame([processed_features])
            
            # Ensure all required columns are present
            required_columns = self.numeric_features + self.categorical_features
            
            for col in required_columns:
                if col not in X.columns:
                    raise ValueError(f"Missing required feature: {col}")
            
            # Ensure columns are in the correct order for the preprocessor
            X = X[required_columns]
            
            # Debug information
            print(f"Input data types before conversion: {X.dtypes}")
            
            # Convert all numeric columns to float64
            for col in self.numeric_features:
                X[col] = pd.to_numeric(X[col], errors='coerce').astype(np.float64)
            
            # Convert categorical columns to appropriate types
            for col in self.categorical_features:
                X[col] = X[col].astype(np.int64)
            
            print(f"Input data types after conversion: {X.dtypes}")
            print(f"Input data: {X.to_dict('records')}")
            
            # Check for NaN values
            if X.isnull().any().any():
                print(f"Warning: NaN values detected in input: {X.isnull().sum()}")
                # Fill NaN values with appropriate defaults
                X = X.fillna(X.mean())
            
            # Use preprocessor
            if self.preprocessor is not None:
                try:
                    X_processed = self.preprocessor.transform(X)
                    print("Preprocessing successful")
                except Exception as e:
                    print(f"Error during preprocessing: {str(e)}")
                    # Try direct prediction without preprocessing as fallback
                    try:
                        # For direct prediction, we need to handle categorical features manually
                        # Convert 'OverTime' to one-hot encoding manually
                        X_direct = X.copy()
                        X_direct['OverTime_Yes'] = X_direct['OverTime']
                        X_direct = X_direct.drop('OverTime', axis=1)
                        
                        # Make prediction with direct features
                        prediction = bool(self.model.predict(X_direct.values)[0])
                        probability = float(self.model.predict_proba(X_direct.values)[0][1])
                        
                        print("Used direct prediction as fallback")
                        return {
                            "prediction": prediction,
                            "probability": probability
                        }
                    except Exception as direct_error:
                        print(f"Direct prediction also failed: {str(direct_error)}")
                        raise ValueError(f"Failed to process input data: {str(e)}")
            else:
                # If no preprocessor, just use the raw values
                X_processed = X.values
                print("No preprocessor available, using raw values")
            
            # Make prediction
            prediction = bool(self.model.predict(X_processed)[0])
            probability = float(self.model.predict_proba(X_processed)[0][1])
            
            print(f"Prediction result: {prediction}, probability: {probability}")
            
            return {
                "prediction": prediction,
                "probability": probability
            }
            
        except Exception as e:
            import traceback
            traceback.print_exc()
            raise ValueError(f"Error during prediction: {str(e)}")

    def get_feature_importance(self) -> List[float]:
        """Get the feature importance scores as a list of floats."""
        try:
            if hasattr(self.model, 'feature_importances_'):
                # Convert feature importances to a list of floats
                return [float(x) for x in self.model.feature_importances_]
            return None
        except Exception as e:
            print(f"Error getting feature importance: {str(e)}")
            return None

def train_model():
    """Train and save the attrition prediction model"""
    try:
        model = AttritionModel()
        
        # Get absolute paths
        current_dir = os.path.dirname(os.path.abspath(__file__))
        project_root = os.path.dirname(os.path.dirname(current_dir))
        data_file = os.path.join(project_root, "data", "HR-Employee-Attrition.csv")
        model_dir = os.path.join(project_root, 'models')
        
        print(f"Loading data from: {data_file}")
        print(f"Model will be saved to: {model_dir}")
        
        # Ensure data file exists
        if not os.path.exists(data_file):
            raise FileNotFoundError(f"Data file not found at {data_file}")
        
        # Create models directory if it doesn't exist
        os.makedirs(model_dir, exist_ok=True)
        
        # Load data
        print("Loading and preparing data...")
        data = pd.read_csv(data_file)
        
        # Select only the features we want to use
        features = model.numeric_features + model.categorical_features
        print(f"Using features: {features}")
        
        X = data[features]
        y = data['Attrition'].map({'Yes': 1, 'No': 0})
        
        # Train the model
        print("Training model...")
        model.train(X, y)
        print("Model trained and saved successfully")
        
    except Exception as e:
        print(f"Error during model training: {str(e)}")
        import traceback
        print(traceback.format_exc())
        sys.exit(1)

if __name__ == "__main__":
    train_model()