import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.metrics import accuracy_score, classification_report import joblib import os import logging import shap import sys # Add the project root to the Python path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Now import from src from src.api.loan_model import LoanApprovalModel # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class LoanModelTrainer: def __init__(self): self.model = RandomForestClassifier(n_estimators=100, random_state=42) self.scaler = StandardScaler() self.label_encoders = {} def load_data(self, file_path): """Load and preprocess the loan approval dataset.""" logger.info("Loading dataset...") df = pd.read_csv(file_path) # Convert loan status to binary df['loan_status'] = df['loan_status'].map({'Approved': 1, 'Rejected': 0}) # Calculate derived features df['debt_to_income'] = df['loan_amount'] / df['income_annum'] df['total_assets'] = df['residential_assets_value'] + df['commercial_assets_value'] + df['luxury_assets_value'] + df['bank_asset_value'] df['asset_to_loan'] = df['total_assets'] / df['loan_amount'] # Define features numerical_features = [ 'no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value', 'debt_to_income', 'total_assets', 'asset_to_loan' ] categorical_features = ['education', 'self_employed'] # Encode categorical features for feature in categorical_features: self.label_encoders[feature] = LabelEncoder() df[feature] = self.label_encoders[feature].fit_transform(df[feature]) # Prepare X and y X = df[numerical_features + categorical_features] y = df['loan_status'] return X, y, numerical_features, categorical_features def train(self, X, y, numerical_features, categorical_features): """Train the model and evaluate its performance.""" logger.info("Splitting data into train and test sets...") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # Scale numerical features logger.info("Scaling numerical features...") X_train[numerical_features] = self.scaler.fit_transform(X_train[numerical_features]) X_test[numerical_features] = self.scaler.transform(X_test[numerical_features]) # Train the model logger.info("Training the model...") self.model.fit(X_train, y_train) # Evaluate the model logger.info("Evaluating the model...") y_pred = self.model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) report = classification_report(y_test, y_pred) logger.info(f"Model accuracy: {accuracy:.4f}") logger.info("Classification Report:") logger.info(report) return accuracy, report def save_model(self, save_dir='models'): """Save the trained model and preprocessing objects.""" logger.info("Saving model components...") os.makedirs(save_dir, exist_ok=True) # Save model components joblib.dump(self.model, os.path.join(save_dir, 'loan_model.joblib')) joblib.dump(self.scaler, os.path.join(save_dir, 'loan_scaler.joblib')) joblib.dump(self.label_encoders, os.path.join(save_dir, 'loan_label_encoders.joblib')) logger.info("Model components saved successfully.") def train_loan_model(): # Create models directory if it doesn't exist model_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models") os.makedirs(model_dir, exist_ok=True) # Load the dataset data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "loan_approval_dataset.csv") data = pd.read_csv(data_path) # Clean column names and string values by removing leading/trailing spaces data.columns = data.columns.str.strip() for col in data.select_dtypes(include=['object']).columns: data[col] = data[col].str.strip() # Remove rows with NaN values data = data.dropna() # Convert loan status to binary data['loan_status'] = data['loan_status'].map({'Approved': 1, 'Rejected': 0}) # Separate features and target X = data.drop(['loan_status', 'loan_id'], axis=1) # Also drop loan_id as it's not a feature y = data['loan_status'] print("Dataset shape:", X.shape) print("Number of approved loans:", sum(y == 1)) print("Number of rejected loans:", sum(y == 0)) # Initialize model without loading existing components model = LoanApprovalModel(model_dir=model_dir, load_model=False) # Train the model model.train(X, y) # Save the model model.save(model_dir) print(f"Model trained and saved successfully in {model_dir}!") if __name__ == "__main__": train_loan_model()