Spaces:
Sleeping
Sleeping
File size: 5,625 Bytes
3efedb0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os
import logging
import shap
import sys
# Add the project root to the Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Now import from src
from src.api.loan_model import LoanApprovalModel
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class LoanModelTrainer:
def __init__(self):
self.model = RandomForestClassifier(n_estimators=100, random_state=42)
self.scaler = StandardScaler()
self.label_encoders = {}
def load_data(self, file_path):
"""Load and preprocess the loan approval dataset."""
logger.info("Loading dataset...")
df = pd.read_csv(file_path)
# Convert loan status to binary
df['loan_status'] = df['loan_status'].map({'Approved': 1, 'Rejected': 0})
# Calculate derived features
df['debt_to_income'] = df['loan_amount'] / df['income_annum']
df['total_assets'] = df['residential_assets_value'] + df['commercial_assets_value'] + df['luxury_assets_value'] + df['bank_asset_value']
df['asset_to_loan'] = df['total_assets'] / df['loan_amount']
# Define features
numerical_features = [
'no_of_dependents', 'income_annum', 'loan_amount', 'loan_term',
'cibil_score', 'residential_assets_value', 'commercial_assets_value',
'luxury_assets_value', 'bank_asset_value', 'debt_to_income',
'total_assets', 'asset_to_loan'
]
categorical_features = ['education', 'self_employed']
# Encode categorical features
for feature in categorical_features:
self.label_encoders[feature] = LabelEncoder()
df[feature] = self.label_encoders[feature].fit_transform(df[feature])
# Prepare X and y
X = df[numerical_features + categorical_features]
y = df['loan_status']
return X, y, numerical_features, categorical_features
def train(self, X, y, numerical_features, categorical_features):
"""Train the model and evaluate its performance."""
logger.info("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Scale numerical features
logger.info("Scaling numerical features...")
X_train[numerical_features] = self.scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = self.scaler.transform(X_test[numerical_features])
# Train the model
logger.info("Training the model...")
self.model.fit(X_train, y_train)
# Evaluate the model
logger.info("Evaluating the model...")
y_pred = self.model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
logger.info(f"Model accuracy: {accuracy:.4f}")
logger.info("Classification Report:")
logger.info(report)
return accuracy, report
def save_model(self, save_dir='models'):
"""Save the trained model and preprocessing objects."""
logger.info("Saving model components...")
os.makedirs(save_dir, exist_ok=True)
# Save model components
joblib.dump(self.model, os.path.join(save_dir, 'loan_model.joblib'))
joblib.dump(self.scaler, os.path.join(save_dir, 'loan_scaler.joblib'))
joblib.dump(self.label_encoders, os.path.join(save_dir, 'loan_label_encoders.joblib'))
logger.info("Model components saved successfully.")
def train_loan_model():
# Create models directory if it doesn't exist
model_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models")
os.makedirs(model_dir, exist_ok=True)
# Load the dataset
data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "loan_approval_dataset.csv")
data = pd.read_csv(data_path)
# Clean column names and string values by removing leading/trailing spaces
data.columns = data.columns.str.strip()
for col in data.select_dtypes(include=['object']).columns:
data[col] = data[col].str.strip()
# Remove rows with NaN values
data = data.dropna()
# Convert loan status to binary
data['loan_status'] = data['loan_status'].map({'Approved': 1, 'Rejected': 0})
# Separate features and target
X = data.drop(['loan_status', 'loan_id'], axis=1) # Also drop loan_id as it's not a feature
y = data['loan_status']
print("Dataset shape:", X.shape)
print("Number of approved loans:", sum(y == 1))
print("Number of rejected loans:", sum(y == 0))
# Initialize model without loading existing components
model = LoanApprovalModel(model_dir=model_dir, load_model=False)
# Train the model
model.train(X, y)
# Save the model
model.save(model_dir)
print(f"Model trained and saved successfully in {model_dir}!")
if __name__ == "__main__":
train_loan_model() |