Synthack-SyntaxSquad / src /train_loan_model.py
WickedFaith's picture
Upload 77 files
3efedb0 verified
raw
history blame contribute delete
5.63 kB
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os
import logging
import shap
import sys
# Add the project root to the Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Now import from src
from src.api.loan_model import LoanApprovalModel
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class LoanModelTrainer:
def __init__(self):
self.model = RandomForestClassifier(n_estimators=100, random_state=42)
self.scaler = StandardScaler()
self.label_encoders = {}
def load_data(self, file_path):
"""Load and preprocess the loan approval dataset."""
logger.info("Loading dataset...")
df = pd.read_csv(file_path)
# Convert loan status to binary
df['loan_status'] = df['loan_status'].map({'Approved': 1, 'Rejected': 0})
# Calculate derived features
df['debt_to_income'] = df['loan_amount'] / df['income_annum']
df['total_assets'] = df['residential_assets_value'] + df['commercial_assets_value'] + df['luxury_assets_value'] + df['bank_asset_value']
df['asset_to_loan'] = df['total_assets'] / df['loan_amount']
# Define features
numerical_features = [
'no_of_dependents', 'income_annum', 'loan_amount', 'loan_term',
'cibil_score', 'residential_assets_value', 'commercial_assets_value',
'luxury_assets_value', 'bank_asset_value', 'debt_to_income',
'total_assets', 'asset_to_loan'
]
categorical_features = ['education', 'self_employed']
# Encode categorical features
for feature in categorical_features:
self.label_encoders[feature] = LabelEncoder()
df[feature] = self.label_encoders[feature].fit_transform(df[feature])
# Prepare X and y
X = df[numerical_features + categorical_features]
y = df['loan_status']
return X, y, numerical_features, categorical_features
def train(self, X, y, numerical_features, categorical_features):
"""Train the model and evaluate its performance."""
logger.info("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Scale numerical features
logger.info("Scaling numerical features...")
X_train[numerical_features] = self.scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = self.scaler.transform(X_test[numerical_features])
# Train the model
logger.info("Training the model...")
self.model.fit(X_train, y_train)
# Evaluate the model
logger.info("Evaluating the model...")
y_pred = self.model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
logger.info(f"Model accuracy: {accuracy:.4f}")
logger.info("Classification Report:")
logger.info(report)
return accuracy, report
def save_model(self, save_dir='models'):
"""Save the trained model and preprocessing objects."""
logger.info("Saving model components...")
os.makedirs(save_dir, exist_ok=True)
# Save model components
joblib.dump(self.model, os.path.join(save_dir, 'loan_model.joblib'))
joblib.dump(self.scaler, os.path.join(save_dir, 'loan_scaler.joblib'))
joblib.dump(self.label_encoders, os.path.join(save_dir, 'loan_label_encoders.joblib'))
logger.info("Model components saved successfully.")
def train_loan_model():
# Create models directory if it doesn't exist
model_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models")
os.makedirs(model_dir, exist_ok=True)
# Load the dataset
data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "loan_approval_dataset.csv")
data = pd.read_csv(data_path)
# Clean column names and string values by removing leading/trailing spaces
data.columns = data.columns.str.strip()
for col in data.select_dtypes(include=['object']).columns:
data[col] = data[col].str.strip()
# Remove rows with NaN values
data = data.dropna()
# Convert loan status to binary
data['loan_status'] = data['loan_status'].map({'Approved': 1, 'Rejected': 0})
# Separate features and target
X = data.drop(['loan_status', 'loan_id'], axis=1) # Also drop loan_id as it's not a feature
y = data['loan_status']
print("Dataset shape:", X.shape)
print("Number of approved loans:", sum(y == 1))
print("Number of rejected loans:", sum(y == 0))
# Initialize model without loading existing components
model = LoanApprovalModel(model_dir=model_dir, load_model=False)
# Train the model
model.train(X, y)
# Save the model
model.save(model_dir)
print(f"Model trained and saved successfully in {model_dir}!")
if __name__ == "__main__":
train_loan_model()