Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from sklearn.model_selection import train_test_split | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.preprocessing import StandardScaler, LabelEncoder | |
from sklearn.metrics import accuracy_score, classification_report | |
import joblib | |
import os | |
import logging | |
import shap | |
import sys | |
# Add the project root to the Python path | |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
# Now import from src | |
from src.api.loan_model import LoanApprovalModel | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class LoanModelTrainer: | |
def __init__(self): | |
self.model = RandomForestClassifier(n_estimators=100, random_state=42) | |
self.scaler = StandardScaler() | |
self.label_encoders = {} | |
def load_data(self, file_path): | |
"""Load and preprocess the loan approval dataset.""" | |
logger.info("Loading dataset...") | |
df = pd.read_csv(file_path) | |
# Convert loan status to binary | |
df['loan_status'] = df['loan_status'].map({'Approved': 1, 'Rejected': 0}) | |
# Calculate derived features | |
df['debt_to_income'] = df['loan_amount'] / df['income_annum'] | |
df['total_assets'] = df['residential_assets_value'] + df['commercial_assets_value'] + df['luxury_assets_value'] + df['bank_asset_value'] | |
df['asset_to_loan'] = df['total_assets'] / df['loan_amount'] | |
# Define features | |
numerical_features = [ | |
'no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', | |
'cibil_score', 'residential_assets_value', 'commercial_assets_value', | |
'luxury_assets_value', 'bank_asset_value', 'debt_to_income', | |
'total_assets', 'asset_to_loan' | |
] | |
categorical_features = ['education', 'self_employed'] | |
# Encode categorical features | |
for feature in categorical_features: | |
self.label_encoders[feature] = LabelEncoder() | |
df[feature] = self.label_encoders[feature].fit_transform(df[feature]) | |
# Prepare X and y | |
X = df[numerical_features + categorical_features] | |
y = df['loan_status'] | |
return X, y, numerical_features, categorical_features | |
def train(self, X, y, numerical_features, categorical_features): | |
"""Train the model and evaluate its performance.""" | |
logger.info("Splitting data into train and test sets...") | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, y, test_size=0.2, random_state=42 | |
) | |
# Scale numerical features | |
logger.info("Scaling numerical features...") | |
X_train[numerical_features] = self.scaler.fit_transform(X_train[numerical_features]) | |
X_test[numerical_features] = self.scaler.transform(X_test[numerical_features]) | |
# Train the model | |
logger.info("Training the model...") | |
self.model.fit(X_train, y_train) | |
# Evaluate the model | |
logger.info("Evaluating the model...") | |
y_pred = self.model.predict(X_test) | |
accuracy = accuracy_score(y_test, y_pred) | |
report = classification_report(y_test, y_pred) | |
logger.info(f"Model accuracy: {accuracy:.4f}") | |
logger.info("Classification Report:") | |
logger.info(report) | |
return accuracy, report | |
def save_model(self, save_dir='models'): | |
"""Save the trained model and preprocessing objects.""" | |
logger.info("Saving model components...") | |
os.makedirs(save_dir, exist_ok=True) | |
# Save model components | |
joblib.dump(self.model, os.path.join(save_dir, 'loan_model.joblib')) | |
joblib.dump(self.scaler, os.path.join(save_dir, 'loan_scaler.joblib')) | |
joblib.dump(self.label_encoders, os.path.join(save_dir, 'loan_label_encoders.joblib')) | |
logger.info("Model components saved successfully.") | |
def train_loan_model(): | |
# Create models directory if it doesn't exist | |
model_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models") | |
os.makedirs(model_dir, exist_ok=True) | |
# Load the dataset | |
data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "loan_approval_dataset.csv") | |
data = pd.read_csv(data_path) | |
# Clean column names and string values by removing leading/trailing spaces | |
data.columns = data.columns.str.strip() | |
for col in data.select_dtypes(include=['object']).columns: | |
data[col] = data[col].str.strip() | |
# Remove rows with NaN values | |
data = data.dropna() | |
# Convert loan status to binary | |
data['loan_status'] = data['loan_status'].map({'Approved': 1, 'Rejected': 0}) | |
# Separate features and target | |
X = data.drop(['loan_status', 'loan_id'], axis=1) # Also drop loan_id as it's not a feature | |
y = data['loan_status'] | |
print("Dataset shape:", X.shape) | |
print("Number of approved loans:", sum(y == 1)) | |
print("Number of rejected loans:", sum(y == 0)) | |
# Initialize model without loading existing components | |
model = LoanApprovalModel(model_dir=model_dir, load_model=False) | |
# Train the model | |
model.train(X, y) | |
# Save the model | |
model.save(model_dir) | |
print(f"Model trained and saved successfully in {model_dir}!") | |
if __name__ == "__main__": | |
train_loan_model() |