import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier import pickle import os # Get the project root directory project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Load the dataset data_path = os.path.join(project_root, 'data', 'loan_approval_dataset.csv') print(f"Loading data from: {data_path}") df = pd.read_csv(data_path) # Clean column names (remove leading/trailing spaces) df.columns = df.columns.str.strip() # Clean string values (remove leading/trailing spaces) for col in df.select_dtypes(include=['object']).columns: df[col] = df[col].str.strip() # Identify numerical and categorical columns numerical_features = ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value'] categorical_features = ['education', 'self_employed'] # Prepare features and target X = df[numerical_features + categorical_features] y = df['loan_status'].map({'Approved': 1, 'Rejected': 0}) # Create preprocessing pipeline numeric_transformer = Pipeline(steps=[ ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline(steps=[ ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numerical_features), ('cat', categorical_transformer, categorical_features) ]) # Create full pipeline model = Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', RandomForestClassifier(n_estimators=100, random_state=42)) ]) # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Fit the model model.fit(X_train, y_train) # Evaluate the model train_score = model.score(X_train, y_train) test_score = model.score(X_test, y_test) print(f"Train accuracy: {train_score:.3f}") print(f"Test accuracy: {test_score:.3f}") # Save the model models_dir = os.path.join(project_root, 'models') os.makedirs(models_dir, exist_ok=True) # Save as pickle file with open(os.path.join(models_dir, 'loan_model.pkl'), 'wb') as f: pickle.dump(model, f) print("Model saved successfully as loan_model.pkl!")