|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
def load_data(filepath):
|
|
"""Load data from CSV file"""
|
|
return pd.read_csv(filepath)
|
|
|
|
def preprocess_data(df, target_column='loan_approved'):
|
|
"""Preprocess data for model training"""
|
|
|
|
X = df.drop(columns=[target_column])
|
|
y = df[target_column]
|
|
|
|
|
|
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
|
|
categorical_features = X.select_dtypes(include=['object']).columns
|
|
|
|
|
|
numeric_transformer = Pipeline(steps=[
|
|
('scaler', StandardScaler())
|
|
])
|
|
|
|
categorical_transformer = Pipeline(steps=[
|
|
('onehot', OneHotEncoder(handle_unknown='ignore'))
|
|
])
|
|
|
|
|
|
preprocessor = ColumnTransformer(
|
|
transformers=[
|
|
('num', numeric_transformer, numeric_features),
|
|
('cat', categorical_transformer, categorical_features)
|
|
])
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
return preprocessor, X_train, X_test, y_train, y_test |