import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split def load_data(filepath): """Load data from CSV file""" return pd.read_csv(filepath) def preprocess_data(df, target_column='loan_approved'): """Preprocess data for model training""" # Split features and target X = df.drop(columns=[target_column]) y = df[target_column] # Identify numeric and categorical columns numeric_features = X.select_dtypes(include=['int64', 'float64']).columns categorical_features = X.select_dtypes(include=['object']).columns # Create preprocessing pipelines numeric_transformer = Pipeline(steps=[ ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline(steps=[ ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) # Combine preprocessing steps preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) ]) # Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) return preprocessor, X_train, X_test, y_train, y_test