Synthack-SyntaxSquad / src /data /preprocessing.py
WickedFaith's picture
Upload 77 files
3efedb0 verified
raw
history blame contribute delete
1.44 kB
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
def load_data(filepath):
"""Load data from CSV file"""
return pd.read_csv(filepath)
def preprocess_data(df, target_column='loan_approved'):
"""Preprocess data for model training"""
# Split features and target
X = df.drop(columns=[target_column])
y = df[target_column]
# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Combine preprocessing steps
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
return preprocessor, X_train, X_test, y_train, y_test