File size: 1,949 Bytes
b9a43be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
def preprocess_data(df, target_col, missing_strategy="drop", transformation_map=None):
df = df.copy()
# 1. Handle missing values
if missing_strategy == "drop":
df = df.dropna()
elif missing_strategy in ["mean", "median"]:
numeric_cols = df.select_dtypes(include=["number"]).columns
non_numeric_cols = df.columns.difference(numeric_cols)
if missing_strategy == "mean":
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
else:
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
for col in non_numeric_cols:
if df[col].isna().sum() > 0:
df[col] = df[col].fillna(df[col].mode()[0])
elif missing_strategy == "mode":
for col in df.columns:
if df[col].isna().sum() > 0:
df[col] = df[col].fillna(df[col].mode()[0])
# 2. Apply feature transformations
if transformation_map:
for col, transform in transformation_map.items():
if transform == "Label Encode":
if df[col].dtype == "object" or str(df[col].dtype).startswith("category"):
df[col] = LabelEncoder().fit_transform(df[col])
else:
df[col] = LabelEncoder().fit_transform(df[col].astype(str))
elif transform == "Normalize":
scaler = StandardScaler()
df[[col]] = scaler.fit_transform(df[[col]])
# "No Transformation" = leave column as is
# 3. Label encode target column if it's a string
if target_col and target_col in df.columns:
if df[target_col].dtype == "object" or str(df[target_col].dtype).startswith("category"):
df[target_col] = LabelEncoder().fit_transform(df[target_col])
return df
|