|
|
|
import pandas as pd
|
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
|
|
def preprocess_data(df, target_col, missing_strategy="drop", transformation_map=None):
|
|
df = df.copy()
|
|
|
|
|
|
if missing_strategy == "drop":
|
|
df = df.dropna()
|
|
elif missing_strategy in ["mean", "median"]:
|
|
numeric_cols = df.select_dtypes(include=["number"]).columns
|
|
non_numeric_cols = df.columns.difference(numeric_cols)
|
|
if missing_strategy == "mean":
|
|
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
|
|
else:
|
|
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
|
|
for col in non_numeric_cols:
|
|
if df[col].isna().sum() > 0:
|
|
df[col] = df[col].fillna(df[col].mode()[0])
|
|
elif missing_strategy == "mode":
|
|
for col in df.columns:
|
|
if df[col].isna().sum() > 0:
|
|
df[col] = df[col].fillna(df[col].mode()[0])
|
|
|
|
|
|
if transformation_map:
|
|
for col, transform in transformation_map.items():
|
|
if transform == "Label Encode":
|
|
if df[col].dtype == "object" or str(df[col].dtype).startswith("category"):
|
|
df[col] = LabelEncoder().fit_transform(df[col])
|
|
else:
|
|
df[col] = LabelEncoder().fit_transform(df[col].astype(str))
|
|
elif transform == "Normalize":
|
|
scaler = StandardScaler()
|
|
df[[col]] = scaler.fit_transform(df[[col]])
|
|
|
|
|
|
|
|
if target_col and target_col in df.columns:
|
|
if df[target_col].dtype == "object" or str(df[target_col].dtype).startswith("category"):
|
|
df[target_col] = LabelEncoder().fit_transform(df[target_col])
|
|
|
|
return df
|
|
|
|
|