import pandas as pd from sklearn.preprocessing import StandardScaler, LabelEncoder def preprocess_data(df, target_col, missing_strategy="drop", transformation_map=None): df = df.copy() # 1. Handle missing values if missing_strategy == "drop": df = df.dropna() elif missing_strategy in ["mean", "median"]: numeric_cols = df.select_dtypes(include=["number"]).columns non_numeric_cols = df.columns.difference(numeric_cols) if missing_strategy == "mean": df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean()) else: df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median()) for col in non_numeric_cols: if df[col].isna().sum() > 0: df[col] = df[col].fillna(df[col].mode()[0]) elif missing_strategy == "mode": for col in df.columns: if df[col].isna().sum() > 0: df[col] = df[col].fillna(df[col].mode()[0]) # 2. Apply feature transformations if transformation_map: for col, transform in transformation_map.items(): if transform == "Label Encode": if df[col].dtype == "object" or str(df[col].dtype).startswith("category"): df[col] = LabelEncoder().fit_transform(df[col]) else: df[col] = LabelEncoder().fit_transform(df[col].astype(str)) elif transform == "Normalize": scaler = StandardScaler() df[[col]] = scaler.fit_transform(df[[col]]) # "No Transformation" = leave column as is # 3. Label encode target column if it's a string if target_col and target_col in df.columns: if df[target_col].dtype == "object" or str(df[target_col].dtype).startswith("category"): df[target_col] = LabelEncoder().fit_transform(df[target_col]) return df