File size: 1,949 Bytes
b9a43be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

def preprocess_data(df, target_col, missing_strategy="drop", transformation_map=None):
    df = df.copy()

    # 1. Handle missing values
    if missing_strategy == "drop":
        df = df.dropna()
    elif missing_strategy in ["mean", "median"]:
        numeric_cols = df.select_dtypes(include=["number"]).columns
        non_numeric_cols = df.columns.difference(numeric_cols)
        if missing_strategy == "mean":
            df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
        else:
            df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
        for col in non_numeric_cols:
            if df[col].isna().sum() > 0:
                df[col] = df[col].fillna(df[col].mode()[0])
    elif missing_strategy == "mode":
        for col in df.columns:
            if df[col].isna().sum() > 0:
                df[col] = df[col].fillna(df[col].mode()[0])

    # 2. Apply feature transformations
    if transformation_map:
        for col, transform in transformation_map.items():
            if transform == "Label Encode":
                if df[col].dtype == "object" or str(df[col].dtype).startswith("category"):
                    df[col] = LabelEncoder().fit_transform(df[col])
                else:
                    df[col] = LabelEncoder().fit_transform(df[col].astype(str))
            elif transform == "Normalize":
                scaler = StandardScaler()
                df[[col]] = scaler.fit_transform(df[[col]])
            # "No Transformation" = leave column as is

    # 3. Label encode target column if it's a string
    if target_col and target_col in df.columns:
        if df[target_col].dtype == "object" or str(df[target_col].dtype).startswith("category"):
            df[target_col] = LabelEncoder().fit_transform(df[target_col])

    return df