File size: 1,939 Bytes
a8b81f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pandas as pd
from sklearn.preprocessing import StandardScaler
import logging
from pathlib import Path

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_and_preprocess_diabetes_data():
    try:
        # Load the dataset from local datasets folder
        data_path = Path(__file__).resolve().parent.parent.parent / "datasets" / "diabetes.csv"
        df = pd.read_csv(data_path)
        
        feature_names = [
            'Pregnancies',      # Number of times pregnant
            'Glucose',          # Plasma glucose concentration (mg/dL)
            'BloodPressure',    # Diastolic blood pressure (mm Hg)
            'SkinThickness',    # Triceps skin fold thickness (mm)
            'Insulin',          # 2-Hour serum insulin (mu U/ml)
            'BMI',              # Body mass index
            'DiabetesPedigreeFunction',  # Diabetes pedigree function
            'Age'               # Age in years
        ]
        
        # Handle missing values (0 values in certain columns)
        zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
        for column in zero_not_accepted:
            mask = df[column] != 0
            df.loc[~mask, column] = df.loc[mask, column].median()
        
        # Add some derived features
        df['GlucoseBMI'] = df['Glucose'] * df['BMI'] / 1000
        df['GlucoseAge'] = df['Glucose'] * df['Age'] / 100
        feature_names.extend(['GlucoseBMI', 'GlucoseAge'])
        
        # Separate features and target
        X = df[feature_names]
        y = df['Outcome']
        
        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=feature_names)
        
        return X_scaled, y, scaler
        
    except Exception as e:
        logger.error(f"Error in diabetes data preprocessing: {str(e)}")
        raise