import pandas as pd from sklearn.preprocessing import StandardScaler import logging from pathlib import Path logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def load_and_preprocess_diabetes_data(): try: # Load the dataset from local datasets folder data_path = Path(__file__).resolve().parent.parent.parent / "datasets" / "diabetes.csv" df = pd.read_csv(data_path) feature_names = [ 'Pregnancies', # Number of times pregnant 'Glucose', # Plasma glucose concentration (mg/dL) 'BloodPressure', # Diastolic blood pressure (mm Hg) 'SkinThickness', # Triceps skin fold thickness (mm) 'Insulin', # 2-Hour serum insulin (mu U/ml) 'BMI', # Body mass index 'DiabetesPedigreeFunction', # Diabetes pedigree function 'Age' # Age in years ] # Handle missing values (0 values in certain columns) zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'] for column in zero_not_accepted: mask = df[column] != 0 df.loc[~mask, column] = df.loc[mask, column].median() # Add some derived features df['GlucoseBMI'] = df['Glucose'] * df['BMI'] / 1000 df['GlucoseAge'] = df['Glucose'] * df['Age'] / 100 feature_names.extend(['GlucoseBMI', 'GlucoseAge']) # Separate features and target X = df[feature_names] y = df['Outcome'] # Scale features scaler = StandardScaler() X_scaled = scaler.fit_transform(X) X_scaled = pd.DataFrame(X_scaled, columns=feature_names) return X_scaled, y, scaler except Exception as e: logger.error(f"Error in diabetes data preprocessing: {str(e)}") raise