Spaces:
Sleeping
Sleeping
File size: 1,939 Bytes
a8b81f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import pandas as pd
from sklearn.preprocessing import StandardScaler
import logging
from pathlib import Path
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def load_and_preprocess_diabetes_data():
try:
# Load the dataset from local datasets folder
data_path = Path(__file__).resolve().parent.parent.parent / "datasets" / "diabetes.csv"
df = pd.read_csv(data_path)
feature_names = [
'Pregnancies', # Number of times pregnant
'Glucose', # Plasma glucose concentration (mg/dL)
'BloodPressure', # Diastolic blood pressure (mm Hg)
'SkinThickness', # Triceps skin fold thickness (mm)
'Insulin', # 2-Hour serum insulin (mu U/ml)
'BMI', # Body mass index
'DiabetesPedigreeFunction', # Diabetes pedigree function
'Age' # Age in years
]
# Handle missing values (0 values in certain columns)
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in zero_not_accepted:
mask = df[column] != 0
df.loc[~mask, column] = df.loc[mask, column].median()
# Add some derived features
df['GlucoseBMI'] = df['Glucose'] * df['BMI'] / 1000
df['GlucoseAge'] = df['Glucose'] * df['Age'] / 100
feature_names.extend(['GlucoseBMI', 'GlucoseAge'])
# Separate features and target
X = df[feature_names]
y = df['Outcome']
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=feature_names)
return X_scaled, y, scaler
except Exception as e:
logger.error(f"Error in diabetes data preprocessing: {str(e)}")
raise |