import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os
import pickle

class DataPreprocessor:
    def __init__(self):
        self.scaler = StandardScaler()

    def load_data(self, filepath):
        """Load and return the dataset"""
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"The file at {filepath} does not exist.")
        df = pd.read_csv(filepath)
        print("Data loaded successfully.")
        return df

    def preprocess_data(self, df):
        """Preprocess the data by handling missing values"""
        # Handle missing values (zeros)
        features_to_process = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
        for feature in features_to_process:
            mean_value = df[feature].replace(0, np.nan).mean()
            df[feature] = df[feature].replace(0, mean_value)
        
        print("Missing values handled.")
        return df

    def split_data(self, df):
        """Split data into features and target"""
        features = df.drop('Outcome', axis=1)
        target = df['Outcome']
        return features, target

    def scale_features(self, features, is_training=False):
        """Scale features using StandardScaler"""
        if is_training:
            scaled_features = self.scaler.fit_transform(features)
            # Save the scaler for future use
            model_dir = "src/models"
            os.makedirs(model_dir, exist_ok=True)
            # Save the scaler as a pickle file
            with open(f"{model_dir}/scaler.pkl", 'wb') as f:
                pickle.dump(self.scaler, f)
            # Save the scaled data as a CSV file
            scaled_df = pd.DataFrame(scaled_features, columns=features.columns)
            scaled_df['Outcome'] = df['Outcome']  # Add the Outcome column back
            scaled_csv_path = "data/scaled_data.csv"
            scaled_df.to_csv(scaled_csv_path, index=False)
            print("Scaled data saved as csv file.")
        else:
            scaled_features = self.scaler.transform(features)
        
        return scaled_features


if __name__ == "__main__":
    preprocessor = DataPreprocessor()
    
    # Load and preprocess data
    df = preprocessor.load_data("data/preprocessed_data.csv")
    df = preprocessor.preprocess_data(df)

    # Split data into features and target
    features, target = preprocessor.split_data(df)

    # Scale features (Training phase)
    scaled_features = preprocessor.scale_features(features, is_training=True)
    
    print("Data preprocessing completed.")