Spaces:

mohli
/

ICS5110

Sleeping

File size: 8,887 Bytes
import os
import warnings
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.exceptions import ConvergenceWarning
from matplotlib import pyplot as plt



class My_NeuralNetwork:
    def __init__(self):
        self.MAX_LAYERS = 10
        self.target_column = "Water_Intake (liters)"
        self.model = None

        # default parameters
        self.num_layer = 3
        self.dimension = (32, 32, 32)
        self.correlation_treshold = 0.01
        self.epochs = 300

        # Load the dataset and preprocess it
        csv_file = os.path.join("data", "gym_members_exercise_tracking.csv")
        df = pd.read_csv(csv_file, engine="python")

        df = df.dropna()  # Remove rows with any null cell (just in case)
        # Assigning some of the features as Category.
        df["Gender"] = df["Gender"].astype("category")
        df["Workout_Type"] = df["Workout_Type"].astype("category")

        # getting the names of the numerical and categorical columns for later
        numeric_columns = list(df.select_dtypes(exclude=["category"]).columns)
        categorical_columns = list(df.select_dtypes(include=["category"]).columns)

        self.df_original = (
            df.copy()
        )  # the df variable will have some features removes later but not this one

        # remove the target to the list of features
        numeric_columns.remove(self.target_column)

        self.scaler = MinMaxScaler()  # create a new MinMaxScaler
        df_scaled = self.scaler.fit_transform(
            df[numeric_columns]
        )  # scale all the numerical columns using the new MinMaxScaler
        df[numeric_columns] = df_scaled.copy()

        df_encoded = pd.get_dummies(
            df, columns=categorical_columns, drop_first=False
        )  # get one-hot encoding for all categorical values
        df_encoded = df_encoded.astype(
            float
        )  # convert the one-hot encoding into floats (values between 0.0 - 1.0)
        new_columns = list(
            set(df_encoded.columns) - set(numeric_columns)
        )  # get the list of the new columns (former categorical columns)
        df = (
            df_encoded.copy()
        )  # the dataframe is now the one with all the one-hot encoded features
        numeric_columns.extend(
            new_columns
        )  # add the new columns to the list of numerical columns

        self.numeric_columns = numeric_columns
        self.df = df

    def train_model(self):
        # FEATURE SELECTION

        correlation_matrix = self.df[self.numeric_columns].corr()
        # calculating the Pearson’s correlation coefficient p-value for each element in the matrix
        p_values = pd.DataFrame(
            np.zeros((len(self.numeric_columns), len(self.numeric_columns))),
            columns=self.numeric_columns,
            index=self.numeric_columns,
        )
        # Calculate p-values for each pair
        for col1 in self.numeric_columns:
            for col2 in self.numeric_columns:
                if col1 != col2:
                    _, p_value = pearsonr(
                        self.df[col1], self.df[col2]
                    )  # using scipy.stats.pearsonr to get the p-value for one pair of feature
                    p_values.loc[col1, col2] = p_value
                else:
                    p_values.loc[col1, col2] = (
                        1  # Set to 1 to not get the relation when trying to find correlations between features
                    )

        # Identifying variables that are correlated
        # When the p-value is smaller than 0.05, there is likely a “real” relationship between the variables.
        correlated_columns = []
        for i, col1 in enumerate(self.numeric_columns):
            for j, col2 in enumerate(self.numeric_columns):
                if (
                    j > i
                    and p_values.loc[col1, col2] < 0.05
                    and col1 != self.target_column
                    and col2 != self.target_column
                ):
                    correlated_columns.append((col1, col2, p_values.loc[col1, col2]))
        
        # remove the target to the list of features
        self.numeric_columns.remove(self.target_column)

        # Identify features with a low correlation with the target
        target_corr = correlation_matrix[self.target_column].copy()
        correlation_treshold = self.correlation_treshold
        features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index
        features_to_remove = set(features_to_remove.to_list())

        # Identify redundant features using p-values
        x = {"keep": set(), "remove": set()}
        for corr_duo in correlated_columns:
            # put the feature with the highest correlation to the target variable in "keep" and the other one in "remove"
            if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]:
                x["keep"].add(corr_duo[0])
                x["remove"].add(corr_duo[1])
            else:
                x["keep"].add(corr_duo[1])
                x["remove"].add(corr_duo[0])

        # remove features that are already removed from "keep"
        x["keep"] = x["keep"] - features_to_remove

        # remove features that are in "remove" and not in "keep"
        redundant_features = x["remove"] - x["keep"]

        features_to_remove = features_to_remove.union(redundant_features)

        # Remove the selected features from the dataframe
        for feature in list(features_to_remove):
            self.numeric_columns.remove(feature)
            self.df.drop(feature, axis=1, inplace=True)
        self.features_to_remove = features_to_remove

        print(
            f"List of numerical features that will be used to predict the target ({self.target_column}) :"
        )
        print(self.numeric_columns)

        # CREATE & TRAIN MODEL

        # split the data
        X = self.df[self.numeric_columns]
        y = self.df[self.target_column]

        # 20% of the dataset will be use as test data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        self.model = MLPRegressor(
            hidden_layer_sizes=self.dimension,
            activation="relu",  # ReLU activation function
            solver="adam",  # Adam optimizer
            max_iter=1,  # One iteration per fit call since the training loop is defined below
            warm_start=True,
        )  # Used to measure the MSE throughout the iterations

        # ignoring the warning raised because we're using a manual loop for training
        warnings.filterwarnings("ignore", category=ConvergenceWarning)

        # Track MSE values during training
        mse_values = []
        epochs = self.epochs

        for epoch in range(epochs):
            # Train the model
            self.model.fit(X_train, y_train)

            # Predict on the test set
            y_pred = self.model.predict(X_test)

            # Evaluate the model
            mse = mean_squared_error(y_test, y_pred)
            mse_values.append(mse)

        # SAVE THE EVOLUTION OF THE MSE THROUGHOUT THE TRAINING 
        plt.figure(figsize=(10, 6))
        plt.plot(range(epochs), mse_values, marker='o', linestyle='-')
        plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}")
        plt.xlabel('Epoch')
        plt.ylabel('Mean Squared Error')
        plt.grid(True)
        plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png"))

        print(f"Final epoch MSE: {mse:.4f}")

    def predict(self, input_data: pd.DataFrame) -> float:
        # scale the input using the scaler used during training
        df_used_for_scaling = input_data[
            [
                col
                for col in input_data.columns
                if col
                not in [
                    "Gender_Male",
                    "Gender_Female",
                    "Workout_Type_Strength",
                    "Workout_Type_Yoga",
                    "Workout_Type_HIIT",
                    "Workout_Type_Cardio",
                ]
            ]
        ]
        scaled_input = self.scaler.transform(
            input_data[[col for col in df_used_for_scaling.columns]]
        )
        input_data[df_used_for_scaling.columns] = scaled_input.copy()

        # keep only the required features for the prediction
        input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore")

        input_data = input_data[self.numeric_columns]

        print("Prediction using the following input : ")
        print(input_data.to_csv())

        water_intake = self.model.predict(input_data)
        return water_intake