File size: 8,887 Bytes
5332346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import os
import warnings
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.exceptions import ConvergenceWarning
from matplotlib import pyplot as plt



class My_NeuralNetwork:
    def __init__(self):
        self.MAX_LAYERS = 10
        self.target_column = "Water_Intake (liters)"
        self.model = None

        # default parameters
        self.num_layer = 3
        self.dimension = (32, 32, 32)
        self.correlation_treshold = 0.01
        self.epochs = 300

        # Load the dataset and preprocess it
        csv_file = os.path.join("data", "gym_members_exercise_tracking.csv")
        df = pd.read_csv(csv_file, engine="python")

        df = df.dropna()  # Remove rows with any null cell (just in case)
        # Assigning some of the features as Category.
        df["Gender"] = df["Gender"].astype("category")
        df["Workout_Type"] = df["Workout_Type"].astype("category")

        # getting the names of the numerical and categorical columns for later
        numeric_columns = list(df.select_dtypes(exclude=["category"]).columns)
        categorical_columns = list(df.select_dtypes(include=["category"]).columns)

        self.df_original = (
            df.copy()
        )  # the df variable will have some features removes later but not this one

        # remove the target to the list of features
        numeric_columns.remove(self.target_column)

        self.scaler = MinMaxScaler()  # create a new MinMaxScaler
        df_scaled = self.scaler.fit_transform(
            df[numeric_columns]
        )  # scale all the numerical columns using the new MinMaxScaler
        df[numeric_columns] = df_scaled.copy()

        df_encoded = pd.get_dummies(
            df, columns=categorical_columns, drop_first=False
        )  # get one-hot encoding for all categorical values
        df_encoded = df_encoded.astype(
            float
        )  # convert the one-hot encoding into floats (values between 0.0 - 1.0)
        new_columns = list(
            set(df_encoded.columns) - set(numeric_columns)
        )  # get the list of the new columns (former categorical columns)
        df = (
            df_encoded.copy()
        )  # the dataframe is now the one with all the one-hot encoded features
        numeric_columns.extend(
            new_columns
        )  # add the new columns to the list of numerical columns

        self.numeric_columns = numeric_columns
        self.df = df

    def train_model(self):
        # FEATURE SELECTION

        correlation_matrix = self.df[self.numeric_columns].corr()
        # calculating the Pearson’s correlation coefficient p-value for each element in the matrix
        p_values = pd.DataFrame(
            np.zeros((len(self.numeric_columns), len(self.numeric_columns))),
            columns=self.numeric_columns,
            index=self.numeric_columns,
        )
        # Calculate p-values for each pair
        for col1 in self.numeric_columns:
            for col2 in self.numeric_columns:
                if col1 != col2:
                    _, p_value = pearsonr(
                        self.df[col1], self.df[col2]
                    )  # using scipy.stats.pearsonr to get the p-value for one pair of feature
                    p_values.loc[col1, col2] = p_value
                else:
                    p_values.loc[col1, col2] = (
                        1  # Set to 1 to not get the relation when trying to find correlations between features
                    )

        # Identifying variables that are correlated
        # When the p-value is smaller than 0.05, there is likely a “real” relationship between the variables.
        correlated_columns = []
        for i, col1 in enumerate(self.numeric_columns):
            for j, col2 in enumerate(self.numeric_columns):
                if (
                    j > i
                    and p_values.loc[col1, col2] < 0.05
                    and col1 != self.target_column
                    and col2 != self.target_column
                ):
                    correlated_columns.append((col1, col2, p_values.loc[col1, col2]))
        
        # remove the target to the list of features
        self.numeric_columns.remove(self.target_column)

        # Identify features with a low correlation with the target
        target_corr = correlation_matrix[self.target_column].copy()
        correlation_treshold = self.correlation_treshold
        features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index
        features_to_remove = set(features_to_remove.to_list())

        # Identify redundant features using p-values
        x = {"keep": set(), "remove": set()}
        for corr_duo in correlated_columns:
            # put the feature with the highest correlation to the target variable in "keep" and the other one in "remove"
            if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]:
                x["keep"].add(corr_duo[0])
                x["remove"].add(corr_duo[1])
            else:
                x["keep"].add(corr_duo[1])
                x["remove"].add(corr_duo[0])

        # remove features that are already removed from "keep"
        x["keep"] = x["keep"] - features_to_remove

        # remove features that are in "remove" and not in "keep"
        redundant_features = x["remove"] - x["keep"]

        features_to_remove = features_to_remove.union(redundant_features)

        # Remove the selected features from the dataframe
        for feature in list(features_to_remove):
            self.numeric_columns.remove(feature)
            self.df.drop(feature, axis=1, inplace=True)
        self.features_to_remove = features_to_remove

        print(
            f"List of numerical features that will be used to predict the target ({self.target_column}) :"
        )
        print(self.numeric_columns)

        # CREATE & TRAIN MODEL

        # split the data
        X = self.df[self.numeric_columns]
        y = self.df[self.target_column]

        # 20% of the dataset will be use as test data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        self.model = MLPRegressor(
            hidden_layer_sizes=self.dimension,
            activation="relu",  # ReLU activation function
            solver="adam",  # Adam optimizer
            max_iter=1,  # One iteration per fit call since the training loop is defined below
            warm_start=True,
        )  # Used to measure the MSE throughout the iterations

        # ignoring the warning raised because we're using a manual loop for training
        warnings.filterwarnings("ignore", category=ConvergenceWarning)

        # Track MSE values during training
        mse_values = []
        epochs = self.epochs

        for epoch in range(epochs):
            # Train the model
            self.model.fit(X_train, y_train)

            # Predict on the test set
            y_pred = self.model.predict(X_test)

            # Evaluate the model
            mse = mean_squared_error(y_test, y_pred)
            mse_values.append(mse)

        # SAVE THE EVOLUTION OF THE MSE THROUGHOUT THE TRAINING 
        plt.figure(figsize=(10, 6))
        plt.plot(range(epochs), mse_values, marker='o', linestyle='-')
        plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}")
        plt.xlabel('Epoch')
        plt.ylabel('Mean Squared Error')
        plt.grid(True)
        plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png"))

        print(f"Final epoch MSE: {mse:.4f}")

    def predict(self, input_data: pd.DataFrame) -> float:
        # scale the input using the scaler used during training
        df_used_for_scaling = input_data[
            [
                col
                for col in input_data.columns
                if col
                not in [
                    "Gender_Male",
                    "Gender_Female",
                    "Workout_Type_Strength",
                    "Workout_Type_Yoga",
                    "Workout_Type_HIIT",
                    "Workout_Type_Cardio",
                ]
            ]
        ]
        scaled_input = self.scaler.transform(
            input_data[[col for col in df_used_for_scaling.columns]]
        )
        input_data[df_used_for_scaling.columns] = scaled_input.copy()

        # keep only the required features for the prediction
        input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore")

        input_data = input_data[self.numeric_columns]

        print("Prediction using the following input : ")
        print(input_data.to_csv())

        water_intake = self.model.predict(input_data)
        return water_intake