mohli commited on
Commit
5332346
·
verified ·
1 Parent(s): 7049970

Changed path of file import

Browse files
Files changed (1) hide show
  1. NeuralNetwork/NeuralNetwork.py +224 -224
NeuralNetwork/NeuralNetwork.py CHANGED
@@ -1,224 +1,224 @@
1
- import os
2
- import warnings
3
- import pandas as pd
4
- import numpy as np
5
- from scipy.stats import pearsonr
6
- from sklearn.preprocessing import MinMaxScaler
7
- from sklearn.model_selection import train_test_split
8
- from sklearn.neural_network import MLPRegressor
9
- from sklearn.metrics import mean_squared_error
10
- from sklearn.exceptions import ConvergenceWarning
11
- from matplotlib import pyplot as plt
12
-
13
-
14
-
15
- class My_NeuralNetwork:
16
- def __init__(self):
17
- self.MAX_LAYERS = 10
18
- self.target_column = "Water_Intake (liters)"
19
- self.model = None
20
-
21
- # default parameters
22
- self.num_layer = 3
23
- self.dimension = (32, 32, 32)
24
- self.correlation_treshold = 0.01
25
- self.epochs = 300
26
-
27
- # Load the dataset and preprocess it
28
- csv_file = os.path.join("app", "data", "gym_members_exercise_tracking.csv")
29
- df = pd.read_csv(csv_file, engine="python")
30
-
31
- df = df.dropna() # Remove rows with any null cell (just in case)
32
- # Assigning some of the features as Category.
33
- df["Gender"] = df["Gender"].astype("category")
34
- df["Workout_Type"] = df["Workout_Type"].astype("category")
35
-
36
- # getting the names of the numerical and categorical columns for later
37
- numeric_columns = list(df.select_dtypes(exclude=["category"]).columns)
38
- categorical_columns = list(df.select_dtypes(include=["category"]).columns)
39
-
40
- self.df_original = (
41
- df.copy()
42
- ) # the df variable will have some features removes later but not this one
43
-
44
- # remove the target to the list of features
45
- numeric_columns.remove(self.target_column)
46
-
47
- self.scaler = MinMaxScaler() # create a new MinMaxScaler
48
- df_scaled = self.scaler.fit_transform(
49
- df[numeric_columns]
50
- ) # scale all the numerical columns using the new MinMaxScaler
51
- df[numeric_columns] = df_scaled.copy()
52
-
53
- df_encoded = pd.get_dummies(
54
- df, columns=categorical_columns, drop_first=False
55
- ) # get one-hot encoding for all categorical values
56
- df_encoded = df_encoded.astype(
57
- float
58
- ) # convert the one-hot encoding into floats (values between 0.0 - 1.0)
59
- new_columns = list(
60
- set(df_encoded.columns) - set(numeric_columns)
61
- ) # get the list of the new columns (former categorical columns)
62
- df = (
63
- df_encoded.copy()
64
- ) # the dataframe is now the one with all the one-hot encoded features
65
- numeric_columns.extend(
66
- new_columns
67
- ) # add the new columns to the list of numerical columns
68
-
69
- self.numeric_columns = numeric_columns
70
- self.df = df
71
-
72
- def train_model(self):
73
- # FEATURE SELECTION
74
-
75
- correlation_matrix = self.df[self.numeric_columns].corr()
76
- # calculating the Pearson’s correlation coefficient p-value for each element in the matrix
77
- p_values = pd.DataFrame(
78
- np.zeros((len(self.numeric_columns), len(self.numeric_columns))),
79
- columns=self.numeric_columns,
80
- index=self.numeric_columns,
81
- )
82
- # Calculate p-values for each pair
83
- for col1 in self.numeric_columns:
84
- for col2 in self.numeric_columns:
85
- if col1 != col2:
86
- _, p_value = pearsonr(
87
- self.df[col1], self.df[col2]
88
- ) # using scipy.stats.pearsonr to get the p-value for one pair of feature
89
- p_values.loc[col1, col2] = p_value
90
- else:
91
- p_values.loc[col1, col2] = (
92
- 1 # Set to 1 to not get the relation when trying to find correlations between features
93
- )
94
-
95
- # Identifying variables that are correlated
96
- # When the p-value is smaller than 0.05, there is likely a “real” relationship between the variables.
97
- correlated_columns = []
98
- for i, col1 in enumerate(self.numeric_columns):
99
- for j, col2 in enumerate(self.numeric_columns):
100
- if (
101
- j > i
102
- and p_values.loc[col1, col2] < 0.05
103
- and col1 != self.target_column
104
- and col2 != self.target_column
105
- ):
106
- correlated_columns.append((col1, col2, p_values.loc[col1, col2]))
107
-
108
- # remove the target to the list of features
109
- self.numeric_columns.remove(self.target_column)
110
-
111
- # Identify features with a low correlation with the target
112
- target_corr = correlation_matrix[self.target_column].copy()
113
- correlation_treshold = self.correlation_treshold
114
- features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index
115
- features_to_remove = set(features_to_remove.to_list())
116
-
117
- # Identify redundant features using p-values
118
- x = {"keep": set(), "remove": set()}
119
- for corr_duo in correlated_columns:
120
- # put the feature with the highest correlation to the target variable in "keep" and the other one in "remove"
121
- if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]:
122
- x["keep"].add(corr_duo[0])
123
- x["remove"].add(corr_duo[1])
124
- else:
125
- x["keep"].add(corr_duo[1])
126
- x["remove"].add(corr_duo[0])
127
-
128
- # remove features that are already removed from "keep"
129
- x["keep"] = x["keep"] - features_to_remove
130
-
131
- # remove features that are in "remove" and not in "keep"
132
- redundant_features = x["remove"] - x["keep"]
133
-
134
- features_to_remove = features_to_remove.union(redundant_features)
135
-
136
- # Remove the selected features from the dataframe
137
- for feature in list(features_to_remove):
138
- self.numeric_columns.remove(feature)
139
- self.df.drop(feature, axis=1, inplace=True)
140
- self.features_to_remove = features_to_remove
141
-
142
- print(
143
- f"List of numerical features that will be used to predict the target ({self.target_column}) :"
144
- )
145
- print(self.numeric_columns)
146
-
147
- # CREATE & TRAIN MODEL
148
-
149
- # split the data
150
- X = self.df[self.numeric_columns]
151
- y = self.df[self.target_column]
152
-
153
- # 20% of the dataset will be use as test data
154
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
155
-
156
- self.model = MLPRegressor(
157
- hidden_layer_sizes=self.dimension,
158
- activation="relu", # ReLU activation function
159
- solver="adam", # Adam optimizer
160
- max_iter=1, # One iteration per fit call since the training loop is defined below
161
- warm_start=True,
162
- ) # Used to measure the MSE throughout the iterations
163
-
164
- # ignoring the warning raised because we're using a manual loop for training
165
- warnings.filterwarnings("ignore", category=ConvergenceWarning)
166
-
167
- # Track MSE values during training
168
- mse_values = []
169
- epochs = self.epochs
170
-
171
- for epoch in range(epochs):
172
- # Train the model
173
- self.model.fit(X_train, y_train)
174
-
175
- # Predict on the test set
176
- y_pred = self.model.predict(X_test)
177
-
178
- # Evaluate the model
179
- mse = mean_squared_error(y_test, y_pred)
180
- mse_values.append(mse)
181
-
182
- # SAVE THE EVOLUTION OF THE MSE THROUGHOUT THE TRAINING
183
- plt.figure(figsize=(10, 6))
184
- plt.plot(range(epochs), mse_values, marker='o', linestyle='-')
185
- plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}")
186
- plt.xlabel('Epoch')
187
- plt.ylabel('Mean Squared Error')
188
- plt.grid(True)
189
- plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png"))
190
-
191
- print(f"Final epoch MSE: {mse:.4f}")
192
-
193
- def predict(self, input_data: pd.DataFrame) -> float:
194
- # scale the input using the scaler used during training
195
- df_used_for_scaling = input_data[
196
- [
197
- col
198
- for col in input_data.columns
199
- if col
200
- not in [
201
- "Gender_Male",
202
- "Gender_Female",
203
- "Workout_Type_Strength",
204
- "Workout_Type_Yoga",
205
- "Workout_Type_HIIT",
206
- "Workout_Type_Cardio",
207
- ]
208
- ]
209
- ]
210
- scaled_input = self.scaler.transform(
211
- input_data[[col for col in df_used_for_scaling.columns]]
212
- )
213
- input_data[df_used_for_scaling.columns] = scaled_input.copy()
214
-
215
- # keep only the required features for the prediction
216
- input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore")
217
-
218
- input_data = input_data[self.numeric_columns]
219
-
220
- print("Prediction using the following input : ")
221
- print(input_data.to_csv())
222
-
223
- water_intake = self.model.predict(input_data)
224
- return water_intake
 
1
+ import os
2
+ import warnings
3
+ import pandas as pd
4
+ import numpy as np
5
+ from scipy.stats import pearsonr
6
+ from sklearn.preprocessing import MinMaxScaler
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.neural_network import MLPRegressor
9
+ from sklearn.metrics import mean_squared_error
10
+ from sklearn.exceptions import ConvergenceWarning
11
+ from matplotlib import pyplot as plt
12
+
13
+
14
+
15
+ class My_NeuralNetwork:
16
+ def __init__(self):
17
+ self.MAX_LAYERS = 10
18
+ self.target_column = "Water_Intake (liters)"
19
+ self.model = None
20
+
21
+ # default parameters
22
+ self.num_layer = 3
23
+ self.dimension = (32, 32, 32)
24
+ self.correlation_treshold = 0.01
25
+ self.epochs = 300
26
+
27
+ # Load the dataset and preprocess it
28
+ csv_file = os.path.join("data", "gym_members_exercise_tracking.csv")
29
+ df = pd.read_csv(csv_file, engine="python")
30
+
31
+ df = df.dropna() # Remove rows with any null cell (just in case)
32
+ # Assigning some of the features as Category.
33
+ df["Gender"] = df["Gender"].astype("category")
34
+ df["Workout_Type"] = df["Workout_Type"].astype("category")
35
+
36
+ # getting the names of the numerical and categorical columns for later
37
+ numeric_columns = list(df.select_dtypes(exclude=["category"]).columns)
38
+ categorical_columns = list(df.select_dtypes(include=["category"]).columns)
39
+
40
+ self.df_original = (
41
+ df.copy()
42
+ ) # the df variable will have some features removes later but not this one
43
+
44
+ # remove the target to the list of features
45
+ numeric_columns.remove(self.target_column)
46
+
47
+ self.scaler = MinMaxScaler() # create a new MinMaxScaler
48
+ df_scaled = self.scaler.fit_transform(
49
+ df[numeric_columns]
50
+ ) # scale all the numerical columns using the new MinMaxScaler
51
+ df[numeric_columns] = df_scaled.copy()
52
+
53
+ df_encoded = pd.get_dummies(
54
+ df, columns=categorical_columns, drop_first=False
55
+ ) # get one-hot encoding for all categorical values
56
+ df_encoded = df_encoded.astype(
57
+ float
58
+ ) # convert the one-hot encoding into floats (values between 0.0 - 1.0)
59
+ new_columns = list(
60
+ set(df_encoded.columns) - set(numeric_columns)
61
+ ) # get the list of the new columns (former categorical columns)
62
+ df = (
63
+ df_encoded.copy()
64
+ ) # the dataframe is now the one with all the one-hot encoded features
65
+ numeric_columns.extend(
66
+ new_columns
67
+ ) # add the new columns to the list of numerical columns
68
+
69
+ self.numeric_columns = numeric_columns
70
+ self.df = df
71
+
72
+ def train_model(self):
73
+ # FEATURE SELECTION
74
+
75
+ correlation_matrix = self.df[self.numeric_columns].corr()
76
+ # calculating the Pearson’s correlation coefficient p-value for each element in the matrix
77
+ p_values = pd.DataFrame(
78
+ np.zeros((len(self.numeric_columns), len(self.numeric_columns))),
79
+ columns=self.numeric_columns,
80
+ index=self.numeric_columns,
81
+ )
82
+ # Calculate p-values for each pair
83
+ for col1 in self.numeric_columns:
84
+ for col2 in self.numeric_columns:
85
+ if col1 != col2:
86
+ _, p_value = pearsonr(
87
+ self.df[col1], self.df[col2]
88
+ ) # using scipy.stats.pearsonr to get the p-value for one pair of feature
89
+ p_values.loc[col1, col2] = p_value
90
+ else:
91
+ p_values.loc[col1, col2] = (
92
+ 1 # Set to 1 to not get the relation when trying to find correlations between features
93
+ )
94
+
95
+ # Identifying variables that are correlated
96
+ # When the p-value is smaller than 0.05, there is likely a “real” relationship between the variables.
97
+ correlated_columns = []
98
+ for i, col1 in enumerate(self.numeric_columns):
99
+ for j, col2 in enumerate(self.numeric_columns):
100
+ if (
101
+ j > i
102
+ and p_values.loc[col1, col2] < 0.05
103
+ and col1 != self.target_column
104
+ and col2 != self.target_column
105
+ ):
106
+ correlated_columns.append((col1, col2, p_values.loc[col1, col2]))
107
+
108
+ # remove the target to the list of features
109
+ self.numeric_columns.remove(self.target_column)
110
+
111
+ # Identify features with a low correlation with the target
112
+ target_corr = correlation_matrix[self.target_column].copy()
113
+ correlation_treshold = self.correlation_treshold
114
+ features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index
115
+ features_to_remove = set(features_to_remove.to_list())
116
+
117
+ # Identify redundant features using p-values
118
+ x = {"keep": set(), "remove": set()}
119
+ for corr_duo in correlated_columns:
120
+ # put the feature with the highest correlation to the target variable in "keep" and the other one in "remove"
121
+ if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]:
122
+ x["keep"].add(corr_duo[0])
123
+ x["remove"].add(corr_duo[1])
124
+ else:
125
+ x["keep"].add(corr_duo[1])
126
+ x["remove"].add(corr_duo[0])
127
+
128
+ # remove features that are already removed from "keep"
129
+ x["keep"] = x["keep"] - features_to_remove
130
+
131
+ # remove features that are in "remove" and not in "keep"
132
+ redundant_features = x["remove"] - x["keep"]
133
+
134
+ features_to_remove = features_to_remove.union(redundant_features)
135
+
136
+ # Remove the selected features from the dataframe
137
+ for feature in list(features_to_remove):
138
+ self.numeric_columns.remove(feature)
139
+ self.df.drop(feature, axis=1, inplace=True)
140
+ self.features_to_remove = features_to_remove
141
+
142
+ print(
143
+ f"List of numerical features that will be used to predict the target ({self.target_column}) :"
144
+ )
145
+ print(self.numeric_columns)
146
+
147
+ # CREATE & TRAIN MODEL
148
+
149
+ # split the data
150
+ X = self.df[self.numeric_columns]
151
+ y = self.df[self.target_column]
152
+
153
+ # 20% of the dataset will be use as test data
154
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
155
+
156
+ self.model = MLPRegressor(
157
+ hidden_layer_sizes=self.dimension,
158
+ activation="relu", # ReLU activation function
159
+ solver="adam", # Adam optimizer
160
+ max_iter=1, # One iteration per fit call since the training loop is defined below
161
+ warm_start=True,
162
+ ) # Used to measure the MSE throughout the iterations
163
+
164
+ # ignoring the warning raised because we're using a manual loop for training
165
+ warnings.filterwarnings("ignore", category=ConvergenceWarning)
166
+
167
+ # Track MSE values during training
168
+ mse_values = []
169
+ epochs = self.epochs
170
+
171
+ for epoch in range(epochs):
172
+ # Train the model
173
+ self.model.fit(X_train, y_train)
174
+
175
+ # Predict on the test set
176
+ y_pred = self.model.predict(X_test)
177
+
178
+ # Evaluate the model
179
+ mse = mean_squared_error(y_test, y_pred)
180
+ mse_values.append(mse)
181
+
182
+ # SAVE THE EVOLUTION OF THE MSE THROUGHOUT THE TRAINING
183
+ plt.figure(figsize=(10, 6))
184
+ plt.plot(range(epochs), mse_values, marker='o', linestyle='-')
185
+ plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}")
186
+ plt.xlabel('Epoch')
187
+ plt.ylabel('Mean Squared Error')
188
+ plt.grid(True)
189
+ plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png"))
190
+
191
+ print(f"Final epoch MSE: {mse:.4f}")
192
+
193
+ def predict(self, input_data: pd.DataFrame) -> float:
194
+ # scale the input using the scaler used during training
195
+ df_used_for_scaling = input_data[
196
+ [
197
+ col
198
+ for col in input_data.columns
199
+ if col
200
+ not in [
201
+ "Gender_Male",
202
+ "Gender_Female",
203
+ "Workout_Type_Strength",
204
+ "Workout_Type_Yoga",
205
+ "Workout_Type_HIIT",
206
+ "Workout_Type_Cardio",
207
+ ]
208
+ ]
209
+ ]
210
+ scaled_input = self.scaler.transform(
211
+ input_data[[col for col in df_used_for_scaling.columns]]
212
+ )
213
+ input_data[df_used_for_scaling.columns] = scaled_input.copy()
214
+
215
+ # keep only the required features for the prediction
216
+ input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore")
217
+
218
+ input_data = input_data[self.numeric_columns]
219
+
220
+ print("Prediction using the following input : ")
221
+ print(input_data.to_csv())
222
+
223
+ water_intake = self.model.predict(input_data)
224
+ return water_intake