Changed path of file import
Browse files- NeuralNetwork/NeuralNetwork.py +224 -224
NeuralNetwork/NeuralNetwork.py
CHANGED
@@ -1,224 +1,224 @@
|
|
1 |
-
import os
|
2 |
-
import warnings
|
3 |
-
import pandas as pd
|
4 |
-
import numpy as np
|
5 |
-
from scipy.stats import pearsonr
|
6 |
-
from sklearn.preprocessing import MinMaxScaler
|
7 |
-
from sklearn.model_selection import train_test_split
|
8 |
-
from sklearn.neural_network import MLPRegressor
|
9 |
-
from sklearn.metrics import mean_squared_error
|
10 |
-
from sklearn.exceptions import ConvergenceWarning
|
11 |
-
from matplotlib import pyplot as plt
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
class My_NeuralNetwork:
|
16 |
-
def __init__(self):
|
17 |
-
self.MAX_LAYERS = 10
|
18 |
-
self.target_column = "Water_Intake (liters)"
|
19 |
-
self.model = None
|
20 |
-
|
21 |
-
# default parameters
|
22 |
-
self.num_layer = 3
|
23 |
-
self.dimension = (32, 32, 32)
|
24 |
-
self.correlation_treshold = 0.01
|
25 |
-
self.epochs = 300
|
26 |
-
|
27 |
-
# Load the dataset and preprocess it
|
28 |
-
csv_file = os.path.join("
|
29 |
-
df = pd.read_csv(csv_file, engine="python")
|
30 |
-
|
31 |
-
df = df.dropna() # Remove rows with any null cell (just in case)
|
32 |
-
# Assigning some of the features as Category.
|
33 |
-
df["Gender"] = df["Gender"].astype("category")
|
34 |
-
df["Workout_Type"] = df["Workout_Type"].astype("category")
|
35 |
-
|
36 |
-
# getting the names of the numerical and categorical columns for later
|
37 |
-
numeric_columns = list(df.select_dtypes(exclude=["category"]).columns)
|
38 |
-
categorical_columns = list(df.select_dtypes(include=["category"]).columns)
|
39 |
-
|
40 |
-
self.df_original = (
|
41 |
-
df.copy()
|
42 |
-
) # the df variable will have some features removes later but not this one
|
43 |
-
|
44 |
-
# remove the target to the list of features
|
45 |
-
numeric_columns.remove(self.target_column)
|
46 |
-
|
47 |
-
self.scaler = MinMaxScaler() # create a new MinMaxScaler
|
48 |
-
df_scaled = self.scaler.fit_transform(
|
49 |
-
df[numeric_columns]
|
50 |
-
) # scale all the numerical columns using the new MinMaxScaler
|
51 |
-
df[numeric_columns] = df_scaled.copy()
|
52 |
-
|
53 |
-
df_encoded = pd.get_dummies(
|
54 |
-
df, columns=categorical_columns, drop_first=False
|
55 |
-
) # get one-hot encoding for all categorical values
|
56 |
-
df_encoded = df_encoded.astype(
|
57 |
-
float
|
58 |
-
) # convert the one-hot encoding into floats (values between 0.0 - 1.0)
|
59 |
-
new_columns = list(
|
60 |
-
set(df_encoded.columns) - set(numeric_columns)
|
61 |
-
) # get the list of the new columns (former categorical columns)
|
62 |
-
df = (
|
63 |
-
df_encoded.copy()
|
64 |
-
) # the dataframe is now the one with all the one-hot encoded features
|
65 |
-
numeric_columns.extend(
|
66 |
-
new_columns
|
67 |
-
) # add the new columns to the list of numerical columns
|
68 |
-
|
69 |
-
self.numeric_columns = numeric_columns
|
70 |
-
self.df = df
|
71 |
-
|
72 |
-
def train_model(self):
|
73 |
-
# FEATURE SELECTION
|
74 |
-
|
75 |
-
correlation_matrix = self.df[self.numeric_columns].corr()
|
76 |
-
# calculating the Pearson’s correlation coefficient p-value for each element in the matrix
|
77 |
-
p_values = pd.DataFrame(
|
78 |
-
np.zeros((len(self.numeric_columns), len(self.numeric_columns))),
|
79 |
-
columns=self.numeric_columns,
|
80 |
-
index=self.numeric_columns,
|
81 |
-
)
|
82 |
-
# Calculate p-values for each pair
|
83 |
-
for col1 in self.numeric_columns:
|
84 |
-
for col2 in self.numeric_columns:
|
85 |
-
if col1 != col2:
|
86 |
-
_, p_value = pearsonr(
|
87 |
-
self.df[col1], self.df[col2]
|
88 |
-
) # using scipy.stats.pearsonr to get the p-value for one pair of feature
|
89 |
-
p_values.loc[col1, col2] = p_value
|
90 |
-
else:
|
91 |
-
p_values.loc[col1, col2] = (
|
92 |
-
1 # Set to 1 to not get the relation when trying to find correlations between features
|
93 |
-
)
|
94 |
-
|
95 |
-
# Identifying variables that are correlated
|
96 |
-
# When the p-value is smaller than 0.05, there is likely a “real” relationship between the variables.
|
97 |
-
correlated_columns = []
|
98 |
-
for i, col1 in enumerate(self.numeric_columns):
|
99 |
-
for j, col2 in enumerate(self.numeric_columns):
|
100 |
-
if (
|
101 |
-
j > i
|
102 |
-
and p_values.loc[col1, col2] < 0.05
|
103 |
-
and col1 != self.target_column
|
104 |
-
and col2 != self.target_column
|
105 |
-
):
|
106 |
-
correlated_columns.append((col1, col2, p_values.loc[col1, col2]))
|
107 |
-
|
108 |
-
# remove the target to the list of features
|
109 |
-
self.numeric_columns.remove(self.target_column)
|
110 |
-
|
111 |
-
# Identify features with a low correlation with the target
|
112 |
-
target_corr = correlation_matrix[self.target_column].copy()
|
113 |
-
correlation_treshold = self.correlation_treshold
|
114 |
-
features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index
|
115 |
-
features_to_remove = set(features_to_remove.to_list())
|
116 |
-
|
117 |
-
# Identify redundant features using p-values
|
118 |
-
x = {"keep": set(), "remove": set()}
|
119 |
-
for corr_duo in correlated_columns:
|
120 |
-
# put the feature with the highest correlation to the target variable in "keep" and the other one in "remove"
|
121 |
-
if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]:
|
122 |
-
x["keep"].add(corr_duo[0])
|
123 |
-
x["remove"].add(corr_duo[1])
|
124 |
-
else:
|
125 |
-
x["keep"].add(corr_duo[1])
|
126 |
-
x["remove"].add(corr_duo[0])
|
127 |
-
|
128 |
-
# remove features that are already removed from "keep"
|
129 |
-
x["keep"] = x["keep"] - features_to_remove
|
130 |
-
|
131 |
-
# remove features that are in "remove" and not in "keep"
|
132 |
-
redundant_features = x["remove"] - x["keep"]
|
133 |
-
|
134 |
-
features_to_remove = features_to_remove.union(redundant_features)
|
135 |
-
|
136 |
-
# Remove the selected features from the dataframe
|
137 |
-
for feature in list(features_to_remove):
|
138 |
-
self.numeric_columns.remove(feature)
|
139 |
-
self.df.drop(feature, axis=1, inplace=True)
|
140 |
-
self.features_to_remove = features_to_remove
|
141 |
-
|
142 |
-
print(
|
143 |
-
f"List of numerical features that will be used to predict the target ({self.target_column}) :"
|
144 |
-
)
|
145 |
-
print(self.numeric_columns)
|
146 |
-
|
147 |
-
# CREATE & TRAIN MODEL
|
148 |
-
|
149 |
-
# split the data
|
150 |
-
X = self.df[self.numeric_columns]
|
151 |
-
y = self.df[self.target_column]
|
152 |
-
|
153 |
-
# 20% of the dataset will be use as test data
|
154 |
-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
155 |
-
|
156 |
-
self.model = MLPRegressor(
|
157 |
-
hidden_layer_sizes=self.dimension,
|
158 |
-
activation="relu", # ReLU activation function
|
159 |
-
solver="adam", # Adam optimizer
|
160 |
-
max_iter=1, # One iteration per fit call since the training loop is defined below
|
161 |
-
warm_start=True,
|
162 |
-
) # Used to measure the MSE throughout the iterations
|
163 |
-
|
164 |
-
# ignoring the warning raised because we're using a manual loop for training
|
165 |
-
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
166 |
-
|
167 |
-
# Track MSE values during training
|
168 |
-
mse_values = []
|
169 |
-
epochs = self.epochs
|
170 |
-
|
171 |
-
for epoch in range(epochs):
|
172 |
-
# Train the model
|
173 |
-
self.model.fit(X_train, y_train)
|
174 |
-
|
175 |
-
# Predict on the test set
|
176 |
-
y_pred = self.model.predict(X_test)
|
177 |
-
|
178 |
-
# Evaluate the model
|
179 |
-
mse = mean_squared_error(y_test, y_pred)
|
180 |
-
mse_values.append(mse)
|
181 |
-
|
182 |
-
# SAVE THE EVOLUTION OF THE MSE THROUGHOUT THE TRAINING
|
183 |
-
plt.figure(figsize=(10, 6))
|
184 |
-
plt.plot(range(epochs), mse_values, marker='o', linestyle='-')
|
185 |
-
plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}")
|
186 |
-
plt.xlabel('Epoch')
|
187 |
-
plt.ylabel('Mean Squared Error')
|
188 |
-
plt.grid(True)
|
189 |
-
plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png"))
|
190 |
-
|
191 |
-
print(f"Final epoch MSE: {mse:.4f}")
|
192 |
-
|
193 |
-
def predict(self, input_data: pd.DataFrame) -> float:
|
194 |
-
# scale the input using the scaler used during training
|
195 |
-
df_used_for_scaling = input_data[
|
196 |
-
[
|
197 |
-
col
|
198 |
-
for col in input_data.columns
|
199 |
-
if col
|
200 |
-
not in [
|
201 |
-
"Gender_Male",
|
202 |
-
"Gender_Female",
|
203 |
-
"Workout_Type_Strength",
|
204 |
-
"Workout_Type_Yoga",
|
205 |
-
"Workout_Type_HIIT",
|
206 |
-
"Workout_Type_Cardio",
|
207 |
-
]
|
208 |
-
]
|
209 |
-
]
|
210 |
-
scaled_input = self.scaler.transform(
|
211 |
-
input_data[[col for col in df_used_for_scaling.columns]]
|
212 |
-
)
|
213 |
-
input_data[df_used_for_scaling.columns] = scaled_input.copy()
|
214 |
-
|
215 |
-
# keep only the required features for the prediction
|
216 |
-
input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore")
|
217 |
-
|
218 |
-
input_data = input_data[self.numeric_columns]
|
219 |
-
|
220 |
-
print("Prediction using the following input : ")
|
221 |
-
print(input_data.to_csv())
|
222 |
-
|
223 |
-
water_intake = self.model.predict(input_data)
|
224 |
-
return water_intake
|
|
|
1 |
+
import os
|
2 |
+
import warnings
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
from scipy.stats import pearsonr
|
6 |
+
from sklearn.preprocessing import MinMaxScaler
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.neural_network import MLPRegressor
|
9 |
+
from sklearn.metrics import mean_squared_error
|
10 |
+
from sklearn.exceptions import ConvergenceWarning
|
11 |
+
from matplotlib import pyplot as plt
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
class My_NeuralNetwork:
|
16 |
+
def __init__(self):
|
17 |
+
self.MAX_LAYERS = 10
|
18 |
+
self.target_column = "Water_Intake (liters)"
|
19 |
+
self.model = None
|
20 |
+
|
21 |
+
# default parameters
|
22 |
+
self.num_layer = 3
|
23 |
+
self.dimension = (32, 32, 32)
|
24 |
+
self.correlation_treshold = 0.01
|
25 |
+
self.epochs = 300
|
26 |
+
|
27 |
+
# Load the dataset and preprocess it
|
28 |
+
csv_file = os.path.join("data", "gym_members_exercise_tracking.csv")
|
29 |
+
df = pd.read_csv(csv_file, engine="python")
|
30 |
+
|
31 |
+
df = df.dropna() # Remove rows with any null cell (just in case)
|
32 |
+
# Assigning some of the features as Category.
|
33 |
+
df["Gender"] = df["Gender"].astype("category")
|
34 |
+
df["Workout_Type"] = df["Workout_Type"].astype("category")
|
35 |
+
|
36 |
+
# getting the names of the numerical and categorical columns for later
|
37 |
+
numeric_columns = list(df.select_dtypes(exclude=["category"]).columns)
|
38 |
+
categorical_columns = list(df.select_dtypes(include=["category"]).columns)
|
39 |
+
|
40 |
+
self.df_original = (
|
41 |
+
df.copy()
|
42 |
+
) # the df variable will have some features removes later but not this one
|
43 |
+
|
44 |
+
# remove the target to the list of features
|
45 |
+
numeric_columns.remove(self.target_column)
|
46 |
+
|
47 |
+
self.scaler = MinMaxScaler() # create a new MinMaxScaler
|
48 |
+
df_scaled = self.scaler.fit_transform(
|
49 |
+
df[numeric_columns]
|
50 |
+
) # scale all the numerical columns using the new MinMaxScaler
|
51 |
+
df[numeric_columns] = df_scaled.copy()
|
52 |
+
|
53 |
+
df_encoded = pd.get_dummies(
|
54 |
+
df, columns=categorical_columns, drop_first=False
|
55 |
+
) # get one-hot encoding for all categorical values
|
56 |
+
df_encoded = df_encoded.astype(
|
57 |
+
float
|
58 |
+
) # convert the one-hot encoding into floats (values between 0.0 - 1.0)
|
59 |
+
new_columns = list(
|
60 |
+
set(df_encoded.columns) - set(numeric_columns)
|
61 |
+
) # get the list of the new columns (former categorical columns)
|
62 |
+
df = (
|
63 |
+
df_encoded.copy()
|
64 |
+
) # the dataframe is now the one with all the one-hot encoded features
|
65 |
+
numeric_columns.extend(
|
66 |
+
new_columns
|
67 |
+
) # add the new columns to the list of numerical columns
|
68 |
+
|
69 |
+
self.numeric_columns = numeric_columns
|
70 |
+
self.df = df
|
71 |
+
|
72 |
+
def train_model(self):
|
73 |
+
# FEATURE SELECTION
|
74 |
+
|
75 |
+
correlation_matrix = self.df[self.numeric_columns].corr()
|
76 |
+
# calculating the Pearson’s correlation coefficient p-value for each element in the matrix
|
77 |
+
p_values = pd.DataFrame(
|
78 |
+
np.zeros((len(self.numeric_columns), len(self.numeric_columns))),
|
79 |
+
columns=self.numeric_columns,
|
80 |
+
index=self.numeric_columns,
|
81 |
+
)
|
82 |
+
# Calculate p-values for each pair
|
83 |
+
for col1 in self.numeric_columns:
|
84 |
+
for col2 in self.numeric_columns:
|
85 |
+
if col1 != col2:
|
86 |
+
_, p_value = pearsonr(
|
87 |
+
self.df[col1], self.df[col2]
|
88 |
+
) # using scipy.stats.pearsonr to get the p-value for one pair of feature
|
89 |
+
p_values.loc[col1, col2] = p_value
|
90 |
+
else:
|
91 |
+
p_values.loc[col1, col2] = (
|
92 |
+
1 # Set to 1 to not get the relation when trying to find correlations between features
|
93 |
+
)
|
94 |
+
|
95 |
+
# Identifying variables that are correlated
|
96 |
+
# When the p-value is smaller than 0.05, there is likely a “real” relationship between the variables.
|
97 |
+
correlated_columns = []
|
98 |
+
for i, col1 in enumerate(self.numeric_columns):
|
99 |
+
for j, col2 in enumerate(self.numeric_columns):
|
100 |
+
if (
|
101 |
+
j > i
|
102 |
+
and p_values.loc[col1, col2] < 0.05
|
103 |
+
and col1 != self.target_column
|
104 |
+
and col2 != self.target_column
|
105 |
+
):
|
106 |
+
correlated_columns.append((col1, col2, p_values.loc[col1, col2]))
|
107 |
+
|
108 |
+
# remove the target to the list of features
|
109 |
+
self.numeric_columns.remove(self.target_column)
|
110 |
+
|
111 |
+
# Identify features with a low correlation with the target
|
112 |
+
target_corr = correlation_matrix[self.target_column].copy()
|
113 |
+
correlation_treshold = self.correlation_treshold
|
114 |
+
features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index
|
115 |
+
features_to_remove = set(features_to_remove.to_list())
|
116 |
+
|
117 |
+
# Identify redundant features using p-values
|
118 |
+
x = {"keep": set(), "remove": set()}
|
119 |
+
for corr_duo in correlated_columns:
|
120 |
+
# put the feature with the highest correlation to the target variable in "keep" and the other one in "remove"
|
121 |
+
if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]:
|
122 |
+
x["keep"].add(corr_duo[0])
|
123 |
+
x["remove"].add(corr_duo[1])
|
124 |
+
else:
|
125 |
+
x["keep"].add(corr_duo[1])
|
126 |
+
x["remove"].add(corr_duo[0])
|
127 |
+
|
128 |
+
# remove features that are already removed from "keep"
|
129 |
+
x["keep"] = x["keep"] - features_to_remove
|
130 |
+
|
131 |
+
# remove features that are in "remove" and not in "keep"
|
132 |
+
redundant_features = x["remove"] - x["keep"]
|
133 |
+
|
134 |
+
features_to_remove = features_to_remove.union(redundant_features)
|
135 |
+
|
136 |
+
# Remove the selected features from the dataframe
|
137 |
+
for feature in list(features_to_remove):
|
138 |
+
self.numeric_columns.remove(feature)
|
139 |
+
self.df.drop(feature, axis=1, inplace=True)
|
140 |
+
self.features_to_remove = features_to_remove
|
141 |
+
|
142 |
+
print(
|
143 |
+
f"List of numerical features that will be used to predict the target ({self.target_column}) :"
|
144 |
+
)
|
145 |
+
print(self.numeric_columns)
|
146 |
+
|
147 |
+
# CREATE & TRAIN MODEL
|
148 |
+
|
149 |
+
# split the data
|
150 |
+
X = self.df[self.numeric_columns]
|
151 |
+
y = self.df[self.target_column]
|
152 |
+
|
153 |
+
# 20% of the dataset will be use as test data
|
154 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
155 |
+
|
156 |
+
self.model = MLPRegressor(
|
157 |
+
hidden_layer_sizes=self.dimension,
|
158 |
+
activation="relu", # ReLU activation function
|
159 |
+
solver="adam", # Adam optimizer
|
160 |
+
max_iter=1, # One iteration per fit call since the training loop is defined below
|
161 |
+
warm_start=True,
|
162 |
+
) # Used to measure the MSE throughout the iterations
|
163 |
+
|
164 |
+
# ignoring the warning raised because we're using a manual loop for training
|
165 |
+
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
166 |
+
|
167 |
+
# Track MSE values during training
|
168 |
+
mse_values = []
|
169 |
+
epochs = self.epochs
|
170 |
+
|
171 |
+
for epoch in range(epochs):
|
172 |
+
# Train the model
|
173 |
+
self.model.fit(X_train, y_train)
|
174 |
+
|
175 |
+
# Predict on the test set
|
176 |
+
y_pred = self.model.predict(X_test)
|
177 |
+
|
178 |
+
# Evaluate the model
|
179 |
+
mse = mean_squared_error(y_test, y_pred)
|
180 |
+
mse_values.append(mse)
|
181 |
+
|
182 |
+
# SAVE THE EVOLUTION OF THE MSE THROUGHOUT THE TRAINING
|
183 |
+
plt.figure(figsize=(10, 6))
|
184 |
+
plt.plot(range(epochs), mse_values, marker='o', linestyle='-')
|
185 |
+
plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}")
|
186 |
+
plt.xlabel('Epoch')
|
187 |
+
plt.ylabel('Mean Squared Error')
|
188 |
+
plt.grid(True)
|
189 |
+
plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png"))
|
190 |
+
|
191 |
+
print(f"Final epoch MSE: {mse:.4f}")
|
192 |
+
|
193 |
+
def predict(self, input_data: pd.DataFrame) -> float:
|
194 |
+
# scale the input using the scaler used during training
|
195 |
+
df_used_for_scaling = input_data[
|
196 |
+
[
|
197 |
+
col
|
198 |
+
for col in input_data.columns
|
199 |
+
if col
|
200 |
+
not in [
|
201 |
+
"Gender_Male",
|
202 |
+
"Gender_Female",
|
203 |
+
"Workout_Type_Strength",
|
204 |
+
"Workout_Type_Yoga",
|
205 |
+
"Workout_Type_HIIT",
|
206 |
+
"Workout_Type_Cardio",
|
207 |
+
]
|
208 |
+
]
|
209 |
+
]
|
210 |
+
scaled_input = self.scaler.transform(
|
211 |
+
input_data[[col for col in df_used_for_scaling.columns]]
|
212 |
+
)
|
213 |
+
input_data[df_used_for_scaling.columns] = scaled_input.copy()
|
214 |
+
|
215 |
+
# keep only the required features for the prediction
|
216 |
+
input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore")
|
217 |
+
|
218 |
+
input_data = input_data[self.numeric_columns]
|
219 |
+
|
220 |
+
print("Prediction using the following input : ")
|
221 |
+
print(input_data.to_csv())
|
222 |
+
|
223 |
+
water_intake = self.model.predict(input_data)
|
224 |
+
return water_intake
|