File size: 8,887 Bytes
5332346 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
import os
import warnings
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.exceptions import ConvergenceWarning
from matplotlib import pyplot as plt
class My_NeuralNetwork:
def __init__(self):
self.MAX_LAYERS = 10
self.target_column = "Water_Intake (liters)"
self.model = None
# default parameters
self.num_layer = 3
self.dimension = (32, 32, 32)
self.correlation_treshold = 0.01
self.epochs = 300
# Load the dataset and preprocess it
csv_file = os.path.join("data", "gym_members_exercise_tracking.csv")
df = pd.read_csv(csv_file, engine="python")
df = df.dropna() # Remove rows with any null cell (just in case)
# Assigning some of the features as Category.
df["Gender"] = df["Gender"].astype("category")
df["Workout_Type"] = df["Workout_Type"].astype("category")
# getting the names of the numerical and categorical columns for later
numeric_columns = list(df.select_dtypes(exclude=["category"]).columns)
categorical_columns = list(df.select_dtypes(include=["category"]).columns)
self.df_original = (
df.copy()
) # the df variable will have some features removes later but not this one
# remove the target to the list of features
numeric_columns.remove(self.target_column)
self.scaler = MinMaxScaler() # create a new MinMaxScaler
df_scaled = self.scaler.fit_transform(
df[numeric_columns]
) # scale all the numerical columns using the new MinMaxScaler
df[numeric_columns] = df_scaled.copy()
df_encoded = pd.get_dummies(
df, columns=categorical_columns, drop_first=False
) # get one-hot encoding for all categorical values
df_encoded = df_encoded.astype(
float
) # convert the one-hot encoding into floats (values between 0.0 - 1.0)
new_columns = list(
set(df_encoded.columns) - set(numeric_columns)
) # get the list of the new columns (former categorical columns)
df = (
df_encoded.copy()
) # the dataframe is now the one with all the one-hot encoded features
numeric_columns.extend(
new_columns
) # add the new columns to the list of numerical columns
self.numeric_columns = numeric_columns
self.df = df
def train_model(self):
# FEATURE SELECTION
correlation_matrix = self.df[self.numeric_columns].corr()
# calculating the Pearson’s correlation coefficient p-value for each element in the matrix
p_values = pd.DataFrame(
np.zeros((len(self.numeric_columns), len(self.numeric_columns))),
columns=self.numeric_columns,
index=self.numeric_columns,
)
# Calculate p-values for each pair
for col1 in self.numeric_columns:
for col2 in self.numeric_columns:
if col1 != col2:
_, p_value = pearsonr(
self.df[col1], self.df[col2]
) # using scipy.stats.pearsonr to get the p-value for one pair of feature
p_values.loc[col1, col2] = p_value
else:
p_values.loc[col1, col2] = (
1 # Set to 1 to not get the relation when trying to find correlations between features
)
# Identifying variables that are correlated
# When the p-value is smaller than 0.05, there is likely a “real” relationship between the variables.
correlated_columns = []
for i, col1 in enumerate(self.numeric_columns):
for j, col2 in enumerate(self.numeric_columns):
if (
j > i
and p_values.loc[col1, col2] < 0.05
and col1 != self.target_column
and col2 != self.target_column
):
correlated_columns.append((col1, col2, p_values.loc[col1, col2]))
# remove the target to the list of features
self.numeric_columns.remove(self.target_column)
# Identify features with a low correlation with the target
target_corr = correlation_matrix[self.target_column].copy()
correlation_treshold = self.correlation_treshold
features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index
features_to_remove = set(features_to_remove.to_list())
# Identify redundant features using p-values
x = {"keep": set(), "remove": set()}
for corr_duo in correlated_columns:
# put the feature with the highest correlation to the target variable in "keep" and the other one in "remove"
if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]:
x["keep"].add(corr_duo[0])
x["remove"].add(corr_duo[1])
else:
x["keep"].add(corr_duo[1])
x["remove"].add(corr_duo[0])
# remove features that are already removed from "keep"
x["keep"] = x["keep"] - features_to_remove
# remove features that are in "remove" and not in "keep"
redundant_features = x["remove"] - x["keep"]
features_to_remove = features_to_remove.union(redundant_features)
# Remove the selected features from the dataframe
for feature in list(features_to_remove):
self.numeric_columns.remove(feature)
self.df.drop(feature, axis=1, inplace=True)
self.features_to_remove = features_to_remove
print(
f"List of numerical features that will be used to predict the target ({self.target_column}) :"
)
print(self.numeric_columns)
# CREATE & TRAIN MODEL
# split the data
X = self.df[self.numeric_columns]
y = self.df[self.target_column]
# 20% of the dataset will be use as test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
self.model = MLPRegressor(
hidden_layer_sizes=self.dimension,
activation="relu", # ReLU activation function
solver="adam", # Adam optimizer
max_iter=1, # One iteration per fit call since the training loop is defined below
warm_start=True,
) # Used to measure the MSE throughout the iterations
# ignoring the warning raised because we're using a manual loop for training
warnings.filterwarnings("ignore", category=ConvergenceWarning)
# Track MSE values during training
mse_values = []
epochs = self.epochs
for epoch in range(epochs):
# Train the model
self.model.fit(X_train, y_train)
# Predict on the test set
y_pred = self.model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mse_values.append(mse)
# SAVE THE EVOLUTION OF THE MSE THROUGHOUT THE TRAINING
plt.figure(figsize=(10, 6))
plt.plot(range(epochs), mse_values, marker='o', linestyle='-')
plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}")
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.grid(True)
plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png"))
print(f"Final epoch MSE: {mse:.4f}")
def predict(self, input_data: pd.DataFrame) -> float:
# scale the input using the scaler used during training
df_used_for_scaling = input_data[
[
col
for col in input_data.columns
if col
not in [
"Gender_Male",
"Gender_Female",
"Workout_Type_Strength",
"Workout_Type_Yoga",
"Workout_Type_HIIT",
"Workout_Type_Cardio",
]
]
]
scaled_input = self.scaler.transform(
input_data[[col for col in df_used_for_scaling.columns]]
)
input_data[df_used_for_scaling.columns] = scaled_input.copy()
# keep only the required features for the prediction
input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore")
input_data = input_data[self.numeric_columns]
print("Prediction using the following input : ")
print(input_data.to_csv())
water_intake = self.model.predict(input_data)
return water_intake
|