|
import os
|
|
import warnings
|
|
import pandas as pd
|
|
import numpy as np
|
|
from scipy.stats import pearsonr
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.neural_network import MLPRegressor
|
|
from sklearn.metrics import mean_squared_error
|
|
from sklearn.exceptions import ConvergenceWarning
|
|
from matplotlib import pyplot as plt
|
|
|
|
|
|
|
|
class My_NeuralNetwork:
|
|
def __init__(self):
|
|
self.MAX_LAYERS = 10
|
|
self.target_column = "Water_Intake (liters)"
|
|
self.model = None
|
|
|
|
|
|
self.num_layer = 3
|
|
self.dimension = (32, 32, 32)
|
|
self.correlation_treshold = 0.01
|
|
self.epochs = 300
|
|
|
|
|
|
csv_file = os.path.join("app", "data", "gym_members_exercise_tracking.csv")
|
|
df = pd.read_csv(csv_file, engine="python")
|
|
|
|
df = df.dropna()
|
|
|
|
df["Gender"] = df["Gender"].astype("category")
|
|
df["Workout_Type"] = df["Workout_Type"].astype("category")
|
|
|
|
|
|
numeric_columns = list(df.select_dtypes(exclude=["category"]).columns)
|
|
categorical_columns = list(df.select_dtypes(include=["category"]).columns)
|
|
|
|
self.df_original = (
|
|
df.copy()
|
|
)
|
|
|
|
|
|
numeric_columns.remove(self.target_column)
|
|
|
|
self.scaler = MinMaxScaler()
|
|
df_scaled = self.scaler.fit_transform(
|
|
df[numeric_columns]
|
|
)
|
|
df[numeric_columns] = df_scaled.copy()
|
|
|
|
df_encoded = pd.get_dummies(
|
|
df, columns=categorical_columns, drop_first=False
|
|
)
|
|
df_encoded = df_encoded.astype(
|
|
float
|
|
)
|
|
new_columns = list(
|
|
set(df_encoded.columns) - set(numeric_columns)
|
|
)
|
|
df = (
|
|
df_encoded.copy()
|
|
)
|
|
numeric_columns.extend(
|
|
new_columns
|
|
)
|
|
|
|
self.numeric_columns = numeric_columns
|
|
self.df = df
|
|
|
|
def train_model(self):
|
|
|
|
|
|
correlation_matrix = self.df[self.numeric_columns].corr()
|
|
|
|
p_values = pd.DataFrame(
|
|
np.zeros((len(self.numeric_columns), len(self.numeric_columns))),
|
|
columns=self.numeric_columns,
|
|
index=self.numeric_columns,
|
|
)
|
|
|
|
for col1 in self.numeric_columns:
|
|
for col2 in self.numeric_columns:
|
|
if col1 != col2:
|
|
_, p_value = pearsonr(
|
|
self.df[col1], self.df[col2]
|
|
)
|
|
p_values.loc[col1, col2] = p_value
|
|
else:
|
|
p_values.loc[col1, col2] = (
|
|
1
|
|
)
|
|
|
|
|
|
|
|
correlated_columns = []
|
|
for i, col1 in enumerate(self.numeric_columns):
|
|
for j, col2 in enumerate(self.numeric_columns):
|
|
if (
|
|
j > i
|
|
and p_values.loc[col1, col2] < 0.05
|
|
and col1 != self.target_column
|
|
and col2 != self.target_column
|
|
):
|
|
correlated_columns.append((col1, col2, p_values.loc[col1, col2]))
|
|
|
|
|
|
self.numeric_columns.remove(self.target_column)
|
|
|
|
|
|
target_corr = correlation_matrix[self.target_column].copy()
|
|
correlation_treshold = self.correlation_treshold
|
|
features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index
|
|
features_to_remove = set(features_to_remove.to_list())
|
|
|
|
|
|
x = {"keep": set(), "remove": set()}
|
|
for corr_duo in correlated_columns:
|
|
|
|
if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]:
|
|
x["keep"].add(corr_duo[0])
|
|
x["remove"].add(corr_duo[1])
|
|
else:
|
|
x["keep"].add(corr_duo[1])
|
|
x["remove"].add(corr_duo[0])
|
|
|
|
|
|
x["keep"] = x["keep"] - features_to_remove
|
|
|
|
|
|
redundant_features = x["remove"] - x["keep"]
|
|
|
|
features_to_remove = features_to_remove.union(redundant_features)
|
|
|
|
|
|
for feature in list(features_to_remove):
|
|
self.numeric_columns.remove(feature)
|
|
self.df.drop(feature, axis=1, inplace=True)
|
|
self.features_to_remove = features_to_remove
|
|
|
|
print(
|
|
f"List of numerical features that will be used to predict the target ({self.target_column}) :"
|
|
)
|
|
print(self.numeric_columns)
|
|
|
|
|
|
|
|
|
|
X = self.df[self.numeric_columns]
|
|
y = self.df[self.target_column]
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
|
|
|
self.model = MLPRegressor(
|
|
hidden_layer_sizes=self.dimension,
|
|
activation="relu",
|
|
solver="adam",
|
|
max_iter=1,
|
|
warm_start=True,
|
|
)
|
|
|
|
|
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
|
|
|
|
|
mse_values = []
|
|
epochs = self.epochs
|
|
|
|
for epoch in range(epochs):
|
|
|
|
self.model.fit(X_train, y_train)
|
|
|
|
|
|
y_pred = self.model.predict(X_test)
|
|
|
|
|
|
mse = mean_squared_error(y_test, y_pred)
|
|
mse_values.append(mse)
|
|
|
|
|
|
plt.figure(figsize=(10, 6))
|
|
plt.plot(range(epochs), mse_values, marker='o', linestyle='-')
|
|
plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}")
|
|
plt.xlabel('Epoch')
|
|
plt.ylabel('Mean Squared Error')
|
|
plt.grid(True)
|
|
plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png"))
|
|
|
|
print(f"Final epoch MSE: {mse:.4f}")
|
|
|
|
def predict(self, input_data: pd.DataFrame) -> float:
|
|
|
|
df_used_for_scaling = input_data[
|
|
[
|
|
col
|
|
for col in input_data.columns
|
|
if col
|
|
not in [
|
|
"Gender_Male",
|
|
"Gender_Female",
|
|
"Workout_Type_Strength",
|
|
"Workout_Type_Yoga",
|
|
"Workout_Type_HIIT",
|
|
"Workout_Type_Cardio",
|
|
]
|
|
]
|
|
]
|
|
scaled_input = self.scaler.transform(
|
|
input_data[[col for col in df_used_for_scaling.columns]]
|
|
)
|
|
input_data[df_used_for_scaling.columns] = scaled_input.copy()
|
|
|
|
|
|
input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore")
|
|
|
|
input_data = input_data[self.numeric_columns]
|
|
|
|
print("Prediction using the following input : ")
|
|
print(input_data.to_csv())
|
|
|
|
water_intake = self.model.predict(input_data)
|
|
return water_intake
|
|
|