|
import os |
|
import warnings |
|
import pandas as pd |
|
import numpy as np |
|
from scipy.stats import pearsonr |
|
from sklearn.preprocessing import MinMaxScaler |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.neural_network import MLPRegressor |
|
from sklearn.metrics import mean_squared_error |
|
from sklearn.exceptions import ConvergenceWarning |
|
from matplotlib import pyplot as plt |
|
|
|
|
|
|
|
class My_NeuralNetwork: |
|
def __init__(self): |
|
self.MAX_LAYERS = 10 |
|
self.target_column = "Water_Intake (liters)" |
|
self.model = None |
|
|
|
|
|
self.num_layer = 3 |
|
self.dimension = (32, 32, 32) |
|
self.correlation_treshold = 0.01 |
|
self.epochs = 300 |
|
|
|
|
|
csv_file = os.path.join("data", "gym_members_exercise_tracking.csv") |
|
df = pd.read_csv(csv_file, engine="python") |
|
|
|
df = df.dropna() |
|
|
|
df["Gender"] = df["Gender"].astype("category") |
|
df["Workout_Type"] = df["Workout_Type"].astype("category") |
|
|
|
|
|
numeric_columns = list(df.select_dtypes(exclude=["category"]).columns) |
|
categorical_columns = list(df.select_dtypes(include=["category"]).columns) |
|
|
|
self.df_original = ( |
|
df.copy() |
|
) |
|
|
|
|
|
numeric_columns.remove(self.target_column) |
|
|
|
self.scaler = MinMaxScaler() |
|
df_scaled = self.scaler.fit_transform( |
|
df[numeric_columns] |
|
) |
|
df[numeric_columns] = df_scaled.copy() |
|
|
|
df_encoded = pd.get_dummies( |
|
df, columns=categorical_columns, drop_first=False |
|
) |
|
df_encoded = df_encoded.astype( |
|
float |
|
) |
|
new_columns = list( |
|
set(df_encoded.columns) - set(numeric_columns) |
|
) |
|
df = ( |
|
df_encoded.copy() |
|
) |
|
numeric_columns.extend( |
|
new_columns |
|
) |
|
|
|
self.numeric_columns = numeric_columns |
|
self.df = df |
|
|
|
def train_model(self): |
|
|
|
|
|
correlation_matrix = self.df[self.numeric_columns].corr() |
|
|
|
p_values = pd.DataFrame( |
|
np.zeros((len(self.numeric_columns), len(self.numeric_columns))), |
|
columns=self.numeric_columns, |
|
index=self.numeric_columns, |
|
) |
|
|
|
for col1 in self.numeric_columns: |
|
for col2 in self.numeric_columns: |
|
if col1 != col2: |
|
_, p_value = pearsonr( |
|
self.df[col1], self.df[col2] |
|
) |
|
p_values.loc[col1, col2] = p_value |
|
else: |
|
p_values.loc[col1, col2] = ( |
|
1 |
|
) |
|
|
|
|
|
|
|
correlated_columns = [] |
|
for i, col1 in enumerate(self.numeric_columns): |
|
for j, col2 in enumerate(self.numeric_columns): |
|
if ( |
|
j > i |
|
and p_values.loc[col1, col2] < 0.05 |
|
and col1 != self.target_column |
|
and col2 != self.target_column |
|
): |
|
correlated_columns.append((col1, col2, p_values.loc[col1, col2])) |
|
|
|
|
|
self.numeric_columns.remove(self.target_column) |
|
|
|
|
|
target_corr = correlation_matrix[self.target_column].copy() |
|
correlation_treshold = self.correlation_treshold |
|
features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index |
|
features_to_remove = set(features_to_remove.to_list()) |
|
|
|
|
|
x = {"keep": set(), "remove": set()} |
|
for corr_duo in correlated_columns: |
|
|
|
if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]: |
|
x["keep"].add(corr_duo[0]) |
|
x["remove"].add(corr_duo[1]) |
|
else: |
|
x["keep"].add(corr_duo[1]) |
|
x["remove"].add(corr_duo[0]) |
|
|
|
|
|
x["keep"] = x["keep"] - features_to_remove |
|
|
|
|
|
redundant_features = x["remove"] - x["keep"] |
|
|
|
features_to_remove = features_to_remove.union(redundant_features) |
|
|
|
|
|
for feature in list(features_to_remove): |
|
self.numeric_columns.remove(feature) |
|
self.df.drop(feature, axis=1, inplace=True) |
|
self.features_to_remove = features_to_remove |
|
|
|
print( |
|
f"List of numerical features that will be used to predict the target ({self.target_column}) :" |
|
) |
|
print(self.numeric_columns) |
|
|
|
|
|
|
|
|
|
X = self.df[self.numeric_columns] |
|
y = self.df[self.target_column] |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) |
|
|
|
self.model = MLPRegressor( |
|
hidden_layer_sizes=self.dimension, |
|
activation="relu", |
|
solver="adam", |
|
max_iter=1, |
|
warm_start=True, |
|
) |
|
|
|
|
|
warnings.filterwarnings("ignore", category=ConvergenceWarning) |
|
|
|
|
|
mse_values = [] |
|
epochs = self.epochs |
|
|
|
for epoch in range(epochs): |
|
|
|
self.model.fit(X_train, y_train) |
|
|
|
|
|
y_pred = self.model.predict(X_test) |
|
|
|
|
|
mse = mean_squared_error(y_test, y_pred) |
|
mse_values.append(mse) |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
plt.plot(range(epochs), mse_values, marker='o', linestyle='-') |
|
plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}") |
|
plt.xlabel('Epoch') |
|
plt.ylabel('Mean Squared Error') |
|
plt.grid(True) |
|
plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png")) |
|
|
|
print(f"Final epoch MSE: {mse:.4f}") |
|
|
|
def predict(self, input_data: pd.DataFrame) -> float: |
|
|
|
df_used_for_scaling = input_data[ |
|
[ |
|
col |
|
for col in input_data.columns |
|
if col |
|
not in [ |
|
"Gender_Male", |
|
"Gender_Female", |
|
"Workout_Type_Strength", |
|
"Workout_Type_Yoga", |
|
"Workout_Type_HIIT", |
|
"Workout_Type_Cardio", |
|
] |
|
] |
|
] |
|
scaled_input = self.scaler.transform( |
|
input_data[[col for col in df_used_for_scaling.columns]] |
|
) |
|
input_data[df_used_for_scaling.columns] = scaled_input.copy() |
|
|
|
|
|
input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore") |
|
|
|
input_data = input_data[self.numeric_columns] |
|
|
|
print("Prediction using the following input : ") |
|
print(input_data.to_csv()) |
|
|
|
water_intake = self.model.predict(input_data) |
|
return water_intake |
|
|