import os import warnings import pandas as pd import numpy as np from scipy.stats import pearsonr from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.neural_network import MLPRegressor from sklearn.metrics import mean_squared_error from sklearn.exceptions import ConvergenceWarning from matplotlib import pyplot as plt class My_NeuralNetwork: def __init__(self): self.MAX_LAYERS = 10 self.target_column = "Water_Intake (liters)" self.model = None # default parameters self.num_layer = 3 self.dimension = (32, 32, 32) self.correlation_treshold = 0.01 self.epochs = 300 # Load the dataset and preprocess it csv_file = os.path.join("data", "gym_members_exercise_tracking.csv") df = pd.read_csv(csv_file, engine="python") df = df.dropna() # Remove rows with any null cell (just in case) # Assigning some of the features as Category. df["Gender"] = df["Gender"].astype("category") df["Workout_Type"] = df["Workout_Type"].astype("category") # getting the names of the numerical and categorical columns for later numeric_columns = list(df.select_dtypes(exclude=["category"]).columns) categorical_columns = list(df.select_dtypes(include=["category"]).columns) self.df_original = ( df.copy() ) # the df variable will have some features removes later but not this one # remove the target to the list of features numeric_columns.remove(self.target_column) self.scaler = MinMaxScaler() # create a new MinMaxScaler df_scaled = self.scaler.fit_transform( df[numeric_columns] ) # scale all the numerical columns using the new MinMaxScaler df[numeric_columns] = df_scaled.copy() df_encoded = pd.get_dummies( df, columns=categorical_columns, drop_first=False ) # get one-hot encoding for all categorical values df_encoded = df_encoded.astype( float ) # convert the one-hot encoding into floats (values between 0.0 - 1.0) new_columns = list( set(df_encoded.columns) - set(numeric_columns) ) # get the list of the new columns (former categorical columns) df = ( df_encoded.copy() ) # the dataframe is now the one with all the one-hot encoded features numeric_columns.extend( new_columns ) # add the new columns to the list of numerical columns self.numeric_columns = numeric_columns self.df = df def train_model(self): # FEATURE SELECTION correlation_matrix = self.df[self.numeric_columns].corr() # calculating the Pearson’s correlation coefficient p-value for each element in the matrix p_values = pd.DataFrame( np.zeros((len(self.numeric_columns), len(self.numeric_columns))), columns=self.numeric_columns, index=self.numeric_columns, ) # Calculate p-values for each pair for col1 in self.numeric_columns: for col2 in self.numeric_columns: if col1 != col2: _, p_value = pearsonr( self.df[col1], self.df[col2] ) # using scipy.stats.pearsonr to get the p-value for one pair of feature p_values.loc[col1, col2] = p_value else: p_values.loc[col1, col2] = ( 1 # Set to 1 to not get the relation when trying to find correlations between features ) # Identifying variables that are correlated # When the p-value is smaller than 0.05, there is likely a “real” relationship between the variables. correlated_columns = [] for i, col1 in enumerate(self.numeric_columns): for j, col2 in enumerate(self.numeric_columns): if ( j > i and p_values.loc[col1, col2] < 0.05 and col1 != self.target_column and col2 != self.target_column ): correlated_columns.append((col1, col2, p_values.loc[col1, col2])) # remove the target to the list of features self.numeric_columns.remove(self.target_column) # Identify features with a low correlation with the target target_corr = correlation_matrix[self.target_column].copy() correlation_treshold = self.correlation_treshold features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index features_to_remove = set(features_to_remove.to_list()) # Identify redundant features using p-values x = {"keep": set(), "remove": set()} for corr_duo in correlated_columns: # put the feature with the highest correlation to the target variable in "keep" and the other one in "remove" if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]: x["keep"].add(corr_duo[0]) x["remove"].add(corr_duo[1]) else: x["keep"].add(corr_duo[1]) x["remove"].add(corr_duo[0]) # remove features that are already removed from "keep" x["keep"] = x["keep"] - features_to_remove # remove features that are in "remove" and not in "keep" redundant_features = x["remove"] - x["keep"] features_to_remove = features_to_remove.union(redundant_features) # Remove the selected features from the dataframe for feature in list(features_to_remove): self.numeric_columns.remove(feature) self.df.drop(feature, axis=1, inplace=True) self.features_to_remove = features_to_remove print( f"List of numerical features that will be used to predict the target ({self.target_column}) :" ) print(self.numeric_columns) # CREATE & TRAIN MODEL # split the data X = self.df[self.numeric_columns] y = self.df[self.target_column] # 20% of the dataset will be use as test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) self.model = MLPRegressor( hidden_layer_sizes=self.dimension, activation="relu", # ReLU activation function solver="adam", # Adam optimizer max_iter=1, # One iteration per fit call since the training loop is defined below warm_start=True, ) # Used to measure the MSE throughout the iterations # ignoring the warning raised because we're using a manual loop for training warnings.filterwarnings("ignore", category=ConvergenceWarning) # Track MSE values during training mse_values = [] epochs = self.epochs for epoch in range(epochs): # Train the model self.model.fit(X_train, y_train) # Predict on the test set y_pred = self.model.predict(X_test) # Evaluate the model mse = mean_squared_error(y_test, y_pred) mse_values.append(mse) # SAVE THE EVOLUTION OF THE MSE THROUGHOUT THE TRAINING plt.figure(figsize=(10, 6)) plt.plot(range(epochs), mse_values, marker='o', linestyle='-') plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}") plt.xlabel('Epoch') plt.ylabel('Mean Squared Error') plt.grid(True) plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png")) print(f"Final epoch MSE: {mse:.4f}") def predict(self, input_data: pd.DataFrame) -> float: # scale the input using the scaler used during training df_used_for_scaling = input_data[ [ col for col in input_data.columns if col not in [ "Gender_Male", "Gender_Female", "Workout_Type_Strength", "Workout_Type_Yoga", "Workout_Type_HIIT", "Workout_Type_Cardio", ] ] ] scaled_input = self.scaler.transform( input_data[[col for col in df_used_for_scaling.columns]] ) input_data[df_used_for_scaling.columns] = scaled_input.copy() # keep only the required features for the prediction input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore") input_data = input_data[self.numeric_columns] print("Prediction using the following input : ") print(input_data.to_csv()) water_intake = self.model.predict(input_data) return water_intake