ICS5110 / NeuralNetwork /NeuralNetwork.py
mohli's picture
Changed path of file import
5332346 verified
import os
import warnings
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.exceptions import ConvergenceWarning
from matplotlib import pyplot as plt
class My_NeuralNetwork:
def __init__(self):
self.MAX_LAYERS = 10
self.target_column = "Water_Intake (liters)"
self.model = None
# default parameters
self.num_layer = 3
self.dimension = (32, 32, 32)
self.correlation_treshold = 0.01
self.epochs = 300
# Load the dataset and preprocess it
csv_file = os.path.join("data", "gym_members_exercise_tracking.csv")
df = pd.read_csv(csv_file, engine="python")
df = df.dropna() # Remove rows with any null cell (just in case)
# Assigning some of the features as Category.
df["Gender"] = df["Gender"].astype("category")
df["Workout_Type"] = df["Workout_Type"].astype("category")
# getting the names of the numerical and categorical columns for later
numeric_columns = list(df.select_dtypes(exclude=["category"]).columns)
categorical_columns = list(df.select_dtypes(include=["category"]).columns)
self.df_original = (
df.copy()
) # the df variable will have some features removes later but not this one
# remove the target to the list of features
numeric_columns.remove(self.target_column)
self.scaler = MinMaxScaler() # create a new MinMaxScaler
df_scaled = self.scaler.fit_transform(
df[numeric_columns]
) # scale all the numerical columns using the new MinMaxScaler
df[numeric_columns] = df_scaled.copy()
df_encoded = pd.get_dummies(
df, columns=categorical_columns, drop_first=False
) # get one-hot encoding for all categorical values
df_encoded = df_encoded.astype(
float
) # convert the one-hot encoding into floats (values between 0.0 - 1.0)
new_columns = list(
set(df_encoded.columns) - set(numeric_columns)
) # get the list of the new columns (former categorical columns)
df = (
df_encoded.copy()
) # the dataframe is now the one with all the one-hot encoded features
numeric_columns.extend(
new_columns
) # add the new columns to the list of numerical columns
self.numeric_columns = numeric_columns
self.df = df
def train_model(self):
# FEATURE SELECTION
correlation_matrix = self.df[self.numeric_columns].corr()
# calculating the Pearson’s correlation coefficient p-value for each element in the matrix
p_values = pd.DataFrame(
np.zeros((len(self.numeric_columns), len(self.numeric_columns))),
columns=self.numeric_columns,
index=self.numeric_columns,
)
# Calculate p-values for each pair
for col1 in self.numeric_columns:
for col2 in self.numeric_columns:
if col1 != col2:
_, p_value = pearsonr(
self.df[col1], self.df[col2]
) # using scipy.stats.pearsonr to get the p-value for one pair of feature
p_values.loc[col1, col2] = p_value
else:
p_values.loc[col1, col2] = (
1 # Set to 1 to not get the relation when trying to find correlations between features
)
# Identifying variables that are correlated
# When the p-value is smaller than 0.05, there is likely a “real” relationship between the variables.
correlated_columns = []
for i, col1 in enumerate(self.numeric_columns):
for j, col2 in enumerate(self.numeric_columns):
if (
j > i
and p_values.loc[col1, col2] < 0.05
and col1 != self.target_column
and col2 != self.target_column
):
correlated_columns.append((col1, col2, p_values.loc[col1, col2]))
# remove the target to the list of features
self.numeric_columns.remove(self.target_column)
# Identify features with a low correlation with the target
target_corr = correlation_matrix[self.target_column].copy()
correlation_treshold = self.correlation_treshold
features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index
features_to_remove = set(features_to_remove.to_list())
# Identify redundant features using p-values
x = {"keep": set(), "remove": set()}
for corr_duo in correlated_columns:
# put the feature with the highest correlation to the target variable in "keep" and the other one in "remove"
if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]:
x["keep"].add(corr_duo[0])
x["remove"].add(corr_duo[1])
else:
x["keep"].add(corr_duo[1])
x["remove"].add(corr_duo[0])
# remove features that are already removed from "keep"
x["keep"] = x["keep"] - features_to_remove
# remove features that are in "remove" and not in "keep"
redundant_features = x["remove"] - x["keep"]
features_to_remove = features_to_remove.union(redundant_features)
# Remove the selected features from the dataframe
for feature in list(features_to_remove):
self.numeric_columns.remove(feature)
self.df.drop(feature, axis=1, inplace=True)
self.features_to_remove = features_to_remove
print(
f"List of numerical features that will be used to predict the target ({self.target_column}) :"
)
print(self.numeric_columns)
# CREATE & TRAIN MODEL
# split the data
X = self.df[self.numeric_columns]
y = self.df[self.target_column]
# 20% of the dataset will be use as test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
self.model = MLPRegressor(
hidden_layer_sizes=self.dimension,
activation="relu", # ReLU activation function
solver="adam", # Adam optimizer
max_iter=1, # One iteration per fit call since the training loop is defined below
warm_start=True,
) # Used to measure the MSE throughout the iterations
# ignoring the warning raised because we're using a manual loop for training
warnings.filterwarnings("ignore", category=ConvergenceWarning)
# Track MSE values during training
mse_values = []
epochs = self.epochs
for epoch in range(epochs):
# Train the model
self.model.fit(X_train, y_train)
# Predict on the test set
y_pred = self.model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mse_values.append(mse)
# SAVE THE EVOLUTION OF THE MSE THROUGHOUT THE TRAINING
plt.figure(figsize=(10, 6))
plt.plot(range(epochs), mse_values, marker='o', linestyle='-')
plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}")
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.grid(True)
plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png"))
print(f"Final epoch MSE: {mse:.4f}")
def predict(self, input_data: pd.DataFrame) -> float:
# scale the input using the scaler used during training
df_used_for_scaling = input_data[
[
col
for col in input_data.columns
if col
not in [
"Gender_Male",
"Gender_Female",
"Workout_Type_Strength",
"Workout_Type_Yoga",
"Workout_Type_HIIT",
"Workout_Type_Cardio",
]
]
]
scaled_input = self.scaler.transform(
input_data[[col for col in df_used_for_scaling.columns]]
)
input_data[df_used_for_scaling.columns] = scaled_input.copy()
# keep only the required features for the prediction
input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore")
input_data = input_data[self.numeric_columns]
print("Prediction using the following input : ")
print(input_data.to_csv())
water_intake = self.model.predict(input_data)
return water_intake