Spaces:

mohli
/

ICS5110

Sleeping

App Files Files Community

ICS5110 / NeuralNetwork /NeuralNetwork.py

mohli

Upload 19 files

d17844e verified 4 months ago

raw

history blame

9.12 kB

	import os
	import warnings
	import pandas as pd
	import numpy as np
	from scipy.stats import pearsonr
	from sklearn.preprocessing import MinMaxScaler
	from sklearn.model_selection import train_test_split
	from sklearn.neural_network import MLPRegressor
	from sklearn.metrics import mean_squared_error
	from sklearn.exceptions import ConvergenceWarning
	from matplotlib import pyplot as plt



	class My_NeuralNetwork:
	def __init__(self):
	self.MAX_LAYERS = 10
	self.target_column = "Water_Intake (liters)"
	self.model = None

	# default parameters
	self.num_layer = 3
	self.dimension = (32, 32, 32)
	self.correlation_treshold = 0.01
	self.epochs = 300

	# Load the dataset and preprocess it
	csv_file = os.path.join("app", "data", "gym_members_exercise_tracking.csv")
	df = pd.read_csv(csv_file, engine="python")

	df = df.dropna() # Remove rows with any null cell (just in case)
	# Assigning some of the features as Category.
	df["Gender"] = df["Gender"].astype("category")
	df["Workout_Type"] = df["Workout_Type"].astype("category")

	# getting the names of the numerical and categorical columns for later
	numeric_columns = list(df.select_dtypes(exclude=["category"]).columns)
	categorical_columns = list(df.select_dtypes(include=["category"]).columns)

	self.df_original = (
	df.copy()
	) # the df variable will have some features removes later but not this one

	# remove the target to the list of features
	numeric_columns.remove(self.target_column)

	self.scaler = MinMaxScaler() # create a new MinMaxScaler
	df_scaled = self.scaler.fit_transform(
	df[numeric_columns]
	) # scale all the numerical columns using the new MinMaxScaler
	df[numeric_columns] = df_scaled.copy()

	df_encoded = pd.get_dummies(
	df, columns=categorical_columns, drop_first=False
	) # get one-hot encoding for all categorical values
	df_encoded = df_encoded.astype(
	float
	) # convert the one-hot encoding into floats (values between 0.0 - 1.0)
	new_columns = list(
	set(df_encoded.columns) - set(numeric_columns)
	) # get the list of the new columns (former categorical columns)
	df = (
	df_encoded.copy()
	) # the dataframe is now the one with all the one-hot encoded features
	numeric_columns.extend(
	new_columns
	) # add the new columns to the list of numerical columns

	self.numeric_columns = numeric_columns
	self.df = df

	def train_model(self):
	# FEATURE SELECTION

	correlation_matrix = self.df[self.numeric_columns].corr()
	# calculating the Pearson’s correlation coefficient p-value for each element in the matrix
	p_values = pd.DataFrame(
	np.zeros((len(self.numeric_columns), len(self.numeric_columns))),
	columns=self.numeric_columns,
	index=self.numeric_columns,
	)
	# Calculate p-values for each pair
	for col1 in self.numeric_columns:
	for col2 in self.numeric_columns:
	if col1 != col2:
	_, p_value = pearsonr(
	self.df[col1], self.df[col2]
	) # using scipy.stats.pearsonr to get the p-value for one pair of feature
	p_values.loc[col1, col2] = p_value
	else:
	p_values.loc[col1, col2] = (
	1 # Set to 1 to not get the relation when trying to find correlations between features
	)

	# Identifying variables that are correlated
	# When the p-value is smaller than 0.05, there is likely a “real” relationship between the variables.
	correlated_columns = []
	for i, col1 in enumerate(self.numeric_columns):
	for j, col2 in enumerate(self.numeric_columns):
	if (
	j > i
	and p_values.loc[col1, col2] < 0.05
	and col1 != self.target_column
	and col2 != self.target_column
	):
	correlated_columns.append((col1, col2, p_values.loc[col1, col2]))

	# remove the target to the list of features
	self.numeric_columns.remove(self.target_column)

	# Identify features with a low correlation with the target
	target_corr = correlation_matrix[self.target_column].copy()
	correlation_treshold = self.correlation_treshold
	features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index
	features_to_remove = set(features_to_remove.to_list())

	# Identify redundant features using p-values
	x = {"keep": set(), "remove": set()}
	for corr_duo in correlated_columns:
	# put the feature with the highest correlation to the target variable in "keep" and the other one in "remove"
	if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]:
	x["keep"].add(corr_duo[0])
	x["remove"].add(corr_duo[1])
	else:
	x["keep"].add(corr_duo[1])
	x["remove"].add(corr_duo[0])

	# remove features that are already removed from "keep"
	x["keep"] = x["keep"] - features_to_remove

	# remove features that are in "remove" and not in "keep"
	redundant_features = x["remove"] - x["keep"]

	features_to_remove = features_to_remove.union(redundant_features)

	# Remove the selected features from the dataframe
	for feature in list(features_to_remove):
	self.numeric_columns.remove(feature)
	self.df.drop(feature, axis=1, inplace=True)
	self.features_to_remove = features_to_remove

	print(
	f"List of numerical features that will be used to predict the target ({self.target_column}) :"
	)
	print(self.numeric_columns)

	# CREATE & TRAIN MODEL

	# split the data
	X = self.df[self.numeric_columns]
	y = self.df[self.target_column]

	# 20% of the dataset will be use as test data
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

	self.model = MLPRegressor(
	hidden_layer_sizes=self.dimension,
	activation="relu", # ReLU activation function
	solver="adam", # Adam optimizer
	max_iter=1, # One iteration per fit call since the training loop is defined below
	warm_start=True,
	) # Used to measure the MSE throughout the iterations

	# ignoring the warning raised because we're using a manual loop for training
	warnings.filterwarnings("ignore", category=ConvergenceWarning)

	# Track MSE values during training
	mse_values = []
	epochs = self.epochs

	for epoch in range(epochs):
	# Train the model
	self.model.fit(X_train, y_train)

	# Predict on the test set
	y_pred = self.model.predict(X_test)

	# Evaluate the model
	mse = mean_squared_error(y_test, y_pred)
	mse_values.append(mse)

	# SAVE THE EVOLUTION OF THE MSE THROUGHOUT THE TRAINING
	plt.figure(figsize=(10, 6))
	plt.plot(range(epochs), mse_values, marker='o', linestyle='-')
	plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}")
	plt.xlabel('Epoch')
	plt.ylabel('Mean Squared Error')
	plt.grid(True)
	plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png"))

	print(f"Final epoch MSE: {mse:.4f}")

	def predict(self, input_data: pd.DataFrame) -> float:
	# scale the input using the scaler used during training
	df_used_for_scaling = input_data[
	[
	col
	for col in input_data.columns
	if col
	not in [
	"Gender_Male",
	"Gender_Female",
	"Workout_Type_Strength",
	"Workout_Type_Yoga",
	"Workout_Type_HIIT",
	"Workout_Type_Cardio",
	]
	]
	]
	scaled_input = self.scaler.transform(
	input_data[[col for col in df_used_for_scaling.columns]]
	)
	input_data[df_used_for_scaling.columns] = scaled_input.copy()

	# keep only the required features for the prediction
	input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore")

	input_data = input_data[self.numeric_columns]

	print("Prediction using the following input : ")
	print(input_data.to_csv())

	water_intake = self.model.predict(input_data)
	return water_intake