Spaces:

mohli
/

ICS5110

Sleeping

App Files Files Community

ICS5110 / NeuralNetwork /NeuralNetwork.py

mohli

Changed path of file import

5332346 verified 4 months ago

raw

history blame contribute delete

8.89 kB

	import os
	import warnings
	import pandas as pd
	import numpy as np
	from scipy.stats import pearsonr
	from sklearn.preprocessing import MinMaxScaler
	from sklearn.model_selection import train_test_split
	from sklearn.neural_network import MLPRegressor
	from sklearn.metrics import mean_squared_error
	from sklearn.exceptions import ConvergenceWarning
	from matplotlib import pyplot as plt



	class My_NeuralNetwork:
	def __init__(self):
	self.MAX_LAYERS = 10
	self.target_column = "Water_Intake (liters)"
	self.model = None

	# default parameters
	self.num_layer = 3
	self.dimension = (32, 32, 32)
	self.correlation_treshold = 0.01
	self.epochs = 300

	# Load the dataset and preprocess it
	csv_file = os.path.join("data", "gym_members_exercise_tracking.csv")
	df = pd.read_csv(csv_file, engine="python")

	df = df.dropna() # Remove rows with any null cell (just in case)
	# Assigning some of the features as Category.
	df["Gender"] = df["Gender"].astype("category")
	df["Workout_Type"] = df["Workout_Type"].astype("category")

	# getting the names of the numerical and categorical columns for later
	numeric_columns = list(df.select_dtypes(exclude=["category"]).columns)
	categorical_columns = list(df.select_dtypes(include=["category"]).columns)

	self.df_original = (
	df.copy()
	) # the df variable will have some features removes later but not this one

	# remove the target to the list of features
	numeric_columns.remove(self.target_column)

	self.scaler = MinMaxScaler() # create a new MinMaxScaler
	df_scaled = self.scaler.fit_transform(
	df[numeric_columns]
	) # scale all the numerical columns using the new MinMaxScaler
	df[numeric_columns] = df_scaled.copy()

	df_encoded = pd.get_dummies(
	df, columns=categorical_columns, drop_first=False
	) # get one-hot encoding for all categorical values
	df_encoded = df_encoded.astype(
	float
	) # convert the one-hot encoding into floats (values between 0.0 - 1.0)
	new_columns = list(
	set(df_encoded.columns) - set(numeric_columns)
	) # get the list of the new columns (former categorical columns)
	df = (
	df_encoded.copy()
	) # the dataframe is now the one with all the one-hot encoded features
	numeric_columns.extend(
	new_columns
	) # add the new columns to the list of numerical columns

	self.numeric_columns = numeric_columns
	self.df = df

	def train_model(self):
	# FEATURE SELECTION

	correlation_matrix = self.df[self.numeric_columns].corr()
	# calculating the Pearson’s correlation coefficient p-value for each element in the matrix
	p_values = pd.DataFrame(
	np.zeros((len(self.numeric_columns), len(self.numeric_columns))),
	columns=self.numeric_columns,
	index=self.numeric_columns,
	)
	# Calculate p-values for each pair
	for col1 in self.numeric_columns:
	for col2 in self.numeric_columns:
	if col1 != col2:
	_, p_value = pearsonr(
	self.df[col1], self.df[col2]
	) # using scipy.stats.pearsonr to get the p-value for one pair of feature
	p_values.loc[col1, col2] = p_value
	else:
	p_values.loc[col1, col2] = (
	1 # Set to 1 to not get the relation when trying to find correlations between features
	)

	# Identifying variables that are correlated
	# When the p-value is smaller than 0.05, there is likely a “real” relationship between the variables.
	correlated_columns = []
	for i, col1 in enumerate(self.numeric_columns):
	for j, col2 in enumerate(self.numeric_columns):
	if (
	j > i
	and p_values.loc[col1, col2] < 0.05
	and col1 != self.target_column
	and col2 != self.target_column
	):
	correlated_columns.append((col1, col2, p_values.loc[col1, col2]))

	# remove the target to the list of features
	self.numeric_columns.remove(self.target_column)

	# Identify features with a low correlation with the target
	target_corr = correlation_matrix[self.target_column].copy()
	correlation_treshold = self.correlation_treshold
	features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index
	features_to_remove = set(features_to_remove.to_list())

	# Identify redundant features using p-values
	x = {"keep": set(), "remove": set()}
	for corr_duo in correlated_columns:
	# put the feature with the highest correlation to the target variable in "keep" and the other one in "remove"
	if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]:
	x["keep"].add(corr_duo[0])
	x["remove"].add(corr_duo[1])
	else:
	x["keep"].add(corr_duo[1])
	x["remove"].add(corr_duo[0])

	# remove features that are already removed from "keep"
	x["keep"] = x["keep"] - features_to_remove

	# remove features that are in "remove" and not in "keep"
	redundant_features = x["remove"] - x["keep"]

	features_to_remove = features_to_remove.union(redundant_features)

	# Remove the selected features from the dataframe
	for feature in list(features_to_remove):
	self.numeric_columns.remove(feature)
	self.df.drop(feature, axis=1, inplace=True)
	self.features_to_remove = features_to_remove

	print(
	f"List of numerical features that will be used to predict the target ({self.target_column}) :"
	)
	print(self.numeric_columns)

	# CREATE & TRAIN MODEL

	# split the data
	X = self.df[self.numeric_columns]
	y = self.df[self.target_column]

	# 20% of the dataset will be use as test data
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

	self.model = MLPRegressor(
	hidden_layer_sizes=self.dimension,
	activation="relu", # ReLU activation function
	solver="adam", # Adam optimizer
	max_iter=1, # One iteration per fit call since the training loop is defined below
	warm_start=True,
	) # Used to measure the MSE throughout the iterations

	# ignoring the warning raised because we're using a manual loop for training
	warnings.filterwarnings("ignore", category=ConvergenceWarning)

	# Track MSE values during training
	mse_values = []
	epochs = self.epochs

	for epoch in range(epochs):
	# Train the model
	self.model.fit(X_train, y_train)

	# Predict on the test set
	y_pred = self.model.predict(X_test)

	# Evaluate the model
	mse = mean_squared_error(y_test, y_pred)
	mse_values.append(mse)

	# SAVE THE EVOLUTION OF THE MSE THROUGHOUT THE TRAINING
	plt.figure(figsize=(10, 6))
	plt.plot(range(epochs), mse_values, marker='o', linestyle='-')
	plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}")
	plt.xlabel('Epoch')
	plt.ylabel('Mean Squared Error')
	plt.grid(True)
	plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png"))

	print(f"Final epoch MSE: {mse:.4f}")

	def predict(self, input_data: pd.DataFrame) -> float:
	# scale the input using the scaler used during training
	df_used_for_scaling = input_data[
	[
	col
	for col in input_data.columns
	if col
	not in [
	"Gender_Male",
	"Gender_Female",
	"Workout_Type_Strength",
	"Workout_Type_Yoga",
	"Workout_Type_HIIT",
	"Workout_Type_Cardio",
	]
	]
	]
	scaled_input = self.scaler.transform(
	input_data[[col for col in df_used_for_scaling.columns]]
	)
	input_data[df_used_for_scaling.columns] = scaled_input.copy()

	# keep only the required features for the prediction
	input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore")

	input_data = input_data[self.numeric_columns]

	print("Prediction using the following input : ")
	print(input_data.to_csv())

	water_intake = self.model.predict(input_data)
	return water_intake