Spaces:

mohli
/

ICS5110

Sleeping

App Files Files Community

mohli commited on Jan 27

Commit

5332346

verified ·

1 Parent(s): 7049970

Changed path of file import

Browse files

Files changed (1) hide show

NeuralNetwork/NeuralNetwork.py +224 -224

NeuralNetwork/NeuralNetwork.py CHANGED Viewed

@@ -1,224 +1,224 @@
-import os
-import warnings
-import pandas as pd
-import numpy as np
-from scipy.stats import pearsonr
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.model_selection import train_test_split
-from sklearn.neural_network import MLPRegressor
-from sklearn.metrics import mean_squared_error
-from sklearn.exceptions import ConvergenceWarning
-from matplotlib import pyplot as plt
-class My_NeuralNetwork:
-    def __init__(self):
-        self.MAX_LAYERS = 10
-        self.target_column = "Water_Intake (liters)"
-        self.model = None
-        # default parameters
-        self.num_layer = 3
-        self.dimension = (32, 32, 32)
-        self.correlation_treshold = 0.01
-        self.epochs = 300
-        # Load the dataset and preprocess it
-        csv_file = os.path.join("app", "data", "gym_members_exercise_tracking.csv")
-        df = pd.read_csv(csv_file, engine="python")
-        df = df.dropna()  # Remove rows with any null cell (just in case)
-        # Assigning some of the features as Category.
-        df["Gender"] = df["Gender"].astype("category")
-        df["Workout_Type"] = df["Workout_Type"].astype("category")
-        # getting the names of the numerical and categorical columns for later
-        numeric_columns = list(df.select_dtypes(exclude=["category"]).columns)
-        categorical_columns = list(df.select_dtypes(include=["category"]).columns)
-        self.df_original = (
-            df.copy()
-        )  # the df variable will have some features removes later but not this one
-        # remove the target to the list of features
-        numeric_columns.remove(self.target_column)
-        self.scaler = MinMaxScaler()  # create a new MinMaxScaler
-        df_scaled = self.scaler.fit_transform(
-            df[numeric_columns]
-        )  # scale all the numerical columns using the new MinMaxScaler
-        df[numeric_columns] = df_scaled.copy()
-        df_encoded = pd.get_dummies(
-            df, columns=categorical_columns, drop_first=False
-        )  # get one-hot encoding for all categorical values
-        df_encoded = df_encoded.astype(
-            float
-        )  # convert the one-hot encoding into floats (values between 0.0 - 1.0)
-        new_columns = list(
-            set(df_encoded.columns) - set(numeric_columns)
-        )  # get the list of the new columns (former categorical columns)
-        df = (
-            df_encoded.copy()
-        )  # the dataframe is now the one with all the one-hot encoded features
-        numeric_columns.extend(
-            new_columns
-        )  # add the new columns to the list of numerical columns
-        self.numeric_columns = numeric_columns
-        self.df = df
-    def train_model(self):
-        # FEATURE SELECTION
-        correlation_matrix = self.df[self.numeric_columns].corr()
-        # calculating the Pearson’s correlation coefficient p-value for each element in the matrix
-        p_values = pd.DataFrame(
-            np.zeros((len(self.numeric_columns), len(self.numeric_columns))),
-            columns=self.numeric_columns,
-            index=self.numeric_columns,
-        )
-        # Calculate p-values for each pair
-        for col1 in self.numeric_columns:
-            for col2 in self.numeric_columns:
-                if col1 != col2:
-                    _, p_value = pearsonr(
-                        self.df[col1], self.df[col2]
-                    )  # using scipy.stats.pearsonr to get the p-value for one pair of feature
-                    p_values.loc[col1, col2] = p_value
-                else:
-                    p_values.loc[col1, col2] = (
-                        1  # Set to 1 to not get the relation when trying to find correlations between features
-                    )
-        # Identifying variables that are correlated
-        # When the p-value is smaller than 0.05, there is likely a “real” relationship between the variables.
-        correlated_columns = []
-        for i, col1 in enumerate(self.numeric_columns):
-            for j, col2 in enumerate(self.numeric_columns):
-                if (
-                    j > i
-                    and p_values.loc[col1, col2] < 0.05
-                    and col1 != self.target_column
-                    and col2 != self.target_column
-                ):
-                    correlated_columns.append((col1, col2, p_values.loc[col1, col2]))
-        # remove the target to the list of features
-        self.numeric_columns.remove(self.target_column)
-        # Identify features with a low correlation with the target
-        target_corr = correlation_matrix[self.target_column].copy()
-        correlation_treshold = self.correlation_treshold
-        features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index
-        features_to_remove = set(features_to_remove.to_list())
-        # Identify redundant features using p-values
-        x = {"keep": set(), "remove": set()}
-        for corr_duo in correlated_columns:
-            # put the feature with the highest correlation to the target variable in "keep" and the other one in "remove"
-            if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]:
-                x["keep"].add(corr_duo[0])
-                x["remove"].add(corr_duo[1])
-            else:
-                x["keep"].add(corr_duo[1])
-                x["remove"].add(corr_duo[0])
-        # remove features that are already removed from "keep"
-        x["keep"] = x["keep"] - features_to_remove
-        # remove features that are in "remove" and not in "keep"
-        redundant_features = x["remove"] - x["keep"]
-        features_to_remove = features_to_remove.union(redundant_features)
-        # Remove the selected features from the dataframe
-        for feature in list(features_to_remove):
-            self.numeric_columns.remove(feature)
-            self.df.drop(feature, axis=1, inplace=True)
-        self.features_to_remove = features_to_remove
-        print(
-            f"List of numerical features that will be used to predict the target ({self.target_column}) :"
-        )
-        print(self.numeric_columns)
-        # CREATE & TRAIN MODEL
-        # split the data
-        X = self.df[self.numeric_columns]
-        y = self.df[self.target_column]
-        # 20% of the dataset will be use as test data
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
-        self.model = MLPRegressor(
-            hidden_layer_sizes=self.dimension,
-            activation="relu",  # ReLU activation function
-            solver="adam",  # Adam optimizer
-            max_iter=1,  # One iteration per fit call since the training loop is defined below
-            warm_start=True,
-        )  # Used to measure the MSE throughout the iterations
-        # ignoring the warning raised because we're using a manual loop for training
-        warnings.filterwarnings("ignore", category=ConvergenceWarning)
-        # Track MSE values during training
-        mse_values = []
-        epochs = self.epochs
-        for epoch in range(epochs):
-            # Train the model
-            self.model.fit(X_train, y_train)
-            # Predict on the test set
-            y_pred = self.model.predict(X_test)
-            # Evaluate the model
-            mse = mean_squared_error(y_test, y_pred)
-            mse_values.append(mse)
-        # SAVE THE EVOLUTION OF THE MSE THROUGHOUT THE TRAINING
-        plt.figure(figsize=(10, 6))
-        plt.plot(range(epochs), mse_values, marker='o', linestyle='-')
-        plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}")
-        plt.xlabel('Epoch')
-        plt.ylabel('Mean Squared Error')
-        plt.grid(True)
-        plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png"))
-        print(f"Final epoch MSE: {mse:.4f}")
-    def predict(self, input_data: pd.DataFrame) -> float:
-        # scale the input using the scaler used during training
-        df_used_for_scaling = input_data[
-            [
-                col
-                for col in input_data.columns
-                if col
-                not in [
-                    "Gender_Male",
-                    "Gender_Female",
-                    "Workout_Type_Strength",
-                    "Workout_Type_Yoga",
-                    "Workout_Type_HIIT",
-                    "Workout_Type_Cardio",
-                ]
-            ]
-        ]
-        scaled_input = self.scaler.transform(
-            input_data[[col for col in df_used_for_scaling.columns]]
-        )
-        input_data[df_used_for_scaling.columns] = scaled_input.copy()
-        # keep only the required features for the prediction
-        input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore")
-        input_data = input_data[self.numeric_columns]
-        print("Prediction using the following input : ")
-        print(input_data.to_csv())
-        water_intake = self.model.predict(input_data)
-        return water_intake

+import os
+import warnings
+import pandas as pd
+import numpy as np
+from scipy.stats import pearsonr
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.neural_network import MLPRegressor
+from sklearn.metrics import mean_squared_error
+from sklearn.exceptions import ConvergenceWarning
+from matplotlib import pyplot as plt
+class My_NeuralNetwork:
+    def __init__(self):
+        self.MAX_LAYERS = 10
+        self.target_column = "Water_Intake (liters)"
+        self.model = None
+        # default parameters
+        self.num_layer = 3
+        self.dimension = (32, 32, 32)
+        self.correlation_treshold = 0.01
+        self.epochs = 300
+        # Load the dataset and preprocess it
+        csv_file = os.path.join("data", "gym_members_exercise_tracking.csv")
+        df = pd.read_csv(csv_file, engine="python")
+        df = df.dropna()  # Remove rows with any null cell (just in case)
+        # Assigning some of the features as Category.
+        df["Gender"] = df["Gender"].astype("category")
+        df["Workout_Type"] = df["Workout_Type"].astype("category")
+        # getting the names of the numerical and categorical columns for later
+        numeric_columns = list(df.select_dtypes(exclude=["category"]).columns)
+        categorical_columns = list(df.select_dtypes(include=["category"]).columns)
+        self.df_original = (
+            df.copy()
+        )  # the df variable will have some features removes later but not this one
+        # remove the target to the list of features
+        numeric_columns.remove(self.target_column)
+        self.scaler = MinMaxScaler()  # create a new MinMaxScaler
+        df_scaled = self.scaler.fit_transform(
+            df[numeric_columns]
+        )  # scale all the numerical columns using the new MinMaxScaler
+        df[numeric_columns] = df_scaled.copy()
+        df_encoded = pd.get_dummies(
+            df, columns=categorical_columns, drop_first=False
+        )  # get one-hot encoding for all categorical values
+        df_encoded = df_encoded.astype(
+            float
+        )  # convert the one-hot encoding into floats (values between 0.0 - 1.0)
+        new_columns = list(
+            set(df_encoded.columns) - set(numeric_columns)
+        )  # get the list of the new columns (former categorical columns)
+        df = (
+            df_encoded.copy()
+        )  # the dataframe is now the one with all the one-hot encoded features
+        numeric_columns.extend(
+            new_columns
+        )  # add the new columns to the list of numerical columns
+        self.numeric_columns = numeric_columns
+        self.df = df
+    def train_model(self):
+        # FEATURE SELECTION
+        correlation_matrix = self.df[self.numeric_columns].corr()
+        # calculating the Pearson’s correlation coefficient p-value for each element in the matrix
+        p_values = pd.DataFrame(
+            np.zeros((len(self.numeric_columns), len(self.numeric_columns))),
+            columns=self.numeric_columns,
+            index=self.numeric_columns,
+        )
+        # Calculate p-values for each pair
+        for col1 in self.numeric_columns:
+            for col2 in self.numeric_columns:
+                if col1 != col2:
+                    _, p_value = pearsonr(
+                        self.df[col1], self.df[col2]
+                    )  # using scipy.stats.pearsonr to get the p-value for one pair of feature
+                    p_values.loc[col1, col2] = p_value
+                else:
+                    p_values.loc[col1, col2] = (
+                        1  # Set to 1 to not get the relation when trying to find correlations between features
+                    )
+        # Identifying variables that are correlated
+        # When the p-value is smaller than 0.05, there is likely a “real” relationship between the variables.
+        correlated_columns = []
+        for i, col1 in enumerate(self.numeric_columns):
+            for j, col2 in enumerate(self.numeric_columns):
+                if (
+                    j > i
+                    and p_values.loc[col1, col2] < 0.05
+                    and col1 != self.target_column
+                    and col2 != self.target_column
+                ):
+                    correlated_columns.append((col1, col2, p_values.loc[col1, col2]))
+        # remove the target to the list of features
+        self.numeric_columns.remove(self.target_column)
+        # Identify features with a low correlation with the target
+        target_corr = correlation_matrix[self.target_column].copy()
+        correlation_treshold = self.correlation_treshold
+        features_to_remove = target_corr[abs(target_corr) < correlation_treshold].index
+        features_to_remove = set(features_to_remove.to_list())
+        # Identify redundant features using p-values
+        x = {"keep": set(), "remove": set()}
+        for corr_duo in correlated_columns:
+            # put the feature with the highest correlation to the target variable in "keep" and the other one in "remove"
+            if target_corr[corr_duo[0]] > target_corr[corr_duo[1]]:
+                x["keep"].add(corr_duo[0])
+                x["remove"].add(corr_duo[1])
+            else:
+                x["keep"].add(corr_duo[1])
+                x["remove"].add(corr_duo[0])
+        # remove features that are already removed from "keep"
+        x["keep"] = x["keep"] - features_to_remove
+        # remove features that are in "remove" and not in "keep"
+        redundant_features = x["remove"] - x["keep"]
+        features_to_remove = features_to_remove.union(redundant_features)
+        # Remove the selected features from the dataframe
+        for feature in list(features_to_remove):
+            self.numeric_columns.remove(feature)
+            self.df.drop(feature, axis=1, inplace=True)
+        self.features_to_remove = features_to_remove
+        print(
+            f"List of numerical features that will be used to predict the target ({self.target_column}) :"
+        )
+        print(self.numeric_columns)
+        # CREATE & TRAIN MODEL
+        # split the data
+        X = self.df[self.numeric_columns]
+        y = self.df[self.target_column]
+        # 20% of the dataset will be use as test data
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+        self.model = MLPRegressor(
+            hidden_layer_sizes=self.dimension,
+            activation="relu",  # ReLU activation function
+            solver="adam",  # Adam optimizer
+            max_iter=1,  # One iteration per fit call since the training loop is defined below
+            warm_start=True,
+        )  # Used to measure the MSE throughout the iterations
+        # ignoring the warning raised because we're using a manual loop for training
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
+        # Track MSE values during training
+        mse_values = []
+        epochs = self.epochs
+        for epoch in range(epochs):
+            # Train the model
+            self.model.fit(X_train, y_train)
+            # Predict on the test set
+            y_pred = self.model.predict(X_test)
+            # Evaluate the model
+            mse = mean_squared_error(y_test, y_pred)
+            mse_values.append(mse)
+        # SAVE THE EVOLUTION OF THE MSE THROUGHOUT THE TRAINING
+        plt.figure(figsize=(10, 6))
+        plt.plot(range(epochs), mse_values, marker='o', linestyle='-')
+        plt.title(f"Evolution of MSE During Training. Final MSE = {mse:.4f}")
+        plt.xlabel('Epoch')
+        plt.ylabel('Mean Squared Error')
+        plt.grid(True)
+        plt.savefig(os.path.join("app", "NeuralNetwork", "graph.png"))
+        print(f"Final epoch MSE: {mse:.4f}")
+    def predict(self, input_data: pd.DataFrame) -> float:
+        # scale the input using the scaler used during training
+        df_used_for_scaling = input_data[
+            [
+                col
+                for col in input_data.columns
+                if col
+                not in [
+                    "Gender_Male",
+                    "Gender_Female",
+                    "Workout_Type_Strength",
+                    "Workout_Type_Yoga",
+                    "Workout_Type_HIIT",
+                    "Workout_Type_Cardio",
+                ]
+            ]
+        ]
+        scaled_input = self.scaler.transform(
+            input_data[[col for col in df_used_for_scaling.columns]]
+        )
+        input_data[df_used_for_scaling.columns] = scaled_input.copy()
+        # keep only the required features for the prediction
+        input_data = input_data.drop(self.features_to_remove, axis=1, errors="ignore")
+        input_data = input_data[self.numeric_columns]
+        print("Prediction using the following input : ")
+        print(input_data.to_csv())
+        water_intake = self.model.predict(input_data)
+        return water_intake