import os import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.model_selection import train_test_split, cross_val_score from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import r2_score, mean_squared_error import matplotlib.pyplot as plt class KNNModel: def __init__(self): self.target_column = "Fat_Percentage" self.data_path = os.path.join("data", "gym_members_exercise_tracking.csv") # Updated path self.data = None self.selected_features = None self.label_encoders = {} self.scaler = StandardScaler() self.knn = None self.optimal_k = None def load_and_preprocess_data(self): # Load data self.data = pd.read_csv(self.data_path) # Check if 'Experience_Level' column exists if 'Experience_Level' not in self.data.columns: raise ValueError("'Experience_Level' column not found in the dataset.") # Encode categorical features categorical_features = ['Gender', 'Experience_Level', 'Workout_Type'] for feature in categorical_features: if feature in self.data.columns: # Ensure the column exists le = LabelEncoder() self.data[feature] = le.fit_transform(self.data[feature]) self.label_encoders[feature] = le # Save the encoder else: raise ValueError(f"'{feature}' column not found in the dataset.") # Compute correlations and select features correlation_matrix = self.data.corr() target_corr = correlation_matrix[self.target_column].sort_values(ascending=False) self.selected_features = target_corr[abs(target_corr) >= 0.5].index.tolist() self.selected_features.remove(self.target_column) # Prepare dataset X = self.data[self.selected_features] y = self.data[self.target_column] X_scaled = self.scaler.fit_transform(X) # Split into training and testing sets return train_test_split(X_scaled, y, test_size=0.2, random_state=42) def find_optimal_k(self, X_train, y_train, k_range=20): k_values = range(1, k_range + 1) cv_scores = [] for k in k_values: knn = KNeighborsRegressor(n_neighbors=k) scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='r2') cv_scores.append(scores.mean()) # Find optimal k self.optimal_k = k_values[np.argmax(cv_scores)] # Plot k vs. R^2 plt.figure(figsize=(10, 6)) plt.plot(k_values, cv_scores, marker='o', linestyle='-') plt.xlabel("Number of Neighbors (k)") plt.ylabel("Cross-Validated R^2") plt.title("R^2 vs. k") plt.grid() plt.savefig(os.path.join("KNN", "optimal_k_plot.png")) plt.close() return self.optimal_k def train_model(self, X_train, y_train): if not self.optimal_k: raise ValueError("Optimal k is not set. Run find_optimal_k() first.") self.knn = KNeighborsRegressor(n_neighbors=self.optimal_k) self.knn.fit(X_train, y_train) def evaluate_model(self, X_test, y_test): if not self.knn: raise ValueError("Model is not trained. Run train_model() first.") y_pred = self.knn.predict(X_test) r2 = r2_score(y_test, y_pred) mse = mean_squared_error(y_test, y_pred) return r2, mse def predict(self, input_data: pd.DataFrame) -> float: if not self.knn: raise ValueError("Model is not trained. Run train_model() first.") input_scaled = self.scaler.transform(input_data) return self.knn.predict(input_scaled)[0]