|
import os |
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.preprocessing import LabelEncoder, StandardScaler |
|
from sklearn.model_selection import train_test_split, cross_val_score |
|
from sklearn.neighbors import KNeighborsRegressor |
|
from sklearn.metrics import r2_score, mean_squared_error |
|
import matplotlib.pyplot as plt |
|
|
|
class KNNModel: |
|
def __init__(self): |
|
self.target_column = "Fat_Percentage" |
|
self.data_path = os.path.join("data", "gym_members_exercise_tracking.csv") |
|
self.data = None |
|
self.selected_features = None |
|
self.label_encoders = {} |
|
self.scaler = StandardScaler() |
|
self.knn = None |
|
self.optimal_k = None |
|
|
|
def load_and_preprocess_data(self): |
|
|
|
self.data = pd.read_csv(self.data_path) |
|
|
|
|
|
if 'Experience_Level' not in self.data.columns: |
|
raise ValueError("'Experience_Level' column not found in the dataset.") |
|
|
|
|
|
categorical_features = ['Gender', 'Experience_Level', 'Workout_Type'] |
|
for feature in categorical_features: |
|
if feature in self.data.columns: |
|
le = LabelEncoder() |
|
self.data[feature] = le.fit_transform(self.data[feature]) |
|
self.label_encoders[feature] = le |
|
else: |
|
raise ValueError(f"'{feature}' column not found in the dataset.") |
|
|
|
|
|
correlation_matrix = self.data.corr() |
|
target_corr = correlation_matrix[self.target_column].sort_values(ascending=False) |
|
self.selected_features = target_corr[abs(target_corr) >= 0.5].index.tolist() |
|
self.selected_features.remove(self.target_column) |
|
|
|
|
|
X = self.data[self.selected_features] |
|
y = self.data[self.target_column] |
|
X_scaled = self.scaler.fit_transform(X) |
|
|
|
|
|
return train_test_split(X_scaled, y, test_size=0.2, random_state=42) |
|
|
|
def find_optimal_k(self, X_train, y_train, k_range=20): |
|
k_values = range(1, k_range + 1) |
|
cv_scores = [] |
|
|
|
for k in k_values: |
|
knn = KNeighborsRegressor(n_neighbors=k) |
|
scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='r2') |
|
cv_scores.append(scores.mean()) |
|
|
|
|
|
self.optimal_k = k_values[np.argmax(cv_scores)] |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
plt.plot(k_values, cv_scores, marker='o', linestyle='-') |
|
plt.xlabel("Number of Neighbors (k)") |
|
plt.ylabel("Cross-Validated R^2") |
|
plt.title("R^2 vs. k") |
|
plt.grid() |
|
plt.savefig(os.path.join("KNN", "optimal_k_plot.png")) |
|
plt.close() |
|
|
|
return self.optimal_k |
|
|
|
def train_model(self, X_train, y_train): |
|
if not self.optimal_k: |
|
raise ValueError("Optimal k is not set. Run find_optimal_k() first.") |
|
self.knn = KNeighborsRegressor(n_neighbors=self.optimal_k) |
|
self.knn.fit(X_train, y_train) |
|
|
|
def evaluate_model(self, X_test, y_test): |
|
if not self.knn: |
|
raise ValueError("Model is not trained. Run train_model() first.") |
|
y_pred = self.knn.predict(X_test) |
|
r2 = r2_score(y_test, y_pred) |
|
mse = mean_squared_error(y_test, y_pred) |
|
return r2, mse |
|
|
|
def predict(self, input_data: pd.DataFrame) -> float: |
|
if not self.knn: |
|
raise ValueError("Model is not trained. Run train_model() first.") |
|
input_scaled = self.scaler.transform(input_data) |
|
return self.knn.predict(input_scaled)[0] |