File size: 3,759 Bytes
83131d7 774e134 83131d7 774e134 83131d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
class KNNModel:
def __init__(self):
self.target_column = "Fat_Percentage"
self.data_path = os.path.join("data", "gym_members_exercise_tracking.csv") # Updated path
self.data = None
self.selected_features = None
self.label_encoders = {}
self.scaler = StandardScaler()
self.knn = None
self.optimal_k = None
def load_and_preprocess_data(self):
# Load data
self.data = pd.read_csv(self.data_path)
# Check if 'Experience_Level' column exists
if 'Experience_Level' not in self.data.columns:
raise ValueError("'Experience_Level' column not found in the dataset.")
# Encode categorical features
categorical_features = ['Gender', 'Experience_Level', 'Workout_Type']
for feature in categorical_features:
if feature in self.data.columns: # Ensure the column exists
le = LabelEncoder()
self.data[feature] = le.fit_transform(self.data[feature])
self.label_encoders[feature] = le # Save the encoder
else:
raise ValueError(f"'{feature}' column not found in the dataset.")
# Compute correlations and select features
correlation_matrix = self.data.corr()
target_corr = correlation_matrix[self.target_column].sort_values(ascending=False)
self.selected_features = target_corr[abs(target_corr) >= 0.5].index.tolist()
self.selected_features.remove(self.target_column)
# Prepare dataset
X = self.data[self.selected_features]
y = self.data[self.target_column]
X_scaled = self.scaler.fit_transform(X)
# Split into training and testing sets
return train_test_split(X_scaled, y, test_size=0.2, random_state=42)
def find_optimal_k(self, X_train, y_train, k_range=20):
k_values = range(1, k_range + 1)
cv_scores = []
for k in k_values:
knn = KNeighborsRegressor(n_neighbors=k)
scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='r2')
cv_scores.append(scores.mean())
# Find optimal k
self.optimal_k = k_values[np.argmax(cv_scores)]
# Plot k vs. R^2
plt.figure(figsize=(10, 6))
plt.plot(k_values, cv_scores, marker='o', linestyle='-')
plt.xlabel("Number of Neighbors (k)")
plt.ylabel("Cross-Validated R^2")
plt.title("R^2 vs. k")
plt.grid()
plt.savefig(os.path.join("KNN", "optimal_k_plot.png"))
plt.close()
return self.optimal_k
def train_model(self, X_train, y_train):
if not self.optimal_k:
raise ValueError("Optimal k is not set. Run find_optimal_k() first.")
self.knn = KNeighborsRegressor(n_neighbors=self.optimal_k)
self.knn.fit(X_train, y_train)
def evaluate_model(self, X_test, y_test):
if not self.knn:
raise ValueError("Model is not trained. Run train_model() first.")
y_pred = self.knn.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
return r2, mse
def predict(self, input_data: pd.DataFrame) -> float:
if not self.knn:
raise ValueError("Model is not trained. Run train_model() first.")
input_scaled = self.scaler.transform(input_data)
return self.knn.predict(input_scaled)[0] |