import pandas as pd import numpy as np import pickle import os import sys import logging from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from typing import List logger = logging.getLogger(__name__) class DiabetesModel: def __init__(self): self.model = None self.scaler = None self.feature_names = None self.model_metrics = None # Get the project root directory self.project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Set paths for model files self.model_path = os.path.join(self.project_root, 'models', 'diabetes_model.pkl') self.feature_names_path = os.path.join(self.project_root, 'models', 'diabetes_feature_names.pkl') self.model_metrics_path = os.path.join(self.project_root, 'models', 'diabetes_model_metrics.pkl') # Default feature names if not loaded from file self.default_feature_names = [ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age' ] # Initialize feature names first self.feature_names = self.default_feature_names # Load the model and related files self.load_model() def load_model(self): """Load the trained model and related files from disk.""" try: # Try to load feature names first if os.path.exists(self.feature_names_path): try: with open(self.feature_names_path, 'rb') as f: self.feature_names = pickle.load(f, encoding='latin1') logger.info("Feature names loaded successfully") except Exception as e: logger.warning(f"Error loading feature names: {str(e)}. Using defaults.") self.feature_names = self.default_feature_names else: logger.warning("Feature names file not found, using defaults") self.feature_names = self.default_feature_names # Try to load the model if os.path.exists(self.model_path): try: with open(self.model_path, 'rb') as f: model_data = pickle.load(f, encoding='latin1') if isinstance(model_data, dict): self.model = model_data.get('model') self.scaler = model_data.get('scaler') if self.model is None or self.scaler is None: raise ValueError("Model or scaler missing from loaded data") else: self.model = model_data # Create a new scaler if not found in model data self.scaler = StandardScaler() logger.warning("Model loaded but scaler not found. Creating new scaler.") logger.info("Model loaded successfully") except Exception as e: logger.error(f"Error loading model: {str(e)}") raise ValueError(f"Failed to load diabetes model: {str(e)}") else: logger.error("Model file not found.") raise FileNotFoundError(f"Diabetes model file not found at {self.model_path}") # Try to load model metrics if os.path.exists(self.model_metrics_path): try: with open(self.model_metrics_path, 'rb') as f: self.model_metrics = pickle.load(f, encoding='latin1') logger.info("Model metrics loaded successfully") except Exception as e: logger.warning(f"Error loading model metrics: {str(e)}") self.model_metrics = None else: logger.warning("Model metrics file not found") self.model_metrics = None except Exception as e: logger.error(f"Error in load_model: {str(e)}") raise ValueError(f"Failed to load diabetes model: {str(e)}") # Remove the _create_dummy_model method entirely def _create_dummy_model(self): """Create a dummy model for testing purposes.""" try: logger.warning("Creating dummy model") self.model = RandomForestClassifier(n_estimators=100, random_state=42) self.scaler = StandardScaler() # Create dummy data to fit the scaler and model dummy_data = pd.DataFrame(np.random.randn(100, len(self.feature_names)), columns=self.feature_names) self.scaler.fit(dummy_data) # Fit the model with dummy data dummy_target = np.random.randint(0, 2, 100) self.model.fit(dummy_data, dummy_target) logger.info("Dummy model created successfully") except Exception as e: logger.error(f"Error creating dummy model: {str(e)}") raise def save_model(self): """Save the model and scaler together in one file.""" try: # Create a dictionary containing both model and scaler model_data = { 'model': self.model, 'scaler': self.scaler } # Save to file with open(self.model_path, 'wb') as f: pickle.dump(model_data, f) logger.info("Model and scaler saved successfully") except Exception as e: logger.error(f"Error saving model: {str(e)}") raise def predict(self, features): """Make a prediction using the trained model.""" try: if self.model is None: raise ValueError("Model not loaded. Please ensure model file exists and is valid.") print(f"Input features for diabetes prediction: {features}") # Convert string inputs to appropriate numeric types processed_features = {} for key, value in features.items(): try: processed_features[key] = float(value) except (ValueError, TypeError): # Handle conversion errors raise ValueError(f"Invalid value for feature {key}: {value}. Expected numeric value.") # Create DataFrame with processed values X = pd.DataFrame([processed_features]) # Ensure all required columns are present required_columns = [ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age' ] for col in required_columns: if col not in X.columns: raise ValueError(f"Missing required feature: {col}") # Ensure columns are in the correct order X = X[required_columns] # Convert all data to float64 to ensure compatibility X = X.astype(float) # Scale features if scaler is available if hasattr(self, 'scaler') and self.scaler is not None: X_scaled = self.scaler.transform(X) else: X_scaled = X.values # Make prediction prediction = bool(self.model.predict(X_scaled)[0]) # Get probability - handle different model types if hasattr(self.model, 'predict_proba'): # For models that provide probability proba = self.model.predict_proba(X_scaled)[0] # Make sure we get the probability for the positive class (index 1) probability = float(proba[1]) if len(proba) > 1 else float(proba[0]) else: # For models that don't provide probability probability = 0.5 + (float(self.model.decision_function(X_scaled)[0]) / 10) probability = max(0, min(1, probability)) # Clamp between 0 and 1 return { "prediction": prediction, "probability": probability } except Exception as e: import traceback traceback.print_exc() raise ValueError(f"Error during prediction: {str(e)}") def get_feature_importance(self) -> List[float]: """Get the feature importance scores as a list of floats.""" try: if hasattr(self.model, 'feature_importances_'): # Convert feature importances to a list of floats importances = [float(x) for x in self.model.feature_importances_] # Ensure we have the same number of importances as features if len(importances) == len(self.feature_names): return importances # If we can't get valid feature importances, return None logger.warning("Could not get valid feature importances") return None except Exception as e: logger.error(f"Error getting feature importance: {str(e)}") return None def get_model_metrics(self): """Get the model metrics.""" return self.model_metrics if self.model_metrics else None def train_model(self, X, y): """Train the model with the given data.""" try: logger.info("Starting model training...") # Initialize the scaler and scale the features self.scaler = StandardScaler() X_scaled = self.scaler.fit_transform(X) # Initialize and train the model self.model = RandomForestClassifier( n_estimators=100, max_depth=10, random_state=42 ) self.model.fit(X_scaled, y) # Calculate and store model metrics train_score = self.model.score(X_scaled, y) feature_importance = self.model.feature_importances_ self.model_metrics = { 'train_score': train_score, 'feature_importance': feature_importance.tolist() } # Save the model, scaler, and metrics self.save_model() self.save_metrics() self.save_feature_names() logger.info(f"Model trained successfully. Training score: {train_score:.4f}") return True except Exception as e: logger.error(f"Error in train_model: {str(e)}") raise def save_metrics(self): """Save model metrics to file.""" try: with open(self.model_metrics_path, 'wb') as f: pickle.dump(self.model_metrics, f) logger.info("Model metrics saved successfully") except Exception as e: logger.error(f"Error saving model metrics: {str(e)}") raise def save_feature_names(self): """Save feature names to file.""" try: with open(self.feature_names_path, 'wb') as f: pickle.dump(self.feature_names, f) logger.info("Feature names saved successfully") except Exception as e: logger.error(f"Error saving feature names: {str(e)}") raise def train_model(): """Train and save the diabetes prediction model""" try: model = DiabetesModel() # Get absolute paths current_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(os.path.dirname(current_dir)) data_file = os.path.join(project_root, "data", "diabetes.csv") model_dir = os.path.join(project_root, 'models') print(f"Loading data from: {data_file}") print(f"Model will be saved to: {model_dir}") # Ensure data file exists if not os.path.exists(data_file): raise FileNotFoundError(f"Data file not found at {data_file}") # Create models directory if it doesn't exist os.makedirs(model_dir, exist_ok=True) # Load data print("Loading and preparing data...") data = pd.read_csv(data_file) # Select features and target X = data[model.feature_names] y = data['Outcome'] # Train the model print("Training model...") model.train_model(X, y) print("Model trained and saved successfully") except Exception as e: print(f"Error during model training: {str(e)}") import traceback print(traceback.format_exc()) sys.exit(1) if __name__ == "__main__": train_model()