WickedFaith commited on
Commit
80d5e09
·
verified ·
1 Parent(s): 39c7581

Update src/api/loan_model.py

Browse files
Files changed (1) hide show
  1. src/api/loan_model.py +272 -287
src/api/loan_model.py CHANGED
@@ -1,288 +1,273 @@
1
- import os
2
- import joblib
3
- import numpy as np
4
- import pandas as pd
5
- from sklearn.ensemble import RandomForestClassifier
6
- from sklearn.preprocessing import StandardScaler, LabelEncoder
7
- import shap
8
- import logging
9
- from typing import Dict, Any, List, Optional, Tuple
10
-
11
- # Configure logging
12
- logging.basicConfig(
13
- level=logging.INFO,
14
- format='%(asctime)s - %(levelname)s - %(message)s'
15
- )
16
- logger = logging.getLogger(__name__)
17
-
18
- class LoanApprovalModel:
19
- """Loan approval model for predicting loan application outcomes."""
20
-
21
- def __init__(self, model_dir: str = "models", load_model: bool = True):
22
- """Initialize the loan approval model.
23
-
24
- Args:
25
- model_dir (str): Directory containing the trained model components
26
- load_model (bool): Whether to load existing model components
27
- """
28
- self.model_dir = model_dir
29
- self.model = None
30
- self.scaler = StandardScaler()
31
- self.feature_names = None
32
- self.explainer = None
33
-
34
- # Initialize label encoders for categorical columns
35
- self.categorical_columns = ['education', 'self_employed']
36
- self.label_encoders = {}
37
- for col in self.categorical_columns:
38
- self.label_encoders[col] = LabelEncoder()
39
-
40
- # Load model components if requested
41
- if load_model:
42
- self.load_components()
43
-
44
- def load_components(self):
45
- """Load the trained model and preprocessing components."""
46
- try:
47
- logger.info("Loading model components...")
48
-
49
- # Load model
50
- model_path = os.path.join(self.model_dir, 'loan_model.joblib')
51
- if not os.path.exists(model_path):
52
- raise FileNotFoundError(f"Model file not found at {model_path}")
53
- self.model = joblib.load(model_path)
54
-
55
- # Load scaler
56
- scaler_path = os.path.join(self.model_dir, 'loan_scaler.joblib')
57
- if not os.path.exists(scaler_path):
58
- raise FileNotFoundError(f"Scaler file not found at {scaler_path}")
59
- self.scaler = joblib.load(scaler_path)
60
-
61
- # Load label encoders
62
- encoders_path = os.path.join(self.model_dir, 'loan_label_encoders.joblib')
63
- if not os.path.exists(encoders_path):
64
- raise FileNotFoundError(f"Label encoders file not found at {encoders_path}")
65
- self.label_encoders = joblib.load(encoders_path)
66
-
67
- # Load feature names
68
- features_path = os.path.join(self.model_dir, 'loan_feature_names.joblib')
69
- if not os.path.exists(features_path):
70
- raise FileNotFoundError(f"Feature names file not found at {features_path}")
71
- self.feature_names = joblib.load(features_path)
72
-
73
- # Try to load explainer if available
74
- explainer_path = os.path.join(self.model_dir, 'loan_explainer.joblib')
75
- if os.path.exists(explainer_path):
76
- self.explainer = joblib.load(explainer_path)
77
-
78
- logger.info("Model components loaded successfully")
79
-
80
- except Exception as e:
81
- logger.error(f"Error loading model components: {str(e)}")
82
- raise
83
-
84
- def save(self, output_dir: str = "models") -> None:
85
- """Save model components to disk.
86
-
87
- Args:
88
- output_dir (str): Directory to save model components
89
- """
90
- try:
91
- os.makedirs(output_dir, exist_ok=True)
92
-
93
- # Save model
94
- model_path = os.path.join(output_dir, "loan_model.joblib")
95
- joblib.dump(self.model, model_path)
96
-
97
- # Save scaler
98
- scaler_path = os.path.join(output_dir, "loan_scaler.joblib")
99
- joblib.dump(self.scaler, scaler_path)
100
-
101
- # Save label encoders
102
- encoders_path = os.path.join(output_dir, "loan_label_encoders.joblib")
103
- joblib.dump(self.label_encoders, encoders_path)
104
-
105
- # Save feature names
106
- features_path = os.path.join(output_dir, "loan_feature_names.joblib")
107
- joblib.dump(self.feature_names, features_path)
108
-
109
- # Save explainer if available
110
- if self.explainer is not None:
111
- explainer_path = os.path.join(output_dir, "loan_explainer.joblib")
112
- joblib.dump(self.explainer, explainer_path)
113
-
114
- logger.info(f"Model components saved to {output_dir}")
115
-
116
- except Exception as e:
117
- logger.error(f"Error saving model components: {str(e)}")
118
- raise
119
-
120
- def train(self, X: pd.DataFrame, y: pd.Series) -> None:
121
- """Train the loan approval model.
122
-
123
- Args:
124
- X (pd.DataFrame): Training features
125
- y (pd.Series): Target values
126
- """
127
- try:
128
- # Store feature names
129
- self.feature_names = list(X.columns)
130
-
131
- # Preprocess features
132
- X_processed = self._preprocess_features(X, is_training=True)
133
-
134
- # Initialize and train model
135
- logger.info("Training RandomForestClassifier...")
136
- self.model = RandomForestClassifier(
137
- n_estimators=200,
138
- max_depth=10,
139
- min_samples_split=5,
140
- min_samples_leaf=2,
141
- random_state=42
142
- )
143
-
144
- # Fit the model
145
- self.model.fit(X_processed, y)
146
-
147
- # Initialize SHAP explainer
148
- logger.info("Initializing SHAP explainer...")
149
- self.explainer = shap.TreeExplainer(self.model)
150
-
151
- logger.info("Model trained successfully")
152
-
153
- except Exception as e:
154
- logger.error(f"Error training model: {str(e)}")
155
- raise
156
-
157
- def predict(self, features: Dict[str, Any]) -> Tuple[str, float, Dict[str, float]]:
158
- """Make a prediction for loan approval.
159
-
160
- Args:
161
- features (Dict[str, Any]): Input features for prediction
162
-
163
- Returns:
164
- Tuple[str, float, Dict[str, float]]: Prediction result, probability, and feature importance
165
- """
166
- try:
167
- # Validate required features
168
- required_features = [
169
- 'no_of_dependents', 'education', 'self_employed', 'income_annum',
170
- 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
171
- 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value'
172
- ]
173
-
174
- missing_features = [f for f in required_features if f not in features]
175
- if missing_features:
176
- raise ValueError(f"Missing required features: {missing_features}")
177
-
178
- # Calculate derived features
179
- features = features.copy() # Create a copy to avoid modifying the input
180
- features['debt_to_income'] = features['loan_amount'] / features['income_annum']
181
- features['total_assets'] = (
182
- features['residential_assets_value'] +
183
- features['commercial_assets_value'] +
184
- features['luxury_assets_value'] +
185
- features['bank_asset_value']
186
- )
187
- features['asset_to_loan'] = features['total_assets'] / features['loan_amount']
188
-
189
- # Create DataFrame with all required features
190
- X = pd.DataFrame([features])
191
-
192
- # Ensure all required features are present
193
- required_features = self.feature_names
194
- missing_features = set(required_features) - set(X.columns)
195
- if missing_features:
196
- raise ValueError(f"Missing required features after preprocessing: {missing_features}")
197
-
198
- # Reorder columns to match training data
199
- X = X[required_features]
200
-
201
- # Encode categorical features first
202
- for feature in ['education', 'self_employed']:
203
- try:
204
- X[feature] = self.label_encoders[feature].transform(X[feature].astype(str))
205
- except Exception as e:
206
- raise ValueError(f"Error encoding {feature}: {str(e)}. Valid values are: {self.label_encoders[feature].classes_}")
207
-
208
- # Scale numerical features
209
- numerical_features = [f for f in X.columns if f not in ['education', 'self_employed']]
210
- X[numerical_features] = self.scaler.transform(X[numerical_features])
211
-
212
- # Make prediction
213
- prediction = self.model.predict(X)[0]
214
- probability = self.model.predict_proba(X)[0][1] # Probability of approval
215
-
216
- # Calculate feature importance
217
- feature_importance = dict(zip(self.feature_names, self.model.feature_importances_))
218
-
219
- # Map prediction to string
220
- result = "Approved" if prediction == 1 else "Rejected"
221
-
222
- return result, probability, feature_importance
223
-
224
- except Exception as e:
225
- logger.error(f"Error making prediction: {str(e)}")
226
- logger.exception("Detailed traceback:")
227
- raise
228
-
229
- def _preprocess_features(self, X: pd.DataFrame, is_training: bool = False) -> pd.DataFrame:
230
- """Preprocess features for model training or prediction.
231
-
232
- Args:
233
- X (pd.DataFrame): Input features
234
- is_training (bool): Whether preprocessing is for training
235
-
236
- Returns:
237
- pd.DataFrame: Preprocessed features
238
- """
239
- try:
240
- # Create copy to avoid modifying original data
241
- df = X.copy()
242
-
243
- # Encode categorical variables
244
- for col in self.categorical_columns:
245
- if col in df.columns:
246
- if is_training:
247
- df[col] = self.label_encoders[col].fit_transform(df[col])
248
- else:
249
- df[col] = self.label_encoders[col].transform(df[col])
250
-
251
- # Scale numerical features
252
- numerical_features = [f for f in df.columns if f not in self.categorical_columns]
253
- if is_training:
254
- df[numerical_features] = self.scaler.fit_transform(df[numerical_features])
255
- else:
256
- df[numerical_features] = self.scaler.transform(df[numerical_features])
257
-
258
- return df
259
-
260
- except Exception as e:
261
- logger.error(f"Error preprocessing features: {str(e)}")
262
- raise
263
-
264
- def get_feature_importance(self):
265
- """Return feature importance values from the model."""
266
- try:
267
- if self.model is None:
268
- print("Model not loaded, cannot get feature importance")
269
- return None
270
-
271
- # For tree-based models like RandomForest, we can get feature importance directly
272
- if hasattr(self.model, 'feature_importances_'):
273
- # Return the feature importances as a list
274
- return self.model.feature_importances_.tolist()
275
- elif hasattr(self.model, 'coef_'):
276
- # For linear models, use coefficients as importance
277
- return np.abs(self.model.coef_[0]).tolist()
278
- else:
279
- # Create dummy feature importance if not available
280
- print("Feature importance not available in model, returning dummy values")
281
- # Create dummy values for each feature
282
- feature_count = len(self.feature_names) if hasattr(self, 'feature_names') else 10
283
- return [0.1] * feature_count
284
- except Exception as e:
285
- print(f"Error getting feature importance: {str(e)}")
286
- # Return dummy values as fallback
287
- feature_count = len(self.feature_names) if hasattr(self, 'feature_names') else 10
288
  return [0.1] * feature_count
 
1
+ import os
2
+ import joblib
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.ensemble import RandomForestClassifier
6
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
7
+ import shap
8
+ import logging
9
+ from typing import Dict, Any, List, Optional, Tuple
10
+
11
+ # Configure logging
12
+ logging.basicConfig(
13
+ level=logging.INFO,
14
+ format='%(asctime)s - %(levelname)s - %(message)s'
15
+ )
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class LoanApprovalModel:
19
+ """Loan approval model for predicting loan application outcomes."""
20
+
21
+ def __init__(self, model_dir: str = "models", load_model: bool = True):
22
+ """Initialize the loan approval model.
23
+
24
+ Args:
25
+ model_dir (str): Directory containing the trained model components
26
+ load_model (bool): Whether to load existing model components
27
+ """
28
+ self.model_dir = model_dir
29
+ self.model = None
30
+ self.scaler = StandardScaler()
31
+ self.feature_names = None
32
+ self.explainer = None
33
+
34
+ # Initialize label encoders for categorical columns
35
+ self.categorical_columns = ['education', 'self_employed']
36
+ self.label_encoders = {}
37
+ for col in self.categorical_columns:
38
+ self.label_encoders[col] = LabelEncoder()
39
+
40
+ # Load model components if requested
41
+ if load_model:
42
+ self.load_components()
43
+
44
+ # Add this to your load_components method
45
+ def load_components(self):
46
+ try:
47
+ # Original loading code
48
+ self.model = joblib.load(self.model_path)
49
+ self.scaler = joblib.load(self.scaler_path)
50
+
51
+ # Try to load the explainer with error handling
52
+ try:
53
+ explainer_path = os.path.join(self.model_dir, 'loan_explainer.pkl')
54
+ if os.path.exists(explainer_path):
55
+ self.explainer = joblib.load(explainer_path)
56
+ else:
57
+ self.explainer = None
58
+ logger.warning("Explainer file not found. Explanations will be limited.")
59
+ except Exception as explainer_error:
60
+ logger.error(f"Error loading explainer: {str(explainer_error)}")
61
+ self.explainer = None
62
+ logger.warning("Continuing without explainer. Explanations will be limited.")
63
+
64
+ logger.info("Model components loaded successfully")
65
+ except Exception as e:
66
+ logger.error(f"Error loading model components: {str(e)}")
67
+ raise ValueError(f"Failed to load model components: {str(e)}")
68
+
69
+ def save(self, output_dir: str = "models") -> None:
70
+ """Save model components to disk.
71
+
72
+ Args:
73
+ output_dir (str): Directory to save model components
74
+ """
75
+ try:
76
+ os.makedirs(output_dir, exist_ok=True)
77
+
78
+ # Save model
79
+ model_path = os.path.join(output_dir, "loan_model.joblib")
80
+ joblib.dump(self.model, model_path)
81
+
82
+ # Save scaler
83
+ scaler_path = os.path.join(output_dir, "loan_scaler.joblib")
84
+ joblib.dump(self.scaler, scaler_path)
85
+
86
+ # Save label encoders
87
+ encoders_path = os.path.join(output_dir, "loan_label_encoders.joblib")
88
+ joblib.dump(self.label_encoders, encoders_path)
89
+
90
+ # Save feature names
91
+ features_path = os.path.join(output_dir, "loan_feature_names.joblib")
92
+ joblib.dump(self.feature_names, features_path)
93
+
94
+ # Save explainer if available
95
+ if self.explainer is not None:
96
+ explainer_path = os.path.join(output_dir, "loan_explainer.joblib")
97
+ joblib.dump(self.explainer, explainer_path)
98
+
99
+ logger.info(f"Model components saved to {output_dir}")
100
+
101
+ except Exception as e:
102
+ logger.error(f"Error saving model components: {str(e)}")
103
+ raise
104
+
105
+ def train(self, X: pd.DataFrame, y: pd.Series) -> None:
106
+ """Train the loan approval model.
107
+
108
+ Args:
109
+ X (pd.DataFrame): Training features
110
+ y (pd.Series): Target values
111
+ """
112
+ try:
113
+ # Store feature names
114
+ self.feature_names = list(X.columns)
115
+
116
+ # Preprocess features
117
+ X_processed = self._preprocess_features(X, is_training=True)
118
+
119
+ # Initialize and train model
120
+ logger.info("Training RandomForestClassifier...")
121
+ self.model = RandomForestClassifier(
122
+ n_estimators=200,
123
+ max_depth=10,
124
+ min_samples_split=5,
125
+ min_samples_leaf=2,
126
+ random_state=42
127
+ )
128
+
129
+ # Fit the model
130
+ self.model.fit(X_processed, y)
131
+
132
+ # Initialize SHAP explainer
133
+ logger.info("Initializing SHAP explainer...")
134
+ self.explainer = shap.TreeExplainer(self.model)
135
+
136
+ logger.info("Model trained successfully")
137
+
138
+ except Exception as e:
139
+ logger.error(f"Error training model: {str(e)}")
140
+ raise
141
+
142
+ def predict(self, features: Dict[str, Any]) -> Tuple[str, float, Dict[str, float]]:
143
+ """Make a prediction for loan approval.
144
+
145
+ Args:
146
+ features (Dict[str, Any]): Input features for prediction
147
+
148
+ Returns:
149
+ Tuple[str, float, Dict[str, float]]: Prediction result, probability, and feature importance
150
+ """
151
+ try:
152
+ # Validate required features
153
+ required_features = [
154
+ 'no_of_dependents', 'education', 'self_employed', 'income_annum',
155
+ 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
156
+ 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value'
157
+ ]
158
+
159
+ missing_features = [f for f in required_features if f not in features]
160
+ if missing_features:
161
+ raise ValueError(f"Missing required features: {missing_features}")
162
+
163
+ # Calculate derived features
164
+ features = features.copy() # Create a copy to avoid modifying the input
165
+ features['debt_to_income'] = features['loan_amount'] / features['income_annum']
166
+ features['total_assets'] = (
167
+ features['residential_assets_value'] +
168
+ features['commercial_assets_value'] +
169
+ features['luxury_assets_value'] +
170
+ features['bank_asset_value']
171
+ )
172
+ features['asset_to_loan'] = features['total_assets'] / features['loan_amount']
173
+
174
+ # Create DataFrame with all required features
175
+ X = pd.DataFrame([features])
176
+
177
+ # Ensure all required features are present
178
+ required_features = self.feature_names
179
+ missing_features = set(required_features) - set(X.columns)
180
+ if missing_features:
181
+ raise ValueError(f"Missing required features after preprocessing: {missing_features}")
182
+
183
+ # Reorder columns to match training data
184
+ X = X[required_features]
185
+
186
+ # Encode categorical features first
187
+ for feature in ['education', 'self_employed']:
188
+ try:
189
+ X[feature] = self.label_encoders[feature].transform(X[feature].astype(str))
190
+ except Exception as e:
191
+ raise ValueError(f"Error encoding {feature}: {str(e)}. Valid values are: {self.label_encoders[feature].classes_}")
192
+
193
+ # Scale numerical features
194
+ numerical_features = [f for f in X.columns if f not in ['education', 'self_employed']]
195
+ X[numerical_features] = self.scaler.transform(X[numerical_features])
196
+
197
+ # Make prediction
198
+ prediction = self.model.predict(X)[0]
199
+ probability = self.model.predict_proba(X)[0][1] # Probability of approval
200
+
201
+ # Calculate feature importance
202
+ feature_importance = dict(zip(self.feature_names, self.model.feature_importances_))
203
+
204
+ # Map prediction to string
205
+ result = "Approved" if prediction == 1 else "Rejected"
206
+
207
+ return result, probability, feature_importance
208
+
209
+ except Exception as e:
210
+ logger.error(f"Error making prediction: {str(e)}")
211
+ logger.exception("Detailed traceback:")
212
+ raise
213
+
214
+ def _preprocess_features(self, X: pd.DataFrame, is_training: bool = False) -> pd.DataFrame:
215
+ """Preprocess features for model training or prediction.
216
+
217
+ Args:
218
+ X (pd.DataFrame): Input features
219
+ is_training (bool): Whether preprocessing is for training
220
+
221
+ Returns:
222
+ pd.DataFrame: Preprocessed features
223
+ """
224
+ try:
225
+ # Create copy to avoid modifying original data
226
+ df = X.copy()
227
+
228
+ # Encode categorical variables
229
+ for col in self.categorical_columns:
230
+ if col in df.columns:
231
+ if is_training:
232
+ df[col] = self.label_encoders[col].fit_transform(df[col])
233
+ else:
234
+ df[col] = self.label_encoders[col].transform(df[col])
235
+
236
+ # Scale numerical features
237
+ numerical_features = [f for f in df.columns if f not in self.categorical_columns]
238
+ if is_training:
239
+ df[numerical_features] = self.scaler.fit_transform(df[numerical_features])
240
+ else:
241
+ df[numerical_features] = self.scaler.transform(df[numerical_features])
242
+
243
+ return df
244
+
245
+ except Exception as e:
246
+ logger.error(f"Error preprocessing features: {str(e)}")
247
+ raise
248
+
249
+ def get_feature_importance(self):
250
+ """Return feature importance values from the model."""
251
+ try:
252
+ if self.model is None:
253
+ print("Model not loaded, cannot get feature importance")
254
+ return None
255
+
256
+ # For tree-based models like RandomForest, we can get feature importance directly
257
+ if hasattr(self.model, 'feature_importances_'):
258
+ # Return the feature importances as a list
259
+ return self.model.feature_importances_.tolist()
260
+ elif hasattr(self.model, 'coef_'):
261
+ # For linear models, use coefficients as importance
262
+ return np.abs(self.model.coef_[0]).tolist()
263
+ else:
264
+ # Create dummy feature importance if not available
265
+ print("Feature importance not available in model, returning dummy values")
266
+ # Create dummy values for each feature
267
+ feature_count = len(self.feature_names) if hasattr(self, 'feature_names') else 10
268
+ return [0.1] * feature_count
269
+ except Exception as e:
270
+ print(f"Error getting feature importance: {str(e)}")
271
+ # Return dummy values as fallback
272
+ feature_count = len(self.feature_names) if hasattr(self, 'feature_names') else 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  return [0.1] * feature_count