Spaces:
Sleeping
Sleeping
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.ensemble import VotingClassifier | |
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix | |
import pickle | |
#models | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.svm import SVC | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
import xgboost as xgb | |
#Improving accuracy | |
from imblearn.over_sampling import SMOTE | |
df = pd.read_csv('churn.csv') | |
sns.set_style(style="whitegrid") | |
plt.figure(figsize=(12, 10)) | |
#sns.countplot(x='Exited', data=df) | |
plt.title('Churn Distribution') | |
#sns.histplot(data=df, x='Age', kde=True) | |
plt.title('Age Distribution') | |
#sns.scatterplot(data=df, x='CreditScore', y='Age', hue='Exited') | |
plt.title('Credit Score vs Age') | |
#sns.boxplot(data=df, x='Exited', y='Balance') | |
plt.title('Balance vs Churn') | |
#sns.boxplot(x='Exited', y='CreditScore', data=df) | |
plt.title('Credit Score vs Churn') | |
#plt.show() | |
#Feature Engineering | |
features = df.drop(columns=['Exited', 'RowNumber', 'CustomerId', 'Surname']) | |
features["CLV"] = df["Balance"] * df["EstimatedSalary"] / 100000 | |
features["AgeGroup"] = pd.cut(df["Age"], bins=[0, 30, 45, 60, 100], labels=["Young", "MiddleAged", "Senior", "Elderly"]) | |
features["TenureAgeRatio"] = df["Tenure"] / df["Age"] | |
features = pd.get_dummies(features, columns=['Geography', 'Gender', 'AgeGroup']) | |
target = df['Exited'] | |
#Train Test Split | |
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42) | |
scaler = StandardScaler() | |
X_train = scaler.fit_transform(X_train) | |
X_test = scaler.fit_transform(X_test) | |
#SMOTE | |
smote = SMOTE(random_state=42) | |
X_resampled, y_resampled = smote.fit_resample(X_train, y_train) | |
#Logistic Regression | |
lr_model = LogisticRegression(random_state=42) | |
lr_model.fit(X_train, y_train) | |
lr_pred = lr_model.predict(X_test) | |
lr_accuracy = accuracy_score(y_test, lr_pred) | |
#Model Evaluation and Saving | |
def evaluate_model(model, X_train, y_train, X_test, y_test): | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
accuracy = accuracy_score(y_test, y_pred) | |
print(f"{model.__class__.__name__} Accuracy: {accuracy}") | |
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}") | |
print(f"--------------------------------") | |
def evaluate_and_save_model(model, X_train, y_train, X_test, y_test, file_name): | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
accuracy = accuracy_score(y_test, y_pred) | |
print(f"{model.__class__.__name__} Accuracy: {accuracy}") | |
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}") | |
print(f"--------------------------------") | |
with open(file_name, 'wb') as file: | |
pickle.dump(model, file) | |
print(f"Model saved to {file_name}") | |
""" | |
xgb_model = xgb.XGBClassifier(random_state=42) | |
#evaluate_and_save_model(xgb_model, X_train, y_train, X_test, y_test, 'xgb_model.pkl') | |
evaluate_model(xgb_model, X_train, y_train, X_test, y_test) | |
evaluate_and_save_model(xgb_model, X_resampled, y_resampled, X_test, y_test, 'xgb_model_resampled.pkl') | |
dt_model = DecisionTreeClassifier(random_state=42) | |
#evaluate_and_save_model(dt_model, X_train, y_train, X_test, y_test, 'dt_model.pkl') | |
evaluate_model(dt_model, X_train, y_train, X_test, y_test) | |
rf_model = RandomForestClassifier(random_state=42) | |
evaluate_and_save_model(rf_model, X_train, y_train, X_test, y_test, 'rf_model.pkl') | |
nb_model = GaussianNB() | |
evaluate_and_save_model(nb_model, X_train, y_train, X_test, y_test, 'nb_model.pkl') | |
svm_model = SVC(random_state=42) | |
evaluate_and_save_model(svm_model, X_train, y_train, X_test, y_test, 'svm_model.pkl') | |
knn_model = KNeighborsClassifier() | |
evaluate_and_save_model(knn_model, X_train, y_train, X_test, y_test, 'knn_model.pkl') | |
#Feature Importance | |
feature_imporance = xgb_model.feature_importances_ | |
feature_names = features.columns | |
feature_importance_df = pd.DataFrame({ | |
'Feature': feature_names, 'Importance': feature_imporance | |
}) | |
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False) | |
""" | |
#Voting Classifier | |
""" | |
voting_model = VotingClassifier( | |
estimators=[('xgb', xgb.XGBClassifier(random_state=42)), ('rf', RandomForestClassifier(random_state=42)), ('svm', SVC(random_state=42, probability=True))], | |
voting='hard' | |
) | |
evaluate_and_save_model(voting_model, X_train, y_train, X_test, y_test, 'voting_model.pkl') """ | |
""" | |
plt.figure(figsize=(10, 6)) | |
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance']) | |
plt.xticks(rotation=90) | |
plt.xlabel('Importance') | |
plt.ylabel('Feature') | |
plt.title('Feature Importance') """ | |