import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import VotingClassifier from sklearn.metrics import classification_report, accuracy_score, confusion_matrix import pickle #models from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier import xgboost as xgb #Improving accuracy from imblearn.over_sampling import SMOTE df = pd.read_csv('churn.csv') sns.set_style(style="whitegrid") plt.figure(figsize=(12, 10)) #sns.countplot(x='Exited', data=df) plt.title('Churn Distribution') #sns.histplot(data=df, x='Age', kde=True) plt.title('Age Distribution') #sns.scatterplot(data=df, x='CreditScore', y='Age', hue='Exited') plt.title('Credit Score vs Age') #sns.boxplot(data=df, x='Exited', y='Balance') plt.title('Balance vs Churn') #sns.boxplot(x='Exited', y='CreditScore', data=df) plt.title('Credit Score vs Churn') #plt.show() #Feature Engineering features = df.drop(columns=['Exited', 'RowNumber', 'CustomerId', 'Surname']) features["CLV"] = df["Balance"] * df["EstimatedSalary"] / 100000 features["AgeGroup"] = pd.cut(df["Age"], bins=[0, 30, 45, 60, 100], labels=["Young", "MiddleAged", "Senior", "Elderly"]) features["TenureAgeRatio"] = df["Tenure"] / df["Age"] features = pd.get_dummies(features, columns=['Geography', 'Gender', 'AgeGroup']) target = df['Exited'] #Train Test Split X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) #SMOTE smote = SMOTE(random_state=42) X_resampled, y_resampled = smote.fit_resample(X_train, y_train) #Logistic Regression lr_model = LogisticRegression(random_state=42) lr_model.fit(X_train, y_train) lr_pred = lr_model.predict(X_test) lr_accuracy = accuracy_score(y_test, lr_pred) #Model Evaluation and Saving def evaluate_model(model, X_train, y_train, X_test, y_test): model.fit(X_train, y_train) y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"{model.__class__.__name__} Accuracy: {accuracy}") print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}") print(f"--------------------------------") def evaluate_and_save_model(model, X_train, y_train, X_test, y_test, file_name): model.fit(X_train, y_train) y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"{model.__class__.__name__} Accuracy: {accuracy}") print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}") print(f"--------------------------------") with open(file_name, 'wb') as file: pickle.dump(model, file) print(f"Model saved to {file_name}") """ xgb_model = xgb.XGBClassifier(random_state=42) #evaluate_and_save_model(xgb_model, X_train, y_train, X_test, y_test, 'xgb_model.pkl') evaluate_model(xgb_model, X_train, y_train, X_test, y_test) evaluate_and_save_model(xgb_model, X_resampled, y_resampled, X_test, y_test, 'xgb_model_resampled.pkl') dt_model = DecisionTreeClassifier(random_state=42) #evaluate_and_save_model(dt_model, X_train, y_train, X_test, y_test, 'dt_model.pkl') evaluate_model(dt_model, X_train, y_train, X_test, y_test) rf_model = RandomForestClassifier(random_state=42) evaluate_and_save_model(rf_model, X_train, y_train, X_test, y_test, 'rf_model.pkl') nb_model = GaussianNB() evaluate_and_save_model(nb_model, X_train, y_train, X_test, y_test, 'nb_model.pkl') svm_model = SVC(random_state=42) evaluate_and_save_model(svm_model, X_train, y_train, X_test, y_test, 'svm_model.pkl') knn_model = KNeighborsClassifier() evaluate_and_save_model(knn_model, X_train, y_train, X_test, y_test, 'knn_model.pkl') #Feature Importance feature_imporance = xgb_model.feature_importances_ feature_names = features.columns feature_importance_df = pd.DataFrame({ 'Feature': feature_names, 'Importance': feature_imporance }) feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False) """ #Voting Classifier """ voting_model = VotingClassifier( estimators=[('xgb', xgb.XGBClassifier(random_state=42)), ('rf', RandomForestClassifier(random_state=42)), ('svm', SVC(random_state=42, probability=True))], voting='hard' ) evaluate_and_save_model(voting_model, X_train, y_train, X_test, y_test, 'voting_model.pkl') """ """ plt.figure(figsize=(10, 6)) plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance']) plt.xticks(rotation=90) plt.xlabel('Importance') plt.ylabel('Feature') plt.title('Feature Importance') """