|
|
|
"""Final Project_Uma Namboothiripad.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1-8ctPDO-F1GYjztowygbfzJI2i5vZT9J |
|
|
|
In my current department, we are facing high levels of employee turnover. We have limited budgets and some employees so not believe there is opportunity for growth. The recent turnover has caused stress among the existing employees. Leadership is concerned about how to stop this loss of talent. Despite what they think, a pizza party just won't fix this. |
|
""" |
|
|
|
from google.colab import drive |
|
drive.mount('/content/drive') |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from matplotlib import pyplot as plt |
|
import seaborn as sns |
|
import lightgbm as lgb |
|
from statsmodels.tsa.holtwinters import SimpleExpSmoothing |
|
from sklearn.metrics import mean_absolute_error |
|
from statsmodels.tsa.holtwinters import ExponentialSmoothing |
|
from statsmodels.tsa.statespace.sarimax import SARIMAX |
|
from statsmodels.tsa.arima_model import ARIMA |
|
from statsmodels.tsa.seasonal import seasonal_decompose |
|
import statsmodels.api as sm |
|
import itertools |
|
from sklearn.model_selection import train_test_split |
|
pip install -U scikit-learn |
|
|
|
|
|
import warnings |
|
pd.set_option('display.max_columns', None) |
|
pd.set_option('display.width', 500) |
|
import warnings |
|
warnings.simplefilter(action='ignore', category=FutureWarning) |
|
from pandas_profiling import ProfileReport |
|
from scipy.stats import norm |
|
from pandas_profiling import ProfileReport |
|
|
|
|
|
|
|
|
|
|
|
|
|
!pip install pycaret |
|
|
|
!pip install datasets |
|
|
|
from datasets import load_dataset |
|
import pandas as pd |
|
import multiprocessing as mp |
|
import numpy as np |
|
import pandas as pd |
|
import time |
|
import concurrent.futures |
|
import time |
|
|
|
url="https://huggingface.co/datasets/namb0010/Turnover/raw/main/finalproject.csv" |
|
df0 = pd.read_csv(url) |
|
|
|
df0.head() |
|
|
|
"""A report to give me an idea of what the data looks like. |
|
|
|
Some additional information on some of the values in the data. For education this is what the categories mean: 1 = Below College, 2 = College, 3 = Bachelor, 4 = Masters, 5 = Phd |
|
""" |
|
|
|
profile = ProfileReport(df0) |
|
profile |
|
|
|
"""drop the data that won't matter for attrition. Everyone works the standard hours and the other information is just identifying""" |
|
|
|
df0=df0.drop(['EmployeeCount', 'EmployeeID', 'StandardHours'], axis = 1) |
|
|
|
"""**Feature Engineering of the data**[link text](https://)""" |
|
|
|
df0.describe() |
|
|
|
"""Looked to see if the data has any NA's. """ |
|
|
|
df0.isna().sum() |
|
|
|
"""Before I change any of the values, I am going to make a copy of the dataframe just to reduce any issues I will have 2 hours from now. """ |
|
|
|
df = df0.copy(deep=True) |
|
|
|
"""The NAs was only 19 of the rows. I am replacing it with the average which was 2.69""" |
|
|
|
df=df.replace(np.nan, "2.69") |
|
|
|
df.dtypes |
|
|
|
df['NumCompaniesWorked']=df['NumCompaniesWorked'].astype(float) |
|
|
|
df['TotalWorkingYears']=df['TotalWorkingYears'].astype(float) |
|
|
|
"""Turn the categorical values into ordinal """ |
|
|
|
cat_cols = ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus'] |
|
|
|
from tqdm import tqdm |
|
|
|
from sklearn.preprocessing import OrdinalEncoder |
|
|
|
|
|
for i, v in tqdm(enumerate(cat_cols)): |
|
df[v] = OrdinalEncoder(dtype="int").fit_transform(df[[v]]) |
|
|
|
df.dtypes |
|
|
|
df.describe() |
|
|
|
"""Notes on the categorical data that was turned into ordinal, so I don't forget what they mean tomorrow. |
|
1. Attrition - 0: No, 1: Yes |
|
2. Business Travel - 0: Non-Travel, 1: Travel Frequently, 2: Travel-Rarely |
|
3. Department- 0: HR, 1: Student Services, 2: Recruitment |
|
4. Gender - 0: Female, 1: Male |
|
5. Marital Status- 0: Divorced, 1: Married, 2: Single |
|
""" |
|
|
|
df.head() |
|
|
|
"""Break up the data into training and testing """ |
|
|
|
from sklearn.model_selection import train_test_split |
|
train, test = train_test_split(df, test_size=0.2) |
|
|
|
train.describe() |
|
|
|
test.describe() |
|
|
|
"""Overall looking at some of the data, some initial observations. |
|
|
|
-most staff live close to work, farthest out is 30 miles |
|
-some disparity in income, percent raise |
|
-Looks like most people have gotten promotions in the last two years; but have people who haven't been promoted for 15 |
|
-some outliers with the current manager - some have worked together for a long time |
|
|
|
Overall - nothing that makes me concerned about the quality of the data. Most companies have a pay disparity from the highest earner to the lowest. With the variety of educational backgrounds, from no college to Phd, we can expect that reflected in salary in this data. Therefore, I am not removing anything from this dataset. |
|
|
|
""" |
|
|
|
train['DistanceFromHome'].hist(bins=25, figsize=(10,5)) |
|
|
|
train['MonthlyIncome'].hist(bins=25, figsize=(10,5)) |
|
|
|
train['PercentSalaryHike'].hist(bins=25, figsize=(10,5)) |
|
|
|
train['TotalWorkingYears'].hist(bins=25, figsize=(10,5)) |
|
|
|
train['YearsAtCompany'].hist(bins=25, figsize=(10,5)) |
|
|
|
train['YearsSinceLastPromotion'].hist(bins=25, figsize=(10,5)) |
|
|
|
train['YearsWithCurrManager'].hist(bins=25, figsize=(10,5)) |
|
|
|
"""Remember that this was categorical data before so this is 18 different job titles. If you want to see the more categorical data, go to the profile report at the top. """ |
|
|
|
train['JobRole'].hist(bins=25, figsize=(10,5)) |
|
|
|
df.corr() |
|
|
|
"""Some notes on what the data indicates about the variables and relationships |
|
-Age and Attrition |
|
- Marital status and Attrition |
|
- Total Working Years and Attrition |
|
-Years at Company and Attrition |
|
-Years with Current Manager and Attrition |
|
|
|
As expected with some of the other relationships. Age and years at the company, Age and number of companies worked, Age and Years Since Last Promotion, age and Years with Current Manager, department with education role, number of companies worked vs last promotion, number of companies worked vs years with current manager, years at company vs years with current manager, years at company vs years since last promotion |
|
""" |
|
|
|
dfCorr = df.corr() |
|
filteredDf = dfCorr[((dfCorr >= .10) | (dfCorr <= -.10)) & (dfCorr !=1.000)] |
|
plt.figure(figsize=(30,10)) |
|
sns.heatmap(filteredDf, annot=True, cmap="Reds") |
|
plt.show() |
|
|
|
import matplotlib.cm as cm |
|
|
|
sns.countplot(x='MaritalStatus',hue='Attrition', data=df0) |
|
plt.title("Attrition and Marital Status") |
|
plt.autoscale(enable=True) |
|
plt.show() |
|
|
|
plt.autoscale(enable=True) |
|
sns.countplot(x='Age',hue='Attrition', data=df0) |
|
plt.title("Attrition and Age") |
|
plt.show() |
|
|
|
plt.autoscale(enable=True) |
|
sns.countplot(x='YearsAtCompany',hue='Attrition', data=df0) |
|
plt.title("Attrition and Years working at company") |
|
plt.show() |
|
|
|
plt.autoscale(enable=True) |
|
sns.countplot(x='TotalWorkingYears',hue='Attrition', data=df0) |
|
plt.title("Attrition and Total Working Years") |
|
plt.show() |
|
|
|
"""staff stay with they like their manager""" |
|
|
|
plt.autoscale(enable=True) |
|
sns.countplot(x='YearsWithCurrManager',hue='Attrition', data=df0) |
|
plt.title("Attrition and Do You like your boss") |
|
plt.show() |
|
|
|
"""I was surprised that the raise did not really impact it """ |
|
|
|
plt.autoscale(enable=True) |
|
sns.countplot(x='PercentSalaryHike',hue='Attrition', data=df) |
|
plt.title("Attrition and Raise") |
|
plt.show() |
|
|
|
"""I was surprised that the monthly income did not impact this as much """ |
|
|
|
df.plot.scatter(x='MonthlyIncome', y= 'Attrition') |
|
|
|
"""The correlations did not indicate it above but I biased with the data |
|
|
|
Start dividing up the training data and testing data for model creation |
|
""" |
|
|
|
X = train.drop('Attrition',axis=1) |
|
y = train['Attrition'] |
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) |
|
|
|
from sklearn.metrics import accuracy_score, log_loss |
|
from sklearn.neighbors import KNeighborsClassifier |
|
from sklearn.svm import SVC, LinearSVC, NuSVC |
|
from sklearn.tree import DecisionTreeClassifier |
|
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier |
|
from sklearn.naive_bayes import GaussianNB |
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis |
|
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis |
|
from xgboost import XGBClassifier |
|
import xgboost |
|
|
|
classifiers = [ |
|
KNeighborsClassifier(3), |
|
SVC(kernel="rbf", C=0.025, probability=True), |
|
|
|
DecisionTreeClassifier(), |
|
RandomForestClassifier(), |
|
XGBClassifier(), |
|
AdaBoostClassifier(), |
|
GradientBoostingClassifier(), |
|
GaussianNB(), |
|
LinearDiscriminantAnalysis(), |
|
QuadraticDiscriminantAnalysis()] |
|
|
|
log_cols=["Attrition", "Accuracy", "Log Loss"] |
|
log = pd.DataFrame(columns=log_cols) |
|
|
|
for clf in classifiers: |
|
clf.fit(X_train, y_train) |
|
name = clf.__class__.__name__ |
|
|
|
print("="*30) |
|
print(name) |
|
|
|
print('****Results****') |
|
train_predictions = clf.predict(X_test) |
|
acc = accuracy_score(y_test, train_predictions) |
|
print("Accuracy: {:.4%}".format(acc)) |
|
|
|
train_predictions = clf.predict_proba(X_test) |
|
ll = log_loss(y_test, train_predictions) |
|
print("Log Loss: {}".format(ll)) |
|
|
|
log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols) |
|
log = log.append(log_entry) |
|
|
|
print("="*30) |
|
|
|
from pycaret.classification import * |
|
s = setup(train, target = 'Attrition', silent = True) |
|
|
|
new_model = create_model('et') |
|
|
|
tuned_new_model = tune_model(new_model) |
|
|
|
plot_model(tuned_new_model) |
|
|
|
plot_model(tuned_new_model, plot = 'feature') |
|
|
|
tuned_new_model.get_params() |
|
|
|
classifiers1 = [ |
|
RandomForestClassifier() |
|
] |
|
|
|
log_cols=["Attrition", "Accuracy", "Log Loss"] |
|
log = pd.DataFrame(columns=log_cols) |
|
X_train= X_train.drop(columns=['TrainingTimesLastYear', 'Department', 'Gender', 'EducationField', 'DistanceFromHome', 'Education', 'JobRole', 'PercentSalaryHike', 'Department', 'BusinessTravel', 'Education', 'NumCompaniesWorked']) |
|
X_test= X_test.drop(columns=['TrainingTimesLastYear', 'Department', 'Gender', 'EducationField', 'DistanceFromHome', 'Education', 'JobRole', 'PercentSalaryHike', 'Department', 'BusinessTravel', 'Education', 'NumCompaniesWorked']) |
|
|
|
|
|
for clf in classifiers1: |
|
clf.fit(X_train, y_train) |
|
name = clf.__class__.__name__ |
|
|
|
print("="*30) |
|
print(name) |
|
|
|
print('****Results****') |
|
train_predictions = clf.predict(X_test) |
|
acc = accuracy_score(y_test, train_predictions) |
|
print("Accuracy: {:.4%}".format(acc)) |
|
|
|
train_predictions = clf.predict_proba(X_test) |
|
ll = log_loss(y_test, train_predictions) |
|
print("Log Loss: {}".format(ll)) |
|
|
|
log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols) |
|
log = log.append(log_entry) |
|
|
|
print("="*30) |
|
|
|
|
|
|
|
|
|
df.drop(['BusinessTravel', 'Gender', 'EducationField', 'DistanceFromHome', 'Education', 'JobRole', 'PercentSalaryHike', 'Department', 'BusinessTravel', 'Education', 'NumCompaniesWorked', 'TrainingTimesLastYear'], axis = 1) |
|
|
|
model = RandomForestClassifier() |
|
df_new = {'Age': 25, 'JobLevel': 3, 'MaritalStatus': 1, 'MonthlyIncome': 13000,'TotalWorkingYears': 4, 'YearsAtCompany': 2, 'YearsSinceLastPromotion': 1, 'YearsWithCurrManager': 2 } |
|
df = df.append(df_new, ignore_index = True) |
|
|
|
model.fit(X_train, y_train) |
|
|
|
model.fit(X_train, y_train) |
|
|
|
"""Okay lets see if this will predict anything. I will be # the code below. """ |
|
|
|
|
|
|
|
Y = model.predict(X_test3) |
|
print(Y) |
|
|
|
!pip freeze > requirements.txt |