{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "view-in-github"
},
"source": [
"
"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "GmfPRLGyV_JO"
},
"source": [
"# **Predictive modelling for Diabetes**\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jh4WhkrODvKy"
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YDRBsHbCEIow"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"from sklearn.svm import SVC\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.model_selection import train_test_split,cross_val_score\n",
"from sklearn.metrics import accuracy_score\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "I8uKq3nIFLaT",
"outputId": "ddce1aff-46f8-4265-afff-dc657021c58d"
},
"outputs": [
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 768,\n \"fields\": [\n {\n \"column\": \"Pregnancies\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 0,\n \"max\": 17,\n \"num_unique_values\": 17,\n \"samples\": [\n 6,\n 1,\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Glucose\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 31,\n \"min\": 0,\n \"max\": 199,\n \"num_unique_values\": 136,\n \"samples\": [\n 151,\n 101,\n 112\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BloodPressure\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 19,\n \"min\": 0,\n \"max\": 122,\n \"num_unique_values\": 47,\n \"samples\": [\n 86,\n 46,\n 85\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SkinThickness\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 15,\n \"min\": 0,\n \"max\": 99,\n \"num_unique_values\": 51,\n \"samples\": [\n 7,\n 12,\n 48\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Insulin\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 115,\n \"min\": 0,\n \"max\": 846,\n \"num_unique_values\": 186,\n \"samples\": [\n 52,\n 41,\n 183\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BMI\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 7.884160320375446,\n \"min\": 0.0,\n \"max\": 67.1,\n \"num_unique_values\": 248,\n \"samples\": [\n 19.9,\n 31.0,\n 38.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"DiabetesPedigreeFunction\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3313285950127749,\n \"min\": 0.078,\n \"max\": 2.42,\n \"num_unique_values\": 517,\n \"samples\": [\n 1.731,\n 0.426,\n 0.138\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11,\n \"min\": 21,\n \"max\": 81,\n \"num_unique_values\": 52,\n \"samples\": [\n 60,\n 47,\n 72\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Outcome\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6 | \n",
" 148 | \n",
" 72 | \n",
" 35 | \n",
" 0 | \n",
" 33.6 | \n",
" 0.627 | \n",
" 50 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 85 | \n",
" 66 | \n",
" 29 | \n",
" 0 | \n",
" 26.6 | \n",
" 0.351 | \n",
" 31 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 8 | \n",
" 183 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 23.3 | \n",
" 0.672 | \n",
" 32 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 89 | \n",
" 66 | \n",
" 23 | \n",
" 94 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 168 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"0 0.627 50 1 \n",
"1 0.351 31 0 \n",
"2 0.672 32 1 \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 "
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"/content/drive/MyDrive/datascience/diabetes (1).csv\")\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IutjpRqFDSs1"
},
"source": [
"# **DATA PREPROCESSING & EDA**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 424
},
"id": "DUM3XEcUGE1W",
"outputId": "9678230b-5106-4ede-bad6-dd095edf186e"
},
"outputs": [
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 768,\n \"fields\": [\n {\n \"column\": \"Pregnancies\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 0,\n \"max\": 17,\n \"num_unique_values\": 17,\n \"samples\": [\n 6,\n 1,\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Glucose\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 31,\n \"min\": 0,\n \"max\": 199,\n \"num_unique_values\": 136,\n \"samples\": [\n 151,\n 101,\n 112\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BloodPressure\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 19,\n \"min\": 0,\n \"max\": 122,\n \"num_unique_values\": 47,\n \"samples\": [\n 86,\n 46,\n 85\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SkinThickness\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 15,\n \"min\": 0,\n \"max\": 99,\n \"num_unique_values\": 51,\n \"samples\": [\n 7,\n 12,\n 48\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Insulin\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 115,\n \"min\": 0,\n \"max\": 846,\n \"num_unique_values\": 186,\n \"samples\": [\n 52,\n 41,\n 183\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BMI\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 7.884160320375446,\n \"min\": 0.0,\n \"max\": 67.1,\n \"num_unique_values\": 248,\n \"samples\": [\n 19.9,\n 31.0,\n 38.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"DiabetesPedigreeFunction\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3313285950127749,\n \"min\": 0.078,\n \"max\": 2.42,\n \"num_unique_values\": 517,\n \"samples\": [\n 1.731,\n 0.426,\n 0.138\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11,\n \"min\": 21,\n \"max\": 81,\n \"num_unique_values\": 52,\n \"samples\": [\n 60,\n 47,\n 72\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Outcome\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6 | \n",
" 148 | \n",
" 72 | \n",
" 35 | \n",
" 0 | \n",
" 33.6 | \n",
" 0.627 | \n",
" 50 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 85 | \n",
" 66 | \n",
" 29 | \n",
" 0 | \n",
" 26.6 | \n",
" 0.351 | \n",
" 31 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 8 | \n",
" 183 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 23.3 | \n",
" 0.672 | \n",
" 32 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 89 | \n",
" 66 | \n",
" 23 | \n",
" 94 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 168 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33 | \n",
" 1 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 763 | \n",
" 10 | \n",
" 101 | \n",
" 76 | \n",
" 48 | \n",
" 180 | \n",
" 32.9 | \n",
" 0.171 | \n",
" 63 | \n",
" 0 | \n",
"
\n",
" \n",
" 764 | \n",
" 2 | \n",
" 122 | \n",
" 70 | \n",
" 27 | \n",
" 0 | \n",
" 36.8 | \n",
" 0.340 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 765 | \n",
" 5 | \n",
" 121 | \n",
" 72 | \n",
" 23 | \n",
" 112 | \n",
" 26.2 | \n",
" 0.245 | \n",
" 30 | \n",
" 0 | \n",
"
\n",
" \n",
" 766 | \n",
" 1 | \n",
" 126 | \n",
" 60 | \n",
" 0 | \n",
" 0 | \n",
" 30.1 | \n",
" 0.349 | \n",
" 47 | \n",
" 1 | \n",
"
\n",
" \n",
" 767 | \n",
" 1 | \n",
" 93 | \n",
" 70 | \n",
" 31 | \n",
" 0 | \n",
" 30.4 | \n",
" 0.315 | \n",
" 23 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
768 rows × 9 columns
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
".. ... ... ... ... ... ... \n",
"763 10 101 76 48 180 32.9 \n",
"764 2 122 70 27 0 36.8 \n",
"765 5 121 72 23 112 26.2 \n",
"766 1 126 60 0 0 30.1 \n",
"767 1 93 70 31 0 30.4 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"0 0.627 50 1 \n",
"1 0.351 31 0 \n",
"2 0.672 32 1 \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 \n",
".. ... ... ... \n",
"763 0.171 63 0 \n",
"764 0.340 27 0 \n",
"765 0.245 30 0 \n",
"766 0.349 47 1 \n",
"767 0.315 23 0 \n",
"\n",
"[768 rows x 9 columns]"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wObmQoogGBMC",
"outputId": "415e4ed0-f346-4830-d9f6-9caa121ca94c"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pregnancies 0\n",
"Glucose 0\n",
"BloodPressure 0\n",
"SkinThickness 0\n",
"Insulin 0\n",
"BMI 0\n",
"DiabetesPedigreeFunction 0\n",
"Age 0\n",
"Outcome 0\n",
"dtype: int64\n",
"shape of dataset : { (768, 9) }\n",
"Pregnancies int64\n",
"Glucose int64\n",
"BloodPressure int64\n",
"SkinThickness int64\n",
"Insulin int64\n",
"BMI float64\n",
"DiabetesPedigreeFunction float64\n",
"Age int64\n",
"Outcome int64\n",
"dtype: object\n"
]
}
],
"source": [
"print(df.isnull().sum())\n",
"print(\"shape of dataset : {\",df.shape,\"}\")\n",
"print(df.dtypes)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "lbssz2twXmdg",
"outputId": "0bdc4ab1-359c-41d4-f4d4-3d380254d4ba"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n",
"count 768.000000 768.000000 768.000000 768.000000 768.000000 \n",
"mean 3.845052 120.894531 69.105469 20.536458 79.799479 \n",
"std 3.369578 31.972618 19.355807 15.952218 115.244002 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 99.000000 62.000000 0.000000 0.000000 \n",
"50% 3.000000 117.000000 72.000000 23.000000 30.500000 \n",
"75% 6.000000 140.250000 80.000000 32.000000 127.250000 \n",
"max 17.000000 199.000000 122.000000 99.000000 846.000000 \n",
"\n",
" BMI DiabetesPedigreeFunction Age Outcome \n",
"count 768.000000 768.000000 768.000000 768.000000 \n",
"mean 31.992578 0.471876 33.240885 0.348958 \n",
"std 7.884160 0.331329 11.760232 0.476951 \n",
"min 0.000000 0.078000 21.000000 0.000000 \n",
"25% 27.300000 0.243750 24.000000 0.000000 \n",
"50% 32.000000 0.372500 29.000000 0.000000 \n",
"75% 36.600000 0.626250 41.000000 1.000000 \n",
"max 67.100000 2.420000 81.000000 1.000000 \n"
]
}
],
"source": [
"print(df.describe())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 620
},
"id": "XQRDkoCOC_2U",
"outputId": "b53619dd-72db-4617-8a6a-a5d8c5328578"
},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.heatmap(df.corr(),annot=True,cmap='coolwarm')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 770
},
"id": "aUO6GmVABb9J",
"outputId": "b539b19d-c7f1-4c86-d255-e1837e979ffe"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
":6: MatplotlibDeprecationWarning: Auto-removal of overlapping axes is deprecated since 3.6 and will be removed two minor releases later; explicitly call ax.remove() as needed.\n",
" plt.subplot((int(length)//2), 3, j + 1)\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import itertools\n",
"col = df.columns[:8]\n",
"plt.subplots(figsize = (20, 15))\n",
"length =int(len(col))\n",
"for i, j in itertools.zip_longest(col, range(length)):\n",
" plt.subplot((int(length)//2), 3, j + 1)\n",
" plt.subplots_adjust(wspace = 0.1,hspace = 0.5)\n",
" df[i].hist(bins = 20)\n",
" plt.title(i)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 518
},
"id": "trHYk66OKkUj",
"outputId": "b5b31bf0-374d-4017-eea9-91f89ddc0e88"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total patients: 768\n",
"patients cured: 500\n",
"patients not cured: 268\n"
]
},
{
"data": {
"text/plain": [
""
]
},
"execution_count": 124,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print(\"total patients:\",df.shape[0])\n",
"print(\"patients cured:\",df['Outcome'].value_counts()[0])\n",
"print(\"patients not cured:\",df['Outcome'].value_counts()[1])\n",
"sns.countplot(x='Outcome',data=df,hue='Outcome')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 300
},
"id": "GsfLWX6ZT3mf",
"outputId": "b8c6821d-1380-4143-b80d-c50507ff6eb1"
},
"outputs": [
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"Pregnancies\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 269.85223453356366,\n \"min\": 0.0,\n \"max\": 768.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 3.8450520833333335,\n 3.0,\n 768.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Glucose\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 243.73802348295857,\n \"min\": 0.0,\n \"max\": 768.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 120.89453125,\n 117.0,\n 768.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BloodPressure\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 252.8525053581062,\n \"min\": 0.0,\n \"max\": 768.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 69.10546875,\n 72.0,\n 768.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SkinThickness\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 263.7684730531098,\n \"min\": 0.0,\n \"max\": 768.0,\n \"num_unique_values\": 7,\n \"samples\": [\n 768.0,\n 20.536458333333332,\n 32.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Insulin\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 350.26059167945886,\n \"min\": 0.0,\n \"max\": 846.0,\n \"num_unique_values\": 7,\n \"samples\": [\n 768.0,\n 79.79947916666667,\n 127.25\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BMI\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 262.05117817552093,\n \"min\": 0.0,\n \"max\": 768.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 31.992578124999998,\n 32.0,\n 768.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"DiabetesPedigreeFunction\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 271.3005221658502,\n \"min\": 0.078,\n \"max\": 768.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 0.47187630208333325,\n 0.3725,\n 768.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 260.1941178528413,\n \"min\": 11.760231540678685,\n \"max\": 768.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 33.240885416666664,\n 29.0,\n 768.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Outcome\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 271.3865920388932,\n \"min\": 0.0,\n \"max\": 768.0,\n \"num_unique_values\": 5,\n \"samples\": [\n 0.3489583333333333,\n 1.0,\n 0.47695137724279896\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 3.845052 | \n",
" 120.894531 | \n",
" 69.105469 | \n",
" 20.536458 | \n",
" 79.799479 | \n",
" 31.992578 | \n",
" 0.471876 | \n",
" 33.240885 | \n",
" 0.348958 | \n",
"
\n",
" \n",
" std | \n",
" 3.369578 | \n",
" 31.972618 | \n",
" 19.355807 | \n",
" 15.952218 | \n",
" 115.244002 | \n",
" 7.884160 | \n",
" 0.331329 | \n",
" 11.760232 | \n",
" 0.476951 | \n",
"
\n",
" \n",
" min | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.078000 | \n",
" 21.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 1.000000 | \n",
" 99.000000 | \n",
" 62.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 27.300000 | \n",
" 0.243750 | \n",
" 24.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 3.000000 | \n",
" 117.000000 | \n",
" 72.000000 | \n",
" 23.000000 | \n",
" 30.500000 | \n",
" 32.000000 | \n",
" 0.372500 | \n",
" 29.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 6.000000 | \n",
" 140.250000 | \n",
" 80.000000 | \n",
" 32.000000 | \n",
" 127.250000 | \n",
" 36.600000 | \n",
" 0.626250 | \n",
" 41.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" max | \n",
" 17.000000 | \n",
" 199.000000 | \n",
" 122.000000 | \n",
" 99.000000 | \n",
" 846.000000 | \n",
" 67.100000 | \n",
" 2.420000 | \n",
" 81.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n",
"count 768.000000 768.000000 768.000000 768.000000 768.000000 \n",
"mean 3.845052 120.894531 69.105469 20.536458 79.799479 \n",
"std 3.369578 31.972618 19.355807 15.952218 115.244002 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 99.000000 62.000000 0.000000 0.000000 \n",
"50% 3.000000 117.000000 72.000000 23.000000 30.500000 \n",
"75% 6.000000 140.250000 80.000000 32.000000 127.250000 \n",
"max 17.000000 199.000000 122.000000 99.000000 846.000000 \n",
"\n",
" BMI DiabetesPedigreeFunction Age Outcome \n",
"count 768.000000 768.000000 768.000000 768.000000 \n",
"mean 31.992578 0.471876 33.240885 0.348958 \n",
"std 7.884160 0.331329 11.760232 0.476951 \n",
"min 0.000000 0.078000 21.000000 0.000000 \n",
"25% 27.300000 0.243750 24.000000 0.000000 \n",
"50% 32.000000 0.372500 29.000000 0.000000 \n",
"75% 36.600000 0.626250 41.000000 1.000000 \n",
"max 67.100000 2.420000 81.000000 1.000000 "
]
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "zgcPA5LUUTf-",
"outputId": "5128a423-61e0-4a91-8de4-27a7072cd34c"
},
"outputs": [
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
},
"text/plain": [
"\"mean_glucose = df['Glucose'].mean()\\ndf['Glucose'] = df['Glucose'].replace(0,mean_glucose)\\nmean_bp = df['BloodPressure'].mean()\\ndf['BloodPressure'] = df['BloodPressure'].replace(0,mean_bp)\\nmean_skin = df['SkinThickness'].mean()\\ndf['SkinThickness'] = df['SkinThickness'].replace(0,mean_skin)\\nmean_bmi = df['BMI'].mean()\\ndf['BMI'] = df['BMI'].replace(0,mean_bmi)\\ndf['Insulin'] = df['Insulin'].replace(0,df['Insulin'].mean())\""
]
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''mean_glucose = df['Glucose'].mean()\n",
"df['Glucose'] = df['Glucose'].replace(0,mean_glucose)\n",
"mean_bp = df['BloodPressure'].mean()\n",
"df['BloodPressure'] = df['BloodPressure'].replace(0,mean_bp)\n",
"mean_skin = df['SkinThickness'].mean()\n",
"df['SkinThickness'] = df['SkinThickness'].replace(0,mean_skin)\n",
"mean_bmi = df['BMI'].mean()\n",
"df['BMI'] = df['BMI'].replace(0,mean_bmi)\n",
"df['Insulin'] = df['Insulin'].replace(0,df['Insulin'].mean())'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 143
},
"id": "heFpz7LnMVuT",
"outputId": "9918c59c-7d1b-4106-b46f-5919bc2aa834"
},
"outputs": [
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 2,\n \"fields\": [\n {\n \"column\": \"Outcome\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Pregnancies\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.108511248584296,\n \"min\": 3.298,\n \"max\": 4.865671641791045,\n \"num_unique_values\": 2,\n \"samples\": [\n 4.865671641791045,\n 3.298\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Glucose\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22.116505963980842,\n \"min\": 109.98,\n \"max\": 141.25746268656715,\n \"num_unique_values\": 2,\n \"samples\": [\n 141.25746268656715,\n 109.98\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BloodPressure\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.8672051632998017,\n \"min\": 68.184,\n \"max\": 70.82462686567165,\n \"num_unique_values\": 2,\n \"samples\": [\n 70.82462686567165,\n 68.184\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SkinThickness\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.7678935989570275,\n \"min\": 19.664,\n \"max\": 22.16417910447761,\n \"num_unique_values\": 2,\n \"samples\": [\n 22.16417910447761,\n 19.664\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Insulin\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22.304849659757796,\n \"min\": 68.792,\n \"max\": 100.33582089552239,\n \"num_unique_values\": 2,\n \"samples\": [\n 100.33582089552239,\n 68.792\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BMI\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.4212211239962618,\n \"min\": 30.3042,\n \"max\": 35.14253731343284,\n \"num_unique_values\": 2,\n \"samples\": [\n 35.14253731343284,\n 30.3042\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"DiabetesPedigreeFunction\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.08539445753677459,\n \"min\": 0.429734,\n \"max\": 0.5505,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.5505,\n 0.429734\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4.155782645191446,\n \"min\": 31.19,\n \"max\": 37.06716417910448,\n \"num_unique_values\": 2,\n \"samples\": [\n 37.06716417910448,\n 31.19\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
"
\n",
" \n",
" Outcome | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 3.298000 | \n",
" 109.980000 | \n",
" 68.184000 | \n",
" 19.664000 | \n",
" 68.792000 | \n",
" 30.304200 | \n",
" 0.429734 | \n",
" 31.190000 | \n",
"
\n",
" \n",
" 1 | \n",
" 4.865672 | \n",
" 141.257463 | \n",
" 70.824627 | \n",
" 22.164179 | \n",
" 100.335821 | \n",
" 35.142537 | \n",
" 0.550500 | \n",
" 37.067164 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n",
"Outcome \n",
"0 3.298000 109.980000 68.184000 19.664000 68.792000 \n",
"1 4.865672 141.257463 70.824627 22.164179 100.335821 \n",
"\n",
" BMI DiabetesPedigreeFunction Age \n",
"Outcome \n",
"0 30.304200 0.429734 31.190000 \n",
"1 35.142537 0.550500 37.067164 "
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('Outcome').mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "IAjvmnW-N838"
},
"outputs": [],
"source": [
"# lets split data into input and output\n",
"X = df.drop(columns='Outcome',axis=1)\n",
"Y = df['Outcome']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 424
},
"id": "Hjs8ungwTGk1",
"outputId": "b74e6e82-286b-4332-9c02-e18913bf065a"
},
"outputs": [
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"X\",\n \"rows\": 768,\n \"fields\": [\n {\n \"column\": \"Pregnancies\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 0,\n \"max\": 17,\n \"num_unique_values\": 17,\n \"samples\": [\n 6,\n 1,\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Glucose\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 31,\n \"min\": 0,\n \"max\": 199,\n \"num_unique_values\": 136,\n \"samples\": [\n 151,\n 101,\n 112\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BloodPressure\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 19,\n \"min\": 0,\n \"max\": 122,\n \"num_unique_values\": 47,\n \"samples\": [\n 86,\n 46,\n 85\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SkinThickness\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 15,\n \"min\": 0,\n \"max\": 99,\n \"num_unique_values\": 51,\n \"samples\": [\n 7,\n 12,\n 48\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Insulin\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 115,\n \"min\": 0,\n \"max\": 846,\n \"num_unique_values\": 186,\n \"samples\": [\n 52,\n 41,\n 183\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BMI\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 7.884160320375446,\n \"min\": 0.0,\n \"max\": 67.1,\n \"num_unique_values\": 248,\n \"samples\": [\n 19.9,\n 31.0,\n 38.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"DiabetesPedigreeFunction\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3313285950127749,\n \"min\": 0.078,\n \"max\": 2.42,\n \"num_unique_values\": 517,\n \"samples\": [\n 1.731,\n 0.426,\n 0.138\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11,\n \"min\": 21,\n \"max\": 81,\n \"num_unique_values\": 52,\n \"samples\": [\n 60,\n 47,\n 72\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "X"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6 | \n",
" 148 | \n",
" 72 | \n",
" 35 | \n",
" 0 | \n",
" 33.6 | \n",
" 0.627 | \n",
" 50 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 85 | \n",
" 66 | \n",
" 29 | \n",
" 0 | \n",
" 26.6 | \n",
" 0.351 | \n",
" 31 | \n",
"
\n",
" \n",
" 2 | \n",
" 8 | \n",
" 183 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 23.3 | \n",
" 0.672 | \n",
" 32 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 89 | \n",
" 66 | \n",
" 23 | \n",
" 94 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 168 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 763 | \n",
" 10 | \n",
" 101 | \n",
" 76 | \n",
" 48 | \n",
" 180 | \n",
" 32.9 | \n",
" 0.171 | \n",
" 63 | \n",
"
\n",
" \n",
" 764 | \n",
" 2 | \n",
" 122 | \n",
" 70 | \n",
" 27 | \n",
" 0 | \n",
" 36.8 | \n",
" 0.340 | \n",
" 27 | \n",
"
\n",
" \n",
" 765 | \n",
" 5 | \n",
" 121 | \n",
" 72 | \n",
" 23 | \n",
" 112 | \n",
" 26.2 | \n",
" 0.245 | \n",
" 30 | \n",
"
\n",
" \n",
" 766 | \n",
" 1 | \n",
" 126 | \n",
" 60 | \n",
" 0 | \n",
" 0 | \n",
" 30.1 | \n",
" 0.349 | \n",
" 47 | \n",
"
\n",
" \n",
" 767 | \n",
" 1 | \n",
" 93 | \n",
" 70 | \n",
" 31 | \n",
" 0 | \n",
" 30.4 | \n",
" 0.315 | \n",
" 23 | \n",
"
\n",
" \n",
"
\n",
"
768 rows × 8 columns
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
".. ... ... ... ... ... ... \n",
"763 10 101 76 48 180 32.9 \n",
"764 2 122 70 27 0 36.8 \n",
"765 5 121 72 23 112 26.2 \n",
"766 1 126 60 0 0 30.1 \n",
"767 1 93 70 31 0 30.4 \n",
"\n",
" DiabetesPedigreeFunction Age \n",
"0 0.627 50 \n",
"1 0.351 31 \n",
"2 0.672 32 \n",
"3 0.167 21 \n",
"4 2.288 33 \n",
".. ... ... \n",
"763 0.171 63 \n",
"764 0.340 27 \n",
"765 0.245 30 \n",
"766 0.349 47 \n",
"767 0.315 23 \n",
"\n",
"[768 rows x 8 columns]"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "KVo0hIBJTpBV",
"outputId": "86074766-bdfd-4a39-dea9-f01252a99ff7"
},
"outputs": [
{
"data": {
"text/plain": [
"0 1\n",
"1 0\n",
"2 1\n",
"3 0\n",
"4 1\n",
" ..\n",
"763 0\n",
"764 0\n",
"765 0\n",
"766 1\n",
"767 0\n",
"Name: Outcome, Length: 768, dtype: int64"
]
},
"execution_count": 130,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Y"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "7ldH2ZHmTpyc"
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "aceWsw5EWsJG"
},
"source": [
"# **data standardization**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "2OPL2gcUWyG2",
"outputId": "9f92b4c3-8c67-4ea1-8da0-6574da661a22"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0.63994726 0.84832379 0.14964075 ... 0.20401277 0.46849198\n",
" 1.4259954 ]\n",
" [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078\n",
" -0.19067191]\n",
" [ 1.23388019 1.94372388 -0.26394125 ... -1.10325546 0.60439732\n",
" -0.10558415]\n",
" ...\n",
" [ 0.3429808 0.00330087 0.14964075 ... -0.73518964 -0.68519336\n",
" -0.27575966]\n",
" [-0.84488505 0.1597866 -0.47073225 ... -0.24020459 -0.37110101\n",
" 1.17073215]\n",
" [-0.84488505 -0.8730192 0.04624525 ... -0.20212881 -0.47378505\n",
" -0.87137393]]\n"
]
}
],
"source": [
"scaler= StandardScaler()\n",
"scaler.fit(X)\n",
"standardized_data = scaler.transform(X)\n",
"print(standardized_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "x_gFAglxXp92"
},
"outputs": [],
"source": [
"# save scaled data into X,Y varibales\n",
"X = standardized_data\n",
"Y= df['Outcome']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "6-W0mdSSYoRX",
"outputId": "f8ff13a4-b941-4fcf-f731-a85004106f94"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0.63994726 0.84832379 0.14964075 ... 0.20401277 0.46849198\n",
" 1.4259954 ]\n",
" [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078\n",
" -0.19067191]\n",
" [ 1.23388019 1.94372388 -0.26394125 ... -1.10325546 0.60439732\n",
" -0.10558415]\n",
" ...\n",
" [ 0.3429808 0.00330087 0.14964075 ... -0.73518964 -0.68519336\n",
" -0.27575966]\n",
" [-0.84488505 0.1597866 -0.47073225 ... -0.24020459 -0.37110101\n",
" 1.17073215]\n",
" [-0.84488505 -0.8730192 0.04624525 ... -0.20212881 -0.47378505\n",
" -0.87137393]]\n",
"0 1\n",
"1 0\n",
"2 1\n",
"3 0\n",
"4 1\n",
" ..\n",
"763 0\n",
"764 0\n",
"765 0\n",
"766 1\n",
"767 0\n",
"Name: Outcome, Length: 768, dtype: int64\n"
]
}
],
"source": [
"print(X)\n",
"print(Y)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "TOJ_uN6BZHH3"
},
"source": [
"# **train test split**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ax0hMd8KZNnF"
},
"outputs": [],
"source": [
"X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "icLZgMTWQI24",
"outputId": "a1922138-d704-4700-c31e-e6942ecaa381"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(768, 8) (614, 8) (154, 8)\n"
]
}
],
"source": [
"print(X.shape, X_train.shape, X_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "i3OlGu879zxn"
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "cFGiWPKG94G2"
},
"source": [
"# **ALGORITHM SELECTION**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "35NpJ2__bHjH"
},
"outputs": [],
"source": [
"models = {\n",
" 'Logistic Regression': LogisticRegression(),\n",
" 'Decision Tree': DecisionTreeClassifier(),\n",
" 'Random Forest': RandomForestClassifier(),\n",
" 'Gradient Boosting': GradientBoostingClassifier(),\n",
" 'Support Vector Machine': SVC(),\n",
" 'k-Nearest Neighbors': KNeighborsClassifier()\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Q4qgGA9IsrY3"
},
"source": [
"**using single train_test_split data**\n",
"\n",
"\n",
"---\n",
"\n",
"possibly biased out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "y-uIuy4J9024",
"outputId": "c70d883a-5d3d-4321-c9e0-4a90939180da"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Logistic Regression: 0.7597\n",
"Decision Tree: 0.6753\n",
"Random Forest: 0.7468\n",
"Gradient Boosting: 0.7078\n",
"Support Vector Machine: 0.7273\n",
"k-Nearest Neighbors: 0.7208\n",
"\n",
"Best model: Logistic Regression with accuracy: 0.7597\n"
]
}
],
"source": [
"results = {}\n",
"for name, model in models.items():\n",
" model.fit(X_train, Y_train)\n",
" y_pred = model.predict(X_test)\n",
" results[name] = accuracy_score(Y_test, y_pred)\n",
" print(f'{name}: {results[name]:.4f}')\n",
"\n",
"# Print the best model\n",
"best_model_name = max(results, key=results.get)\n",
"print(f'\\nBest model: {best_model_name} with accuracy: {results[best_model_name]:.4f}')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "or3n5f1kralG"
},
"source": [
"**using cross validation**\n",
"genralized accuracy (preferable)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xQWDewExqSMo",
"outputId": "16111655-afd0-48c5-d65e-7a604a38f7aa"
},
"outputs": [
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
},
"text/plain": [
"'dir(cross_val_score)\\nhelp(cross_val_score)'"
]
},
"execution_count": 138,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''dir(cross_val_score)\n",
"help(cross_val_score)'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0d2q6CHE907S",
"outputId": "582b9393-f5ab-4455-fcff-adc817cdc97d"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Logistic Regression: 0.7709\n",
"Decision Tree: 0.7124\n",
"Random Forest: 0.7696\n",
"Gradient Boosting: 0.7566\n",
"Support Vector Machine: 0.7709\n",
"k-Nearest Neighbors: 0.7331\n",
"\n",
"Best model: Support Vector Machine with accuracy: 0.7709\n"
]
}
],
"source": [
"# Evaluate each model using cross-validation\n",
"results = {}\n",
"for name, model in models.items():\n",
" scores = cross_val_score(model, X, Y, cv=5)\n",
" results[name] = scores.mean()\n",
" print(f'{name}: {scores.mean():.4f}')\n",
"\n",
"# Print the best model\n",
"best_model_name = max(results, key=results.get)\n",
"print(f'\\nBest model: {best_model_name} with accuracy: {results[best_model_name]:.4f}')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "VC5XZl2wgzE6"
},
"source": [
"# **SELECTING THE BEST CLASSIFICATION ALGORITHM**\n",
"---\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"id": "W8r84zBnkmcQ"
},
"outputs": [],
"source": [
"#dir(classifier)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jy7zOvsXbKFP"
},
"source": [
"\n",
"\n",
"```\n",
"# This is formatted as code\n",
"```\n",
"\n",
"**Training the model**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "f7MDqkdaB-Ta"
},
"outputs": [],
"source": [
"from sklearn.model_selection import GridSearchCV"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "auusAbz9zKft"
},
"source": [
"# Gridsearch on 2 best score algo - 1)logistic regg & 2)SVM\n",
"---\n",
"**to find best hyperparameters for both algos**\n",
"---\n",
"\n",
"**1)LOGISTIC REGG**\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ApPfWQOm5oom",
"outputId": "c73c0d49-56e6-47e8-a649-abbad95a400b"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best hyperparameters for Logistic Regression: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}\n",
"Best cross-validation score for Logistic Regression: 0.7708853238265002\n"
]
}
],
"source": [
"param_grid = {\n",
" 'penalty': ['l1', 'l2'],\n",
" 'C': [0.001, 0.01, 0.1, 1, 10, 100],\n",
" 'solver': ['liblinear', 'saga']\n",
"}\n",
"\n",
"logistic_reg = LogisticRegression()\n",
"\n",
"grid_search = GridSearchCV(logistic_reg, param_grid, cv=5, scoring='accuracy')\n",
"grid_search.fit(X, Y)\n",
"\n",
"print(\"Best hyperparameters for Logistic Regression:\", grid_search.best_params_)\n",
"print(\"Best cross-validation score for Logistic Regression:\", grid_search.best_score_)\n",
"#gcv_df=pd.DataFrame(grid_search.cv_results_)\n",
"#gcv_df[['param_C','param_penalty','mean_test_score']]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "dl0x49IL0z0W"
},
"source": [
"**2)SVM**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0xm9D40-iNoz",
"outputId": "ba3a6360-b649-4e1b-fd00-99a0730d17f0"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters for SVM: {'C': 1, 'kernel': 'linear'}\n",
"Best cross-validation score for SVM: 0.7734827264239028\n"
]
}
],
"source": [
"classifier_svc=SVC()\n",
"\n",
"param_grid_svc = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}\n",
"grid_search_svc = GridSearchCV(classifier_svc, param_grid_svc, cv=5, scoring='accuracy')\n",
"grid_search_svc.fit(X, Y)\n",
"\n",
"print(\"Best parameters for SVM:\", grid_search_svc.best_params_)\n",
"print(\"Best cross-validation score for SVM:\", grid_search_svc.best_score_)\n",
"grid_search_svc_df = pd.DataFrame(grid_search_svc.cv_results_)\n",
"#grid_search_svc_df[['param_C', 'param_kernel', 'mean_test_score']]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-6P74cmj2qFj"
},
"source": [
"**conclusion**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3PqL2BUVn-O3",
"outputId": "9ea372ce-7ae3-4b5f-adf3-56453493bc7e"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"best parameter for logistic regression {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}\n",
"score : 0.7708853238265002\n",
"\n",
"Best parameters for SVM: {'C': 1, 'kernel': 'linear'}\n",
"Best cross-validation score for SVM: 0.7734827264239028\n"
]
}
],
"source": [
"print(\"best parameter for logistic regression\",grid_search.best_params_)\n",
"print(\"score :\",grid_search.best_score_)\n",
"print(\"\\nBest parameters for SVM:\", grid_search_svc.best_params_)\n",
"print(\"Best cross-validation score for SVM:\", grid_search_svc.best_score_)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "krf_bnXa57BZ"
},
"source": [
"**Best parameters for SVM: {'C': 1, 'kernel': 'linear'}**\n",
"\n",
"---\n",
"\n",
"\n",
"**Best cross-validation score for SVM: 0.7734827264239028**"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "fmEMZaZeR0ex"
},
"source": [
"#**Train the final SVM model with the best hyperparameters**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7mHU3Y70n9Gv",
"outputId": "f2acd065-e1e7-481d-dfa3-93eaf5b73061"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Final accuracy of the SVM model: 0.7727\n"
]
}
],
"source": [
"best_svc = SVC(C=1, kernel='linear')\n",
"best_svc.fit(X_train, Y_train)\n",
"\n",
"y_pred = best_svc.predict(X_test)\n",
"\n",
"final_accuracy = accuracy_score(Y_test, y_pred)\n",
"print(f'Final accuracy of the SVM model: {final_accuracy:.4f}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "mKL5EI8Hl9Y2"
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FoQGL2t8XSYC",
"outputId": "ceabb1e7-3927-4d05-8da5-6d6758da37e9"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0.3429808 1.41167241 0.14964075 -0.09637905 0.82661621 -0.78595734\n",
" 0.34768723 1.51108316]]\n",
"[1]\n",
"person is diabetic\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/sklearn/base.py:465: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
" warnings.warn(\n"
]
}
],
"source": [
"input_data = (5,166,72,19,175,25.8,0.587,51)\n",
"input_data_as_numpy_array = np.asarray(input_data)\n",
"#reshappe array for predicting one istance\n",
"input_data_reshaped= input_data_as_numpy_array.reshape(1,-1) #(1 row,-1 till last col)\n",
"\n",
"std_data=scaler.transform(input_data_reshaped)\n",
"print(std_data)\n",
"classifier_svc.fit(X_train,Y_train)\n",
"prediction=classifier_svc.predict(std_data)\n",
"print(prediction)\n",
"\n",
"if prediction[0]==0: #prediction is list so use 0th element as output\n",
" print('person is not diabetic')\n",
"else:\n",
" print('person is diabetic')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VX42kAr4aJpf",
"outputId": "9b116d47-e3c2-4595-fa95-bede447d443b"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.78 0.91 0.84 100\n",
" 1 0.76 0.52 0.62 54\n",
"\n",
" accuracy 0.77 154\n",
" macro avg 0.77 0.71 0.73 154\n",
"weighted avg 0.77 0.77 0.76 154\n",
"\n"
]
}
],
"source": [
"from sklearn.metrics import classification_report\n",
"print(classification_report(Y_test,y_pred))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DPHI_WvUamLM"
},
"source": [
"**using svm**\n",
"\n",
"**accuracy 77.27%**\n"
]
}
],
"metadata": {
"colab": {
"authorship_tag": "ABX9TyPJKK4eqKym/2KAkTgYfQay",
"include_colab_link": true,
"mount_file_id": "1_DzATjkNqJwnRO7TaioUaShWd-AVDou1",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}