{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Machine Learning Intro - Python\n",
    "\n",
    "First, we import the packages we need:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.io import arff # to read arff files\n",
    "import pandas as pd # for data manipulation\n",
    "import numpy as np\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn import preprocessing, svm\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV\n",
    "from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix, roc_curve, auc, roc_auc_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, we load the data set into a dataframe and display the first few rows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>preg</th>\n",
       "      <th>plas</th>\n",
       "      <th>pres</th>\n",
       "      <th>skin</th>\n",
       "      <th>insu</th>\n",
       "      <th>mass</th>\n",
       "      <th>pedi</th>\n",
       "      <th>age</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6.0</td>\n",
       "      <td>148.0</td>\n",
       "      <td>72.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>85.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>29.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8.0</td>\n",
       "      <td>183.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>23.0</td>\n",
       "      <td>94.0</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>137.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>168.0</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5.0</td>\n",
       "      <td>116.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>25.6</td>\n",
       "      <td>0.201</td>\n",
       "      <td>30.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.0</td>\n",
       "      <td>78.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>88.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>0.248</td>\n",
       "      <td>26.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>10.0</td>\n",
       "      <td>115.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>35.3</td>\n",
       "      <td>0.134</td>\n",
       "      <td>29.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2.0</td>\n",
       "      <td>197.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>45.0</td>\n",
       "      <td>543.0</td>\n",
       "      <td>30.5</td>\n",
       "      <td>0.158</td>\n",
       "      <td>53.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>8.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>96.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.232</td>\n",
       "      <td>54.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>4.0</td>\n",
       "      <td>110.0</td>\n",
       "      <td>92.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>37.6</td>\n",
       "      <td>0.191</td>\n",
       "      <td>30.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>10.0</td>\n",
       "      <td>168.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0.537</td>\n",
       "      <td>34.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>10.0</td>\n",
       "      <td>139.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>27.1</td>\n",
       "      <td>1.441</td>\n",
       "      <td>57.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>1.0</td>\n",
       "      <td>189.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>23.0</td>\n",
       "      <td>846.0</td>\n",
       "      <td>30.1</td>\n",
       "      <td>0.398</td>\n",
       "      <td>59.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>5.0</td>\n",
       "      <td>166.0</td>\n",
       "      <td>72.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>175.0</td>\n",
       "      <td>25.8</td>\n",
       "      <td>0.587</td>\n",
       "      <td>51.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>7.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>0.484</td>\n",
       "      <td>32.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>0.0</td>\n",
       "      <td>118.0</td>\n",
       "      <td>84.0</td>\n",
       "      <td>47.0</td>\n",
       "      <td>230.0</td>\n",
       "      <td>45.8</td>\n",
       "      <td>0.551</td>\n",
       "      <td>31.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>7.0</td>\n",
       "      <td>107.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>29.6</td>\n",
       "      <td>0.254</td>\n",
       "      <td>31.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>1.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>38.0</td>\n",
       "      <td>83.0</td>\n",
       "      <td>43.3</td>\n",
       "      <td>0.183</td>\n",
       "      <td>33.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>1.0</td>\n",
       "      <td>115.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>96.0</td>\n",
       "      <td>34.6</td>\n",
       "      <td>0.529</td>\n",
       "      <td>32.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>3.0</td>\n",
       "      <td>126.0</td>\n",
       "      <td>88.0</td>\n",
       "      <td>41.0</td>\n",
       "      <td>235.0</td>\n",
       "      <td>39.3</td>\n",
       "      <td>0.704</td>\n",
       "      <td>27.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>8.0</td>\n",
       "      <td>99.0</td>\n",
       "      <td>84.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>35.4</td>\n",
       "      <td>0.388</td>\n",
       "      <td>50.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>7.0</td>\n",
       "      <td>196.0</td>\n",
       "      <td>90.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>39.8</td>\n",
       "      <td>0.451</td>\n",
       "      <td>41.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>9.0</td>\n",
       "      <td>119.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>29.0</td>\n",
       "      <td>0.263</td>\n",
       "      <td>29.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>11.0</td>\n",
       "      <td>143.0</td>\n",
       "      <td>94.0</td>\n",
       "      <td>33.0</td>\n",
       "      <td>146.0</td>\n",
       "      <td>36.6</td>\n",
       "      <td>0.254</td>\n",
       "      <td>51.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>10.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>26.0</td>\n",
       "      <td>115.0</td>\n",
       "      <td>31.1</td>\n",
       "      <td>0.205</td>\n",
       "      <td>41.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>7.0</td>\n",
       "      <td>147.0</td>\n",
       "      <td>76.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>39.4</td>\n",
       "      <td>0.257</td>\n",
       "      <td>43.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>1.0</td>\n",
       "      <td>97.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>140.0</td>\n",
       "      <td>23.2</td>\n",
       "      <td>0.487</td>\n",
       "      <td>22.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>13.0</td>\n",
       "      <td>145.0</td>\n",
       "      <td>82.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>110.0</td>\n",
       "      <td>22.2</td>\n",
       "      <td>0.245</td>\n",
       "      <td>57.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>5.0</td>\n",
       "      <td>117.0</td>\n",
       "      <td>92.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>34.1</td>\n",
       "      <td>0.337</td>\n",
       "      <td>38.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    preg   plas  pres  skin   insu  mass   pedi   age            class\n",
       "0    6.0  148.0  72.0  35.0    0.0  33.6  0.627  50.0  tested_positive\n",
       "1    1.0   85.0  66.0  29.0    0.0  26.6  0.351  31.0  tested_negative\n",
       "2    8.0  183.0  64.0   0.0    0.0  23.3  0.672  32.0  tested_positive\n",
       "3    1.0   89.0  66.0  23.0   94.0  28.1  0.167  21.0  tested_negative\n",
       "4    0.0  137.0  40.0  35.0  168.0  43.1  2.288  33.0  tested_positive\n",
       "5    5.0  116.0  74.0   0.0    0.0  25.6  0.201  30.0  tested_negative\n",
       "6    3.0   78.0  50.0  32.0   88.0  31.0  0.248  26.0  tested_positive\n",
       "7   10.0  115.0   0.0   0.0    0.0  35.3  0.134  29.0  tested_negative\n",
       "8    2.0  197.0  70.0  45.0  543.0  30.5  0.158  53.0  tested_positive\n",
       "9    8.0  125.0  96.0   0.0    0.0   0.0  0.232  54.0  tested_positive\n",
       "10   4.0  110.0  92.0   0.0    0.0  37.6  0.191  30.0  tested_negative\n",
       "11  10.0  168.0  74.0   0.0    0.0  38.0  0.537  34.0  tested_positive\n",
       "12  10.0  139.0  80.0   0.0    0.0  27.1  1.441  57.0  tested_negative\n",
       "13   1.0  189.0  60.0  23.0  846.0  30.1  0.398  59.0  tested_positive\n",
       "14   5.0  166.0  72.0  19.0  175.0  25.8  0.587  51.0  tested_positive\n",
       "15   7.0  100.0   0.0   0.0    0.0  30.0  0.484  32.0  tested_positive\n",
       "16   0.0  118.0  84.0  47.0  230.0  45.8  0.551  31.0  tested_positive\n",
       "17   7.0  107.0  74.0   0.0    0.0  29.6  0.254  31.0  tested_positive\n",
       "18   1.0  103.0  30.0  38.0   83.0  43.3  0.183  33.0  tested_negative\n",
       "19   1.0  115.0  70.0  30.0   96.0  34.6  0.529  32.0  tested_positive\n",
       "20   3.0  126.0  88.0  41.0  235.0  39.3  0.704  27.0  tested_negative\n",
       "21   8.0   99.0  84.0   0.0    0.0  35.4  0.388  50.0  tested_negative\n",
       "22   7.0  196.0  90.0   0.0    0.0  39.8  0.451  41.0  tested_positive\n",
       "23   9.0  119.0  80.0  35.0    0.0  29.0  0.263  29.0  tested_positive\n",
       "24  11.0  143.0  94.0  33.0  146.0  36.6  0.254  51.0  tested_positive\n",
       "25  10.0  125.0  70.0  26.0  115.0  31.1  0.205  41.0  tested_positive\n",
       "26   7.0  147.0  76.0   0.0    0.0  39.4  0.257  43.0  tested_positive\n",
       "27   1.0   97.0  66.0  15.0  140.0  23.2  0.487  22.0  tested_negative\n",
       "28  13.0  145.0  82.0  19.0  110.0  22.2  0.245  57.0  tested_negative\n",
       "29   5.0  117.0  92.0   0.0    0.0  34.1  0.337  38.0  tested_negative"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load dataset (arff file)\n",
    "data = arff.loadarff('diabetes.arff')\n",
    "\n",
    "# transform into a dataframe\n",
    "df = pd.DataFrame(data[0])\n",
    "\n",
    "# decode the class label from a byte object into a string with utf-8 encoding\n",
    "df['class'] = df['class'].str.decode('utf-8')\n",
    "\n",
    "#display the first 5 rows\n",
    "df.head(30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As a first step, we could explore the dataset:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>preg</th>\n",
       "      <th>plas</th>\n",
       "      <th>pres</th>\n",
       "      <th>skin</th>\n",
       "      <th>insu</th>\n",
       "      <th>mass</th>\n",
       "      <th>pedi</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>3.845052</td>\n",
       "      <td>120.894531</td>\n",
       "      <td>69.105469</td>\n",
       "      <td>20.536458</td>\n",
       "      <td>79.799479</td>\n",
       "      <td>31.992578</td>\n",
       "      <td>0.471876</td>\n",
       "      <td>33.240885</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>3.369578</td>\n",
       "      <td>31.972618</td>\n",
       "      <td>19.355807</td>\n",
       "      <td>15.952218</td>\n",
       "      <td>115.244002</td>\n",
       "      <td>7.884160</td>\n",
       "      <td>0.331329</td>\n",
       "      <td>11.760232</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.078000</td>\n",
       "      <td>21.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>99.000000</td>\n",
       "      <td>62.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>27.300000</td>\n",
       "      <td>0.243750</td>\n",
       "      <td>24.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>117.000000</td>\n",
       "      <td>72.000000</td>\n",
       "      <td>23.000000</td>\n",
       "      <td>30.500000</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>0.372500</td>\n",
       "      <td>29.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>6.000000</td>\n",
       "      <td>140.250000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>127.250000</td>\n",
       "      <td>36.600000</td>\n",
       "      <td>0.626250</td>\n",
       "      <td>41.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>17.000000</td>\n",
       "      <td>199.000000</td>\n",
       "      <td>122.000000</td>\n",
       "      <td>99.000000</td>\n",
       "      <td>846.000000</td>\n",
       "      <td>67.100000</td>\n",
       "      <td>2.420000</td>\n",
       "      <td>81.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             preg        plas        pres        skin        insu        mass  \\\n",
       "count  768.000000  768.000000  768.000000  768.000000  768.000000  768.000000   \n",
       "mean     3.845052  120.894531   69.105469   20.536458   79.799479   31.992578   \n",
       "std      3.369578   31.972618   19.355807   15.952218  115.244002    7.884160   \n",
       "min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   \n",
       "25%      1.000000   99.000000   62.000000    0.000000    0.000000   27.300000   \n",
       "50%      3.000000  117.000000   72.000000   23.000000   30.500000   32.000000   \n",
       "75%      6.000000  140.250000   80.000000   32.000000  127.250000   36.600000   \n",
       "max     17.000000  199.000000  122.000000   99.000000  846.000000   67.100000   \n",
       "\n",
       "             pedi         age  \n",
       "count  768.000000  768.000000  \n",
       "mean     0.471876   33.240885  \n",
       "std      0.331329   11.760232  \n",
       "min      0.078000   21.000000  \n",
       "25%      0.243750   24.000000  \n",
       "50%      0.372500   29.000000  \n",
       "75%      0.626250   41.000000  \n",
       "max      2.420000   81.000000  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Number of positive samples:'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "268"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Number of negative samples:'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "500"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Percentage of positive samples:'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "34.89583333333333"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# how many samples do we have per class?\n",
    "\n",
    "display('Number of positive samples:')\n",
    "nr_pos_samples = df[df['class'] == 'tested_positive']['class'].count()\n",
    "display(nr_pos_samples)\n",
    "\n",
    "display('Number of negative samples:')\n",
    "nr_neg_samples = df[df['class'] == 'tested_negative']['class'].count()\n",
    "display(nr_neg_samples)\n",
    "\n",
    "display('Percentage of positive samples:')\n",
    "display((nr_pos_samples / df['class'].count()) * 100)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Preprocessing: Are there missing values? Do you need to transform data (e.g. dates, ...)?\n",
    "\n",
    "In this case, we mark zero values as missing values, since we assume that a value of exactly 0 is not a valid measurement."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>preg</th>\n",
       "      <th>plas</th>\n",
       "      <th>pres</th>\n",
       "      <th>skin</th>\n",
       "      <th>insu</th>\n",
       "      <th>mass</th>\n",
       "      <th>pedi</th>\n",
       "      <th>age</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6.0</td>\n",
       "      <td>148.0</td>\n",
       "      <td>72.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>85.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8.0</td>\n",
       "      <td>183.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>23.0</td>\n",
       "      <td>94.0</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>137.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>168.0</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5.0</td>\n",
       "      <td>116.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25.6</td>\n",
       "      <td>0.201</td>\n",
       "      <td>30.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.0</td>\n",
       "      <td>78.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>88.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>0.248</td>\n",
       "      <td>26.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>10.0</td>\n",
       "      <td>115.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>35.3</td>\n",
       "      <td>0.134</td>\n",
       "      <td>29.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2.0</td>\n",
       "      <td>197.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>45.0</td>\n",
       "      <td>543.0</td>\n",
       "      <td>30.5</td>\n",
       "      <td>0.158</td>\n",
       "      <td>53.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>8.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>96.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.232</td>\n",
       "      <td>54.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>4.0</td>\n",
       "      <td>110.0</td>\n",
       "      <td>92.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>37.6</td>\n",
       "      <td>0.191</td>\n",
       "      <td>30.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>10.0</td>\n",
       "      <td>168.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0.537</td>\n",
       "      <td>34.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>10.0</td>\n",
       "      <td>139.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27.1</td>\n",
       "      <td>1.441</td>\n",
       "      <td>57.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>1.0</td>\n",
       "      <td>189.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>23.0</td>\n",
       "      <td>846.0</td>\n",
       "      <td>30.1</td>\n",
       "      <td>0.398</td>\n",
       "      <td>59.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>5.0</td>\n",
       "      <td>166.0</td>\n",
       "      <td>72.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>175.0</td>\n",
       "      <td>25.8</td>\n",
       "      <td>0.587</td>\n",
       "      <td>51.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>7.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30.0</td>\n",
       "      <td>0.484</td>\n",
       "      <td>32.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>NaN</td>\n",
       "      <td>118.0</td>\n",
       "      <td>84.0</td>\n",
       "      <td>47.0</td>\n",
       "      <td>230.0</td>\n",
       "      <td>45.8</td>\n",
       "      <td>0.551</td>\n",
       "      <td>31.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>7.0</td>\n",
       "      <td>107.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>29.6</td>\n",
       "      <td>0.254</td>\n",
       "      <td>31.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>1.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>38.0</td>\n",
       "      <td>83.0</td>\n",
       "      <td>43.3</td>\n",
       "      <td>0.183</td>\n",
       "      <td>33.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>1.0</td>\n",
       "      <td>115.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>96.0</td>\n",
       "      <td>34.6</td>\n",
       "      <td>0.529</td>\n",
       "      <td>32.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>3.0</td>\n",
       "      <td>126.0</td>\n",
       "      <td>88.0</td>\n",
       "      <td>41.0</td>\n",
       "      <td>235.0</td>\n",
       "      <td>39.3</td>\n",
       "      <td>0.704</td>\n",
       "      <td>27.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>8.0</td>\n",
       "      <td>99.0</td>\n",
       "      <td>84.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>35.4</td>\n",
       "      <td>0.388</td>\n",
       "      <td>50.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>7.0</td>\n",
       "      <td>196.0</td>\n",
       "      <td>90.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>39.8</td>\n",
       "      <td>0.451</td>\n",
       "      <td>41.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>9.0</td>\n",
       "      <td>119.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>29.0</td>\n",
       "      <td>0.263</td>\n",
       "      <td>29.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>11.0</td>\n",
       "      <td>143.0</td>\n",
       "      <td>94.0</td>\n",
       "      <td>33.0</td>\n",
       "      <td>146.0</td>\n",
       "      <td>36.6</td>\n",
       "      <td>0.254</td>\n",
       "      <td>51.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>10.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>26.0</td>\n",
       "      <td>115.0</td>\n",
       "      <td>31.1</td>\n",
       "      <td>0.205</td>\n",
       "      <td>41.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>7.0</td>\n",
       "      <td>147.0</td>\n",
       "      <td>76.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>39.4</td>\n",
       "      <td>0.257</td>\n",
       "      <td>43.0</td>\n",
       "      <td>tested_positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>1.0</td>\n",
       "      <td>97.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>140.0</td>\n",
       "      <td>23.2</td>\n",
       "      <td>0.487</td>\n",
       "      <td>22.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>13.0</td>\n",
       "      <td>145.0</td>\n",
       "      <td>82.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>110.0</td>\n",
       "      <td>22.2</td>\n",
       "      <td>0.245</td>\n",
       "      <td>57.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>5.0</td>\n",
       "      <td>117.0</td>\n",
       "      <td>92.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.1</td>\n",
       "      <td>0.337</td>\n",
       "      <td>38.0</td>\n",
       "      <td>tested_negative</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    preg   plas  pres  skin   insu  mass   pedi   age            class\n",
       "0    6.0  148.0  72.0  35.0    NaN  33.6  0.627  50.0  tested_positive\n",
       "1    1.0   85.0  66.0  29.0    NaN  26.6  0.351  31.0  tested_negative\n",
       "2    8.0  183.0  64.0   NaN    NaN  23.3  0.672  32.0  tested_positive\n",
       "3    1.0   89.0  66.0  23.0   94.0  28.1  0.167  21.0  tested_negative\n",
       "4    NaN  137.0  40.0  35.0  168.0  43.1  2.288  33.0  tested_positive\n",
       "5    5.0  116.0  74.0   NaN    NaN  25.6  0.201  30.0  tested_negative\n",
       "6    3.0   78.0  50.0  32.0   88.0  31.0  0.248  26.0  tested_positive\n",
       "7   10.0  115.0   NaN   NaN    NaN  35.3  0.134  29.0  tested_negative\n",
       "8    2.0  197.0  70.0  45.0  543.0  30.5  0.158  53.0  tested_positive\n",
       "9    8.0  125.0  96.0   NaN    NaN   NaN  0.232  54.0  tested_positive\n",
       "10   4.0  110.0  92.0   NaN    NaN  37.6  0.191  30.0  tested_negative\n",
       "11  10.0  168.0  74.0   NaN    NaN  38.0  0.537  34.0  tested_positive\n",
       "12  10.0  139.0  80.0   NaN    NaN  27.1  1.441  57.0  tested_negative\n",
       "13   1.0  189.0  60.0  23.0  846.0  30.1  0.398  59.0  tested_positive\n",
       "14   5.0  166.0  72.0  19.0  175.0  25.8  0.587  51.0  tested_positive\n",
       "15   7.0  100.0   NaN   NaN    NaN  30.0  0.484  32.0  tested_positive\n",
       "16   NaN  118.0  84.0  47.0  230.0  45.8  0.551  31.0  tested_positive\n",
       "17   7.0  107.0  74.0   NaN    NaN  29.6  0.254  31.0  tested_positive\n",
       "18   1.0  103.0  30.0  38.0   83.0  43.3  0.183  33.0  tested_negative\n",
       "19   1.0  115.0  70.0  30.0   96.0  34.6  0.529  32.0  tested_positive\n",
       "20   3.0  126.0  88.0  41.0  235.0  39.3  0.704  27.0  tested_negative\n",
       "21   8.0   99.0  84.0   NaN    NaN  35.4  0.388  50.0  tested_negative\n",
       "22   7.0  196.0  90.0   NaN    NaN  39.8  0.451  41.0  tested_positive\n",
       "23   9.0  119.0  80.0  35.0    NaN  29.0  0.263  29.0  tested_positive\n",
       "24  11.0  143.0  94.0  33.0  146.0  36.6  0.254  51.0  tested_positive\n",
       "25  10.0  125.0  70.0  26.0  115.0  31.1  0.205  41.0  tested_positive\n",
       "26   7.0  147.0  76.0   NaN    NaN  39.4  0.257  43.0  tested_positive\n",
       "27   1.0   97.0  66.0  15.0  140.0  23.2  0.487  22.0  tested_negative\n",
       "28  13.0  145.0  82.0  19.0  110.0  22.2  0.245  57.0  tested_negative\n",
       "29   5.0  117.0  92.0   NaN    NaN  34.1  0.337  38.0  tested_negative"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# mark missing values as NaN\n",
    "df = df.replace(0, np.NaN)\n",
    "df.head(30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we want to run machine learning, and we define several classifier pipelines that we want to try out.\n",
    "\n",
    "The classifier pipeline have various possibilities for configuring. Here we try out:\n",
    "* Imputation: This preprocessing step replaces missing values with a value, in this case with the mean. This usually improves the performance of the classifier. There are several possibilities to deal with missing values. https://machinelearningmastery.com/handle-missing-data-python/\n",
    "* Scaling: The standard scales normalizes the features, such that each feature's values are distributed between -1 and +1. This helps to prevent that a certain features is weighted more only because it has larger values than another one.\n",
    "* Feature Selection: only use a subset of the features compared to all\n",
    "* Classifiers: try out different classifiers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define classifiers and parameters to evaluate\n",
    "steps_decision_forest = [\n",
    "    (\"imputation\", preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)),\n",
    "    (\"scaling\", preprocessing.StandardScaler()),\n",
    "    (\"feature_selection\", SelectKBest()),\n",
    "    (\"classifier\", RandomForestClassifier(n_estimators = 500))\n",
    "]\n",
    "\n",
    "parameters_decision_forest = dict(\n",
    "    feature_selection__k = [2, 4, 6, 'all'],\n",
    "    classifier__n_estimators = [100, 250, 500, 1000],\n",
    "    classifier__max_features = [\"sqrt\", \"log2\", 0.25, 0.5],\n",
    "    classifier__min_samples_leaf = [5, 10, 30, 50]\n",
    ")\n",
    "\n",
    "steps_gradient_boost = [\n",
    "    (\"imputation\", preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)),\n",
    "    (\"scaling\", preprocessing.StandardScaler()),\n",
    "    (\"feature_selection\", SelectKBest()),\n",
    "    (\"classifier\", GradientBoostingClassifier(n_estimators = 500))\n",
    "]\n",
    "\n",
    "parameters_gradient_boost = dict(\n",
    "    feature_selection__k = [2, 4, 6, 'all'],\n",
    "    classifier__n_estimators = [100, 250, 500, 1000],\n",
    "    classifier__loss = [\"deviance\", \"exponential\"],\n",
    "    classifier__learning_rate = [0.05, 0.1, 0.15],\n",
    "    classifier__max_features = [\"sqrt\", \"log2\", 0.25, 0.5],\n",
    "    classifier__min_samples_leaf = [5, 10, 30, 50]\n",
    ")\n",
    "\n",
    "steps_svm = [\n",
    "    (\"imputation\", preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)),\n",
    "    (\"scaling\", preprocessing.StandardScaler()),\n",
    "    (\"feature_selection\", SelectKBest(k=30)),\n",
    "    (\"classifier\", svm.SVC(kernel=\"rbf\"))\n",
    "]\n",
    "\n",
    "parameters_svm = dict(\n",
    "    feature_selection__k = [2, 4, 6, 'all'],\n",
    "    classifier__kernel = [\"rbf\", \"linear\", \"poly\", \"sigmoid\"],\n",
    "    classifier__C = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
    "    classifier__gamma = [0.01, 0.02, 0.03, 0.04, 0.05, 0.10, 0.2, 0.3, 0.4, 0.5]\n",
    ")\n",
    "\n",
    "steps_neural_network = [\n",
    "    (\"imputation\", preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)),\n",
    "    (\"scaling\", preprocessing.StandardScaler()),\n",
    "    (\"feature_selection\", SelectKBest()),\n",
    "    (\"classifier\", MLPClassifier(solver=\"lbfgs\"))\n",
    "]\n",
    "\n",
    "parameters_neural_network = dict(\n",
    "    feature_selection__k = [2, 4, 6, 8, 'all'],\n",
    "    classifier__solver = [\"lbfgs\", \"sgd\"],\n",
    "    classifier__alpha = [0.00001, 0.0001, 0.001, 0.01, 1],\n",
    "    classifier__activation = [\"identity\", \"logistic\", \"tanh\", \"relu\"],\n",
    "    classifier__learning_rate = [\"constant\", \"invscaling\", \"adaptive\"]\n",
    ")\n",
    "\n",
    "steps_naive_bayes = [\n",
    "    (\"imputation\", preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)),\n",
    "    (\"scaling\", preprocessing.StandardScaler()),\n",
    "    (\"feature_selection\", SelectKBest(k=30)),\n",
    "    (\"classifier\", GaussianNB())\n",
    "]\n",
    "\n",
    "parameters_naive_bayes =  dict(\n",
    "    feature_selection__k = [2, 4, 6, 8, 'all']\n",
    ")\n",
    "\n",
    "# all our classifiers with parameters\n",
    "pipelines = [(steps_decision_forest, parameters_decision_forest),\n",
    "             (steps_gradient_boost, parameters_gradient_boost),\n",
    "             (steps_svm, parameters_svm),\n",
    "             (steps_neural_network, parameters_neural_network),\n",
    "             (steps_naive_bayes, parameters_naive_bayes)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Predict the class value for each pipeline and print out the results:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Classifier: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "            min_samples_leaf=1, min_samples_split=2,\n",
       "            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,\n",
       "            oob_score=False, random_state=None, verbose=0,\n",
       "            warm_start=False)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 0'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 4, 'classifier__n_estimators': 1000, 'classifier__min_samples_leaf': 50, 'classifier__max_features': 'log2'}\n",
      "Best parameters:  {'feature_selection__k': 2, 'classifier__n_estimators': 500, 'classifier__min_samples_leaf': 30, 'classifier__max_features': 'log2'}\n",
      "Best parameters:  {'feature_selection__k': 4, 'classifier__n_estimators': 1000, 'classifier__min_samples_leaf': 50, 'classifier__max_features': 'log2'}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 1'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 30, 'classifier__max_features': 0.5}\n",
      "Best parameters:  {'feature_selection__k': 2, 'classifier__n_estimators': 250, 'classifier__min_samples_leaf': 10, 'classifier__max_features': 0.25}\n",
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__n_estimators': 250, 'classifier__min_samples_leaf': 10, 'classifier__max_features': 'sqrt'}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 2'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 6, 'classifier__n_estimators': 500, 'classifier__min_samples_leaf': 5, 'classifier__max_features': 0.25}\n",
      "Best parameters:  {'feature_selection__k': 6, 'classifier__n_estimators': 250, 'classifier__min_samples_leaf': 10, 'classifier__max_features': 0.25}\n",
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__n_estimators': 1000, 'classifier__min_samples_leaf': 5, 'classifier__max_features': 'sqrt'}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Scores of classification:'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\"Classifier: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\\n            max_depth=None, max_features='auto', max_leaf_nodes=None,\\n            min_impurity_decrease=0.0, min_impurity_split=None,\\n            min_samples_leaf=1, min_samples_split=2,\\n            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,\\n            oob_score=False, random_state=None, verbose=0,\\n            warm_start=False)\""
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Accuracy: 0.7530381944444443 +/- 0.006917264518450201'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Weighted Precision: 0.7461195899106623 +/- 0.007625903647639041'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Weighted Recall: 0.7530381944444443 +/- 0.006917264518450201'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Precision Tested Positive: 0.6922865353037767 +/- 0.012454712762053214'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Precision Tested Negative: 0.7749741071799529 +/- 0.005430011754841385'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Recall Tested Positive: 0.5261194029850746 +/- 0.013279948074665806'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Recall Tested Negative: 0.8746666666666667 +/- 0.005249338582674546'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Confusion Matrix: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "array([[437.33333333,  62.66666667],\n",
       "       [127.        , 141.        ]])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Classifier: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
       "              learning_rate=0.1, loss='deviance', max_depth=3,\n",
       "              max_features=None, max_leaf_nodes=None,\n",
       "              min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "              min_samples_leaf=1, min_samples_split=2,\n",
       "              min_weight_fraction_leaf=0.0, n_estimators=500,\n",
       "              presort='auto', random_state=None, subsample=1.0, verbose=0,\n",
       "              warm_start=False)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 0'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 10, 'classifier__max_features': 0.25, 'classifier__loss': 'exponential', 'classifier__learning_rate': 0.1}\n",
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 50, 'classifier__max_features': 'log2', 'classifier__loss': 'exponential', 'classifier__learning_rate': 0.15}\n",
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 30, 'classifier__max_features': 0.25, 'classifier__loss': 'deviance', 'classifier__learning_rate': 0.15}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 1'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 6, 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 30, 'classifier__max_features': 0.5, 'classifier__loss': 'exponential', 'classifier__learning_rate': 0.15}\n",
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__n_estimators': 250, 'classifier__min_samples_leaf': 5, 'classifier__max_features': 0.25, 'classifier__loss': 'deviance', 'classifier__learning_rate': 0.05}\n",
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 30, 'classifier__max_features': 0.5, 'classifier__loss': 'deviance', 'classifier__learning_rate': 0.1}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 2'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 50, 'classifier__max_features': 0.5, 'classifier__loss': 'deviance', 'classifier__learning_rate': 0.1}\n",
      "Best parameters:  {'feature_selection__k': 6, 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 10, 'classifier__max_features': 'log2', 'classifier__loss': 'exponential', 'classifier__learning_rate': 0.05}\n",
      "Best parameters:  {'feature_selection__k': 4, 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 30, 'classifier__max_features': 'log2', 'classifier__loss': 'deviance', 'classifier__learning_rate': 0.15}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Scores of classification:'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\"Classifier: GradientBoostingClassifier(criterion='friedman_mse', init=None,\\n              learning_rate=0.1, loss='deviance', max_depth=3,\\n              max_features=None, max_leaf_nodes=None,\\n              min_impurity_decrease=0.0, min_impurity_split=None,\\n              min_samples_leaf=1, min_samples_split=2,\\n              min_weight_fraction_leaf=0.0, n_estimators=500,\\n              presort='auto', random_state=None, subsample=1.0, verbose=0,\\n              warm_start=False)\""
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Accuracy: 0.7565104166666666 +/- 0.00637887953849786'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Weighted Precision: 0.7518041356776776 +/- 0.005900446495549101'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Weighted Recall: 0.7565104166666666 +/- 0.00637887953849786'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Precision Tested Positive: 0.6703557829662151 +/- 0.015150367018807323'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Precision Tested Negative: 0.7954604527310214 +/- 0.001451126541547686'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Recall Tested Positive: 0.595771144278607 +/- 0.00465380271986807'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Recall Tested Negative: 0.8426666666666667 +/- 0.011585431464655188'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Confusion Matrix: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "array([[421.33333333,  78.66666667],\n",
       "       [108.33333333, 159.66666667]])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Classifier: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
       "  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\n",
       "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
       "  tol=0.001, verbose=False)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 0'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__kernel': 'sigmoid', 'classifier__gamma': 0.01, 'classifier__C': 1}\n",
      "Best parameters:  {'feature_selection__k': 6, 'classifier__kernel': 'linear', 'classifier__gamma': 0.5, 'classifier__C': 5}\n",
      "Best parameters:  {'feature_selection__k': 2, 'classifier__kernel': 'linear', 'classifier__gamma': 0.02, 'classifier__C': 3}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 1'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 2, 'classifier__kernel': 'rbf', 'classifier__gamma': 0.04, 'classifier__C': 9}\n",
      "Best parameters:  {'feature_selection__k': 6, 'classifier__kernel': 'sigmoid', 'classifier__gamma': 0.01, 'classifier__C': 1}\n",
      "Best parameters:  {'feature_selection__k': 6, 'classifier__kernel': 'rbf', 'classifier__gamma': 0.05, 'classifier__C': 10}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 2'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 4, 'classifier__kernel': 'sigmoid', 'classifier__gamma': 0.05, 'classifier__C': 1}\n",
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__kernel': 'rbf', 'classifier__gamma': 0.01, 'classifier__C': 7}\n",
      "Best parameters:  {'feature_selection__k': 2, 'classifier__kernel': 'linear', 'classifier__gamma': 0.01, 'classifier__C': 8}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Scores of classification:'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\"Classifier: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\\n  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\\n  max_iter=-1, probability=False, random_state=None, shrinking=True,\\n  tol=0.001, verbose=False)\""
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Accuracy: 0.7625868055555555 +/- 0.006495932963149201'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Weighted Precision: 0.7571512667777728 +/- 0.006957592093905651'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Weighted Recall: 0.7625868055555555 +/- 0.006495932963149201'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Precision Tested Positive: 0.7202504897449863 +/- 0.00796907915109877'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Precision Tested Negative: 0.7769300832673465 +/- 0.006648553094210303'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Recall Tested Positive: 0.5223880597014925 +/- 0.018279774199874473'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Recall Tested Negative: 0.8913333333333333 +/- 0.0018856180831641283'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Confusion Matrix: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "array([[445.66666667,  54.33333333],\n",
       "       [128.        , 140.        ]])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Classifier: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,\n",
       "       beta_2=0.999, early_stopping=False, epsilon=1e-08,\n",
       "       hidden_layer_sizes=(100,), learning_rate='constant',\n",
       "       learning_rate_init=0.001, max_iter=200, momentum=0.9,\n",
       "       nesterovs_momentum=True, power_t=0.5, random_state=None,\n",
       "       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,\n",
       "       verbose=False, warm_start=False)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 0'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__solver': 'sgd', 'classifier__learning_rate': 'constant', 'classifier__alpha': 0.01, 'classifier__activation': 'tanh'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 6, 'classifier__solver': 'lbfgs', 'classifier__learning_rate': 'constant', 'classifier__alpha': 0.0001, 'classifier__activation': 'identity'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n",
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n",
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n",
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n",
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__solver': 'lbfgs', 'classifier__learning_rate': 'invscaling', 'classifier__alpha': 1, 'classifier__activation': 'logistic'}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 1'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n",
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__solver': 'sgd', 'classifier__learning_rate': 'adaptive', 'classifier__alpha': 1e-05, 'classifier__activation': 'identity'}\n",
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__solver': 'lbfgs', 'classifier__learning_rate': 'adaptive', 'classifier__alpha': 1, 'classifier__activation': 'identity'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n",
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 4, 'classifier__solver': 'sgd', 'classifier__learning_rate': 'constant', 'classifier__alpha': 0.001, 'classifier__activation': 'relu'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 2'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 6, 'classifier__solver': 'lbfgs', 'classifier__learning_rate': 'constant', 'classifier__alpha': 1, 'classifier__activation': 'identity'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n",
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n",
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 'all', 'classifier__solver': 'lbfgs', 'classifier__learning_rate': 'adaptive', 'classifier__alpha': 1, 'classifier__activation': 'identity'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n",
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n",
      "C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 2, 'classifier__solver': 'sgd', 'classifier__learning_rate': 'adaptive', 'classifier__alpha': 1e-05, 'classifier__activation': 'relu'}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Scores of classification:'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\"Classifier: MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,\\n       beta_2=0.999, early_stopping=False, epsilon=1e-08,\\n       hidden_layer_sizes=(100,), learning_rate='constant',\\n       learning_rate_init=0.001, max_iter=200, momentum=0.9,\\n       nesterovs_momentum=True, power_t=0.5, random_state=None,\\n       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,\\n       verbose=False, warm_start=False)\""
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Accuracy: 0.7673611111111112 +/- 0.0070787788325955385'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Weighted Precision: 0.7618461993180677 +/- 0.0075941082106042125'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Weighted Recall: 0.7673611111111112 +/- 0.0070787788325955385'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Precision Tested Positive: 0.6999569408494454 +/- 0.010800002546392365'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Precision Tested Negative: 0.7950188418572494 +/- 0.006475560416198352'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Recall Tested Positive: 0.5833333333333334 +/- 0.015634085932806636'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Recall Tested Negative: 0.866 +/- 0.00489897948556636'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Confusion Matrix: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "array([[433.        ,  67.        ],\n",
       "       [111.66666667, 156.33333333]])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Classifier: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "GaussianNB(priors=None)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 0'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 4}\n",
      "Best parameters:  {'feature_selection__k': 2}\n",
      "Best parameters:  {'feature_selection__k': 4}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 1'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 8}\n",
      "Best parameters:  {'feature_selection__k': 2}\n",
      "Best parameters:  {'feature_selection__k': 2}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Starting Trial 2'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters:  {'feature_selection__k': 2}\n",
      "Best parameters:  {'feature_selection__k': 4}\n",
      "Best parameters:  {'feature_selection__k': 2}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Scores of classification:'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Classifier: GaussianNB(priors=None)'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Accuracy: 0.75 +/- 0.006639348324990605'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Weighted Precision: 0.7437151410614428 +/- 0.006537020314428177'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Weighted Recall: 0.75 +/- 0.006639348324990605'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Precision Tested Positive: 0.6689852488588031 +/- 0.014896082027820307'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Precision Tested Negative: 0.7837703632820575 +/- 0.0024749828971138833'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Recall Tested Positive: 0.5621890547263683 +/- 0.0046538027198680795'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Recall Tested Negative: 0.8506666666666667 +/- 0.010370899457402705'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'Confusion Matrix: '"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "array([[425.33333333,  74.66666667],\n",
       "       [117.33333333, 150.66666667]])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# split into training/test set\n",
    "stratifiedFolds = StratifiedKFold(n_splits=3, shuffle=True)\n",
    "\n",
    "X = df.drop(columns=[\"class\"]).values\n",
    "y = df[\"class\"].values\n",
    "\n",
    "for steps in pipelines:\n",
    "    pipeline = Pipeline(steps[0])\n",
    "    parameters = steps[1]\n",
    "    display(\"Classifier: \", pipeline.named_steps.classifier)\n",
    "\n",
    "    numTrials = 3\n",
    "    accuracies = []\n",
    "    weighted_precisions = []\n",
    "    weighted_recalls = []\n",
    "    precisions_positive = []\n",
    "    precisions_negative = []\n",
    "    recalls_positive = []\n",
    "    recalls_negative = []\n",
    "\n",
    "    totals_originalclass = []\n",
    "    totals_predictedclass = []  \n",
    "\n",
    "    for i in range(numTrials):\n",
    "        display(\"Starting Trial \" + str(i))\n",
    "        originalclass = []\n",
    "        predictedclass = []\n",
    "\n",
    "        for train_index, test_index in stratifiedFolds.split(X, y):\n",
    "            # Need to specify the positive label here as the default metric auc_roc only works for binary label.\n",
    "            #def custom_auc(ground_truth, predictions):\n",
    "            #    fpr, tpr, _ = roc_curve(ground_truth, predictions[:, 1], pos_label='Switch')\n",
    "            #    return auc(fpr, tpr)\n",
    "\n",
    "            #roc_auc_scorer = make_scorer(custom_auc, greater_is_better=True, needs_proba=True)\n",
    "\n",
    "            randomSearch = RandomizedSearchCV(pipeline, parameters, cv=5, scoring=None, n_iter=5)\n",
    "            randomSearch.fit(X[train_index], y=y[train_index])\n",
    "            print(\"Best parameters: \", randomSearch.best_params_)\n",
    "\n",
    "            y_pred = randomSearch.predict(X[test_index])\n",
    "            originalclass.extend(y[test_index])\n",
    "            predictedclass.extend(y_pred)\n",
    "            totals_originalclass.extend(y[test_index])\n",
    "            totals_predictedclass.extend(y_pred)\n",
    "\n",
    "        # after each complete trial, store results\n",
    "        trial_accuracy = accuracy_score(originalclass, predictedclass)\n",
    "        accuracies.extend([trial_accuracy])\n",
    "\n",
    "        trial_weighted_precision = precision_score(originalclass, predictedclass, average=\"weighted\")\n",
    "        weighted_precisions.extend([trial_weighted_precision])\n",
    "        trial_precision_positive = precision_score(originalclass, predictedclass, average=\"binary\", pos_label=\"tested_positive\")\n",
    "        precisions_positive.extend([trial_precision_positive])\n",
    "        trial_precision_negative = precision_score(originalclass, predictedclass, average=\"binary\", pos_label=\"tested_negative\")\n",
    "        precisions_negative.extend([trial_precision_negative])\n",
    "\n",
    "        trial_weighted_recall = recall_score(originalclass, predictedclass, average=\"weighted\")\n",
    "        weighted_recalls.extend([trial_weighted_recall])\n",
    "        trial_recall_positive = recall_score(originalclass, predictedclass, average=\"binary\", pos_label=\"tested_positive\")\n",
    "        recalls_positive.extend([trial_recall_positive])\n",
    "        trial_recall_negative = recall_score(originalclass, predictedclass, average=\"binary\", pos_label=\"tested_negative\")\n",
    "        recalls_negative.extend([trial_recall_negative])\n",
    "\n",
    "    display('Scores of classification:')\n",
    "    display(\"Classifier: \" + str(pipeline.named_steps.classifier))\n",
    "    display(\"Accuracy: \" + str(np.mean(accuracies)) + \" +/- \" + str(np.std(accuracies)))\n",
    "    display(\"Weighted Precision: \" + str(np.mean(weighted_precisions)) + \" +/- \" + str(np.std(weighted_precisions)))\n",
    "    display(\"Weighted Recall: \" + str(np.mean(weighted_recalls)) + \" +/- \" + str(np.std(weighted_recalls)))\n",
    "    display(\"Precision Tested Positive: \" + str(np.mean(precisions_positive)) + \" +/- \" + str(np.std(precisions_positive)))\n",
    "    display(\"Precision Tested Negative: \" + str(np.mean(precisions_negative)) + \" +/- \" + str(np.std(precisions_negative)))\n",
    "    display(\"Recall Tested Positive: \" + str(np.mean(recalls_positive)) + \" +/- \" + str(np.std(recalls_positive)))\n",
    "    display(\"Recall Tested Negative: \" + str(np.mean(recalls_negative)) + \" +/- \" + str(np.std(recalls_negative)))\n",
    "    conf_matrix = confusion_matrix(totals_originalclass, totals_predictedclass) / numTrials\n",
    "    display(\"Confusion Matrix: \")\n",
    "    display(conf_matrix)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}