powers/Sundog-ML-Course: materials/NaiveBayes.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Naive Bayes (the easy way)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import io\n",
    "import numpy\n",
    "import pandas as pd\n",
    "from pandas import DataFrame\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "def readFiles(path):\n",
    "    for root, dirnames, filenames in os.walk(path):\n",
    "        for filename in filenames:\n",
    "            path = os.path.join(root, filename)\n",
    "\n",
    "            inBody = False\n",
    "            lines = []\n",
    "            f = io.open(path, 'r', encoding='latin1')\n",
    "            for line in f:\n",
    "                if inBody:\n",
    "                    lines.append(line)\n",
    "                elif line == '\\n':\n",
    "                    inBody = True\n",
    "            f.close()\n",
    "            message = '\\n'.join(lines)\n",
    "            yield path, message\n",
    "\n",
    "\n",
    "def dataFrameFromDirectory(path, classification):\n",
    "    rows = []\n",
    "    index = []\n",
    "    for filename, message in readFiles(path):\n",
    "        rows.append({'message': message, 'class': classification})\n",
    "        index.append(filename)\n",
    "\n",
    "    return DataFrame(rows, index=index)\n",
    "\n",
    "data = DataFrame({'message': [], 'class': []})\n",
    "\n",
    "data = pd.concat([data, dataFrameFromDirectory(\"emails/spam\", \"spam\")]);\n",
    "data = pd.concat([data, dataFrameFromDirectory(\"emails/ham\", \"ham\")])\n",
    "\n",
    "#For Pandas 1.3:\n",
    "#data = data.append(dataFrameFromDirectory('emails/spam', 'spam'))\n",
    "#data = data.append(dataFrameFromDirectory('emails/ham', 'ham'))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's have a look at that DataFrame:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>message</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>emails/spam\\00001.7848dde101aa985090474a91ec93fcf0</th>\n",
       "      <td>&lt;!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Tr...</td>\n",
       "      <td>spam</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>emails/spam\\00002.d94f1b97e48ed3b553b3508d116e6a09</th>\n",
       "      <td>1) Fight The Risk of Cancer!\\n\\nhttp://www.adc...</td>\n",
       "      <td>spam</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>emails/spam\\00003.2ee33bc6eacdb11f38d052c44819ba6c</th>\n",
       "      <td>1) Fight The Risk of Cancer!\\n\\nhttp://www.adc...</td>\n",
       "      <td>spam</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>emails/spam\\00004.eac8de8d759b7e74154f142194282724</th>\n",
       "      <td>##############################################...</td>\n",
       "      <td>spam</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>emails/spam\\00005.57696a39d7d84318ce497886896bf90d</th>\n",
       "      <td>I thought you might like these:\\n\\n1) Slim Dow...</td>\n",
       "      <td>spam</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                              message  \\\n",
       "emails/spam\\00001.7848dde101aa985090474a91ec93fcf0  <!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Tr...   \n",
       "emails/spam\\00002.d94f1b97e48ed3b553b3508d116e6a09  1) Fight The Risk of Cancer!\\n\\nhttp://www.adc...   \n",
       "emails/spam\\00003.2ee33bc6eacdb11f38d052c44819ba6c  1) Fight The Risk of Cancer!\\n\\nhttp://www.adc...   \n",
       "emails/spam\\00004.eac8de8d759b7e74154f142194282724  ##############################################...   \n",
       "emails/spam\\00005.57696a39d7d84318ce497886896bf90d  I thought you might like these:\\n\\n1) Slim Dow...   \n",
       "\n",
       "                                                   class  \n",
       "emails/spam\\00001.7848dde101aa985090474a91ec93fcf0  spam  \n",
       "emails/spam\\00002.d94f1b97e48ed3b553b3508d116e6a09  spam  \n",
       "emails/spam\\00003.2ee33bc6eacdb11f38d052c44819ba6c  spam  \n",
       "emails/spam\\00004.eac8de8d759b7e74154f142194282724  spam  \n",
       "emails/spam\\00005.57696a39d7d84318ce497886896bf90d  spam  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>MultinomialNB()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">MultinomialNB</label><div class=\"sk-toggleable__content\"><pre>MultinomialNB()</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "MultinomialNB()"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer = CountVectorizer()\n",
    "counts = vectorizer.fit_transform(data['message'].values)\n",
    "\n",
    "classifier = MultinomialNB()\n",
    "targets = data['class'].values\n",
    "classifier.fit(counts, targets)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's try it out:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['spam', 'ham'], dtype='<U4')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "examples = ['Free Viagra now!!!', \"Hi Bob, how about a game of golf tomorrow?\"]\n",
    "example_counts = vectorizer.transform(examples)\n",
    "predictions = classifier.predict(example_counts)\n",
    "predictions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Activity"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.\n",
    "\n",
    "If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}