mirror of https://github.com/kubeflow/examples.git
1823 lines
194 KiB
Plaintext
1823 lines
194 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "IvufQ90W_ILK",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Basic Intro \n",
|
||
"\n",
|
||
"In this competition, you’re challenged to build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t.\n",
|
||
"\n",
|
||
""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "EG9UlaLI_ILQ",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## What's in this kernel?\n",
|
||
"- Basic EDA\n",
|
||
"- Data Cleaning\n",
|
||
"- Baseline Model"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "Z7CAwrRLgliq",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"# Unzipping the file"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "OypqduLZ_ILS",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Importing required Libraries."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 45,
|
||
"metadata": {
|
||
"tags": [
|
||
"block:"
|
||
]
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Defaulting to user installation because normal site-packages is not writeable\n",
|
||
"Requirement already satisfied: seaborn in /home/jovyan/.local/lib/python3.6/site-packages (from -r requirements.txt (line 1)) (0.11.2)\n",
|
||
"Requirement already satisfied: nltk in /home/jovyan/.local/lib/python3.6/site-packages (from -r requirements.txt (line 2)) (3.6.7)\n",
|
||
"Requirement already satisfied: sklearn in /home/jovyan/.local/lib/python3.6/site-packages (from -r requirements.txt (line 3)) (0.0)\n",
|
||
"Requirement already satisfied: collection in /home/jovyan/.local/lib/python3.6/site-packages (from -r requirements.txt (line 4)) (0.1.6)\n",
|
||
"Requirement already satisfied: gensim in /home/jovyan/.local/lib/python3.6/site-packages (from -r requirements.txt (line 5)) (4.1.2)\n",
|
||
"Requirement already satisfied: keras in /home/jovyan/.local/lib/python3.6/site-packages (from -r requirements.txt (line 6)) (2.6.0)\n",
|
||
"Requirement already satisfied: tensorflow in /home/jovyan/.local/lib/python3.6/site-packages (from -r requirements.txt (line 7)) (2.6.2)\n",
|
||
"Requirement already satisfied: pyspellchecker in /home/jovyan/.local/lib/python3.6/site-packages (from -r requirements.txt (line 8)) (0.6.3)\n",
|
||
"Collecting zipfile36\n",
|
||
" Downloading zipfile36-0.1.3-py3-none-any.whl (20 kB)\n",
|
||
"Collecting wget\n",
|
||
" Downloading wget-3.2.zip (10 kB)\n",
|
||
" Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||
"\u001b[?25hRequirement already satisfied: pandas>=0.23 in /usr/local/lib/python3.6/dist-packages (from seaborn->-r requirements.txt (line 1)) (1.1.5)\n",
|
||
"Requirement already satisfied: scipy>=1.0 in /usr/local/lib/python3.6/dist-packages (from seaborn->-r requirements.txt (line 1)) (1.5.4)\n",
|
||
"Requirement already satisfied: matplotlib>=2.2 in /usr/local/lib/python3.6/dist-packages (from seaborn->-r requirements.txt (line 1)) (3.3.4)\n",
|
||
"Requirement already satisfied: numpy>=1.15 in /usr/local/lib/python3.6/dist-packages (from seaborn->-r requirements.txt (line 1)) (1.19.5)\n",
|
||
"Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from nltk->-r requirements.txt (line 2)) (1.1.0)\n",
|
||
"Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from nltk->-r requirements.txt (line 2)) (7.1.2)\n",
|
||
"Requirement already satisfied: regex>=2021.8.3 in /home/jovyan/.local/lib/python3.6/site-packages (from nltk->-r requirements.txt (line 2)) (2022.3.15)\n",
|
||
"Requirement already satisfied: tqdm in /home/jovyan/.local/lib/python3.6/site-packages (from nltk->-r requirements.txt (line 2)) (4.64.0)\n",
|
||
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from sklearn->-r requirements.txt (line 3)) (0.23.2)\n",
|
||
"Requirement already satisfied: smart-open>=1.8.1 in /home/jovyan/.local/lib/python3.6/site-packages (from gensim->-r requirements.txt (line 5)) (5.2.1)\n",
|
||
"Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from gensim->-r requirements.txt (line 5)) (0.8)\n",
|
||
"Requirement already satisfied: opt-einsum~=3.3.0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (3.3.0)\n",
|
||
"Requirement already satisfied: wrapt~=1.12.1 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (1.12.1)\n",
|
||
"Requirement already satisfied: google-pasta~=0.2 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (0.2.0)\n",
|
||
"Requirement already satisfied: typing-extensions~=3.7.4 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (3.7.4.3)\n",
|
||
"Requirement already satisfied: h5py~=3.1.0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (3.1.0)\n",
|
||
"Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow->-r requirements.txt (line 7)) (3.19.3)\n",
|
||
"Requirement already satisfied: six~=1.15.0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (1.15.0)\n",
|
||
"Requirement already satisfied: wheel~=0.35 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (0.37.1)\n",
|
||
"Requirement already satisfied: clang~=5.0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (5.0)\n",
|
||
"Requirement already satisfied: grpcio<2.0,>=1.37.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow->-r requirements.txt (line 7)) (1.43.0)\n",
|
||
"Requirement already satisfied: gast==0.4.0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (0.4.0)\n",
|
||
"Requirement already satisfied: flatbuffers~=1.12.0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (1.12)\n",
|
||
"Requirement already satisfied: termcolor~=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow->-r requirements.txt (line 7)) (1.1.0)\n",
|
||
"Requirement already satisfied: keras-preprocessing~=1.1.2 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (1.1.2)\n",
|
||
"Requirement already satisfied: astunparse~=1.6.3 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (1.6.3)\n",
|
||
"Requirement already satisfied: tensorflow-estimator<2.7,>=2.6.0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (2.6.0)\n",
|
||
"Requirement already satisfied: tensorboard<2.7,>=2.6.0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow->-r requirements.txt (line 7)) (2.6.0)\n",
|
||
"Requirement already satisfied: absl-py~=0.10 in /usr/local/lib/python3.6/dist-packages (from tensorflow->-r requirements.txt (line 7)) (0.11.0)\n",
|
||
"Requirement already satisfied: cached-property in /home/jovyan/.local/lib/python3.6/site-packages (from h5py~=3.1.0->tensorflow->-r requirements.txt (line 7)) (1.5.2)\n",
|
||
"Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=2.2->seaborn->-r requirements.txt (line 1)) (2.8.2)\n",
|
||
"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=2.2->seaborn->-r requirements.txt (line 1)) (1.3.1)\n",
|
||
"Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=2.2->seaborn->-r requirements.txt (line 1)) (8.4.0)\n",
|
||
"Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=2.2->seaborn->-r requirements.txt (line 1)) (3.0.6)\n",
|
||
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=2.2->seaborn->-r requirements.txt (line 1)) (0.11.0)\n",
|
||
"Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.23->seaborn->-r requirements.txt (line 1)) (2021.3)\n",
|
||
"Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (2.27.1)\n",
|
||
"Requirement already satisfied: werkzeug>=0.11.15 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (2.0.3)\n",
|
||
"Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (0.4.6)\n",
|
||
"Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (0.6.1)\n",
|
||
"Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (1.35.0)\n",
|
||
"Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (59.6.0)\n",
|
||
"Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (1.8.1)\n",
|
||
"Requirement already satisfied: markdown>=2.6.8 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (3.3.6)\n",
|
||
"Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->sklearn->-r requirements.txt (line 3)) (3.0.0)\n",
|
||
"Requirement already satisfied: importlib-resources in /usr/local/lib/python3.6/dist-packages (from tqdm->nltk->-r requirements.txt (line 2)) (5.4.0)\n",
|
||
"Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (4.8)\n",
|
||
"Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (0.2.8)\n",
|
||
"Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (4.2.4)\n",
|
||
"Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (1.3.0)\n",
|
||
"Requirement already satisfied: importlib-metadata>=4.4 in /usr/local/lib/python3.6/dist-packages (from markdown>=2.6.8->tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (4.8.3)\n",
|
||
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (1.26.8)\n",
|
||
"Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (2.0.10)\n",
|
||
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (2021.10.8)\n",
|
||
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (3.3)\n",
|
||
"Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.6/dist-packages (from importlib-resources->tqdm->nltk->-r requirements.txt (line 2)) (3.6.0)\n",
|
||
"Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (0.4.8)\n",
|
||
"Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.7,>=2.6.0->tensorflow->-r requirements.txt (line 7)) (3.1.1)\n",
|
||
"Building wheels for collected packages: wget\n",
|
||
" Building wheel for wget (setup.py) ... \u001b[?25ldone\n",
|
||
"\u001b[?25h Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=7494539bc3322689677bdc668e46be6e09fc409ca4c7942e315d8d83afb78000\n",
|
||
" Stored in directory: /home/jovyan/.cache/pip/wheels/90/1d/93/c863ee832230df5cfc25ca497b3e88e0ee3ea9e44adc46ac62\n",
|
||
"Successfully built wget\n",
|
||
"Installing collected packages: zipfile36, wget\n",
|
||
"Successfully installed wget-3.2 zipfile36-0.1.3\n",
|
||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"pip install -r requirements.txt"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"# Importing Libraries"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "BfxrbE88_ILS",
|
||
"outputId": "b71ee66f-a5d2-46ad-924a-d4f1ea986aa5",
|
||
"tags": [
|
||
"imports"
|
||
]
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...\n",
|
||
"[nltk_data] Package stopwords is already up-to-date!\n",
|
||
"[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...\n",
|
||
"[nltk_data] Package punkt is already up-to-date!\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import re\n",
|
||
"import nltk\n",
|
||
"import gensim\n",
|
||
"import string\n",
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"import seaborn as sns\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"\n",
|
||
"from tqdm import tqdm\n",
|
||
"\n",
|
||
"from nltk.tokenize import word_tokenize\n",
|
||
"\n",
|
||
"from nltk.corpus import stopwords\n",
|
||
"from nltk.util import ngrams\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||
"from collections import defaultdict\n",
|
||
"from collections import Counter\n",
|
||
"\n",
|
||
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
|
||
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
|
||
"from tensorflow.keras.models import Sequential\n",
|
||
"from tensorflow.keras.layers import Embedding,LSTM,Dense,SpatialDropout1D\n",
|
||
"from tensorflow.keras.initializers import Constant\n",
|
||
"from tensorflow.keras.optimizers import Adam\n",
|
||
"\n",
|
||
"nltk.download('stopwords')\n",
|
||
"nltk.download('punkt')\n",
|
||
"stop=set(stopwords.words('english'))\n",
|
||
"plt.style.use('ggplot')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {
|
||
"id": "zVmloKw7_ILV",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"#os.listdir('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "2Ut5Ko9G_ILW",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## Load data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 143
|
||
},
|
||
"id": "I8aHh7_l_ILW",
|
||
"outputId": "0f90d9ba-0307-4917-d128-cbb3b49b77fa",
|
||
"tags": [
|
||
"block:load_data"
|
||
]
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>keyword</th>\n",
|
||
" <th>location</th>\n",
|
||
" <th>text</th>\n",
|
||
" <th>target</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Our Deeds are the Reason of this #earthquake M...</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Forest fire near La Ronge Sask. Canada</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>All residents asked to 'shelter in place' are ...</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id keyword location text \\\n",
|
||
"0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n",
|
||
"1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n",
|
||
"2 5 NaN NaN All residents asked to 'shelter in place' are ... \n",
|
||
"\n",
|
||
" target \n",
|
||
"0 1 \n",
|
||
"1 1 \n",
|
||
"2 1 "
|
||
]
|
||
},
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"tweet= pd.read_csv('./data/train.csv')\n",
|
||
"test=pd.read_csv('./data/test.csv')\n",
|
||
"tweet.head(3)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "WX64jB83_ILX",
|
||
"outputId": "314e9aad-8fb4-4c60-cb5d-3f91027a024c",
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"There are 7613 rows and 5 columns in train\n",
|
||
"There are 3263 rows and 4 columns in train\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print('There are {} rows and {} columns in train'.format(tweet.shape[0],tweet.shape[1]))\n",
|
||
"print('There are {} rows and {} columns in train'.format(test.shape[0],test.shape[1]))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "ebUgRyjU_ILY",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## Class distribution"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "KrbMbAtc_ILZ",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"Before we begin with anything else,let's check the class distribution.There are only two classes 0 and 1."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 337
|
||
},
|
||
"id": "7nfbgY5l_ILa",
|
||
"outputId": "e06bfc19-14a8-4c5d-af3b-6434a97af380",
|
||
"tags": [
|
||
"block:",
|
||
"prev:load_data"
|
||
]
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/home/jovyan/.local/lib/python3.6/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.\n",
|
||
" FutureWarning\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Text(0, 0.5, 'samples')"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAD4CAYAAAD7CAEUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAULklEQVR4nO3df0zU9x3H8dcdaPlxFjgQEXSJTM0CE2mkLZq1MnrLmtJkRBvNOpfS1XSGDIPNXMEma/dHDZ2zEEFrN4na1Mx1pvqXydyFgGmpC4wfizJrnW7WiEHuixRQh3fc/qBedNr2PsL9QJ6P/+57fO/el3yTJ5/73n3P5vf7/QIAIEj2SA8AAJhaCAcAwAjhAAAYIRwAACOEAwBghHAAAIzERnqAcLh06VKkRwCAKSUzM/Mr72PFAQAwQjgAAEYIBwDACOEAABghHAAAI4QDAGCEcAAAjBAOAIARwgEAMDItvjk+Ub2b10d6BEShudv2RHoEICJYcQAAjBAOAIARwgEAMEI4AABGCAcAwAjhAAAYIRwAACOEAwBghHAAAIwQDgCAEcIBADBCOAAARsJ6kcOxsTFVVVXJ6XSqqqpKfX19qqur09DQkLKzs1VRUaHY2FjdvHlTDQ0NOnfunGbNmqXKykqlp6dLkg4fPqympibZ7Xa9+OKLys/PD+dLAIBpL6wrjqNHjyorKytw+/3331dJSYnq6+uVmJiopqYmSVJTU5MSExNVX1+vkpISHThwQJJ08eJFtba26u2339Zrr72mxsZGjY2NhfMlAMC0F7ZweDwedXR06KmnnpIk+f1+nTp1SoWFhZKkoqIitbW1SZLa29tVVFQkSSosLNTJkyfl9/vV1tamFStWaMaMGUpPT1dGRobOnj0brpcAAFAY36rat2+f1q1bp+vXr0uShoaGlJCQoJiYGEmS0+mUZVmSJMuylJqaKkmKiYlRQkKChoaGZFmWFi1aFHjM2/e5ndvtltvtliTV1NQoLS1tQrP3TmhvPKgmelwBU1VYwvH3v/9dSUlJys7O1qlTp0L+fC6XSy6XK3C7v78/5M+J6YfjCg+yzMzMr7wvLOH49NNP1d7ers7OTo2Ojur69evat2+frl27Jp/Pp5iYGFmWJafTKWl8JeHxeJSamiqfz6dr165p1qxZge233L4PACA8wnKO4/nnn9fu3bu1c+dOVVZW6rvf/a42btyo3NxcnThxQpLU3NysgoICSdKyZcvU3NwsSTpx4oRyc3Nls9lUUFCg1tZW3bx5U319fert7dXChQvD8RIAAF+K6G+O/+QnP1FdXZ0OHjyoBQsWqLi4WJJUXFyshoYGVVRUyOFwqLKyUpI0f/58LV++XK+88orsdrteeukl2e18FQUAwsnm9/v9kR4i1C5dujSh/Xs3r5+kSfAgmbttT6RHAELm685x8O86AMAI4QAAGCEcAAAjhAMAYIRwAACMEA4AgBHCAQAwQjgAAEYIBwDACOEAABghHAAAI4QDAGCEcAAAjBAOAIARwgEAMEI4AABGCAcAwAjhAAAYIRwAACOEAwBghHAAAIwQDgCAEcIBADBCOAAARggHAMAI4QAAGCEcAAAjhAMAYIRwAACMxEZ6AAD3r2z/J5EeAVFo3wvLQ/r4rDgAAEYIBwDACOEAABghHAAAI4QDAGCEcAAAjBAOAIARwgEAMEI4AABGwvLN8dHRUb3++uvyer3y+XwqLCzUmjVr1NfXp7q6Og0NDSk7O1sVFRWKjY3VzZs31dDQoHPnzmnWrFmqrKxUenq6JOnw4cNqamqS3W7Xiy++qPz8/HC8BADAl8Ky4pgxY4Zef/11bdu2Tb/97W/V1dWlM2fO6P3331dJSYnq6+uVmJiopqYmSVJTU5MSExNVX1+vkpISHThwQJJ08eJFtba26u2339Zrr72mxsZGjY2NheMlAAC+FJZw2Gw2xcXFSZJ8Pp98Pp9sNptOnTqlwsJCSVJRUZHa2tokSe3t7SoqKpIkFRYW6uTJk/L7/Wpra9OKFSs0Y8YMpaenKyMjQ2fPng3HSwAAfClsFzkcGxvTq6++qsuXL+uHP/yh5syZo4SEBMXExEiSnE6nLMuSJFmWpdTUVElSTEyMEhISNDQ0JMuytGjRosBj3r7P7dxut9xutySppqZGaWlpE5q9d0J740E10eMKCJVQH5thC4fdbte2bds0MjKi3/3ud7p06VLInsvlcsnlcgVu9/f3h+y5MH1xXCFaTcaxmZmZ+ZX3hf1TVYmJicrNzdWZM2d07do1+Xw+SeOrDKfTKWl8JeHxeCSNv7V17do1zZo1647t/78PACA8whKOL774QiMjI5LGP2H1j3/8Q1lZWcrNzdWJEyckSc3NzSooKJAkLVu2TM3NzZKkEydOKDc3VzabTQUFBWptbdXNmzfV19en3t5eLVy4MBwvAQDwpbC8VTUwMKCdO3dqbGxMfr9fy5cv17JlyzRv3jzV1dXp4MGDWrBggYqLiyVJxcXFamhoUEVFhRwOhyorKyVJ8+fP1/Lly/XKK6/IbrfrpZdekt3OV1EAIJxsfr/fH+khQm2i51N6N6+fpEnwIJm7bU+kR+AXAHFPk/ELgFF1jgMAMLURDgCAEcIBADBCOAAARggHAMAI4QAAGAn6exwnT55Uenq60tPTNTAwoAMHDshut+v5559XcnJyCEcEAESToFccjY2NgS/bvffee4Er3L777rshGw4AEH2CXnFYlqW0tDT5fD51d3dr165dio2N1c9//vNQzgcAiDJBhyM+Pl5Xr17V559/rnnz5ikuLk5er1derzeU8wEAokzQ4Xj66adVXV0tr9ersrIySdLp06eVlZUVqtkAAFEo6HCUlpbqsccek91uV0ZGhqTxy59v2LAhZMMBAKKP0cdxb32iqrW1VdJ4ONLT00MyGAAgOgW94rhw4YLeeustzZgxQx6PRytWrFBPT49aWlq0adOmUM4IAIgiQa84/vCHP2jt2rWqq6tTbOx4b3JycnT69OmQDQcAiD5Bh+PixYt64okn7tgWFxen0dHRSR8KABC9gg7H7Nmzde7cuTu2nT17NnCiHAAwPQR9jmPt2rWqqanRD37wA3m9Xh0+fFh//etf+QIgAEwzQa84li1bpi1btuiLL75QTk6Orly5ol/+8pdaunRpKOcDAESZoFcckrRgwQKtX8/vbwPAdPa14fjTn/4U1IOsXbt2UoYBAES/rw2Hx+MJ1xwAgCnia8NRXl4erjkAAFOE0TmO3t5effLJJ7IsS06nU8uXL9fcuXNDNRsAIAoF/amqjz76SL/61a/0n//8R3Fxcbpw4YJeffVVffTRR6GcDwAQZYJecRw8eFDV1dXKyckJbPvnP/+phoYGfe973wvJcACA6BP0iuP69etavHjxHdsWLVqkGzduTPpQAIDoFXQ4nn32Wf3xj38MXJtqdHRUBw8e1LPPPhuy4QAA0Sfot6qOHTumq1ev6ujRo3I4HBoeHpYkJScn69ixY4G/e+eddyZ/SgBA1Ag6HBUVFaGcAwAwRQQdjttPigMApq+gw+Hz+fTxxx/r/Pnzd50Q5wq5ADB9BB2O+vp6XbhwQfn5+UpKSgrlTACAKBZ0OLq6uvTOO+8oPj4+lPMAAKJc0B/HnT9/fuCTVACA6SvoFccvfvEL7d69W0uXLr3rraqVK1dO+mAAgOgUdDiam5t1+vRpjYyMaObMmYHtNpuNcADANBJ0OI4ePaq33npL8+bNC+U8AIAoF3Q4kpOTlZaWdl9P0t/fr507d+rq1auy2WxyuVx65plnNDw8rNraWl25ckWzZ8/Wpk2b5HA45Pf7tXfvXnV2duqhhx5SeXm5srOzJY2vfD788ENJ0qpVq1RUVHRfMwEA7k/Q4SgpKVF9fb1+9KMf3XWOY86cOV+7b0xMjH76058qOztb169fV1VVlfLy8tTc3KwlS5aotLRUR44c0ZEjR7Ru3Tp1dnbq8uXL2rFjhz777DPt2bNHW7du1fDwsA4dOqSamhpJUlVVlQoKCuRwOO7jpQMA7kfQ4WhsbJQktbe333XfN/02eUpKilJSUiRJ8fHxysrKkmVZamtr0xtvvCFp/AT7G2+8oXXr1qm9vV1PPvmkbDabFi9erJGREQ0MDOjUqVPKy8sLhCIvL09dXV1c1h0AwijocHxTHILV19en8+fPa+HChRocHAwEJTk5WYODg5Iky7LueFssNTVVlmXJsiylpqYGtjudTlmWdddzuN1uud1uSVJNTc19v8V2S++E9saDaqLHFRAqoT42jX46dqJu3Lih7du3q6ysTAkJCXfcZ7PZZLPZJuV5XC6XXC5X4HZ/f/+kPC5wO44rRKvJODYzMzO/8j6ja1X95S9/UU9Pj4aGhu647ze/+c037u/1erV9+3Y98cQTevzxxyVJSUlJGhgYUEpKigYGBvTwww9LGl9J3P7CPR6PnE6nnE6nenp6Atsty+LiiwAQZkF/c3z//v1yu93KycnRuXPn9Pjjj2twcFC5ubnfuK/f79fu3buVlZV1xw8/FRQUqKWlRZLU0tKiRx99NLD9+PHj8vv9OnPmjBISEpSSkqL8/Hx1d3dreHhYw8PD6u7uVn5+vuFLBgBMRNArjr/97W968803lZaWpg8++EDPPPOMli5dqt///vffuO+nn36q48eP61vf+pY2b94sSfrxj3+s0tJS1dbWqqmpKfBxXEl65JFH1NHRoY0bN2rmzJkqLy+XJDkcDq1evVrV1dWSpOeee45PVAFAmAUdjtHR0cCJ6ZkzZ+q///2vsrKy9O9///sb9/3Od76jDz744J73/frXv75rm81m0/r16+/598XFxSouLg52bADAJAs6HFlZWfrXv/6lhQsXKjs7W3/+858VHx8vp9MZyvkAAFEm6HMcZWVliomJkSS98MILOn/+vDo6OvTyyy+HbDgAQPQJesVx48YNpaenS5Li4uKUkpIiu92uuXPnhmw4AED0CXrF0djYKLt9/M/fe+89+Xw+2Ww2vfvuuyEbDgAQfYJecdz6NrfP51N3d7d27dql2NhYfm8cAKaZoMMRHx+vq1ev6vPPP9e8efMUFxcnr9crr9cbyvkAAFEm6HA8/fTTqq6ultfrVVlZmSTp9OnTysrKCtVsAIAoFHQ4SktL9dhjj8lutysjI0PS+KVBNmzYELLhAADRx+gih/9/0auvuwgWAODBFPSnqgAAkAgHAMAQ4QAAGCEcAAAjhAMAYIRwAACMEA4AgBHCAQAwQjgAAEYIBwDACOEAABghHAAAI4QDAGCEcAAAjBAOAIARwgEAMEI4AABGCAcAwAjhAAAYIRwAACOEAwBghHAAAIwQDgCAEcIBADBCOAAARggHAMAI4QAAGCEcAAAjhAMAYIRwAACMxIbjSXbt2qWOjg4lJSVp+/btkqTh4WHV1tbqypUrmj17tjZt2iSHwyG/36+9e/eqs7NTDz30kMrLy5WdnS1Jam5u1ocffihJWrVqlYqKisIxPgDgNmFZcRQVFWnLli13bDty5IiWLFmiHTt2aMmSJTpy5IgkqbOzU5cvX9aOHTv08ssva8+ePZLGQ3Po0CFt3bpVW7du1aFDhzQ8PByO8QEAtwlLOHJycuRwOO7Y1tbWppUrV0qSVq5cqba2NklSe3u7nnzySdlsNi1evFgjIyMaGBhQV1eX8vLy5HA45HA4lJeXp66urnCMDwC4TVjeqrqXwcFBpaSkSJKSk5M1ODgoSbIsS2lpaYG/S01NlWVZsixLqampge1Op1OWZd3zsd1ut9xutySppqbmjse7H70T2hsPqokeV0CohPrYjFg4bmez2WSz2Sbt8Vwul1wuV+B2f3//pD02cAvHFaLVZBybmZmZX3lfxD5VlZSUpIGBAUnSwMCAHn74YUnjK4nbX7TH45HT6ZTT6ZTH4wlstyxLTqczvEMDACIXjoKCArW0tEiSWlpa9Oijjwa2Hz9+XH6/X2fOnFFCQoJSUlKUn5+v7u5uDQ8Pa3h4WN3d3crPz4/U+AAwbYXlraq6ujr19PRoaGhIGzZs0Jo1a1RaWqra2lo1NTUFPo4rSY888og6Ojq0ceNGzZw5U+Xl5ZIkh8Oh1atXq7q6WpL03HPP3XXCHQAQeja/3++P9BChdunSpQnt37t5/SRNggfJ3G17Ij2CyvZ/EukREIX2vbB8wo8Rlec4AABTE+EAABghHAAAI4QDAGCEcAAAjBAOAIARwgEAMEI4AABGCAcAwAjhAAAYIRwAACOEAwBghHAAAIwQDgCAEcIBADBCOAAARggHAMAI4QAAGCEcAAAjhAMAYIRwAACMEA4AgBHCAQAwQjgAAEYIBwDACOEAABghHAAAI4QDAGCEcAAAjBAOAIARwgEAMEI4AABGCAcAwAjhAAAYIRwAACOEAwBghHAAAIwQDgCAEcIBADASG+kB7kdXV5f27t2rsbExPfXUUyotLY30SAAwbUy5FcfY2JgaGxu1ZcsW1dbW6uOPP9bFixcjPRYATBtTLhxnz55VRkaG5syZo9jYWK1YsUJtbW2RHgsApo0p91aVZVlKTU0N3E5NTdVnn312x9+43W653W5JUk1NjTIzMyf0nJkHjk5ofyBUjlWvjvQImIam3IojGC6XSzU1NaqpqYn0KA+cqqqqSI8A3BPHZvhMuXA4nU55PJ7AbY/HI6fTGcGJAGB6mXLh+Pa3v63e3l719fXJ6/WqtbVVBQUFkR4LAKaNKXeOIyYmRj/72c/05ptvamxsTN///vc1f/78SI81bbhcrkiPANwTx2b42Px+vz/SQwAApo4p91YVACCyCAcAwMiUO8eByOFSL4hGu3btUkdHh5KSkrR9+/ZIjzMtsOJAULjUC6JVUVGRtmzZEukxphXCgaBwqRdEq5ycHDkcjkiPMa0QDgTlXpd6sSwrghMBiBTCAQAwQjgQFC71AuAWwoGgcKkXALfwzXEEraOjQ/v37w9c6mXVqlWRHglQXV2denp6NDQ0pKSkJK1Zs0bFxcWRHuuBRjgAAEZ4qwoAYIRwAACMEA4AgBHCAQAwQjgAAEYIBwDACOEAABj5H8vwodaQVQ2uAAAAAElFTkSuQmCC\n",
|
||
"text/plain": [
|
||
"<Figure size 432x288 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"x=tweet.target.value_counts()\n",
|
||
"sns.barplot(x.index,x)\n",
|
||
"plt.gca().set_ylabel('samples')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "P-nrzD3F_ILa",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"ohh,as expected ! There is a class distribution.There are more tweets with class 0 ( No disaster) than class 1 ( disaster tweets)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "b_ZgxS9Y_ILb",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## Exploratory Data Analysis of tweets"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "bwdYFByW_ILb",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"First,we will do very basic analysis,that is character level,word level and sentence level analysis."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "KDzMFkEa_ILb",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Number of characters in tweets"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 356
|
||
},
|
||
"id": "GPRFNeTn_ILc",
|
||
"outputId": "40b4b4a3-2cb7-46df-8547-ccf2b6f5cd0e",
|
||
"tags": [
|
||
"block:eda_data",
|
||
"prev:load_data"
|
||
]
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"image/png": "\n",
|
||
"text/plain": [
|
||
"<Figure size 720x360 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))\n",
|
||
"tweet_len=tweet[tweet['target']==1]['text'].str.len()\n",
|
||
"ax1.hist(tweet_len,color='red')\n",
|
||
"ax1.set_title('disaster tweets')\n",
|
||
"tweet_len=tweet[tweet['target']==0]['text'].str.len()\n",
|
||
"ax2.hist(tweet_len,color='green')\n",
|
||
"ax2.set_title('Not disaster tweets')\n",
|
||
"fig.suptitle('Characters in tweets')\n",
|
||
"plt.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "kMl21V0z_ILc",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"The distribution of both seems to be almost same.120 t0 140 characters in a tweet are the most common among both."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "6TeqztFR_ILd",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Number of words in a tweet"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 356
|
||
},
|
||
"id": "v7SANwhp_ILd",
|
||
"outputId": "70e79903-970e-4735-8246-00acb2753453",
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"image/png": "\n",
|
||
"text/plain": [
|
||
"<Figure size 720x360 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))\n",
|
||
"tweet_len=tweet[tweet['target']==1]['text'].str.split().map(lambda x: len(x))\n",
|
||
"ax1.hist(tweet_len,color='red')\n",
|
||
"ax1.set_title('disaster tweets')\n",
|
||
"tweet_len=tweet[tweet['target']==0]['text'].str.split().map(lambda x: len(x))\n",
|
||
"ax2.hist(tweet_len,color='green')\n",
|
||
"ax2.set_title('Not disaster tweets')\n",
|
||
"fig.suptitle('Words in a tweet')\n",
|
||
"plt.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "pEgUshSi_ILe",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Average word length in a tweet"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 480
|
||
},
|
||
"id": "CVXe5YTx_ILe",
|
||
"outputId": "7feaae49-1c61-4038-fc3f-7cd24a3c3bf7",
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/home/jovyan/.local/lib/python3.6/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
|
||
" warnings.warn(msg, FutureWarning)\n",
|
||
"/home/jovyan/.local/lib/python3.6/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
|
||
" warnings.warn(msg, FutureWarning)\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Text(0.5, 0.98, 'Average word length in each tweet')"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "\n",
|
||
"text/plain": [
|
||
"<Figure size 720x360 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))\n",
|
||
"word=tweet[tweet['target']==1]['text'].str.split().apply(lambda x : [len(i) for i in x])\n",
|
||
"sns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='red')\n",
|
||
"ax1.set_title('disaster')\n",
|
||
"word=tweet[tweet['target']==0]['text'].str.split().apply(lambda x : [len(i) for i in x])\n",
|
||
"sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='green')\n",
|
||
"ax2.set_title('Not disaster')\n",
|
||
"fig.suptitle('Average word length in each tweet')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {
|
||
"id": "6NRUZcUo_ILf",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"def create_corpus(target):\n",
|
||
" corpus=[]\n",
|
||
" \n",
|
||
" for x in tweet[tweet['target']==target]['text'].str.split():\n",
|
||
" for i in x:\n",
|
||
" corpus.append(i)\n",
|
||
" return corpus"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "nwxtu1Sm_ILf",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Common stopwords in tweets"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "GxyIXyPe_ILf",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"First we will analyze tweets with class 0."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {
|
||
"id": "wD2Z8SC__ILg",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"corpus=create_corpus(0)\n",
|
||
"\n",
|
||
"dic=defaultdict(int)\n",
|
||
"for word in corpus:\n",
|
||
" if word in stop:\n",
|
||
" dic[word]+=1\n",
|
||
" \n",
|
||
"top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] \n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 286
|
||
},
|
||
"id": "1T2dSQXK_ILg",
|
||
"outputId": "1712a0cc-34c5-43af-881c-403b8929014d",
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<BarContainer object of 10 artists>"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "\n",
|
||
"text/plain": [
|
||
"<Figure size 432x288 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"x,y=zip(*top)\n",
|
||
"plt.bar(x,y)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "Dr55rAMk_ILg",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"Now,we will analyze tweets with class 1."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 282
|
||
},
|
||
"id": "Hl4hcagJ_ILh",
|
||
"outputId": "ec6b8a74-0432-4d09-bdd1-1cbba1d90442",
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<BarContainer object of 10 artists>"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAWjElEQVR4nO3da2xT5x3H8Z+xl7S5xw6whcvA5VZYGIyERq1G0uBXwCq0bkxrYeoYRTRbEIHSMVahXUqJNtGk4bJqpUonirQx1mWUUbWz0oSNFslZQgcZBVbQRC8QEqchIdziPHuBsModO06c8Xw/r/Dx8fn/n5zDL0+Oj48dxhgjAIAVBsW7AQBA/yH0AcAihD4AWITQBwCLEPoAYBFCHwAs4op3A7fzySef9FutrKwstbS09Fs9alOb2tTuC9nZ2Td9jpk+AFiE0AcAixD6AGARQh8ALELoA4BFCH0AsAihDwAWIfQBwCKEPgBYZMB/Irc3Qk8+EtH6pyLcvvPlnRG+AgDii5k+AFjkrp7pxxN/ZQAYiJjpA4BFCH0AsAihDwAWIfQBwCK3fSN38+bNamhoUHp6utavXy9J6uzsVHl5uU6fPq3BgwertLRUKSkpMsaoqqpKjY2NSkxMVHFxsbxerySptrZWr7/+uiTpm9/8pgoLC/tuVACAG7rtTL+wsFCrV6++all1dbVycnJUWVmpnJwcVVdXS5IaGxt18uRJVVZWavHixdqyZYuky78kduzYoeeff17PP/+8duzYoc7OztiPBgBwS7cN/YkTJyolJeWqZYFAQAUFBZKkgoICBQIBSVJ9fb1mzJghh8OhcePG6ezZs2pra9P+/fs1efJkpaSkKCUlRZMnT9b+/ftjPxoAwC1FdZ1+e3u7MjMzJUkZGRlqb2+XJAWDQWVlZYXX83g8CgaDCgaD8ng84eVut1vBYPCG2/b7/fL7/ZKksrKyq7YXqUivfY/UrXqLZ+1IuVyumG6P2tSm9sCpfa1efzjL4XDI4XDEohdJks/nk8/nCz+O1xcZ34l49hbL2jZ9YTS1qW1D7Zh/MXp6erra2tokSW1tbUpLS5N0eQb/+YG1trbK7XbL7XartbU1vDwYDMrtdkdTGgDQC1GFfm5ururq6iRJdXV1ysvLCy/fs2ePjDE6cuSIkpKSlJmZqSlTpuj9999XZ2enOjs79f7772vKlCkxGwQA4M7c9vRORUWF/v3vf6ujo0NLlizRvHnzNHfuXJWXl6umpiZ8yaYkTZ06VQ0NDVq6dKkSEhJUXFwsSUpJSdGjjz6qn/zkJ5Kkb33rW9e9OQwA6Hu3Df1ly5bdcPmaNWuuW+ZwOLRo0aIbrl9UVKSioqLIugMAxBSfyAUAixD6AGARQh8ALELoA4BFCH0AsAihDwAWIfQBwCKEPgBYhNAHAIsQ+gBgEUIfACxC6AOARQh9ALAIoQ8AFiH0AcAihD4AWKTXX4yOgSf05CMRrX8qihrOl3dG8SoA8cZMHwAsQugDgEUIfQCwCKEPABYh9AHAIoQ+AFiE0AcAixD6AGARQh8ALELoA4BFCH0AsAihDwAWIfQBwCK9usvmrl27VFNTI4fDoREjRqi4uFifffaZKioq1NHRIa/Xq5KSErlcLl26dEkbN27UsWPHlJqaqmXLlmnIkCGxGgcA4A5EPdMPBoN68803VVZWpvXr16unp0fvvvuuXnvtNc2ePVsbNmxQcnKyampqJEk1NTVKTk7Whg0bNHv2bG3bti1mgwAA3Jlend7p6enRxYsXFQqFdPHiRWVkZKipqUn5+fmSpMLCQgUCAUlSfX29CgsLJUn5+fk6ePCgjDG96x4AEJGoT++43W594xvf0FNPPaWEhAR99atfldfrVVJSkpxOZ3idYDAo6fJfBh6PR5LkdDqVlJSkjo4OpaWlXbVdv98vv98vSSorK1NWVla0LUb15SCRuFVvd3Pt29WPhMvlitm2qE1tat9e1KHf2dmpQCCgTZs2KSkpSS+88IL279/f64Z8Pp98Pl/4cUtLS6+32Vfi2Vu8fy6xqp+VlRW3sVCb2ndr7ezs7Js+F/XpnQMHDmjIkCFKS0uTy+XSAw88oMOHD6urq0uhUEjS5dm92+2WdHnW39raKkkKhULq6upSampqtOUBAFGIOvSzsrJ09OhRXbhwQcYYHThwQMOHD9ekSZO0b98+SVJtba1yc3MlSdOmTVNtba0kad++fZo0aZIcDkfvRwAAuGNRn94ZO3as8vPz9eMf/1hOp1OjRo2Sz+fT1772NVVUVOj3v/+9Ro8eraKiIklSUVGRNm7cqJKSEqWkpGjZsmWxGgMA4A716jr9efPmad68eVctGzp0qNatW3fdugkJCVq+fHlvygEAeolP5AKARQh9ALAIoQ8AFiH0AcAihD4AWITQBwCLEPoAYBFCHwAs0qsPZwHXCj35SETrR3NHUOfLO6N4FQCJmT4AWIXQBwCLEPoAYBFCHwAswhu5uGvwJjJwe8z0AcAihD4AWITQBwCLEPoAYBFCHwAsQugDgEUIfQCwCKEPABYh9AHAIoQ+AFiE0AcAixD6AGARQh8ALELoA4BFCH0AsAihDwAW6dWXqJw9e1YvvfSSTpw4IYfDoaeeekrZ2dkqLy/X6dOnNXjwYJWWliolJUXGGFVVVamxsVGJiYkqLi6W1+uN1TgAAHegVzP9qqoqTZkyRRUVFfr1r3+tYcOGqbq6Wjk5OaqsrFROTo6qq6slSY2NjTp58qQqKyu1ePFibdmyJRb9AwAiEHXod3V16dChQyoqKpIkuVwuJScnKxAIqKCgQJJUUFCgQCAgSaqvr9eMGTPkcDg0btw4nT17Vm1tbTEYAgDgTkV9eqe5uVlpaWnavHmz/vvf/8rr9eqJJ55Qe3u7MjMzJUkZGRlqb2+XJAWDQWVlZYVf7/F4FAwGw+sCAPpe1KEfCoV0/PhxLVy4UGPHjlVVVVX4VM4VDodDDocjou36/X75/X5JUllZ2VW/KCIVzRdfR+JWvd3NtW9V39bakXK5XDHbFrWpHYmoQ9/j8cjj8Wjs2LGSpPz8fFVXVys9PV1tbW3KzMxUW1ub0tLSJElut1stLS3h17e2tsrtdl+3XZ/PJ5/PF378+dcMNPHsLd4/F1vHHqvaWVlZcRsHte/+2tnZ2Td9Lupz+hkZGfJ4PPrkk08kSQcOHNDw4cOVm5ururo6SVJdXZ3y8vIkSbm5udqzZ4+MMTpy5IiSkpI4tQMA/axXl2wuXLhQlZWV6u7u1pAhQ1RcXCxjjMrLy1VTUxO+ZFOSpk6dqoaGBi1dulQJCQkqLi6OyQAAAHeuV6E/atQolZWVXbd8zZo11y1zOBxatGhRb8oBAHqJT+QCgEUIfQCwCKEPABYh9AHAIoQ+AFiE0AcAixD6AGARQh8ALELoA4BFCH0AsAihDwAWIfQBwCKEPgBYhNAHAIsQ+gBgEUIfACxC6AOARQh9ALAIoQ8AFiH0AcAihD4AWITQBwCLuOLdAHA3CD35SETrn4qihvPlnVG8CrgaM30AsAihDwAWIfQBwCKEPgBYhNAHAIsQ+gBgEUIfACxC6AOARXr94ayenh6tWrVKbrdbq1atUnNzsyoqKtTR0SGv16uSkhK5XC5dunRJGzdu1LFjx5Samqply5ZpyJAhsRgDAOAO9Xqmv3v3bg0bNiz8+LXXXtPs2bO1YcMGJScnq6amRpJUU1Oj5ORkbdiwQbNnz9a2bdt6WxoAEKFehX5ra6saGho0c+ZMSZIxRk1NTcrPz5ckFRYWKhAISJLq6+tVWFgoScrPz9fBgwdljOlNeQBAhHp1eufVV1/V/Pnzde7cOUlSR0eHkpKS5HQ6JUlut1vBYFCSFAwG5fF4JElOp1NJSUnq6OhQWlraVdv0+/3y+/2SpLKyMmVlZUXdXzT3N4nErXq7m2vfqj61+792pFwuV8y2Re2BX/taUYf+P//5T6Wnp8vr9aqpqSlmDfl8Pvl8vvDjlpaWmG071uLZW7x/LraO/W6onZWVFbdxULt/ZGdn3/S5qEP/8OHDqq+vV2Njoy5evKhz587p1VdfVVdXl0KhkJxOp4LBoNxut6TLs/7W1lZ5PB6FQiF1dXUpNTU12vIAgChEHfqPPfaYHnvsMUlSU1OT3njjDS1dulQvvPCC9u3bp4ceeki1tbXKzc2VJE2bNk21tbUaN26c9u3bp0mTJsnhcMRmFIDFuK0zIhHz6/Qff/xx7dq1SyUlJers7FRRUZEkqaioSJ2dnSopKdGuXbv0+OOPx7o0AOA2YvIlKpMmTdKkSZMkSUOHDtW6deuuWychIUHLly+PRTkAQJT45iwAUePU0v8fbsMAABYh9AHAIoQ+AFiE0AcAixD6AGARQh8ALELoA4BFCH0AsAihDwAWIfQBwCKEPgBYhNAHAItwwzUA/5e42Vt0mOkDgEUIfQCwCKEPABYh9AHAIoQ+AFiEq3cAIEL/z1cOMdMHAIsQ+gBgEUIfACxC6AOARQh9ALAIoQ8AFiH0AcAihD4AWITQBwCLEPoAYJGob8PQ0tKiTZs26bPPPpPD4ZDP59OsWbPU2dmp8vJynT59WoMHD1ZpaalSUlJkjFFVVZUaGxuVmJio4uJieb3eWI4FAHAbUc/0nU6nFixYoPLycq1du1ZvvfWWPvroI1VXVysnJ0eVlZXKyclRdXW1JKmxsVEnT55UZWWlFi9erC1btsRqDACAOxR16GdmZoZn6vfee6+GDRumYDCoQCCggoICSVJBQYECgYAkqb6+XjNmzJDD4dC4ceN09uxZtbW1xWAIAIA7FZO7bDY3N+v48eMaM2aM2tvblZmZKUnKyMhQe3u7JCkYDCorKyv8Go/Ho2AwGF73Cr/fL7/fL0kqKyu76jWRiubOdpG4VW93c+1b1ac2tandt7V7q9ehf/78ea1fv15PPPGEkpKSrnrO4XDI4XBEtD2fzyefzxd+3NLS0tsW+0w8e4v3z8XWsVOb2v8PtbOzs2/6XK+u3unu7tb69ev19a9/XQ888IAkKT09PXzapq2tTWlpaZIkt9t91SBaW1vldrt7Ux4AEKGoQ98Yo5deeknDhg3TnDlzwstzc3NVV1cnSaqrq1NeXl54+Z49e2SM0ZEjR5SUlHTdqR0AQN+K+vTO4cOHtWfPHo0cOVIrV66UJH33u9/V3LlzVV5erpqamvAlm5I0depUNTQ0aOnSpUpISFBxcXFsRgAAuGNRh/6ECRO0ffv2Gz63Zs2a65Y5HA4tWrQo2nIAgBjgE7kAYBFCHwAsQugDgEUIfQCwCKEPABYh9AHAIoQ+AFiE0AcAixD6AGARQh8ALELoA4BFCH0AsAihDwAWIfQBwCKEPgBYhNAHAIsQ+gBgEUIfACxC6AOARQh9ALAIoQ8AFiH0AcAihD4AWITQBwCLEPoAYBFCHwAsQugDgEUIfQCwCKEPABYh9AHAIoQ+AFjE1d8F9+/fr6qqKvX09GjmzJmaO3duf7cAANbq15l+T0+PXnnlFa1evVrl5eXau3evPvroo/5sAQCs1q+h/5///Edf/OIXNXToULlcLj344IMKBAL92QIAWM1hjDH9VWzfvn3av3+/lixZIknas2ePjh49qh/84Afhdfx+v/x+vySprKysv1oDACsMuDdyfT6fysrK4hL4q1at6vea1KY2tandn/o19N1ut1pbW8OPW1tb5Xa7+7MFALBav4b+fffdp08//VTNzc3q7u7Wu+++q9zc3P5sAQCs1q+XbDqdTi1cuFBr165VT0+PHn74YY0YMaI/W7gln89HbWpTm9p3Ve1r9esbuQCA+Bpwb+QCAPoOoQ8AFrEq9M+ePau33npLktTU1BTXzwE8++yzcat9M7t371ZpaakqKyvj3UrMfX7fD3QLFiyIdwsxMRCOp4H2/+z111+Pdwv2hf7bb78d7zYkSc8991y8W7jO22+/rWeffVZLly6NdysxN5D2vS0iOZ5CoVCf9DDQ/p/9+c9/jncLdr2RW1FRoUAgoOzsbLlcLiUmJio1NVUnTpyQ1+tVSUmJHA6Hjh07pt/97nc6f/680tLSVFxcrMzMzJj2smDBAm3dulVNTU364x//eMM++tKuXbv0zjvvSJKKior08ccf65133lF2drYefvhhzZkzp0/rS9KvfvUrtba26tKlS5o1a1afXuHw+X0/efJkSZdv/idJjz76qB588MGY1rvR2BYsWKBZs2apoaFBCQkJWrlypTIyMtTc3KwXX3xR58+fV15env76179q69atMevl2n2dl5endevWafz48Tpy5IjcbreeeeYZJSQkxKzmb3/72/DxVFhYqEOHDqm5uVmJiYlavHixvvzlL2v79u06deqUmpub5fF4tGzZspjVv+LK/7O2tjZVVFSoq6tLPT09WrRoke6///6Y1/u8a4+BU6dOaefOnRo5cqRGjBgRv8mVscipU6fM8uXLjTHGHDx40Hzve98zLS0tJhQKmdWrV5tDhw6ZS5cumZ/+9Kemvb3dGGPM3r17zaZNm2Ley/z582/ZR1/68MMPzfLly825c+fMuXPnTGlpqTl27JgpLi4Oj7s/dHR0GGOMuXDhglm+fLk5c+ZMn9X6/L5/7733zC9+8QsTCoVMW1ubWbJkiQkGgzGtd6Oxffvb3zaBQMAYY8zWrVvNjh07jDHGlJWVmdraWmOMMW+++Wb42IiFm+3r73znO+b48ePGGGPWr19v6urqYlbziivH0yuvvGK2b99ujDHmwIED5umnnzbGGPOHP/zBPPPMM+bChQsxr33FlZ/lzp07zZ/+9CdjjDGhUMh0dXX1Wc0rbnQMxHLfRqvfb608kIwZM0Yej0eSNGrUKDU3NyspKUknTpzQL3/5S0mX7wwa61n+nfQxYcKEPqv3wQcfaPr06brnnnskSdOnT9ehQ4f6rN7N7N69O3zDvZaWFn366adKTU3t87offPCBHnroIQ0aNEgZGRmaOHGiPvzww5h+UPBGY3O5XJo2bZokyev16l//+pck6fDhw1qxYoUkacaMGdq2bVvM+rjZvh4yZIhGjRoV7uX06dMxq3mjHq6M7ytf+Yo6OzvV1dUlScrNzY3pXxg3c9999+k3v/mNuru7NX369PDY+9KNjoGBwOrQ/8IXvhD+96BBg9TT0yNJGj58uNauXRv3Pu5mTU1NOnDggJ577jklJibqZz/7mS5duhTvtmLiZmNzOp3h03aDBg266jx2X5/Ou9a1x9zFixf7tf4ViYmJ/VJn4sSJ+vnPf66GhgZt2rRJc+bMUUFBQZ/VG8jHt1Vv5N577706d+7cLdfJzs7WmTNndOTIEUlSd3e3Tpw40R/t9ZsJEyYoEAjowoULOn/+vAKBQJ+f37xWV1eXkpOTlZiYqI8//lhHjx7t03qf3/f333+/3nvvPfX09OjMmTM6dOiQxowZE7NakY5t/Pjx2rt3ryTpH//4R8z6kAbGvp4wYYL+/ve/S7ochqmpqUpKSurXHk6fPq2MjAz5fD7NnDlTx48f79N6NzsGXC6Xuru7+7T27Vg1009NTdX48eO1YsUKJSQkKD09/bp1XC6XVqxYoaqqKnV1dSkUCmnWrFkD6nYRveX1elVYWKjVq1dLuvzm3ujRo/u1hylTpuhvf/ubSktL9aUvfUljx47t03qf3/dTpkzRyJEjtXLlSknS/PnzlZGREbNakY7t+9//vl588UX95S9/UV5eXsz6kG68r5OTk2Na43bmzZunzZs36+mnn1ZiYqJ++MMf9mt96fIvmzfeeENOp1P33HOPfvSjH/VpvZsdAzNnztTKlSs1evTouL2Ra9XVOwBgO6tO7wCA7Qh9ALAIoQ8AFiH0AcAihD4AWITQBwCLEPoAYJH/AZ8ymjzCQezSAAAAAElFTkSuQmCC\n",
|
||
"text/plain": [
|
||
"<Figure size 432x288 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"\n",
|
||
"\n",
|
||
"corpus=create_corpus(1)\n",
|
||
"\n",
|
||
"dic=defaultdict(int)\n",
|
||
"for word in corpus:\n",
|
||
" if word in stop:\n",
|
||
" dic[word]+=1\n",
|
||
"\n",
|
||
"top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] \n",
|
||
" \n",
|
||
"\n",
|
||
"\n",
|
||
"x,y=zip(*top)\n",
|
||
"plt.bar(x,y)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "UaqkbAMx_ILh",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"In both of them,\"the\" dominates which is followed by \"a\" in class 0 and \"in\" in class 1."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "QxQNuTVe_ILh",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Analyzing punctuations."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "MgkVXm4X_ILi",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"First let's check tweets indicating real disaster."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 337
|
||
},
|
||
"id": "3fJJLs_C_ILi",
|
||
"outputId": "2882472e-95c0-44f1-dda9-794d43257ed8",
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<BarContainer object of 18 artists>"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "\n",
|
||
"text/plain": [
|
||
"<Figure size 720x360 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"plt.figure(figsize=(10,5))\n",
|
||
"corpus=create_corpus(1)\n",
|
||
"\n",
|
||
"dic=defaultdict(int)\n",
|
||
"import string\n",
|
||
"special = string.punctuation\n",
|
||
"for i in (corpus):\n",
|
||
" if i in special:\n",
|
||
" dic[i]+=1\n",
|
||
" \n",
|
||
"x,y=zip(*dic.items())\n",
|
||
"plt.bar(x,y)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "plgBvyxS_ILi",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"Now,we will move on to class 0."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 337
|
||
},
|
||
"id": "e_1ffbrD_ILi",
|
||
"outputId": "d721a57b-9bb1-4271-ef9e-631f074801c4",
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<BarContainer object of 20 artists>"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "\n",
|
||
"text/plain": [
|
||
"<Figure size 720x360 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"plt.figure(figsize=(10,5))\n",
|
||
"corpus=create_corpus(0)\n",
|
||
"\n",
|
||
"dic=defaultdict(int)\n",
|
||
"import string\n",
|
||
"special = string.punctuation\n",
|
||
"for i in (corpus):\n",
|
||
" if i in special:\n",
|
||
" dic[i]+=1\n",
|
||
" \n",
|
||
"x,y=zip(*dic.items())\n",
|
||
"plt.bar(x,y,color='green')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "EQYD1pao_ILj",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Common words ?"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {
|
||
"id": "L6y5aUjS_ILj",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"\n",
|
||
"counter=Counter(corpus)\n",
|
||
"most=counter.most_common()\n",
|
||
"x=[]\n",
|
||
"y=[]\n",
|
||
"for word,count in most[:40]:\n",
|
||
" if (word not in stop) :\n",
|
||
" x.append(word)\n",
|
||
" y.append(count)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 282
|
||
},
|
||
"id": "z5YAMS8Y_ILj",
|
||
"outputId": "08fe71ae-23ec-4ba5-ade3-ebf90e6a1ac6",
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<AxesSubplot:>"
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAD4CAYAAAAD6PrjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAUUUlEQVR4nO3df2zU9eHH8ddBW8q1tHA9CrZSWUtZFjBma1G+OgeTi19ChC2bM7iM6Xf5zh91dEWqXEMYGwuzQijMwdaijRjiXJgCBpK57VRQpoxSrBlMC0zQIpRyHLSF0h/Xvr9/kL3jDfiC0t7nrvd8/NXez9e9805ffd/n/blzGWOMAACQNMTpAACA2EEpAAAsSgEAYFEKAACLUgAAWJQCAMBKcjrA9Tp+/LjTEWKG1+tVMBh0OkbMYDwiMR6REnk8cnJyrngdKwUAgBX3K4XkV99wOkLMaJWU7HSIGMJ4RGI8IsXzePR8664Be2xWCgAAi1IAAFiUAgDAohQAABalAACwKAUAgEUpAACsmC2FefPmOR0BABJOzJYCACD6KAUAgBV3H3MRCAQUCAQkSZWVlQ6nAYDBJe5KwefzyefzOR0DAAYl3j4CAFiUAgDAohQAAFbMlsLGjRudjgAACSdmSwEAEH2UAgDAohQAABalAACwKAUAgBV3ZzT/p55v3eV0hJjh9XoVDAadjhEzGI9IjEckxuPyWCkAACxKAQBgUQoAAItSAABYcX+g+dNtDzsdIWZ86nSAKMqdXeN0BGBQYqUAALAoBQCARSkAACxKAQBgUQoAAItSAABYlAIAwKIUAAAWpQAAsCgFAIAVdx9zEQgEFAgEJEmVlZUOpwGAwSXuSsHn88nn8zkdAwAGpZgshddee02vv/66JKmiokIej8fhRACQGGKyFGbOnKmZM2c6HQMAEg4HmgEAFqUAALAoBQCARSkAACxKAQBgUQoAAItSAABYMXmewueRO7vG6Qgxw+v1KhgMOh0DQBxjpQAAsCgFAIBFKQAALEoBAGDF/YHmba//j9MR8BmzZzzvdAQA14GVAgDAohQAABalAACwKAUAgEUpAAAsSgEAYFEKAABrwM5TaG9v17JlyyRJZ8+e1ZAhQ5SRkaFTp05p1KhRWr169UA9NQDgCxqwUhgxYoRWrlwpSdq0aZNSU1M1Z84ctbS06Omnnx6opwUAXAdHzmju6+tTdXW1Dh48KI/HoyeffFIpKSlqbm5WbW2t2traNGzYMD388MPKzc11IiIAJCRHjimcOHFCM2fOVFVVldxut3bv3i1JWr9+vX70ox/p6aef1rx58/Tcc89dct9AICC/3y+/3x/t2AAw6DmyUsjOztb48eMlSfn5+Tp16pQ6OzvV2Nioqqoqe7twOHzJfX0+n3w+X7SiAkBCcaQUkpOT7c9DhgxRd3e3+vr6lJaWZo9DAACiL2a2pLrdbmVnZ+vdd9+VJBljdPToUWdDAUCCiamPzi4tLdWzzz6rzZs3KxwO64477rBvMwEABp7LGGOcDnE9ajb+t9MR8Bmx9H0KXq9XwWDQ6Rgxg/GIlMjjkZOTc8XrYubtIwCA8ygFAIBFKQAALEoBAGBRCgAAK6a2pH4RsbTbxWmJvJsCQP9gpQAAsCgFAIBFKQAALEoBAGBRCgAAK+53H/3vnlVOR4hbz9260OkIAGIMKwUAgEUpAAAsSgEAYFEKAACLUgAAWJQCAMAa8FKYN2+eJCkUCmnVqovbR3fs2KHa2tqBfmoAwOcUtZWCx+PRwoXsiweAWBa1UmhpablsKezbt0+LFy9WW1ub3n//fS1evFiLFi1SVVWVOjs7oxUPACCHz2jes2ePtm/froqKCvX19Wnz5s1asmSJUlNTtXXrVm3fvl333ntvxH0CgYACgYAkqbKy0onYADBoOVYK+/fv10cffaTFixfL7Xarvr5ex44d05IlSyRJ4XBYEydOvOR+Pp9PPp8v2nEBICE4VgpjxoxRS0uLTpw4oYKCAhljdPPNN6usrMypSACQ8Bzbkjp69GgtXLhQa9euVVNTkyZOnKjGxkY1NzdLkjo7O3X8+HGn4gFAQnL0mEJubq5KS0tVVVWlRYsW6bHHHtOvf/1r9fT0SJLmzp2rnJwcJyMCQEJxGWOM0yGux6ytbHP9ogb7R2d7vV4Fg0GnY8QMxiNSIo/H//fPNmc0AwAsSgEAYFEKAACLUgAAWJQCAMBydEtqfxjsO2g+j0TeTQGgf7BSAABYlAIAwKIUAAAWpQAAsCgFAIAV97uPHnrnTacjOGr97d90OgKAQYSVAgDAohQAABalAACwKAUAgEUpAAAsSgEAYFEKAADrms5TaGtr0+rVq9Xe3q7k5GQtXbpUqampA50NABBl11QKf/nLX/SVr3xF9913n0KhkJKS4v6cNwDAZVzTX/ekpCSdOHFCkuTxeOzlnZ2dWrFihc6fP69wOKy5c+dqypQpamlp0a9+9SsVFhbq4MGDKigo0PTp0/XHP/5Rra2tKi0t1YQJE7Rp0yadPHlSzc3Nam9v15w5c+Tz+SRJTzzxhFauXDkALxkAcCXXVApjx47Vq6++qoKCAt1999328uTkZJWXl8vtdqutrU2LFy9WcXGxJKm5uVmPP/64brzxRlVUVGjXrl1atmyZ9u7dq82bN+vJJ5+UJH3yySdavny5Ojs7tWjRIn3ta1+Tx+O5YiEEAgEFAgFJUmVl5XW9eABApKuWQigU0pYtW/TMM89o+fLlysjI0NSpU1VeXq5ly5bppZde0gcffCCXy6VQKKTW1lZJUnZ2tvLy8iRJ48aN08033yyXy6W8vDydOnXKPn5xcbFSUlKUkpKiSZMm6fDhw7r11luvmMfn89nVBACgf121FD788EPl5eVpxIgR8vv9+uUvf6nW1laNHj1ae/bsUVtbmyorK5WUlKTHHntM3d3dki6uIv7N5XLZ310ul/r6+iKu+6z//B0AED1X3ZJ600036cCBAwqFQho5cqQeeOAB1dbW6utf/7o6OjqUmZmppKQk7d+/P2IFcK3q6urU3d2t9vZ2HThwQAUFBZKksrKyz/1YAIDrc9WVQm5urubOnavly5crKSlJmZmZKisr04svvqgFCxbob3/7mxYuXKiCggLl5uZ+7gA33XSTfvGLX6i9vV3f/e535fF41NbWJmPMF3pBAIAvzmUc/Ou7adMmpaamas6cORGX19fX6+TJk5o1a9ZVH+Oel18cqHhx4bPfp+D1ehUMBh1ME1sYj0iMR6REHo+cnJwrXheTJxwUFRU5HQEAEpKjpXDfffc5+fQAgP/AZx8BACxKAQBgUQoAACsmDzR/Hp/dfQMAuD6sFAAAFqUAALAoBQCARSkAAKy4P9Bc+m6z0xEc8cx/jXU6AoBBiJUCAMCiFAAAFqUAALAoBQCARSkAACxKAQBgUQoAACtqpTBv3jxJUktLi37+859H62kBAJ8DKwUAgBX1M5qHDBmi9PR0SdKOHTu0Z88edXV1qbm5WbNnz1Y4HNZbb72l5ORkVVRU2NsCAAZe1FcKXq9X5eXl9vempiaVl5frqaee0ksvvaSUlBStWLFChYWF2rlz5yX3DwQC8vv98vv90YwNAAnB8c8+mjRpkoYPH67hw4fL7XaruLhYkpSXl6dPPvnkktv7fD75fL5oxwSAhOD4MYXk5GT785AhQ5SUlGR/7u3tdSoWACQkx0sBABA7KAUAgOUyxhinQ1yPe1/Z53QER1zu+xS8Xq+CwaADaWIT4xGJ8YiUyOORk5NzxetYKQAALEoBAGBRCgAAi1IAAFiUAgDAcvyM5ut1uV04AIAvhpUCAMCiFAAAFqUAALAoBQCAFfcHmt9/O8XpCI645c5upyMAGIRYKQAALEoBAGBRCgAAi1IAAFiUAgDAohQAABalAACwHD1P4dNPP9Xvfvc7XbhwQenp6VqwYIHWr1+vkydPSpIeeeQRFRYWOhkRABKK4yevzZ8/X2PGjNHvf/97BQIBzZo1S5MnT1ZDQ4P+8Ic/aMmSJU5HBICE4Wgp5Obm2p97enqUnp6uyZMnS5LC4bCSk5OdigYACSkmjik0NDSooaFBM2bMkCQFg0G98MIL+t73vnfJbQOBgPx+v/x+f7RjAsCg5/jbR319faqurtbSpUuVlpYmSdqwYYPuvfdeFRQUXHJ7n88nn88X7ZgAkBAcXymcOXNGbrdbN9xwg73s448/1le/+lUHUwFAYnK8FNLS0vTDH/4w4rIHHnhAbrfboUQAkLgcL4WOjg69/vrrEZf99a9/VVdXl0OJACBxOX5MwePxaOHChRGXVVRUOJQGABKb4ysFAEDsoBQAABalAACwKAUAgEUpAAAsx3cfXa9b7ux2OgIADBqsFAAAFqUAALAoBQCARSkAAKy4P9A85KU2pyNcl777M5yOAAAWKwUAgEUpAAAsSgEAYFEKAACLUgAAWJQCAMCiFAAAVsyVwp49e3Ts2DGnYwBAQoq5Uqirq6MUAMAhUTmj+eWXX9bbb7+tjIwMZWVlKT8/X7feeqtqa2vV1tamYcOG6eGHH9a5c+e0d+9e/fOf/9Qrr7yihQsXauzYsdGICABQFErh8OHD+vvf/66VK1eqt7dXixYtUn5+vtavX68f//jHuuGGG3To0CE999xzWrp0qYqLi1VUVKSpU6de9vECgYACgYAkqbKycqDjA0BCGfBSaGxs1JQpU5SSkiJJKioqUk9PjxobG1VVVWVvFw6Hr+nxfD6ffD7fgGQFgETnyAfiGWOUlpamlStXOvH0AIArGPADzV/+8pdVX1+v7u5udXZ2at++fUpJSVF2drbeffddSRdL4ujRo5Kk4cOH68KFCwMdCwBwGQO+UpgwYYKKior0xBNPKDMzU+PGjZPb7VZpaameffZZbd68WeFwWHfccYfGjx+v22+/XTU1NfrTn/6kxx9/nAPNABBFLmOMGegn6ezsVGpqqrq6urR06VI99NBDys/P75fHbl71Yb88jlP68/sUvF6vgsFgvz1evGM8IjEekRJ5PHJycq54XVSOKdTU1OjYsWPq6enRtGnT+q0QAAD9Kyql8NOf/jQaTwMAuE4xd0YzAMA5lAIAwKIUAACWIyev9af+3L0DAImOlQIAwKIUAABWVE5eAwDEh7heKfj9fqcjxBTGIxLjEYnxiMR4XF5clwIAoH9RCgAAK65LgS/bicR4RGI8IjEekRiPy+NAMwDAiuuVAgCgf1EKAAArbj/moqGhQc8//7z6+vo0Y8YMffvb33Y60oALBoNat26dzp49K5fLJZ/Pp1mzZuncuXNavXq1Tp06pdGjR2vBggVKT0+XMUbPP/+83nvvPQ0bNkwlJSWD7rss+vr65Pf75fF45Pf71dLSojVr1qi9vV35+fmaP3++kpKS1NPTo7Vr1+qjjz7SiBEjVFZWpuzsbKfj97vz58+rurpaTU1NcrlcevTRR5WTk5Ow82P79u1644035HK5NG7cOJWUlOjs2bMJPUeuysSh3t5e85Of/MQ0Nzebnp4eU15ebpqampyONeBCoZD517/+ZYwxpqOjw5SWlpqmpiazceNGs2XLFmOMMVu2bDEbN240xhhTX19vli9fbvr6+kxjY6OpqKhwKvqA2bZtm1mzZo156qmnjDHGrFq1yuzatcsYY0xNTY3585//bIwx5rXXXjM1NTXGGGN27dplqqqqnAk8wH7zm9+YQCBgjDGmp6fHnDt3LmHnx+nTp01JSYnp6uoyxlycG2+++WbCz5Gricu3jw4fPqyxY8dqzJgxSkpK0u233666ujqnYw24UaNG2f/khg8frtzcXIVCIdXV1WnatGmSpGnTptmx2Lt3r77xjW/I5XJp4sSJOn/+vM6cOeNY/v52+vRp7du3TzNmzJAkGWN04MABTZ06VZI0ffr0iLGYPn26JGnq1Knav3+/zCDbY9HR0aEPPvhAd911lyQpKSlJaWlpCTs/pIsrye7ubvX29qq7u1sjR45M6DlyLeLy7aNQKKSsrCz7e1ZWlg4dOuRgouhraWnRkSNHNGHCBLW2tmrUqFGSpJEjR6q1tVXSxXHyer32PllZWQqFQva28W7Dhg36wQ9+oAsXLkiS2tvb5Xa7NXToUEmSx+NRKBSSFDlnhg4dKrfbrfb2dmVkDJ5P2W1paVFGRoZ++9vf6uOPP1Z+fr4efPDBhJ0fHo9Hs2fP1qOPPqqUlBTdcsstys/PT+g5ci3icqWQ6Do7O7Vq1So9+OCDcrvdEde5XC65XC6HkkVPfX29MjMzB9174Nejt7dXR44c0d13360VK1Zo2LBh2rp1a8RtEmV+SNK5c+dUV1endevWqaamRp2dnWpoaHA6VsyLy5WCx+PR6dOn7e+nT5+Wx+NxMFH0hMNhrVq1Snfeeaduu+02SVJmZqbOnDmjUaNG6cyZM/Y/G4/Ho2AwaO87mMapsbFRe/fu1Xvvvafu7m5duHBBGzZsUEdHh3p7ezV06FCFQiH7ev89Z7KystTb26uOjg6NGDHC4VfRv7KyspSVlaXCwkJJF98C2bp1a0LOD0n6xz/+oezsbPt6b7vtNjU2Nib0HLkWcblSKCgo0IkTJ9TS0qJwOKx33nlHxcXFTscacMYYVVdXKzc3V/fcc4+9vLi4WDt37pQk7dy5U1OmTLGXv/XWWzLG6ODBg3K73YPmrYHvf//7qq6u1rp161RWVqbJkyertLRUkyZN0u7duyVJO3bssPOiqKhIO3bskCTt3r1bkyZNGnT/MY8cOVJZWVk6fvy4pIt/FG+88caEnB+S5PV6dejQIXV1dckYY8cjkefItYjbM5r37dunF154QX19ffrmN7+p73znO05HGnAffvihfvaznykvL89O1vvvv1+FhYVavXq1gsHgJVsOa2tr9f777yslJUUlJSUqKChw+FX0vwMHDmjbtm3y+/06efKk1qxZo3PnzulLX/qS5s+fr+TkZHV3d2vt2rU6cuSI0tPTVVZWpjFjxjgdvd8dPXpU1dXVCofDys7OVklJiYwxCTs/Nm3apHfeeUdDhw7V+PHj9cgjjygUCiX0HLmauC0FAED/i8u3jwAAA4NSAABYlAIAwKIUAAAWpQAAsCgFAIBFKQAArP8D7z3sLzPbcv4AAAAASUVORK5CYII=\n",
|
||
"text/plain": [
|
||
"<Figure size 432x288 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"sns.barplot(x=y,y=x)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "u3tZhA9d_ILk",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"Lot of cleaning needed !"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "8_PnwdSs_ILk",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Ngram analysis"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "tAIgaAvE_ILk",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"we will do a bigram (n=2) analysis over the tweets.Let's check the most common bigrams in tweets."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {
|
||
"id": "N15pbWGx_ILk",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"def get_top_tweet_bigrams(corpus, n=None):\n",
|
||
" vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)\n",
|
||
" bag_of_words = vec.transform(corpus)\n",
|
||
" sum_words = bag_of_words.sum(axis=0) \n",
|
||
" words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]\n",
|
||
" words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)\n",
|
||
" return words_freq[:n]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 337
|
||
},
|
||
"id": "-McUR_eY_ILk",
|
||
"outputId": "a341561b-f94c-4fe2-ea77-8481508a3872",
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<AxesSubplot:>"
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmsAAAEvCAYAAAAabYYDAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAkK0lEQVR4nO3df3RU9Z3/8ddMfhIhQJIGDWghQQmILMvGSLFCgNHa6LFLdVmXhrNY1OUbFYMhNHCWjVZT0CypS4EiINSD2lO0sC4UAaeSWg6mjSBQA0lKgDUsIZMxSAJJSDJzv39wnDaFCCSTzIfJ8/HX3Ln3c+/78tac1/ncuffaLMuyBAAAACPZA10AAAAAOkZYAwAAMBhhDQAAwGCENQAAAIMR1gAAAAxGWAMAADAYYQ0AAMBgoYEuoDudOnUq0CWgh8TFxcntdge6DPQAet270O/eo7f3OiEhocN1zKwBAAAYLKhn1sLe+zDQJaCHnJUUFugi0CPode9Cv3sPU3vd+r0pgS6BmTUAAACTEdYAAAAMRlgDAAAwGGENAADAYIQ1AAAAgxHWAAAADHbNYc3lcik7O/uy64qKilRXV+db/s1vfqMLFy50vjoAAIBezq8za0VFRTpz5oxvefv27YQ1AACALujUQ3G9Xq9Wr16tiooKxcTEaMGCBdq/f78qKyu1fPlyhYeHa/Lkyaqrq9MLL7yg6Oho5eXlaebMmZo6daoOHTqkAQMGKCsrS9HR0e32/eWXX2rt2rVyuVySpMcff1wjRozQtm3btHv3bknSlClT9MADD3Tx1AEAAMzXqbBWXV2tZ599VnPmzFFhYaGKi4s1ceJE7dixQzNnzlRSUpKki5dB8/LyfIHswoULSkpK0qxZs/Tuu+/qnXfe0ezZs9vte8OGDRo1apRycnLk9XrV3NysY8eOaffu3crPz5ckLVq0SKNGjdKwYcO6cu4AAADG69Rl0Pj4eA0dOlSSlJiYqNra2qsaZ7PZNGHCBEnSPffco7Kysku2+eyzz3TfffddLM5uV1RUlMrKypSamqrIyEhFRkYqNTVVR44cuWSs0+lUbm6ucnNzO3NaAAAAxunUzFpY2F/e3mW329XS0tKpg9tstk6N64jD4ZDD4fDrPgEAAALJrzcYREZGqqmpqd1yc3Ozb9myLBUXF0uS9uzZo+Tk5Ev2cccdd2jXrl2SLv42rrGxUcnJySopKdGFCxfU3NyskpISjRw50p+lAwAAGKlTM2sdSUtL09q1axUeHq78/Hw5HA7l5+crJiZGeXl5ioiI0NGjR7V582ZFR0dr3rx5l+xj1qxZWrNmjT788EPZ7XY98cQTuu2225SWlqZFixZJuniDAb9XAwAAvYHNsiyrpw42c+ZMbdy4sacOp9qfv9ljxwIAAMGn9XtTeuQ4CQkJHa7jDQYAAAAG69Gw1pOzagAAAMGAmTUAAACDEdYAAAAMRlgDAAAwmF8f3WGanrqDA4EXFxcnt9sd6DLQA+h170K/ew963TFm1gAAAAxGWAMAADAYYQ0AAMBghDUAAACDEdYAAAAMFtR3g9Zv+X+BLsEvoqf9PNAlAACAAGFmDQAAwGCENQAAAIMR1gAAAAxGWAMAADAYYQ0AAMBghDUAAACDfW1Yc7lcys7Ovuy6oqIi1dXV+ZZ/85vf6MKFC/6tDgAAoJfr9MxaUVGRzpw541vevn07YQ0AAMDPrvhQXK/Xq9WrV6uiokIxMTFasGCB9u/fr8rKSi1fvlzh4eGaPHmy6urq9MILLyg6Olp5eXmaOXOmpk6dqkOHDmnAgAHKyspSdHS0tm/frg8++EAhISEaMmSIsrKyLjnem2++qYMHD8pms2nq1Kn67ne/qz/96U/auHGjPB6PkpKS9MQTTygsLKy7/l0AAACMcMWwVl1drWeffVZz5sxRYWGhiouLNXHiRO3YsUMzZ85UUlKSpIuXQfPy8hQdHS1JunDhgpKSkjRr1iy9++67eueddzR79my99957WrFihcLCwnT+/PlLjud0OlVbW6tXXnlFISEhOnfunFpaWrRq1SotXrxYCQkJWrFihXbt2qUHHnjAz/8cAAAAZrniZdD4+HgNHTpUkpSYmKja2tqr2rHNZtOECRMkSffcc4/KysokSbfccouWL1+ujz76SCEhIZeMO3TokO69917fur59++rUqVOKj49XQkKCJGnSpEk6cuTIJWOdTqdyc3OVm5t7VTUCAACY7ooza399qdFut6ulpaVTB7LZbJKkhQsX6vDhw9q3b5+2bNmi//zP/7xsaOsMh8Mhh8Phl30BAACYoNM3GERGRqqpqandcnNzs2/ZsiwVFxdLkvbs2aPk5GR5vV653W6NHj1aP/jBD9TY2NhujCSNGTNGH3zwgTwejyTp3LlzSkhIkMvl0unTpyVJH330kUaNGtXZ0gEAAK4bV5xZ60haWprWrl2r8PBw5efny+FwKD8/XzExMcrLy1NERISOHj2qzZs3Kzo6WvPmzZPX69XPfvYzNTY2SpK++93v6oYbbmi336lTp6q6ulrz589XaGiopk6dqvvvv1+ZmZkqLCz03WBw7733du3MAQAArgM2y7Ks7tjxzJkztXHjxu7Y9VUrW/m9gB7fX6Kn/TzQJRgvLi5Obrc70GWgB9Dr3oV+9x69vddf/S7/cniDAQAAgMG6LawFelYNAAAgGDCzBgAAYDDCGgAAgMEIawAAAAYjrAEAABis089Zux7wyAsAAHC9Y2YNAADAYIQ1AAAAgxHWAAAADEZYAwAAMFhQ32Cwd+cPA13CJSZ8Z32gSwAAANcRZtYAAAAMRlgDAAAwGGENAADAYIQ1AAAAgxHWAAAADEZYAwAAMJjfw9q///u/X9P2RUVFqqur8y0/9dRTqq+v93dZAAAA1yW/h7WXXnrpmrYvKirSmTNn/F0GAABAUPD7Q3FnzpypjRs3qrS0VO+884769eunqqoqJSYm6plnnpHNZvNtW1xcrMrKSi1fvlzh4eHKz8+XJO3YsUP79u1TW1ubnnvuOQ0ePFjNzc1av369qqqq5PF49E//9E+68847/V0+AACAUbr1N2vHjx/XrFmzVFhYqJqaGpWXl7dbP378eCUlJWnu3LkqKChQeHi4JKlfv356+eWXdd9992nr1q2SpM2bN2v06NFasmSJ8vLy9Oabb6q5ubk7ywcAAAi4bn3d1PDhwxUbGytJGjp0qFwul5KTk6847q677pIkJSYm6o9//KMk6dChQ9q3b58vvLW0tMjtdmvIkCG+cU6nU06nU5K0dOlSv54LAABAIHRrWAsLC/N9ttvt8nq9VzUuNDTUN8bj8UiSLMtSdna2EhISOhzncDjkcDi6UDEAAIBZAv7ojsjISDU1NV1xu7/7u7/T+++/L8uyJF28xAoAABDsAh7W0tLStHbtWuXk5KilpaXD7R555BF5PB7Nnz9fzz33nH71q1/1YJUAAACBYbO+mqoKQu9uuD/QJVxiwnfWB7qEoBQXFye32x3oMtAD6HXvQr97j97e66/7mVfAZ9YAAADQMcIaAACAwQhrAAAABiOsAQAAGIywBgAAYDDCGgAAgMG69Q0GgcZjMgAAwPWOmTUAAACDEdYAAAAMRlgDAAAwGGENAADAYEF9g8GaPY8F7NhPfntDwI4NAACCBzNrAAAABiOsAQAAGIywBgAAYDDCGgAAgMEIawAAAAYjrAEAABisR8La9u3bNW/ePC1fvrzd9ydOnND+/ft9y5s2bdL//M//9ERJAAAA14Ueec7arl27tHjxYsXGxrb7/sSJE6qsrNS4ceN6ogwAAIDrjl/D2rZt27R7925J0pQpU/TAAw9ozZo1qqmp0U9+8hNNnjxZDz74oCSpra1Nv/rVr9TS0qKysjJNmzZNknTy5Ek9//zzcrvdSk9PV3p6uiTpo48+0vvvv6+2tjbdeuutevzxx2W3cxUXAAAEN7+FtWPHjmn37t3Kz8+XJC1atEijRo3Sk08+qYMHDyovL0/R0dF/OXBoqP75n/9ZlZWVmj17tqSLl0FPnTqlvLw8NTU1KSsrS/fdd59Onz6tvXv36sUXX1RoaKjWrVun3//+95o0aZK/ygcAADCS38JaWVmZUlNTFRkZKUlKTU3VkSNHNGzYsGvaz7hx4xQWFqawsDD1799fZ8+e1Weffabjx49r4cKFkqSWlpZ2we8rTqdTTqdTkrR06dIunhEAAEDgGfdu0NDQv5Rkt9vl8XhkWZYmTZqkGTNmfO1Yh8Mhh8PR3SUCAAD0GL/96Cs5OVklJSW6cOGCmpubVVJSopEjR37tmMjISDU1NV1x33fccYeKi4t19uxZSdK5c+dUW1vrl7oBAABM5reZtcTERKWlpWnRokWSLt5gcKVLoKNHj9Z7772nnJwc3w0GlzNkyBA9+uijeumll2RZlkJCQjR79mx94xvf8Ff5AAAARrJZlmUFuoju8vym7wTs2E9+e0PAjt0bxcXFye12B7oM9AB63bvQ796jt/c6ISGhw3U8+wIAAMBghDUAAACDEdYAAAAMRlgDAAAwGGENAADAYIQ1AAAAgxn3BgN/4vEZAADgesfMGgAAgMEIawAAAAYjrAEAABiMsAYAAGCwoL7B4PE/rO+R46y764c9chwAAND7MLMGAABgMMIaAACAwQhrAAAABiOsAQAAGIywBgAAYDDCGgAAgMG6FNY+//xzFRcX+6sWAAAA/I1OhzXLsrRp0yb94Q9/kMvluupxK1euvGzAc7lc2rNnj2/5xIkT2r9/f2fLAwAACAqdDmtut1sPPfSQfvjDH+r06dNdLqS2tvaSsPbpp592eb8AAADXs069wcDlcunll1/WsmXLJF0MVmVlZZo+ffpVjT98+LC2bdumL7/8UhkZGRo/frzefvttnTx5Ujk5Obr77ru1c+dOtbS0qKysTNOmTdPJkydVU1Oj06dPq6GhQQ899JAcDkdnygcAALhuBOR1U19++aV+/OMf69SpU3r55Zc1fvx4zZgxQ1u3blVubq4kacCAAaqsrNTs2bMlSZs2bdLnn3+u/Px8NTc360c/+pHGjRunmJgY336dTqecTqckaenSpT1/YgAAAH4WkLB25513ym63a8iQITp79uxVj0tJSVF4eLjCw8N1++236+jRo0pNTfWtdzgczLYBAICg0qnfrIWEhMjr9fqWW1tbr2l8WFiY77NlWVc9zmazfe0yAABAsOlUWOvfv7/q6+vV0NCg1tZWv9y12adPHzU1NfmWIyMj2y1LUklJiVpaWtTQ0KDS0lIlJSV1+bgAAAAm69Rl0NDQUD388MNatGiRYmJilJCQ0OVCbrnlFtntduXk5GjSpElKS0vTe++9p5ycHE2bNk2S9M1vflMvvPCCGhoa9PDDD7f7vRoAAEAwslnXch0ygDZt2qTIyEg99NBDVz0mfctL3VjRX6y764c9chx0LC4uTm63O9BloAfQ696Ffvcevb3XXzfxxeumAAAADObXu0HXrVun8vLydt+lp6dr8uTJXd731T7DDQAAIJj4Naw9/vjj/twdAABAr8dlUAAAAIMR1gAAAAwWkDcY9BTu0gQAANc7ZtYAAAAMRlgDAAAwGGENAADAYIQ1AAAAgxHWAAAADBbUd4M+uXd3p8eumdD1ty4AAAB0FTNrAAAABiOsAQAAGIywBgAAYDDCGgAAgMEIawAAAAYjrAEAABgs4GHtj3/8o06ePOlbfv7551VZWRnAigAAAMwR8LBWUlLSLqwBAADgL/zyUNxt27Zp9+6LD6CdMmWKHnjgAblcLi1ZskQjRoxQRUWFYmJitGDBAoWHh/vGlZeX65NPPtHhw4f161//WtnZ2ZKkjz/+WOvWrVNjY6PmzJmjkSNHyuv16q233tLhw4fV2tqq73znO7r33nv9UT4AAICxujyzduzYMe3evVv5+fnKz8/Xb3/7Wx0/flySVF1drfvvv1+FhYWKiopScXFxu7EjRoxQSkqKZs6cqYKCAt14442SJK/XqyVLluhf//Vf9e6770qSPvzwQ0VFRWnJkiVasmSJfvvb38rlcrXbn9PpVG5urnJzc7t6WgAAAEbo8sxaWVmZUlNTFRkZKUlKTU3VkSNHlJKSovj4eA0dOlSSlJiYqNra2qvaZ2pqqm/MV4Hs4MGD+vzzz32Br7GxUdXV1YqPj/eNczgccjgcXT0lAAAAY3Tru0HDwsJ8n+12u1paWq5pnN1ul9frlSRZlqXHHntMY8eO9XudAAAApuryZdDk5GSVlJTowoULam5uVklJiUaOHHnV4/v06aOmpqYrbjd27Fjt2rVLbW1tkqRTp06pubm503UDAABcD7o8s5aYmKi0tDQtWrRI0sUbDIYNG3bJ78k6MmHCBL322mt6//339dxzz3W43ZQpU+RyufSjH/1IkhQdHa2cnJyulg8AAGA0m2VZVqCL6C4PvvtWp8eumTDZj5Wgu8XFxcntdge6DPQAet270O/eo7f3OiEhocN1AX/OGgAAADpGWAMAADAYYQ0AAMBghDUAAACDEdYAAAAM1q0PxQ007ugEAADXO2bWAAAADEZYAwAAMBhhDQAAwGCENQAAAIMR1gAAAAwW1HeDPvXxsU6PXfmtRD9WAgAA0DnMrAEAABiMsAYAAGAwwhoAAIDBCGsAAAAGI6wBAAAYjLAGAABgML+FtfPnz2vnzp1dGlNaWqqlS5f6qyQAAIDrnl/D2q5du7p9DAAAQG/it4fivv322zp9+rRycnI0ZswYZWRk6M0339SBAwckSQ8//LAmTJjwtWPGjRun5uZmLVu2TFVVVUpMTNQzzzwjm82mY8eO6Y033lBzc7Oio6OVmZmpgQMH+qt8AAAAI/ktrM2YMUNVVVUqKCiQJBUXF+vEiRMqKChQfX29Fi5cqJEjR7YLWH87prS0VMePH1dhYaEGDhyoxYsXq7y8XMOHD9f69eu1YMECRUdHa+/evfrlL3+pzMxMf5UPAABgpG573VRZWZnuvvtu2e12DRgwQKNGjVJlZaVSUlK+dtzw4cMVGxsrSRo6dKhcLpeioqJUVVWlF198UZLk9XovO6vmdDrldDolid++AQCAoGDcu0HDwsJ8n+12u7xeryRpyJAhys/P/9qxDodDDoejW+sDAADoSX67waBPnz5qamryLY8cOVIff/yxvF6v6uvrdeTIEQ0fPvxrx3QkISFB9fX1qqiokCS1tbWpqqrKX6UDAAAYy28za/369dOIESOUnZ2tsWPHKiMjQxUVFcrJyZEkZWRkaMCAAV87Zty4cZcvMjRU2dnZ2rBhgxobG+XxeJSenq6bb77ZX+UDAAAYyWZZlhXoIrrLtF/v6fTYld9K9GMl6G5xcXFyu92BLgM9gF73LvS79+jtvU5ISOhwHW8wAAAAMBhhDQAAwGCENQAAAIMR1gAAAAxGWAMAADAYYQ0AAMBgxr3BwJ94/AYAALjeMbMGAABgMMIaAACAwQhrAAAABiOsAQAAGCyobzDY8oeQax4z7S5PN1QCAADQOcysAQAAGIywBgAAYDDCGgAAgMEIawAAAAYjrAEAABiMsAYAAGCwbgtr58+f186dO69pjMvlUnZ2djdVBAAAcP3p1rC2a9eu7to9AABAr9BtD8V9++23dfr0aeXk5GjMmDHKyMjQm2++qQMHDkiSHn74YU2YMOGScR6PR8uXL9fx48c1ZMgQPf3004qIiNCxY8f0xhtvqLm5WdHR0crMzNTAgQO7q3wAAAAjdFtYmzFjhqqqqlRQUCBJKi4u1okTJ1RQUKD6+notXLhQI0eOvCRwnTp1SnPmzFFycrJWrVqlnTt3Kj09XevXr9eCBQsUHR2tvXv36pe//KUyMzO7q3wAAAAj9NjrpsrKynT33XfLbrdrwIABGjVqlCorK5WSktJuu9jYWCUnJ0uSJk6cqO3bt2vs2LGqqqrSiy++KEnyer2XnVVzOp1yOp2SpKVLl3bzGQEAAHQ/494NarPZLrs8ZMgQ5efnf+1Yh8Mhh8PRbbUBAAD0tG67waBPnz5qamryLY8cOVIff/yxvF6v6uvrdeTIEQ0fPvyScW63WxUVFZKkPXv2KDk5WQkJCaqvr/d939bWpqqqqu4qHQAAwBjdNrPWr18/jRgxQtnZ2Ro7dqwyMjJUUVGhnJwcSVJGRoYGDBhwybiEhATt2LFDP//5zzV48GDdd999Cg0NVXZ2tjZs2KDGxkZ5PB6lp6fr5ptv7q7yAQAAjGCzLMsKdBHdZeWWmmseM+0uTzdUgu4WFxcnt9sd6DLQA+h170K/e4/e3uuEhIQO1/EGAwAAAIMR1gAAAAxGWAMAADAYYQ0AAMBghDUAAACDEdYAAAAMZtwbDPyJx3AAAIDrHTNrAAAABiOsAQAAGIywBgAAYDDCGgAAgMGC+gaDmu1Xf3qD0tu6sRIAAIDOYWYNAADAYIQ1AAAAgxHWAAAADEZYAwAAMBhhDQAAwGCENQAAAIN1Kaxt375d8+bN0/Llyzs13uVyac+ePb7loqIivf76610pCQAAIKh06Tlru3bt0uLFixUbG3tV23s8HoWEhPiWa2trtWfPHn3729/uShkAAABBq9Nhbc2aNaqpqdFPfvITTZ48WWlpaVq1apVcLpciIiL05JNP6pvf/KY2bdqkmpoauVwuxcbGKisry7ePt99+WydPnlROTo4mTZqkvn376syZM8rPz1dNTY1SU1OVkZEhSTp48KA2bdqktrY2DRo0SJmZmYqMjOzyPwAAAIDJOh3WnnzySR08eFB5eXmKjo7W+vXrNWzYMC1YsECfffaZVqxYoYKCAknSyZMn9eKLLyo8PLzdPmbMmKGtW7cqNzdX0sXLoCdOnNArr7yi0NBQZWVl6f7771d4eLg2b96sxYsXKzIyUv/93/+tbdu26ZFHHunCqQMAAJjPb6+bKisrU3Z2tiRp9OjROnfunBobGyVJKSkplwS1jowePVpRUVGSpCFDhsjtduv8+fM6efKkFi9eLElqa2vTbbfddslYp9Mpp9MpSVq6dGmXzwkAACDQeuTdoBEREVe9bVhYmO+z3W6Xx+ORZVm644472l1CvRyHwyGHw9HZMgEAAIzjt0d3JCcn6/e//70kqbS0VP369fPNkHWkT58+ampquuK+b7vtNpWXl+v06dOSpObmZp06darrRQMAABjObzNr06dP16pVqzR//nxFREToqaeeuuKYW265RXa7vd0NBpcTHR2tp556Sv/1X/+l1tZWSdKjjz6qhIQEf5UPAABgJJtlWVagi+gun65zXfW2g9LburESdLe4uDi53e5Al4EeQK97F/rde/T2Xn/dBBRvMAAAADAYYQ0AAMBghDUAAACDEdYAAAAMRlgDAAAwGGENAADAYD3yBoNA4XEcAADgesfMGgAAgMEIawAAAAYjrAEAABiMsAYAAGCwoL7BIGRT9VVt55l+UzdXAgAA0DnMrAEAABiMsAYAAGAwwhoAAIDBCGsAAAAGI6wBAAAYjLAGAABgsB4Ja5s3b76qdS6XS9nZ2T1REgAAwHWhR8Lali1bOrUOAACgt/PrQ3FfeeUVffHFF2ptbVV6erocDofeeusttbS0KCcnRzfffLPmzp3r2/5v1z366KPyer1avXq1KioqFBMTowULFig8PFynT5/W66+/rvr6ekVEROjf/u3fNHjwYH+WDwAAYBy/hrXMzEz17dtXLS0tWrhwoe666y794Ac/0I4dO1RQUHDJ9n+7zuVyqbq6Ws8++6zmzJmjwsJCFRcXa+LEiVqzZo2eeOIJ3XTTTfrzn/+sdevWKS8vz5/lAwAAGMevYW379u0qKSmRJLndblVXV6tfv37XtI/4+HgNHTpUkpSYmKja2lo1NzervLxchYWFvu3a2touGet0OuV0OiVJS5cu7eRZAAAAmMNvYa20tFR/+tOf9NJLLykiIkLPP/+8Wltbr3k/YWFhvs92u10tLS3yer264YYbLjs799ccDoccDsc1HxMAAMBUfrvBoLGxUTfccIMiIiL0f//3f/rzn//sWxcaGnrZmbArrftKVFSU4uPj9fHHH0uSLMvSiRMn/FU6AACAsfw2szZ27Fh98MEHmjdvnm666SbdeuutvnVTp05VTk6Ohg0b1u4Gg79d9+ijj3a4/7lz52rt2rXavHmz2tradPfdd/sulwIAAAQrm2VZVqCL6C41r+67qu0802/q5krQ3eLi4uR2uwNdBnoAve5d6Hfv0dt7nZCQ0OE63mAAAABgMMIaAACAwQhrAAAABiOsAQAAGIywBgAAYDDCGgAAgMH8+rop0/BIDgAAcL1jZg0AAMBghDUAAACDBfUbDAAAAK53QTuzlpubG+gS0IPod+9Br3sX+t170OuOBW1YAwAACAaENQAAAIMFbVhzOByBLgE9iH73HvS6d6HfvQe97hg3GAAAABgsaGfWAAAAgkFQvsHgwIED2rBhg7xer6ZOnap//Md/DHRJuEarVq3S/v371b9/fy1btkySdO7cOf30pz9VbW2tvvGNb2jevHnq27evLMvShg0b9OmnnyoiIkKZmZlKTEyUJBUVFWnz5s2SpO9///tKS0sL1CmhA263WytXrtSXX34pm80mh8Oh9PR0+h2kWlpalJeXp7a2Nnk8Ho0fP17Tp0+Xy+XSq6++qoaGBiUmJuqZZ55RaGioWltbtWLFCh07dkz9+vVTVlaW4uPjJUlbtmzRhx9+KLvdrscee0xjx44N7Mnhsrxer3JzcxUTE6Pc3Fx63RlWkPF4PNbTTz9tnT592mptbbXmz59vVVVVBbosXKPS0lKrsrLSeu6553zfbdy40dqyZYtlWZa1ZcsWa+PGjZZlWda+ffus/Px8y+v1WuXl5dbChQsty7KshoYG66mnnrIaGhrafYZZ6urqrMrKSsuyLKuxsdGaO3euVVVVRb+DlNfrtZqamizLsqzW1lZr4cKFVnl5ubVs2TJrz549lmVZ1muvvWbt3LnTsizL2rFjh/Xaa69ZlmVZe/bssQoLCy3Lsqyqqipr/vz5VktLi1VTU2M9/fTTlsfjCcAZ4Uq2bt1qvfrqq9aSJUssy7LodScE3WXQo0eP6sYbb9SgQYMUGhqqCRMmqKSkJNBl4RqNGjVKffv2bfddSUmJJk2aJEmaNGmSr6+ffPKJJk6cKJvNpttuu03nz5/XmTNndODAAY0ZM0Z9+/ZV3759NWbMGB04cKCnTwVXMHDgQN/MWJ8+fTR48GDV1dXR7yBls9kUGRkpSfJ4PPJ4PLLZbCotLdX48eMlSWlpae36/dUM6fjx4/XZZ5/JsiyVlJRowoQJCgsLU3x8vG688UYdPXo0IOeEjn3xxRfav3+/pk6dKkmyLIted0LQhbW6ujrFxsb6lmNjY1VXVxfAiuAvZ8+e1cCBAyVJAwYM0NmzZyVd7HlcXJxvu696/rf/LcTExPDfguFcLpeOHz+u4cOH0+8g5vV6lZOTo8cff1x33HGHBg0apKioKIWEhEhq37u/7mtISIiioqLU0NBAv68Tv/jFL5SRkSGbzSZJamhooNedEHRhDb2DzWbz/c+P4NDc3Kxly5Zp1qxZioqKareOfgcXu92ugoICrV69WpWVlTp16lSgS0I32Ldvn/r37++bOUfnBd0NBjExMfriiy98y1988YViYmICWBH8pX///jpz5owGDhyoM2fOKDo6WtLFnrvdbt92X/U8JiZGhw8f9n1fV1enUaNG9XjduLK2tjYtW7ZM99xzj+666y5J9Ls3uOGGG3T77beroqJCjY2N8ng8CgkJUV1dne/v9ld/02NjY+XxeNTY2Kh+/fpd8rf+r8fADOXl5frkk0/06aefqqWlRU1NTfrFL35Brzsh6GbWkpKSVF1dLZfLpba2Nu3du1cpKSmBLgt+kJKSot/97neSpN/97ne68847fd9/9NFHsixLFRUVioqK0sCBAzV27FgdPHhQ586d07lz53Tw4MHedwfRdcCyLK1evVqDBw/Wgw8+6Puefgen+vp6nT9/XtLFO0MPHTqkwYMH6/bbb1dxcbGki3f1fvV3+x/+4R9UVFQkSSouLtbtt98um82mlJQU7d27V62trXK5XKqurtbw4cMDck64vBkzZmj16tVauXKlsrKyNHr0aM2dO5ded0JQPhR3//79euONN+T1ejV58mR9//vfD3RJuEavvvqqDh8+rIaGBvXv31/Tp0/XnXfeqZ/+9Kdyu92XPMrh9ddf18GDBxUeHq7MzEwlJSVJkj788ENt2bJF0sVHOUyePDmQp4XLKCsr03/8x3/olltu8V3q/Jd/+Rfdeuut9DsI/e///q9Wrlwpr9cry7L0rW99S4888ohqamr06quv6ty5cxo2bJieeeYZhYWFqaWlRStWrNDx48fVt29fZWVladCgQZKkzZs3a/fu3bLb7Zo1a5b+/u//PsBnh46UlpZq69atys3NpdedEJRhDQAAIFgE3WVQAACAYEJYAwAAMBhhDQAAwGCENQAAAIMR1gAAAAxGWAMAADAYYQ0AAMBghDUAAACD/X8cl/h8MorqFgAAAABJRU5ErkJggg==\n",
|
||
"text/plain": [
|
||
"<Figure size 720x360 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"plt.figure(figsize=(10,5))\n",
|
||
"top_tweet_bigrams=get_top_tweet_bigrams(tweet['text'])[:10]\n",
|
||
"x,y=map(list,zip(*top_tweet_bigrams))\n",
|
||
"sns.barplot(x=y,y=x)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "OgTvY4ZA_ILl",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"We will need lot of cleaning here.."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "f2YFV_JY_ILl",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## Data Cleaning\n",
|
||
"As we know,twitter tweets always have to be cleaned before we go onto modelling.So we will do some basic cleaning such as spelling correction,removing punctuations,removing html tags and emojis etc.So let's start."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "giSY7dMs_ILl",
|
||
"outputId": "70649954-fe83-4897-b49a-956426d87fbb",
|
||
"tags": [
|
||
"block:preprocess_data",
|
||
"prev:eda_data"
|
||
]
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(10876, 5)"
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df=pd.concat([tweet,test])\n",
|
||
"df.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "qIOyVrSB_ILm",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Removing urls"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"metadata": {
|
||
"id": "VaEIZCtG_ILm",
|
||
"tags": [
|
||
"block:",
|
||
"prev:pre",
|
||
"prev:preprocess_data"
|
||
]
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"def remove_URL(text):\n",
|
||
" url = re.compile(r'https?://\\S+|www\\.\\S+')\n",
|
||
" return url.sub(r'',text)\n",
|
||
"\n",
|
||
"df['text']=df['text'].apply(lambda x : remove_URL(x))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "MJnMSv_C_ILm",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Removing HTML tags"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {
|
||
"id": "UrB9R_Fe_ILn",
|
||
"tags": [
|
||
"block:"
|
||
]
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"def remove_html(text):\n",
|
||
" html=re.compile(r'<.*?>')\n",
|
||
" return html.sub(r'',text)\n",
|
||
"df['text']=df['text'].apply(lambda x : remove_html(x))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "1c20CxzO_ILn",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Removing Emojis"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"metadata": {
|
||
"id": "G5EHnHOC_ILo",
|
||
"tags": [
|
||
"block:"
|
||
]
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b\n",
|
||
"def remove_emoji(text):\n",
|
||
" emoji_pattern = re.compile(\"[\"\n",
|
||
" u\"\\U0001F600-\\U0001F64F\" # emoticons\n",
|
||
" u\"\\U0001F300-\\U0001F5FF\" # symbols & pictographs\n",
|
||
" u\"\\U0001F680-\\U0001F6FF\" # transport & map symbols\n",
|
||
" u\"\\U0001F1E0-\\U0001F1FF\" # flags (iOS)\n",
|
||
" u\"\\U00002702-\\U000027B0\"\n",
|
||
" u\"\\U000024C2-\\U0001F251\"\n",
|
||
" \"]+\", flags=re.UNICODE)\n",
|
||
" return emoji_pattern.sub(r'', text)\n",
|
||
"\n",
|
||
"df['text']=df['text'].apply(lambda x: remove_emoji(x))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "YoiLOdZe_ILo",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Removing punctuations"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"metadata": {
|
||
"id": "C4hZZQBV_ILp",
|
||
"tags": [
|
||
"block:",
|
||
"prev:preprocess_data"
|
||
]
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"def remove_punct(text):\n",
|
||
" table=str.maketrans('','',string.punctuation)\n",
|
||
" return text.translate(table)\n",
|
||
"\n",
|
||
"df['text']=df['text'].apply(lambda x : remove_punct(x))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "uNkNHy7W_ILp",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Spelling Correction\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "DAjtB__8_ILp",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"Even if I'm not good at spelling I can correct it with python :) I will use `pyspellcheker` to do that."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## Corpus Creation"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "QWKIMprt_ILr",
|
||
"outputId": "e2c533cf-3700-4df4-b4c0-a41428e4a5b0",
|
||
"tags": [
|
||
"block:corpus_creation",
|
||
"prev:preprocess_data"
|
||
]
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"100%|██████████| 10876/10876 [00:02<00:00, 4629.46it/s]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"def create_corpus(df):\n",
|
||
" corpus=[]\n",
|
||
" for tweet in tqdm(df['text']):\n",
|
||
" words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]\n",
|
||
" corpus.append(words)\n",
|
||
" return corpus\n",
|
||
"corpus=create_corpus(df)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## Download Glove"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "rUdI7CrYfsOc",
|
||
"outputId": "23e87b84-e26e-4b41-af73-7565eabee6a6",
|
||
"tags": [
|
||
"block:"
|
||
]
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# download files\n",
|
||
"import wget\n",
|
||
"import zipfile\n",
|
||
"wget.download(\"http://nlp.stanford.edu/data/glove.6B.zip\", './glove.6B.zip')\n",
|
||
" \n",
|
||
"with zipfile.ZipFile(\"glove.6B.zip\", 'r') as zip_ref:\n",
|
||
" zip_ref.extractall(\"./\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## Embedding Step"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"metadata": {
|
||
"id": "vvY4lcdn_ILr",
|
||
"tags": [
|
||
"block:embedding_step",
|
||
"prev:corpus_creation"
|
||
]
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"embedding_dict={}\n",
|
||
"with open(\"./glove.6B.100d.txt\",'r') as f:\n",
|
||
" for line in f:\n",
|
||
" values=line.split()\n",
|
||
" word=values[0]\n",
|
||
" vectors=np.asarray(values[1:],'float32')\n",
|
||
" embedding_dict[word]=vectors\n",
|
||
"f.close()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"metadata": {
|
||
"id": "tIqnmcc6_ILr",
|
||
"tags": [
|
||
"block:"
|
||
]
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"MAX_LEN=50\n",
|
||
"tokenizer_obj=Tokenizer()\n",
|
||
"tokenizer_obj.fit_on_texts(corpus)\n",
|
||
"sequences=tokenizer_obj.texts_to_sequences(corpus)\n",
|
||
"\n",
|
||
"tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "B4n1eHMp_ILs",
|
||
"outputId": "2c027b1c-6dfe-41af-89ce-663e38c227e9",
|
||
"tags": [
|
||
"block:"
|
||
]
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Number of unique words: 20342\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"word_index=tokenizer_obj.word_index\n",
|
||
"print('Number of unique words:',len(word_index))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 33,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "MBONMhCm_ILs",
|
||
"outputId": "3ce6f1b4-802f-43d7-cd0b-14a79fe9346c",
|
||
"tags": [
|
||
"block:"
|
||
]
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"100%|██████████| 20342/20342 [00:00<00:00, 310979.08it/s]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"num_words=len(word_index)+1\n",
|
||
"embedding_matrix=np.zeros((num_words,100))\n",
|
||
"\n",
|
||
"for word,i in tqdm(word_index.items()):\n",
|
||
" if i > num_words:\n",
|
||
" continue\n",
|
||
" \n",
|
||
" emb_vec=embedding_dict.get(word)\n",
|
||
" if emb_vec is not None:\n",
|
||
" embedding_matrix[i]=emb_vec\n",
|
||
" "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "Sh1bYaFO_ILs",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## Baseline Model"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"metadata": {
|
||
"id": "0ox_ger4_ILs",
|
||
"tags": [
|
||
"block:final_model",
|
||
"prev:embedding_step"
|
||
]
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"model=Sequential()\n",
|
||
"\n",
|
||
"embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),\n",
|
||
" input_length=MAX_LEN,trainable=False)\n",
|
||
"\n",
|
||
"model.add(embedding)\n",
|
||
"model.add(SpatialDropout1D(0.2))\n",
|
||
"model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))\n",
|
||
"model.add(Dense(1, activation='sigmoid'))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "gNcE-6N0_ILt",
|
||
"outputId": "77f6865e-3642-410b-a263-a75060e0431a",
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: \"sequential\"\n",
|
||
"_________________________________________________________________\n",
|
||
"Layer (type) Output Shape Param # \n",
|
||
"=================================================================\n",
|
||
"embedding (Embedding) (None, 50, 100) 2034300 \n",
|
||
"_________________________________________________________________\n",
|
||
"spatial_dropout1d (SpatialDr (None, 50, 100) 0 \n",
|
||
"_________________________________________________________________\n",
|
||
"lstm (LSTM) (None, 64) 42240 \n",
|
||
"_________________________________________________________________\n",
|
||
"dense (Dense) (None, 1) 65 \n",
|
||
"=================================================================\n",
|
||
"Total params: 2,076,605\n",
|
||
"Trainable params: 42,305\n",
|
||
"Non-trainable params: 2,034,300\n",
|
||
"_________________________________________________________________\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"model.summary()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 36,
|
||
"metadata": {
|
||
"id": "7-iK95sN_ILt",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"train=tweet_pad[:tweet.shape[0]]\n",
|
||
"final_test=tweet_pad[tweet.shape[0]:]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "-wNkHpK__ILt",
|
||
"outputId": "0d287614-192a-425d-a6a3-a1eb6b757e0a",
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Shape of train (6471, 50)\n",
|
||
"Shape of Validation (1142, 50)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"X_train,X_test,y_train,y_test=train_test_split(train,tweet['target'].values,test_size=0.15)\n",
|
||
"print('Shape of train',X_train.shape)\n",
|
||
"print(\"Shape of Validation \",X_test.shape)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## Training Model"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 38,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "gIFqAxRP_ILt",
|
||
"outputId": "1228be0f-5fe5-4df1-e48f-7707f81bf6ef",
|
||
"tags": [
|
||
"block:train_model",
|
||
"prev:final_model"
|
||
]
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Epoch 1/5\n",
|
||
"1618/1618 - 118s - loss: 0.6912 - accuracy: 0.5670 - val_loss: 0.6862 - val_accuracy: 0.5841\n",
|
||
"Epoch 2/5\n",
|
||
"1618/1618 - 114s - loss: 0.6043 - accuracy: 0.6840 - val_loss: 0.5433 - val_accuracy: 0.7671\n",
|
||
"Epoch 3/5\n",
|
||
"1618/1618 - 112s - loss: 0.5460 - accuracy: 0.7456 - val_loss: 0.5189 - val_accuracy: 0.7785\n",
|
||
"Epoch 4/5\n",
|
||
"1618/1618 - 112s - loss: 0.5329 - accuracy: 0.7535 - val_loss: 0.5038 - val_accuracy: 0.7846\n",
|
||
"Epoch 5/5\n",
|
||
"1618/1618 - 111s - loss: 0.5153 - accuracy: 0.7645 - val_loss: 0.4931 - val_accuracy: 0.7881\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=1e-5), metrics=['accuracy'])\n",
|
||
"history=model.fit(X_train,y_train,batch_size=4,epochs=5,validation_data=(X_test,y_test),verbose=2)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "5VdeRpW6_ILu",
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## Making our submission"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 39,
|
||
"metadata": {
|
||
"id": "C2nwzCGZ_ILu",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"sample_sub=pd.read_csv('./data/sample_submission.csv')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 40,
|
||
"metadata": {
|
||
"id": "skzE80GX_ILu",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"y_pre=model.predict(final_test)\n",
|
||
"y_pre=np.round(y_pre).astype(int).reshape(3263)\n",
|
||
"sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})\n",
|
||
"sub.to_csv('submission.csv',index=False)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 41,
|
||
"metadata": {
|
||
"id": "vhkRPayj_ILu",
|
||
"outputId": "cbbca861-390f-4040-9830-79bb6a34e967",
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>target</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>11</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id target\n",
|
||
"0 0 1\n",
|
||
"1 2 1\n",
|
||
"2 3 1\n",
|
||
"3 9 1\n",
|
||
"4 11 1"
|
||
]
|
||
},
|
||
"execution_count": 41,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"sub.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"colab": {
|
||
"name": "nlp-getting-started.ipynb",
|
||
"provenance": []
|
||
},
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"kubeflow_notebook": {
|
||
"autosnapshot": true,
|
||
"docker_image": "",
|
||
"experiment": {
|
||
"id": "new",
|
||
"name": "trial-with-kale"
|
||
},
|
||
"experiment_name": "trial-with-kale",
|
||
"katib_metadata": {
|
||
"algorithm": {
|
||
"algorithmName": "grid"
|
||
},
|
||
"maxFailedTrialCount": 3,
|
||
"maxTrialCount": 12,
|
||
"objective": {
|
||
"objectiveMetricName": "",
|
||
"type": "minimize"
|
||
},
|
||
"parallelTrialCount": 3,
|
||
"parameters": []
|
||
},
|
||
"katib_run": false,
|
||
"pipeline_description": "An NLP pipeline for disaster detection using tweets",
|
||
"pipeline_name": "nlp-getting-started",
|
||
"snapshot_volumes": true,
|
||
"steps_defaults": [
|
||
"label:access-ml-pipeline:true",
|
||
"label:access-rok:true"
|
||
],
|
||
"volume_access_mode": "rwm",
|
||
"volumes": [
|
||
{
|
||
"annotations": [],
|
||
"mount_point": "/home/jovyan",
|
||
"name": "nlp-getting-started-kale-workspace-6tp8v",
|
||
"size": 20,
|
||
"size_type": "Gi",
|
||
"snapshot": false,
|
||
"type": "clone"
|
||
}
|
||
]
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.6.9"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 4
|
||
}
|