examples/github_issue_summarization/notebooks/Training.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import logging\n",
    "import glob\n",
    "from sklearn.model_selection import train_test_split\n",
    "pd.set_option('display.max_colwidth', 500)\n",
    "logger = logging.getLogger()\n",
    "logger.setLevel(logging.WARNING)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Download Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Ensure that the github-issues-data volume is mounted in /mnt\n",
    "!ls -la /mnt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Set path for data dir\n",
    "%env DATA_DIR=/mnt/github-issues-data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Download the github-issues.zip training data to /mnt/github-issues-data\n",
    "!wget --directory-prefix=${DATA_DIR} https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip\n",
    "\n",
    "# Unzip the file into /mnt/github-issues-data directory\n",
    "!unzip ${DATA_DIR}/github-issues.zip -d ${DATA_DIR}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Create a symlink from <current_directory>/github-issues-data to /mnt/github-issues-data\n",
    "!ln -sf ${DATA_DIR} github-issues-data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Make sure that the github-issues-data symlink is created\n",
    "!ls -lh github-issues-data/github_issues.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Process Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Split data into train and test set and preview data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_file='github-issues-data/github_issues.csv'\n",
    "\n",
    "# read in data sample 2000 rows (for speed of tutorial)\n",
    "# Set this to False to train on the entire dataset\n",
    "use_sample_data=True\n",
    "\n",
    "if use_sample_data:\n",
    "    training_data_size=2000\n",
    "    traindf, testdf = train_test_split(pd.read_csv(data_file).sample(n=training_data_size), \n",
    "                                   test_size=.10)\n",
    "else:\n",
    "    traindf, testdf = train_test_split(pd.read_csv(data_file),test_size=.10)\n",
    "\n",
    "\n",
    "#print out stats about shape of data\n",
    "print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')\n",
    "print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')\n",
    "\n",
    "# preview data\n",
    "traindf.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Convert to lists in preparation for modeling**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_body_raw = traindf.body.tolist()\n",
    "train_title_raw = traindf.issue_title.tolist()\n",
    "#preview output of first element\n",
    "train_body_raw[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pre-Process Data For Deep Learning\n",
    "\n",
    "See [this repo](https://github.com/hamelsmu/ktext) for documentation on the ktext package"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "from ktext.preprocess import processor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "# Clean, tokenize, and apply padding / truncating such that each document length = 70\n",
    "#  also, retain only the top 8,000 words in the vocabulary and set the remaining words\n",
    "#  to 1 which will become common index for rare words \n",
    "body_pp = processor(keep_n=8000, padding_maxlen=70)\n",
    "train_body_vecs = body_pp.fit_transform(train_body_raw)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Look at one example of processed issue bodies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print('\\noriginal string:\\n', train_body_raw[0], '\\n')\n",
    "print('after pre-processing:\\n', train_body_vecs[0], '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Instantiate a text processor for the titles, with some different parameters\n",
    "#  append_indicators = True appends the tokens '_start_' and '_end_' to each\n",
    "#                      document\n",
    "#  padding = 'post' means that zero padding is appended to the end of the \n",
    "#             of the document (as opposed to the default which is 'pre')\n",
    "title_pp = processor(append_indicators=True, keep_n=4500, \n",
    "                     padding_maxlen=12, padding ='post')\n",
    "\n",
    "# process the title data\n",
    "train_title_vecs = title_pp.fit_transform(train_title_raw)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Look at one example of processed issue titles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print('\\noriginal string:\\n', train_title_raw[0])\n",
    "print('after pre-processing:\\n', train_title_vecs[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Serialize all of this to disk for later use"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import dill as dpickle\n",
    "import numpy as np\n",
    "\n",
    "# Save the preprocessor\n",
    "with open('body_pp.dpkl', 'wb') as f:\n",
    "    dpickle.dump(body_pp, f)\n",
    "\n",
    "with open('title_pp.dpkl', 'wb') as f:\n",
    "    dpickle.dump(title_pp, f)\n",
    "\n",
    "# Save the processed data\n",
    "np.save('train_title_vecs.npy', train_title_vecs)\n",
    "np.save('train_body_vecs.npy', train_body_vecs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Define Model Architecture"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the data from disk into variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "encoder_input_data, doc_length = load_encoder_inputs('train_body_vecs.npy')\n",
    "decoder_input_data, decoder_target_data = load_decoder_inputs('train_title_vecs.npy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "num_encoder_tokens, body_pp = load_text_processor('body_pp.dpkl')\n",
    "num_decoder_tokens, title_pp = load_text_processor('title_pp.dpkl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define Model Architecture"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "from keras.models import Model\n",
    "from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization\n",
    "from keras import optimizers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#arbitrarly set latent dimension for embedding and hidden units\n",
    "latent_dim = 300\n",
    "\n",
    "##### Define Model Architecture ######\n",
    "\n",
    "########################\n",
    "#### Encoder Model ####\n",
    "encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')\n",
    "\n",
    "# Word embeding for encoder (ex: Issue Body)\n",
    "x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)\n",
    "x = BatchNormalization(name='Encoder-Batchnorm-1')(x)\n",
    "\n",
    "# Intermediate GRU layer (optional)\n",
    "#x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x)\n",
    "#x = BatchNormalization(name='Encoder-Batchnorm-2')(x)\n",
    "\n",
    "# We do not need the `encoder_output` just the hidden state.\n",
    "_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)\n",
    "\n",
    "# Encapsulate the encoder as a separate entity so we can just \n",
    "#  encode without decoding if we want to.\n",
    "encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')\n",
    "\n",
    "seq2seq_encoder_out = encoder_model(encoder_inputs)\n",
    "\n",
    "########################\n",
    "#### Decoder Model ####\n",
    "decoder_inputs = Input(shape=(None,), name='Decoder-Input')  # for teacher forcing\n",
    "\n",
    "# Word Embedding For Decoder (ex: Issue Titles)\n",
    "dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)\n",
    "dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)\n",
    "\n",
    "# Set up the decoder, using `decoder_state_input` as initial state.\n",
    "decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')\n",
    "decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)\n",
    "x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)\n",
    "\n",
    "# Dense layer for prediction\n",
    "decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')\n",
    "decoder_outputs = decoder_dense(x)\n",
    "\n",
    "########################\n",
    "#### Seq2Seq Model ####\n",
    "\n",
    "#seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])\n",
    "seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)\n",
    "\n",
    "\n",
    "seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "** Examine Model Architecture Summary **"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from seq2seq_utils import viz_model_architecture\n",
    "seq2seq_Model.summary()\n",
    "viz_model_architecture(seq2seq_Model)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from keras.callbacks import CSVLogger, ModelCheckpoint\n",
    "\n",
    "script_name_base = 'tutorial_seq2seq'\n",
    "csv_logger = CSVLogger('{:}.log'.format(script_name_base))\n",
    "model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),\n",
    "                                   save_best_only=True)\n",
    "\n",
    "batch_size = 1200\n",
    "epochs = 7\n",
    "history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),\n",
    "          batch_size=batch_size,\n",
    "          epochs=epochs,\n",
    "          validation_split=0.12, callbacks=[csv_logger, model_checkpoint])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#save model\n",
    "seq2seq_Model.save('seq2seq_model_tutorial.h5')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# See Example Results On Holdout Set\n",
    "\n",
    "It is useful to see examples of real predictions on a holdout set to get a sense of the performance of the model.  We will also evaluate the model numerically in a following section."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from seq2seq_utils import Seq2Seq_Inference\n",
    "seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,\n",
    "                                 decoder_preprocessor=title_pp,\n",
    "                                 seq2seq_model=seq2seq_Model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# this method displays the predictions on random rows of the holdout set\n",
    "seq2seq_inf.demo_model_predictions(n=50, issue_df=testdf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Evaluate Model: BLEU Score\n",
    "\n",
    "For machine-translation tasks such as this one, it is common to measure the accuracy of results using the [BLEU Score](https://en.wikipedia.org/wiki/BLEU).  The convenience function illustrated below uses [NLTK's corpus_bleu](https://www.nltk.org/api/nltk.translate.html#nltk.translate.bleu_score.corpus_bleu).  The output of the below convenience function is an Average of BlEU-1, BLEU-2, BLEU-3 and BLEU-4. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#convenience function that generates predictions on holdout set and calculates BLEU Score\n",
    "\n",
    "bleu_score = seq2seq_inf.evaluate_model(holdout_bodies=testdf.body.tolist(), \n",
    "                                        holdout_titles=testdf.issue_title.tolist(), \n",
    "                                        max_len_title=12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'BLEU Score (avg of BLUE 1-4) on Holdout Set: {bleu_score * 100}')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "toc": {
   "nav_menu": {
    "height": "263px",
    "width": "352px"
   },
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}