Merge remote-tracking branch 'upstream/master' into contributing

2018-03-02 11:50:58 -05:00 · 2018-03-02 11:50:58 -05:00 · 0ad22a29e1
parent 797f1adce5 a855d666d8
commit 0ad22a29e1
24 changed files with 77298 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,41 @@
+# pkg and bin directories currently contain build artifacts
+# only so we exclude them.
+bin/
+vendor/
+
+.vscode/
+
+# Compiled python files.
+*.pyc
+
+# Emacs temporary files
+*~
+
+# Other temporary files
+.DS_Store
+
+# temporary files from emacs flymd-mode
+flymd.*
+
+# vim .swp files
+.swp
+
+# Files created by Gogland IDE
+.idea/
+
+# Exclude wheel files for now.
+# The only wheel file is the TF wheel one which is quite large.
+# We don't want to check that into source control because it could be
+# quite large.
+*.whl
+
+# Bazel files
+**/bazel-*
+# Examples egg
+examples/tf_sample/tf_sample.egg-info/
+examples/.ipynb_checkpoints/
+
+**/.ipynb_checkpoints
+
+# pyenv
+.python-version
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+# TODO(jlewi): We should probably have OWNERs files in subdirectories that 
+# list approvers for individual components (e.g. Seldon folks for Seldon component)
+approvers:
+  - ankushagarwal
+  - DjangoPeng
+  - gaocegege
+  - jlewi
+  - llunn
+  - ScorpioCPH
+reviewers:
+  - ankushagarwal
+  - DjangoPeng
+  - gaocegege
+  - Jimexist
+  - jlewi
+  - llunn
+  - nkashy1
+  - ScorpioCPH
+  - texasmichelle
+  - wbuchwalter
+  - zjj2wry
--- a/README.md
+++ b/README.md
@ -1,2 +1,3 @@
-# examples
-A repository to host extended examples and tutorials
+## A repository to host extended examples and tutorials for kubeflow.
+
+1. [Github issue summarization using sequence-to-sequence learning](./issue_summarization_github_issues) by [Hamel Husain](https://github.com/hamelsmu)
--- a/github_issue_summarization/README.md
+++ b/github_issue_summarization/README.md
@ -0,0 +1,30 @@
+# [WIP] End-to-End kubeflow tutorial using a Sequence-to-Sequence model
+
+This example demonstrates how you can use `kubeflow` end-to-end to train and
+serve a Sequence-to-Sequence model on an existing kubernetes cluster. This
+tutorial is based upon @hamelsmu's article ["How To Create Data Products That
+Are Magical Using Sequence-to-Sequence
+Models"](https://medium.com/@hamelhusain/how-to-create-data-products-that-are-magical-using-sequence-to-sequence-models-703f86a231f8).
+
+## Goals
+
+There are two primary goals for this tutorial:
+
+*   End-to-End kubeflow example
+*   End-to-End Sequence-to-Sequence model
+
+By the end of this tutorial, you should learn how to:
+
+*   Setup a Kubeflow cluster on an existing Kubernetes deployment
+*   Spawn up a Jupyter Notebook on the cluster
+*   Spawn up a shared-persistent storage across the cluster to store large
+    datasets
+*   Train a Sequence-to-Sequence model using TensorFlow on the cluster using
+    GPUs
+*   Serve the model using TensorFlow Serving
+
+## Steps:
+
+1.  [Setup a Kubeflow cluster](setup_a_kubeflow_cluster.md)
+1.  [Training the model](training_the_model.md)
+1.  [Teardown](teardown.md)
--- a/github_issue_summarization/notebooks/Tutorial.ipynb
+++ b/github_issue_summarization/notebooks/Tutorial.ipynb
@ -0,0 +1,510 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "toc": true
+   },
+   "source": [
+    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
+    "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Process-Data\" data-toc-modified-id=\"Process-Data-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Process Data</a></span></li><li><span><a href=\"#Pre-Process-Data-For-Deep-Learning\" data-toc-modified-id=\"Pre-Process-Data-For-Deep-Learning-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Pre-Process Data For Deep Learning</a></span><ul class=\"toc-item\"><li><ul class=\"toc-item\"><li><ul class=\"toc-item\"><li><span><a href=\"#Look-at-one-example-of-processed-issue-bodies\" data-toc-modified-id=\"Look-at-one-example-of-processed-issue-bodies-2.0.0.1\"><span class=\"toc-item-num\">2.0.0.1&nbsp;&nbsp;</span>Look at one example of processed issue bodies</a></span></li><li><span><a href=\"#Look-at-one-example-of-processed-issue-titles\" data-toc-modified-id=\"Look-at-one-example-of-processed-issue-titles-2.0.0.2\"><span class=\"toc-item-num\">2.0.0.2&nbsp;&nbsp;</span>Look at one example of processed issue titles</a></span></li></ul></li></ul></li></ul></li><li><span><a href=\"#Define-Model-Architecture\" data-toc-modified-id=\"Define-Model-Architecture-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Define Model Architecture</a></span><ul class=\"toc-item\"><li><ul class=\"toc-item\"><li><span><a href=\"#Load-the-data-from-disk-into-variables\" data-toc-modified-id=\"Load-the-data-from-disk-into-variables-3.0.1\"><span class=\"toc-item-num\">3.0.1&nbsp;&nbsp;</span>Load the data from disk into variables</a></span></li><li><span><a href=\"#Define-Model-Architecture\" data-toc-modified-id=\"Define-Model-Architecture-3.0.2\"><span class=\"toc-item-num\">3.0.2&nbsp;&nbsp;</span>Define Model Architecture</a></span></li></ul></li></ul></li><li><span><a href=\"#Train-Model\" data-toc-modified-id=\"Train-Model-4\"><span class=\"toc-item-num\">4&nbsp;&nbsp;</span>Train Model</a></span></li><li><span><a href=\"#See-Results-On-Holdout-Set\" data-toc-modified-id=\"See-Results-On-Holdout-Set-5\"><span class=\"toc-item-num\">5&nbsp;&nbsp;</span>See Results On Holdout Set</a></span></li><li><span><a href=\"#Feature-Extraction-Demo\" data-toc-modified-id=\"Feature-Extraction-Demo-6\"><span class=\"toc-item-num\">6&nbsp;&nbsp;</span>Feature Extraction Demo</a></span><ul class=\"toc-item\"><li><ul class=\"toc-item\"><li><span><a href=\"#Example-1:-Issues-Installing-Python-Packages\" data-toc-modified-id=\"Example-1:-Issues-Installing-Python-Packages-6.0.1\"><span class=\"toc-item-num\">6.0.1&nbsp;&nbsp;</span>Example 1: Issues Installing Python Packages</a></span></li><li><span><a href=\"#Example-2:--Issues-asking-for-feature-improvements\" data-toc-modified-id=\"Example-2:--Issues-asking-for-feature-improvements-6.0.2\"><span class=\"toc-item-num\">6.0.2&nbsp;&nbsp;</span>Example 2:  Issues asking for feature improvements</a></span></li></ul></li></ul></li></ul></div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import logging\n",
+    "import glob\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "pd.set_option('display.max_colwidth', 500)\n",
+    "logger = logging.getLogger()\n",
+    "logger.setLevel(logging.WARNING)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ensure that the github-issues-data volume is mounted in /mnt\n",
+    "!ls -la /mnt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# Download the github-issues.zip training data to /mnt/github-issues-data\n",
+    "!wget --directory-prefix=/mnt/github-issues-data https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip\n",
+    "\n",
+    "# Unzip the file into /mnt/github-issues-data directory\n",
+    "!unzip /mnt/github-issues-data/github-issues.zip -d /mnt/github-issues-data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Create a symlink from <current_directory>/github-issues-data to /mnt/github-issues-data\n",
+    "!ln -sf /mnt/github-issues-data github-issues-data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Make sure that the github-issues-data symlink is created\n",
+    "!ls -lh github-issues-data/github_issues.csv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Process Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Split data into train and test set and preview data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_file='github-issues-data/github_issues.csv'\n",
+    "\n",
+    "# read in data sample 2000 rows (for speed of tutorial)\n",
+    "# Set this to False to train on the entire dataset\n",
+    "use_sample_data=True\n",
+    "\n",
+    "if use_sample_data:\n",
+    "    training_data_size=2000\n",
+    "    traindf, testdf = train_test_split(pd.read_csv(data_file).sample(n=training_data_size), \n",
+    "                                   test_size=.10)\n",
+    "else:\n",
+    "    traindf, testdf = train_test_split(pd.read_csv(data_file),test_size=.10)\n",
+    "\n",
+    "\n",
+    "#print out stats about shape of data\n",
+    "print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')\n",
+    "print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')\n",
+    "\n",
+    "# preview data\n",
+    "traindf.head(3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Convert to lists in preparation for modeling**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_body_raw = traindf.body.tolist()\n",
+    "train_title_raw = traindf.issue_title.tolist()\n",
+    "#preview output of first element\n",
+    "train_body_raw[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pre-Process Data For Deep Learning\n",
+    "\n",
+    "See [this repo](https://github.com/hamelsmu/ktext) for documentation on the ktext package"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "from ktext.preprocess import processor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "# Clean, tokenize, and apply padding / truncating such that each document length = 70\n",
+    "#  also, retain only the top 8,000 words in the vocabulary and set the remaining words\n",
+    "#  to 1 which will become common index for rare words \n",
+    "body_pp = processor(keep_n=8000, padding_maxlen=70)\n",
+    "train_body_vecs = body_pp.fit_transform(train_body_raw)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Look at one example of processed issue bodies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('\\noriginal string:\\n', train_body_raw[0], '\\n')\n",
+    "print('after pre-processing:\\n', train_body_vecs[0], '\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Instantiate a text processor for the titles, with some different parameters\n",
+    "#  append_indicators = True appends the tokens '_start_' and '_end_' to each\n",
+    "#                      document\n",
+    "#  padding = 'post' means that zero padding is appended to the end of the \n",
+    "#             of the document (as opposed to the default which is 'pre')\n",
+    "title_pp = processor(append_indicators=True, keep_n=4500, \n",
+    "                     padding_maxlen=12, padding ='post')\n",
+    "\n",
+    "# process the title data\n",
+    "train_title_vecs = title_pp.fit_transform(train_title_raw)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Look at one example of processed issue titles"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('\\noriginal string:\\n', train_title_raw[0])\n",
+    "print('after pre-processing:\\n', train_title_vecs[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Serialize all of this to disk for later use"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import dill as dpickle\n",
+    "import numpy as np\n",
+    "\n",
+    "# Save the preprocessor\n",
+    "with open('body_pp.dpkl', 'wb') as f:\n",
+    "    dpickle.dump(body_pp, f)\n",
+    "\n",
+    "with open('title_pp.dpkl', 'wb') as f:\n",
+    "    dpickle.dump(title_pp, f)\n",
+    "\n",
+    "# Save the processed data\n",
+    "np.save('train_title_vecs.npy', train_title_vecs)\n",
+    "np.save('train_body_vecs.npy', train_body_vecs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Define Model Architecture"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load the data from disk into variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "encoder_input_data, doc_length = load_encoder_inputs('train_body_vecs.npy')\n",
+    "decoder_input_data, decoder_target_data = load_decoder_inputs('train_title_vecs.npy')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_encoder_tokens, body_pp = load_text_processor('body_pp.dpkl')\n",
+    "num_decoder_tokens, title_pp = load_text_processor('title_pp.dpkl')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define Model Architecture"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "from keras.models import Model\n",
+    "from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization\n",
+    "from keras import optimizers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#arbitrarly set latent dimension for embedding and hidden units\n",
+    "latent_dim = 300\n",
+    "\n",
+    "##### Define Model Architecture ######\n",
+    "\n",
+    "########################\n",
+    "#### Encoder Model ####\n",
+    "encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')\n",
+    "\n",
+    "# Word embeding for encoder (ex: Issue Body)\n",
+    "x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)\n",
+    "x = BatchNormalization(name='Encoder-Batchnorm-1')(x)\n",
+    "\n",
+    "# Intermediate GRU layer (optional)\n",
+    "#x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x)\n",
+    "#x = BatchNormalization(name='Encoder-Batchnorm-2')(x)\n",
+    "\n",
+    "# We do not need the `encoder_output` just the hidden state.\n",
+    "_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)\n",
+    "\n",
+    "# Encapsulate the encoder as a separate entity so we can just \n",
+    "#  encode without decoding if we want to.\n",
+    "encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')\n",
+    "\n",
+    "seq2seq_encoder_out = encoder_model(encoder_inputs)\n",
+    "\n",
+    "########################\n",
+    "#### Decoder Model ####\n",
+    "decoder_inputs = Input(shape=(None,), name='Decoder-Input')  # for teacher forcing\n",
+    "\n",
+    "# Word Embedding For Decoder (ex: Issue Titles)\n",
+    "dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)\n",
+    "dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)\n",
+    "\n",
+    "# Set up the decoder, using `decoder_state_input` as initial state.\n",
+    "decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')\n",
+    "decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)\n",
+    "x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)\n",
+    "\n",
+    "# Dense layer for prediction\n",
+    "decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')\n",
+    "decoder_outputs = decoder_dense(x)\n",
+    "\n",
+    "########################\n",
+    "#### Seq2Seq Model ####\n",
+    "\n",
+    "#seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])\n",
+    "seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)\n",
+    "\n",
+    "\n",
+    "seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "** Examine Model Architecture Summary **"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from seq2seq_utils import viz_model_architecture\n",
+    "seq2seq_Model.summary()\n",
+    "viz_model_architecture(seq2seq_Model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from keras.callbacks import CSVLogger, ModelCheckpoint\n",
+    "\n",
+    "script_name_base = 'tutorial_seq2seq'\n",
+    "csv_logger = CSVLogger('{:}.log'.format(script_name_base))\n",
+    "model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),\n",
+    "                                   save_best_only=True)\n",
+    "\n",
+    "batch_size = 1200\n",
+    "epochs = 7\n",
+    "history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),\n",
+    "          batch_size=batch_size,\n",
+    "          epochs=epochs,\n",
+    "          validation_split=0.12, callbacks=[csv_logger, model_checkpoint])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#save model\n",
+    "seq2seq_Model.save('seq2seq_model_tutorial.h5')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# See Results On Holdout Set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from seq2seq_utils import Seq2Seq_Inference\n",
+    "seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,\n",
+    "                                 decoder_preprocessor=title_pp,\n",
+    "                                 seq2seq_model=seq2seq_Model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# this method displays the predictions on random rows of the holdout set\n",
+    "seq2seq_inf.demo_model_predictions(n=50, issue_df=testdf)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  },
+  "toc": {
+   "nav_menu": {
+    "height": "263px",
+    "width": "352px"
+   },
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": true,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/github_issue_summarization/notebooks/seq2seq_utils.py
+++ b/github_issue_summarization/notebooks/seq2seq_utils.py
@ -0,0 +1,425 @@
+from matplotlib import pyplot as plt
+import tensorflow as tf
+from keras import backend as K
+from keras.layers import Input
+from keras.models import Model
+from IPython.display import SVG, display
+from keras.utils.vis_utils import model_to_dot
+import logging
+import numpy as np
+import dill as dpickle
+from annoy import AnnoyIndex
+from tqdm import tqdm, tqdm_notebook
+from random import random
+from nltk.translate.bleu_score import corpus_bleu
+
+
+def load_text_processor(fname='title_pp.dpkl'):
+    """
+    Load preprocessors from disk.
+
+    Parameters
+    ----------
+    fname: str
+        file name of ktext.proccessor object
+
+    Returns
+    -------
+    num_tokens : int
+        size of vocabulary loaded into ktext.processor
+    pp : ktext.processor
+        the processor you are trying to load
+
+    Typical Usage:
+    -------------
+
+    num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
+    num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
+
+    """
+    # Load files from disk
+    with open(fname, 'rb') as f:
+        pp = dpickle.load(f)
+
+    num_tokens = max(pp.id2token.keys()) + 1
+    print(f'Size of vocabulary for {fname}: {num_tokens:,}')
+    return num_tokens, pp
+
+
+def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
+    """
+    Load decoder inputs.
+
+    Parameters
+    ----------
+    decoder_np_vecs : str
+        filename of serialized numpy.array of decoder input (issue title)
+
+    Returns
+    -------
+    decoder_input_data : numpy.array
+        The data fed to the decoder as input during training for teacher forcing.
+        This is the same as `decoder_np_vecs` except the last position.
+    decoder_target_data : numpy.array
+        The data that the decoder data is trained to generate (issue title).
+        Calculated by sliding `decoder_np_vecs` one position forward.
+
+    """
+    vectorized_title = np.load(decoder_np_vecs)
+    # For Decoder Input, you don't need the last word as that is only for prediction
+    # when we are training using Teacher Forcing.
+    decoder_input_data = vectorized_title[:, :-1]
+
+    # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
+    decoder_target_data = vectorized_title[:, 1:]
+
+    print(f'Shape of decoder input: {decoder_input_data.shape}')
+    print(f'Shape of decoder target: {decoder_target_data.shape}')
+    return decoder_input_data, decoder_target_data
+
+
+def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
+    """
+    Load variables & data that are inputs to encoder.
+
+    Parameters
+    ----------
+    encoder_np_vecs : str
+        filename of serialized numpy.array of encoder input (issue title)
+
+    Returns
+    -------
+    encoder_input_data : numpy.array
+        The issue body
+    doc_length : int
+        The standard document length of the input for the encoder after padding
+        the shape of this array will be (num_examples, doc_length)
+
+    """
+    vectorized_body = np.load(encoder_np_vecs)
+    # Encoder input is simply the body of the issue text
+    encoder_input_data = vectorized_body
+    doc_length = encoder_input_data.shape[1]
+    print(f'Shape of encoder input: {encoder_input_data.shape}')
+    return encoder_input_data, doc_length
+
+
+def viz_model_architecture(model):
+    """Visualize model architecture in Jupyter notebook."""
+    display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
+
+
+def free_gpu_mem():
+    """Attempt to free gpu memory."""
+    K.get_session().close()
+    cfg = K.tf.ConfigProto()
+    cfg.gpu_options.allow_growth = True
+    K.set_session(K.tf.Session(config=cfg))
+
+
+def test_gpu():
+    """Run a toy computation task in tensorflow to test GPU."""
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    session = tf.Session(config=config)
+    hello = tf.constant('Hello, TensorFlow!')
+    print(session.run(hello))
+
+
+def plot_model_training_history(history_object):
+    """Plots model train vs. validation loss."""
+    plt.title('model accuracy')
+    plt.ylabel('accuracy')
+    plt.xlabel('epoch')
+    plt.plot(history_object.history['loss'])
+    plt.plot(history_object.history['val_loss'])
+    plt.legend(['train', 'test'], loc='upper left')
+    plt.show()
+
+
+def extract_encoder_model(model):
+    """
+    Extract the encoder from the original Sequence to Sequence Model.
+
+    Returns a keras model object that has one input (body of issue) and one
+    output (encoding of issue, which is the last hidden state).
+
+    Input:
+    -----
+    model: keras model object
+
+    Returns:
+    -----
+    keras model object
+
+    """
+    encoder_model = model.get_layer('Encoder-Model')
+    return encoder_model
+
+
+def extract_decoder_model(model):
+    """
+    Extract the decoder from the original model.
+
+    Inputs:
+    ------
+    model: keras model object
+
+    Returns:
+    -------
+    A Keras model object with the following inputs and outputs:
+
+    Inputs of Keras Model That Is Returned:
+    1: the embedding index for the last predicted word or the <Start> indicator
+    2: the last hidden state, or in the case of the first word the hidden state from the encoder
+
+    Outputs of Keras Model That Is Returned:
+    1.  Prediction (class probabilities) for the next word
+    2.  The hidden state of the decoder, to be fed back into the decoder at the next time step
+
+    Implementation Notes:
+    ----------------------
+    Must extract relevant layers and reconstruct part of the computation graph
+    to allow for different inputs as we are not going to use teacher forcing at
+    inference time.
+
+    """
+    # the latent dimension is the same throughout the architecture so we are going to
+    # cheat and grab the latent dimension of the embedding because that is the same as what is
+    # output from the decoder
+    latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
+
+    # Reconstruct the input into the decoder
+    decoder_inputs = model.get_layer('Decoder-Input').input
+    dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
+    dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
+
+    # Instead of setting the intial state from the encoder and forgetting about it, during inference
+    # we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
+    # the GRU, thus we define this input layer for the state so we can add this capability
+    gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
+
+    # we need to reuse the weights that is why we are getting this
+    # If you inspect the decoder GRU that we created for training, it will take as input
+    # 2 tensors -> (1) is the embedding layer output for the teacher forcing
+    #                  (which will now be the last step's prediction, and will be _start_ on the first time step)
+    #              (2) is the state, which we will initialize with the encoder on the first time step, but then
+    #                   grab the state after the first prediction and feed that back in again.
+    gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
+
+    # Reconstruct dense layers
+    dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
+    dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
+    decoder_model = Model([decoder_inputs, gru_inference_state_input],
+                          [dense_out, gru_state_out])
+    return decoder_model
+
+
+class Seq2Seq_Inference(object):
+    def __init__(self,
+                 encoder_preprocessor,
+                 decoder_preprocessor,
+                 seq2seq_model):
+
+        self.pp_body = encoder_preprocessor
+        self.pp_title = decoder_preprocessor
+        self.seq2seq_model = seq2seq_model
+        self.encoder_model = extract_encoder_model(seq2seq_model)
+        self.decoder_model = extract_decoder_model(seq2seq_model)
+        self.default_max_len_title = self.pp_title.padding_maxlen
+        self.nn = None
+        self.rec_df = None
+
+    def generate_issue_title(self,
+                             raw_input_text,
+                             max_len_title=None):
+        """
+        Use the seq2seq model to generate a title given the body of an issue.
+
+        Inputs
+        ------
+        raw_input: str
+            The body of the issue text as an input string
+
+        max_len_title: int (optional)
+            The maximum length of the title the model will generate
+
+        """
+        if max_len_title is None:
+            max_len_title = self.default_max_len_title
+        # get the encoder's features for the decoder
+        raw_tokenized = self.pp_body.transform([raw_input_text])
+        body_encoding = self.encoder_model.predict(raw_tokenized)
+        # we want to save the encoder's embedding before its updated by decoder
+        #   because we can use that as an embedding for other tasks.
+        original_body_encoding = body_encoding
+        state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
+
+        decoded_sentence = []
+        stop_condition = False
+        while not stop_condition:
+            preds, st = self.decoder_model.predict([state_value, body_encoding])
+
+            # We are going to ignore indices 0 (padding) and indices 1 (unknown)
+            # Argmax will return the integer index corresponding to the
+            #  prediction + 2 b/c we chopped off first two
+            pred_idx = np.argmax(preds[:, :, 2:]) + 2
+
+            # retrieve word from index prediction
+            pred_word_str = self.pp_title.id2token[pred_idx]
+
+            if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
+                stop_condition = True
+                break
+            decoded_sentence.append(pred_word_str)
+
+            # update the decoder for the next word
+            body_encoding = st
+            state_value = np.array(pred_idx).reshape(1, 1)
+
+        return original_body_encoding, ' '.join(decoded_sentence)
+
+
+    def print_example(self,
+                      i,
+                      body_text,
+                      title_text,
+                      url,
+                      threshold):
+        """
+        Prints an example of the model's prediction for manual inspection.
+        """
+        if i:
+            print('\n\n==============================================')
+            print(f'============== Example # {i} =================\n')
+
+        if url:
+            print(url)
+
+        print(f"Issue Body:\n {body_text} \n")
+
+        if title_text:
+            print(f"Original Title:\n {title_text}")
+
+        emb, gen_title = self.generate_issue_title(body_text)
+        print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
+
+        if self.nn:
+            # return neighbors and distances
+            n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
+                                             include_distances=True)
+            neighbors = n[1:]
+            dist = d[1:]
+
+            if min(dist) <= threshold:
+                cols = ['issue_url', 'issue_title', 'body']
+                dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
+                dfcopy['dist'] = dist
+                similar_issues_df = dfcopy.query(f'dist <= {threshold}')
+
+                print("\n**** Similar Issues (using encoder embedding) ****:\n")
+                display(similar_issues_df)
+
+
+    def demo_model_predictions(self,
+                               n,
+                               issue_df,
+                               threshold=1):
+        """
+        Pick n random Issues and display predictions.
+
+        Input:
+        ------
+        n : int
+            Number of issues to display from issue_df
+        issue_df : pandas DataFrame
+            DataFrame that contains two columns: `body` and `issue_title`.
+        threshold : float
+            distance threshold for recommendation of similar issues.
+
+        Returns:
+        --------
+        None
+            Prints the original issue body and the model's prediction.
+        """
+        # Extract body and title from DF
+        body_text = issue_df.body.tolist()
+        title_text = issue_df.issue_title.tolist()
+        url = issue_df.issue_url.tolist()
+
+        demo_list = np.random.randint(low=1, high=len(body_text), size=n)
+        for i in demo_list:
+            self.print_example(i,
+                               body_text=body_text[i],
+                               title_text=title_text[i],
+                               url=url[i],
+                               threshold=threshold)
+
+    def prepare_recommender(self, vectorized_array, original_df):
+        """
+        Use the annoy library to build recommender
+
+        Parameters
+        ----------
+        vectorized_array : List[List[int]]
+            This is the list of list of integers that represents your corpus
+            that is fed into the seq2seq model for training.
+        original_df : pandas.DataFrame
+            This is the original dataframe that has the columns
+            ['issue_url', 'issue_title', 'body']
+
+        Returns
+        -------
+        annoy.AnnoyIndex  object (see https://github.com/spotify/annoy)
+        """
+        self.rec_df = original_df
+        emb = self.encoder_model.predict(x=vectorized_array,
+                                         batch_size=vectorized_array.shape[0]//200)
+
+        f = emb.shape[1]
+        self.nn = AnnoyIndex(f)
+        logging.warning('Adding embeddings')
+        for i in tqdm(range(len(emb))):
+            self.nn.add_item(i, emb[i])
+        logging.warning('Building trees for similarity lookup.')
+        self.nn.build(50)
+        return self.nn
+
+    def set_recsys_data(self, original_df):
+        self.rec_df = original_df
+
+    def set_recsys_annoyobj(self, annoyobj):
+        self.nn = annoyobj
+
+    def evaluate_model(self, holdout_bodies, holdout_titles):
+        """
+        Method for calculating BLEU Score.
+
+        Parameters
+        ----------
+        holdout_bodies : List[str]
+            These are the issue bodies that we want to summarize
+        holdout_titles : List[str]
+            This is the ground truth we are trying to predict --> issue titles
+
+        Returns
+        -------
+        bleu : float
+            The BLEU Score
+
+        """
+        actual, predicted = list(), list()
+        assert len(holdout_bodies) == len(holdout_titles)
+        num_examples = len(holdout_bodies)
+
+        logging.warning('Generating predictions.')
+        # step over the whole set TODO: parallelize this
+        for i in tqdm_notebook(range(num_examples)):
+            _, yhat = self.generate_issue_title(holdout_bodies[i])
+
+            actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
+            predicted.append(self.pp_title.process_text([yhat])[0])
+        # calculate BLEU score
+        logging.warning('Calculating BLEU.')
+        bleu = corpus_bleu(actual, predicted)
+        return bleu
--- a/github_issue_summarization/requirements.txt
+++ b/github_issue_summarization/requirements.txt
@ -0,0 +1,233 @@
+alabaster==0.7.10
+altair==1.2.1
+anaconda-client==1.6.5
+anaconda-navigator==1.6.8
+anaconda-project==0.8.0
+annoy==1.10.0
+asn1crypto==0.22.0
+astroid==1.5.3
+astropy==2.0.2
+Babel==2.5.0
+backports.functools-lru-cache==1.4
+backports.shutil-get-terminal-size==1.0.0
+bcolz==1.1.2
+beautifulsoup4==4.6.0
+bitarray==0.8.1
+bkcharts==0.2
+blaze==0.11.3
+bleach==1.5.0
+bokeh==0.12.7
+boto==2.48.0
+boto3==1.5.14
+botocore==1.8.28
+Bottleneck==1.2.1
+bz2file==0.98
+cachetools==2.0.1
+certifi==2017.11.5
+cffi==1.10.0
+chardet==3.0.4
+click==6.7
+cloudpickle==0.4.0
+clyent==1.2.2
+colorama==0.3.9
+conda==4.4.6
+conda-build==3.0.23
+conda-verify==2.0.0
+contextlib2==0.5.5
+cryptography==2.0.3
+cycler==0.10.0
+cymem==1.31.2
+Cython==0.26.1
+cytoolz==0.9.0
+dask==0.16.1
+datashape==0.5.4
+decorator==4.2.1
+dill==0.2.7.1
+distributed==1.20.2
+docopt==0.6.2
+docutils==0.14
+en-core-web-sm==2.0.0
+entrypoints==0.2.3
+et-xmlfile==1.0.1
+fastcache==1.0.2
+fastparquet==0.1.3
+filelock==2.0.12
+Flask==0.12.2
+Flask-Cors==3.0.3
+ftfy==4.4.3
+future==0.16.0
+gensim==3.2.0
+gevent==1.2.2
+glob2==0.5
+gmpy2==2.0.8
+graphviz==0.8.1
+greenlet==0.4.12
+h5py==2.7.1
+hdfs==2.1.0
+heapdict==1.0.0
+html5lib==1.0.1
+idna==2.6
+ijson==2.3
+imageio==2.2.0
+imagesize==0.7.1
+ipykernel==4.6.1
+ipython==6.2.1
+ipython-genutils==0.2.0
+ipywidgets==7.0.0
+isort==4.2.15
+isoweek==1.3.3
+itsdangerous==0.24
+jdcal==1.3
+jedi==0.11.0
+Jinja2==2.9.6
+jmespath==0.9.3
+jsonschema==2.6.0
+jupyter-client==5.1.0
+jupyter-console==5.2.0
+jupyter-core==4.3.0
+jupyterlab==0.27.0
+jupyterlab-launcher==0.4.0
+Keras==2.1.2
+ktext==0.27
+lazy-object-proxy==1.3.1
+llvmlite==0.20.0
+locket==0.2.0
+lxml==3.8.0
+Markdown==2.6.9
+MarkupSafe==1.0
+matplotlib==2.1.0
+mccabe==0.6.1
+mistune==0.7.4
+more-itertools==4.0.1
+mpmath==0.19
+msgpack==0.5.1
+msgpack-numpy==0.4.2
+msgpack-python==0.5.1
+multipledispatch==0.4.9
+multiprocess==0.70.5
+murmurhash==0.28.0
+navigator-updater==0.1.0
+nbconvert==5.3.1
+nbformat==4.4.0
+networkx==2.0
+nltk==3.2.5
+nose==1.3.7
+notebook==5.0.0
+numba==0.35.0+10.g143f70e90
+numexpr==2.6.2
+numpy==1.14.0
+numpydoc==0.7.0
+odo==0.5.1
+olefile==0.44
+openpyxl==2.4.8
+packaging==16.8
+pandas==0.22.0
+pandas-summary==0.0.41
+pandocfilters==1.4.2
+parso==0.1.0
+partd==0.3.8
+path.py==10.3.1
+pathlib==1.0.1
+pathlib2==2.3.0
+pathos==0.2.1
+patsy==0.4.1
+pep8==1.7.0
+pexpect==4.3.0
+pickleshare==0.7.4
+Pillow==4.3.0
+pkginfo==1.4.1
+plac==0.9.6
+ply==3.10
+pox==0.2.3
+ppft==1.6.4.7.1
+preshed==1.0.0
+prompt-toolkit==1.0.15
+protobuf==3.5.0
+psutil==5.2.2
+ptyprocess==0.5.2
+py==1.4.34
+pyarrow==0.8.0
+pycodestyle==2.3.1
+pycosat==0.6.3
+pycparser==2.18
+pycrypto==2.6.1
+pycurl==7.43.0
+pydot==1.2.3
+pydot-ng==1.0.0
+pyemd==0.4.4
+pyflakes==1.5.0
+Pygments==2.2.0
+PyHive==0.5.0
+pylint==1.7.2
+pyodbc==4.0.17
+pyOpenSSL==17.2.0
+pyparsing==2.2.0
+Pyphen==0.9.4
+PySocks==1.6.7
+pytest==3.2.1
+python-dateutil==2.6.1
+python-Levenshtein==0.12.0
+pytz==2017.3
+PyWavelets==0.5.2
+PyYAML==3.12
+pyzmq==16.0.2
+QtAwesome==0.4.4
+qtconsole==4.3.1
+QtPy==1.3.1
+regex==2017.4.5
+requests==2.18.4
+rope==0.10.5
+ruamel-yaml==0.11.14
+s3transfer==0.1.12
+scikit-image==0.13.0
+scikit-learn==0.19.1
+scipy==1.0.0
+seaborn==0.8
+simplegeneric==0.8.1
+singledispatch==3.4.0.3
+six==1.11.0
+sklearn-pandas==1.6.0
+smart-open==1.5.6
+snowballstemmer==1.2.1
+sortedcollections==0.5.3
+sortedcontainers==1.5.7
+spacy==2.0.5
+Sphinx==1.6.3
+sphinxcontrib-websupport==1.0.1
+spyder==3.2.3
+SQLAlchemy==1.1.13
+statsmodels==0.8.0
+sympy==1.1.1
+tables==3.4.2
+tabulate==0.8.2
+tblib==1.3.2
+tensorflow-gpu==1.3.0
+tensorflow-tensorboard==0.1.8
+termcolor==1.1.0
+terminado==0.6
+testpath==0.3.1
+textacy==0.5.0
+thinc==6.10.2
+thrift==0.10.0
+toolz==0.9.0
+torch==0.2.0.post4
+torchtext==0.2.0
+torchvision==0.1.9
+tornado==4.5.2
+tqdm==4.19.5
+traitlets==4.3.2
+typing==3.6.2
+ujson==1.35
+unicodecsv==0.14.1
+Unidecode==1.0.22
+urllib3==1.22
+vega==0.4.4
+wcwidth==0.1.7
+webencodings==0.5.1
+Werkzeug==0.12.2
+widgetsnbextension==3.0.2
+wrapt==1.10.11
+xlrd==1.1.0
+XlsxWriter==0.9.8
+xlwt==1.3.0
+zict==0.1.3
--- a/github_issue_summarization/setup_a_kubeflow_cluster.md
+++ b/github_issue_summarization/setup_a_kubeflow_cluster.md
@ -0,0 +1,66 @@
+# Setup Kubeflow
+
+In this part, you will setup kubeflow on an existing kubernetes cluster.
+
+## Requirements
+
+*   A kubernetes cluster
+*   `kubectl` CLI pointing to the kubernetes cluster
+    *   Make sure that you can run `kubectl get nodes` from your terminal
+        successfully
+*   The ksonnet CLI: [ks](https://ksonnet.io/#get-started)
+
+Refer to the [user
+guide](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md) for
+instructions on how to setup Kubeflow on your Kubernetes Cluster. Specifically
+complete the [Deploy
+Kubeflow](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#deploy-kubeflow)
+section and [Bringing up a
+Notebook](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#bringing-up-a-notebook)
+section.
+
+After completing that, you should have the following ready
+
+*   A ksonnet app in a directory named `my-kubeflow`
+*   An output similar to this for `kubectl get pods`
+
+```
+NAME                              READY     STATUS    RESTARTS   AGE
+ambassador-7987df44b9-4pht8       2/2       Running   0          1m
+ambassador-7987df44b9-dh5h6       2/2       Running   0          1m
+ambassador-7987df44b9-qrgsm       2/2       Running   0          1m
+tf-hub-0                          1/1       Running   0          1m
+tf-job-operator-78757955b-qkg7s   1/1       Running   0          1m
+```
+
+*   A Jupyter Notebook accessible at `http://127.0.0.1:8000`
+
+## Provision storage for training data
+
+We need a shared persistent disk to store our training data since containers'
+filesystems are ephemeral and don't have a lot of storage space.
+
+The [Advanced
+Customization](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#advanced-customization)
+section of the [user
+guide](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md) has
+instructions on how to provision a cluster-wide shared NFS.
+
+For this example, provision a `10GB` NFS mount with the name
+`github-issues-data`.
+
+After the NFS is ready, delete the `tf-hub-0` pod so that it gets recreated and
+picks up the NFS mount. You can delete it by running `kubectl delete pod
+tf-hub-0 -n=${NAMESPACE}`
+
+At this point you should have a 10GB mount `/mnt/github-issues-data` in your
+Jupyter Notebook pod. Check this by running `!df` in your Jupyter Notebook.
+
+## Summary
+
+*   We created a ksonnet app for our kubeflow deployment
+*   We created a disk for storing our training data
+*   We deployed the kubeflow-core component to our kubernetes cluster
+*   We connected to JupyterHub and spawned a new Jupyter notebook
+
+Next: [Training the model using our cluster](training_the_model.md)
--- a/github_issue_summarization/sql/GetIssues.sql
+++ b/github_issue_summarization/sql/GetIssues.sql
@ -0,0 +1,30 @@
+SELECT
+  url as issue_url
+  -- replace more than one white-space character in a row with a single space
+, REGEXP_REPLACE(title, r"\s{2,}", ' ') as issue_title
+, REGEXP_REPLACE(body, r"\s{2,}", ' ') as body
+
+FROM(
+    SELECT
+        JSON_EXTRACT(payload, '$.issue.html_url') as url
+        -- extract the title and body removing parentheses, brackets, and quotes
+      , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as title
+      , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as body
+    FROM `githubarchive.day.2017*`
+    WHERE 
+      -- 70 random days in 2017 (because it costs money to query these tables!!)  
+          _TABLE_SUFFIX BETWEEN '0101' and '1231'
+      and type="IssuesEvent" 
+      -- Only want the issue at a specific point otherwise will have duplicates
+      and JSON_EXTRACT(payload, '$.action') = "\"opened\"" 
+) as tbl
+
+WHERE 
+  -- the body must be at least 8 words long and the title at least 3 words long
+  --  this is an arbitrary way to filter out empty or sparse issues
+      ARRAY_LENGTH(SPLIT(body, ' ')) >= 6
+  and ARRAY_LENGTH(SPLIT(title, ' ')) >= 3
+  -- filter out issues that have really long titles or bodies
+  --    (these are outliers, and will slow tokenization down).
+  and LENGTH(title) <= 400
+  and LENGTH(body) <= 2000
--- a/github_issue_summarization/teardown.md
+++ b/github_issue_summarization/teardown.md
@ -0,0 +1,20 @@
+# Teardown
+
+Delete the kubernetes namespace
+
+```
+kubectl delete namespace ${NAMESPACE}
+```
+
+Delete the PD backing the NFS mount
+
+```
+gcloud --project=${PROJECT} compute disks delete  --zone=${ZONE} ${PD_DISK_NAME}
+
+```
+
+Delete the kubeflow-app directory
+
+```
+rm -rf my-kubeflow
+```
--- a/github_issue_summarization/training_the_model.md
+++ b/github_issue_summarization/training_the_model.md
@ -0,0 +1,9 @@
+# Training the model
+
+By this point, you should have a Jupyter Notebook running at `http://127.0.0.1:8000`.
+
+Open the Jupyter Notebook interface and create a new Terminal by clicking on New -> Terminal. In the Terminal, clone this git repo by executing: `git clone https://github.com/kubeflow/examples.git`.
+
+Now you should have all the code required to complete this tutorial in the `examples/issue_summarization_github_isses/notebooks` folder. Navigate to this folder. Here you should see two files: `Tutorial.ipynb` and `seq2seq_utils.py`. Open `Tutorial.ipynb` - this contains a complete walk-through of how to go about downloading the training data, preprocessing it and training it.
+
+Next: [Serving the model](serving_the_model.md)
--- a/prow_config.yaml
+++ b/prow_config.yaml
@ -0,0 +1,7 @@
+# This file configures the workflows to trigger in our Prow jobs.
+# see kubeflow/testing/py/run_e2e_workflow.py
+workflows:
+  - app_dir: kubeflow/examples/test/workflows
+    component: workflows
+    name: examples-e2e
+
--- a/test/workflows/.ksonnet/registries/incubator/422d521c05aa905df949868143b26445f5e4eda5.yaml
+++ b/test/workflows/.ksonnet/registries/incubator/422d521c05aa905df949868143b26445f5e4eda5.yaml
@ -0,0 +1,39 @@
+apiVersion: "0.1"
+gitVersion:
+  commitSha: 422d521c05aa905df949868143b26445f5e4eda5
+  refSpec: master
+kind: ksonnet.io/registry
+libraries:
+  apache:
+    path: apache
+    version: master
+  efk:
+    path: efk
+    version: master
+  mariadb:
+    path: mariadb
+    version: master
+  memcached:
+    path: memcached
+    version: master
+  mongodb:
+    path: mongodb
+    version: master
+  mysql:
+    path: mysql
+    version: master
+  nginx:
+    path: nginx
+    version: master
+  node:
+    path: node
+    version: master
+  postgres:
+    path: postgres
+    version: master
+  redis:
+    path: redis
+    version: master
+  tomcat:
+    path: tomcat
+    version: master
--- a/test/workflows/app.yaml
+++ b/test/workflows/app.yaml
@ -0,0 +1,11 @@
+apiVersion: 0.0.1
+kind: ksonnet.io/app
+name: test-infra
+registries:
+  incubator:
+    gitVersion:
+      commitSha: 422d521c05aa905df949868143b26445f5e4eda5
+      refSpec: master
+    protocol: github
+    uri: github.com/ksonnet/parts/tree/master/incubator
+version: 0.0.1
--- a/test/workflows/components/params.libsonnet
+++ b/test/workflows/components/params.libsonnet
@ -0,0 +1,17 @@
+{
+  global: {
+    // User-defined global parameters; accessible to all component and environments, Ex:
+    // replicas: 4,
+  },
+  components: {
+    // Component-level parameters, defined initially from 'ks prototype use ...'
+    // Each object below should correspond to a component in the components/ directory
+    workflows: {
+      bucket: "mlkube-testing_temp",
+      name: "kubeflow-examples-presubmit-test-374-6e32",
+      namespace: "kubeflow-test-infra",
+      prow: "JOB_NAME=kubeflow-examples-presubmit-test,JOB_TYPE=presubmit,PULL_NUMBER=209,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=997a",
+      prow_env: "JOB_NAME=kubeflow-examples-presubmit-test,JOB_TYPE=presubmit,PULL_NUMBER=374,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=6e32",
+    },
+  },
+}
--- a/test/workflows/components/workflows.jsonnet
+++ b/test/workflows/components/workflows.jsonnet
@ -0,0 +1,14 @@
+local params = std.extVar("__ksonnet/params").components.workflows;
+
+local k = import 'k.libsonnet';
+local workflows = import 'workflows.libsonnet';
+local namespace = params.namespace;
+
+// TODO(jlewi): Can we make name default so some random unique value?
+// I didn't see any routines in the standard library for datetime or random.
+local name = params.name;
+
+local prowEnv = workflows.parseEnv(params.prow_env);
+local bucket = params.bucket;
+std.prune(k.core.v1.list.new([workflows.parts(namespace, name).e2e(prowEnv, bucket)]))
+
--- a/test/workflows/components/workflows.libsonnet
+++ b/test/workflows/components/workflows.libsonnet
@ -0,0 +1,244 @@
+{
+  // TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet
+  // doesn't support automatically piping in the namespace from the environment to prototypes.
+
+  // convert a list of two items into a map representing an environment variable
+  // TODO(jlewi): Should we move this into kubeflow/core/util.libsonnet
+  listToMap:: function(v)
+    {
+      name: v[0],
+      value: v[1],
+    },
+
+  // Function to turn comma separated list of prow environment variables into a dictionary.
+  parseEnv:: function(v)
+    local pieces = std.split(v, ",");
+    if v != "" && std.length(pieces) > 0 then
+      std.map(
+        function(i) $.listToMap(std.split(i, "=")),
+        std.split(v, ",")
+      )
+    else [],
+
+  parts(namespace, name):: {
+    // Workflow to run the e2e test.
+    e2e(prow_env, bucket):
+      // mountPath is the directory where the volume to store the test data
+      // should be mounted.
+      local mountPath = "/mnt/" + "test-data-volume";
+      // testDir is the root directory for all data for a particular test run.
+      local testDir = mountPath + "/" + name;
+      // outputDir is the directory to sync to GCS to contain the output for this job.
+      local outputDir = testDir + "/output";
+      local artifactsDir = outputDir + "/artifacts";
+      local goDir = testDir + "/go";
+      // Source directory where all repos should be checked out
+      local srcRootDir = testDir + "/src";
+      // The directory containing the kubeflow/examples repo
+      local srcDir = srcRootDir + "/kubeflow/examples";
+      local image = "gcr.io/mlkube-testing/test-worker";
+      // The name of the NFS volume claim to use for test files.
+      // local nfsVolumeClaim = "kubeflow-testing";
+      local nfsVolumeClaim = "nfs-external";
+      // The name to use for the volume to use to contain test data.
+      local dataVolume = "kubeflow-test-volume";
+      local versionTag = name;
+      // The directory within the kubeflow_testing submodule containing
+      // py scripts to use.
+      local kubeflowExamplesPy = srcDir;
+      local kubeflowTestingPy = srcRootDir + "/kubeflow/testing/py";
+
+      local project = "mlkube-testing";
+      // GKE cluster to use
+      // We need to truncate the cluster to no more than 40 characters because
+      // cluster names can be a max of 40 characters.
+      // We expect the suffix of the cluster name to be unique salt.
+      // We prepend a z because cluster name must start with an alphanumeric character
+      // and if we cut the prefix we might end up starting with "-" or other invalid
+      // character for first character.
+      local cluster = 
+        if std.length(name) > 40 then
+          "z" + std.substr(name, std.length(name) - 39, 39)
+        else 
+        name;
+      local zone = "us-east1-d";
+      local chart = srcDir + "/bin/examples-chart-0.2.1-" + versionTag + ".tgz";
+      {
+        // Build an Argo template to execute a particular command.
+        // step_name: Name for the template
+        // command: List to pass as the container command.
+        buildTemplate(step_name, command):: {
+          name: step_name,
+          container: {
+            command: command,
+            image: image,
+            workingDir: srcDir,
+            env: [
+              {
+                // Add the source directories to the python path.
+                name: "PYTHONPATH",
+                value: kubeflowExamplesPy + ":" + kubeflowTestingPy,
+              },
+              {
+                // Set the GOPATH
+                name: "GOPATH",
+                value: goDir,
+              },
+              {
+                name: "GOOGLE_APPLICATION_CREDENTIALS",
+                value: "/secret/gcp-credentials/key.json",
+              },
+              {
+                name: "GIT_TOKEN",
+                valueFrom: {
+                  secretKeyRef: {
+                    name: "github-token",
+                    key: "github_token",
+                  },
+                },
+              },
+              {
+                name: "EXTRA_REPOS",
+                value: "kubeflow/testing@HEAD",
+              },
+            ] + prow_env,
+            volumeMounts: [
+              {
+                name: dataVolume,
+                mountPath: mountPath,
+              },
+              {
+                name: "github-token",
+                mountPath: "/secret/github-token",
+              },
+              {
+                name: "gcp-credentials",
+                mountPath: "/secret/gcp-credentials",
+              },
+            ],
+          },
+        },  // buildTemplate
+
+        apiVersion: "argoproj.io/v1alpha1",
+        kind: "Workflow",
+        metadata: {
+          name: name,
+          namespace: namespace,
+        },
+        // TODO(jlewi): Use OnExit to run cleanup steps.
+        spec: {
+          entrypoint: "e2e",
+          volumes: [
+            {
+              name: "github-token",
+              secret: {
+                secretName: "github-token",
+              },
+            },
+            {
+              name: "gcp-credentials",
+              secret: {
+                secretName: "kubeflow-testing-credentials",
+              },
+            },
+            {
+              name: dataVolume,
+              persistentVolumeClaim: {
+                claimName: nfsVolumeClaim,
+              },
+            },
+          ],  // volumes
+          // onExit specifies the template that should always run when the workflow completes.
+          onExit: "exit-handler",
+          templates: [
+            {
+              name: "e2e",
+              steps: [
+                [{
+                  name: "checkout",
+                  template: "checkout",
+                }],
+                [
+                  {
+                    name: "create-pr-symlink",
+                    template: "create-pr-symlink",
+                  },
+                  {
+                    name: "py-test",
+                    template: "py-test",
+                  },
+                  {
+                    name: "py-lint",
+                    template: "py-lint",
+                  },
+                ],
+              ],
+            },
+            {
+              name: "exit-handler",
+              steps: [
+                [{
+                  name: "copy-artifacts",
+                  template: "copy-artifacts",
+                }],
+              ],
+            },
+            {
+              name: "checkout",
+              container: {
+                command: [
+                  "/usr/local/bin/checkout.sh",
+                  srcRootDir,
+                ],
+                env: prow_env + [{
+                  name: "EXTRA_REPOS",
+                  value: "kubeflow/testing@HEAD",
+                }],
+                image: image,
+                volumeMounts: [
+                  {
+                    name: dataVolume,
+                    mountPath: mountPath,
+                  },
+                ],
+              },
+            },  // checkout
+            $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("py-test", [
+              "python",
+              "-m",
+              "kubeflow.testing.py_checks",
+              "test",
+              "--src_dir=" + srcDir,
+              "--project=mlkube-testing",
+              "--junit_path=" + artifactsDir + "/junit_pycheckstest.xml",
+            ]),  // py test
+            $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("py-lint", [
+              "python",
+              "-m",
+              "kubeflow.testing.py_checks",
+              "lint",
+              "--src_dir=" + srcDir,
+              "--project=mlkube-testing",
+              "--junit_path=" + artifactsDir + "/junit_pycheckslint.xml",
+            ]),  // py lint
+            $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("create-pr-symlink", [
+              "python",
+              "-m",
+              "kubeflow.testing.prow_artifacts",
+              "--artifacts_dir=" + outputDir,
+              "create_pr_symlink",
+              "--bucket=" + bucket,
+            ]),  // create-pr-symlink
+            $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("copy-artifacts", [
+              "python",
+              "-m",
+              "kubeflow.testing.prow_artifacts",
+              "--artifacts_dir=" + outputDir,
+              "copy_artifacts",
+              "--bucket=" + bucket,
+            ]),  // copy-artifacts
+          ],  // templates
+        },
+      },  // e2e
+  },  // parts
+}
--- a/test/workflows/environments/base.libsonnet
+++ b/test/workflows/environments/base.libsonnet
@ -0,0 +1,4 @@
+local components = std.extVar("__ksonnet/components");
+components + {
+  // Insert user-specified overrides here.
+}
--- a/test/workflows/environments/test/.metadata/k.libsonnet
+++ b/test/workflows/environments/test/.metadata/k.libsonnet
@ -0,0 +1,80 @@
+local k8s = import "k8s.libsonnet";
+
+local apps = k8s.apps;
+local core = k8s.core;
+local extensions = k8s.extensions;
+
+local hidden = {
+  mapContainers(f):: {
+    local podContainers = super.spec.template.spec.containers,
+    spec+: {
+      template+: {
+        spec+: {
+          // IMPORTANT: This overwrites the 'containers' field
+          // for this deployment.
+          containers: std.map(f, podContainers),
+        },
+      },
+    },
+  },
+
+  mapContainersWithName(names, f) ::
+    local nameSet =
+      if std.type(names) == "array"
+      then std.set(names)
+      else std.set([names]);
+    local inNameSet(name) = std.length(std.setInter(nameSet, std.set([name]))) > 0;
+    self.mapContainers(
+      function(c)
+        if std.objectHas(c, "name") && inNameSet(c.name)
+        then f(c)
+        else c
+    ),
+};
+
+k8s + {
+  apps:: apps + {
+    v1beta1:: apps.v1beta1 + {
+      local v1beta1 = apps.v1beta1,
+
+      daemonSet:: v1beta1.daemonSet + {
+        mapContainers(f):: hidden.mapContainers(f),
+        mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
+      },
+
+      deployment:: v1beta1.deployment + {
+        mapContainers(f):: hidden.mapContainers(f),
+        mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
+      },
+    },
+  },
+
+  core:: core + {
+    v1:: core.v1 + {
+      list:: {
+        new(items)::
+          {apiVersion: "v1"} +
+          {kind: "List"} +
+          self.items(items),
+
+        items(items):: if std.type(items) == "array" then {items+: items} else {items+: [items]},
+      },
+    },
+  },
+
+  extensions:: extensions + {
+    v1beta1:: extensions.v1beta1 + {
+      local v1beta1 = extensions.v1beta1,
+
+      daemonSet:: v1beta1.daemonSet + {
+        mapContainers(f):: hidden.mapContainers(f),
+        mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
+      },
+
+      deployment:: v1beta1.deployment + {
+        mapContainers(f):: hidden.mapContainers(f),
+        mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
+      },
+    },
+  },
+}
--- a/test/workflows/environments/test/.metadata/k8s.libsonnet
+++ b/test/workflows/environments/test/.metadata/k8s.libsonnet
--- a/test/workflows/environments/test/.metadata/swagger.json
+++ b/test/workflows/environments/test/.metadata/swagger.json
--- a/test/workflows/environments/test/main.jsonnet
+++ b/test/workflows/environments/test/main.jsonnet
@ -0,0 +1,7 @@
+local base = import "../base.libsonnet";
+local k = import "k.libsonnet";
+
+base + {
+  // Insert user-specified overrides here. For example if a component is named "nginx-deployment", you might have something like:
+  //   "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
+}
--- a/test/workflows/environments/test/params.libsonnet
+++ b/test/workflows/environments/test/params.libsonnet
@ -0,0 +1,10 @@
+local params = import "../../components/params.libsonnet";
+params + {
+  components +: {
+    // Insert component parameter overrides here. Ex:
+    // guestbook +: {
+    //   name: "guestbook-dev",
+    //   replicas: params.global.replicas,
+    // },
+  },
+}
--- a/test/workflows/environments/test/spec.json
+++ b/test/workflows/environments/test/spec.json
@ -0,0 +1,4 @@
+{
+  "server": "https://35.196.185.88",
+  "namespace": "kubeflow-test-infra"
+}