Merge remote-tracking branch 'upstream/master' into contributing

This commit is contained in:
Michelle Casbon 2018-03-02 11:50:58 -05:00
commit 0ad22a29e1
24 changed files with 77298 additions and 2 deletions

41
.gitignore vendored Normal file
View File

@ -0,0 +1,41 @@
# pkg and bin directories currently contain build artifacts
# only so we exclude them.
bin/
vendor/
.vscode/
# Compiled python files.
*.pyc
# Emacs temporary files
*~
# Other temporary files
.DS_Store
# temporary files from emacs flymd-mode
flymd.*
# vim .swp files
.swp
# Files created by Gogland IDE
.idea/
# Exclude wheel files for now.
# The only wheel file is the TF wheel one which is quite large.
# We don't want to check that into source control because it could be
# quite large.
*.whl
# Bazel files
**/bazel-*
# Examples egg
examples/tf_sample/tf_sample.egg-info/
examples/.ipynb_checkpoints/
**/.ipynb_checkpoints
# pyenv
.python-version

21
OWNERS Normal file
View File

@ -0,0 +1,21 @@
# TODO(jlewi): We should probably have OWNERs files in subdirectories that
# list approvers for individual components (e.g. Seldon folks for Seldon component)
approvers:
- ankushagarwal
- DjangoPeng
- gaocegege
- jlewi
- llunn
- ScorpioCPH
reviewers:
- ankushagarwal
- DjangoPeng
- gaocegege
- Jimexist
- jlewi
- llunn
- nkashy1
- ScorpioCPH
- texasmichelle
- wbuchwalter
- zjj2wry

View File

@ -1,2 +1,3 @@
# examples
A repository to host extended examples and tutorials
## A repository to host extended examples and tutorials for kubeflow.
1. [Github issue summarization using sequence-to-sequence learning](./issue_summarization_github_issues) by [Hamel Husain](https://github.com/hamelsmu)

View File

@ -0,0 +1,30 @@
# [WIP] End-to-End kubeflow tutorial using a Sequence-to-Sequence model
This example demonstrates how you can use `kubeflow` end-to-end to train and
serve a Sequence-to-Sequence model on an existing kubernetes cluster. This
tutorial is based upon @hamelsmu's article ["How To Create Data Products That
Are Magical Using Sequence-to-Sequence
Models"](https://medium.com/@hamelhusain/how-to-create-data-products-that-are-magical-using-sequence-to-sequence-models-703f86a231f8).
## Goals
There are two primary goals for this tutorial:
* End-to-End kubeflow example
* End-to-End Sequence-to-Sequence model
By the end of this tutorial, you should learn how to:
* Setup a Kubeflow cluster on an existing Kubernetes deployment
* Spawn up a Jupyter Notebook on the cluster
* Spawn up a shared-persistent storage across the cluster to store large
datasets
* Train a Sequence-to-Sequence model using TensorFlow on the cluster using
GPUs
* Serve the model using TensorFlow Serving
## Steps:
1. [Setup a Kubeflow cluster](setup_a_kubeflow_cluster.md)
1. [Training the model](training_the_model.md)
1. [Teardown](teardown.md)

View File

@ -0,0 +1,510 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"toc": true
},
"source": [
"<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
"<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Process-Data\" data-toc-modified-id=\"Process-Data-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Process Data</a></span></li><li><span><a href=\"#Pre-Process-Data-For-Deep-Learning\" data-toc-modified-id=\"Pre-Process-Data-For-Deep-Learning-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Pre-Process Data For Deep Learning</a></span><ul class=\"toc-item\"><li><ul class=\"toc-item\"><li><ul class=\"toc-item\"><li><span><a href=\"#Look-at-one-example-of-processed-issue-bodies\" data-toc-modified-id=\"Look-at-one-example-of-processed-issue-bodies-2.0.0.1\"><span class=\"toc-item-num\">2.0.0.1&nbsp;&nbsp;</span>Look at one example of processed issue bodies</a></span></li><li><span><a href=\"#Look-at-one-example-of-processed-issue-titles\" data-toc-modified-id=\"Look-at-one-example-of-processed-issue-titles-2.0.0.2\"><span class=\"toc-item-num\">2.0.0.2&nbsp;&nbsp;</span>Look at one example of processed issue titles</a></span></li></ul></li></ul></li></ul></li><li><span><a href=\"#Define-Model-Architecture\" data-toc-modified-id=\"Define-Model-Architecture-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Define Model Architecture</a></span><ul class=\"toc-item\"><li><ul class=\"toc-item\"><li><span><a href=\"#Load-the-data-from-disk-into-variables\" data-toc-modified-id=\"Load-the-data-from-disk-into-variables-3.0.1\"><span class=\"toc-item-num\">3.0.1&nbsp;&nbsp;</span>Load the data from disk into variables</a></span></li><li><span><a href=\"#Define-Model-Architecture\" data-toc-modified-id=\"Define-Model-Architecture-3.0.2\"><span class=\"toc-item-num\">3.0.2&nbsp;&nbsp;</span>Define Model Architecture</a></span></li></ul></li></ul></li><li><span><a href=\"#Train-Model\" data-toc-modified-id=\"Train-Model-4\"><span class=\"toc-item-num\">4&nbsp;&nbsp;</span>Train Model</a></span></li><li><span><a href=\"#See-Results-On-Holdout-Set\" data-toc-modified-id=\"See-Results-On-Holdout-Set-5\"><span class=\"toc-item-num\">5&nbsp;&nbsp;</span>See Results On Holdout Set</a></span></li><li><span><a href=\"#Feature-Extraction-Demo\" data-toc-modified-id=\"Feature-Extraction-Demo-6\"><span class=\"toc-item-num\">6&nbsp;&nbsp;</span>Feature Extraction Demo</a></span><ul class=\"toc-item\"><li><ul class=\"toc-item\"><li><span><a href=\"#Example-1:-Issues-Installing-Python-Packages\" data-toc-modified-id=\"Example-1:-Issues-Installing-Python-Packages-6.0.1\"><span class=\"toc-item-num\">6.0.1&nbsp;&nbsp;</span>Example 1: Issues Installing Python Packages</a></span></li><li><span><a href=\"#Example-2:--Issues-asking-for-feature-improvements\" data-toc-modified-id=\"Example-2:--Issues-asking-for-feature-improvements-6.0.2\"><span class=\"toc-item-num\">6.0.2&nbsp;&nbsp;</span>Example 2: Issues asking for feature improvements</a></span></li></ul></li></ul></li></ul></div>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import logging\n",
"import glob\n",
"from sklearn.model_selection import train_test_split\n",
"pd.set_option('display.max_colwidth', 500)\n",
"logger = logging.getLogger()\n",
"logger.setLevel(logging.WARNING)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Ensure that the github-issues-data volume is mounted in /mnt\n",
"!ls -la /mnt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Download the github-issues.zip training data to /mnt/github-issues-data\n",
"!wget --directory-prefix=/mnt/github-issues-data https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip\n",
"\n",
"# Unzip the file into /mnt/github-issues-data directory\n",
"!unzip /mnt/github-issues-data/github-issues.zip -d /mnt/github-issues-data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Create a symlink from <current_directory>/github-issues-data to /mnt/github-issues-data\n",
"!ln -sf /mnt/github-issues-data github-issues-data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Make sure that the github-issues-data symlink is created\n",
"!ls -lh github-issues-data/github_issues.csv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Process Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Split data into train and test set and preview data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_file='github-issues-data/github_issues.csv'\n",
"\n",
"# read in data sample 2000 rows (for speed of tutorial)\n",
"# Set this to False to train on the entire dataset\n",
"use_sample_data=True\n",
"\n",
"if use_sample_data:\n",
" training_data_size=2000\n",
" traindf, testdf = train_test_split(pd.read_csv(data_file).sample(n=training_data_size), \n",
" test_size=.10)\n",
"else:\n",
" traindf, testdf = train_test_split(pd.read_csv(data_file),test_size=.10)\n",
"\n",
"\n",
"#print out stats about shape of data\n",
"print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')\n",
"print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')\n",
"\n",
"# preview data\n",
"traindf.head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Convert to lists in preparation for modeling**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_body_raw = traindf.body.tolist()\n",
"train_title_raw = traindf.issue_title.tolist()\n",
"#preview output of first element\n",
"train_body_raw[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pre-Process Data For Deep Learning\n",
"\n",
"See [this repo](https://github.com/hamelsmu/ktext) for documentation on the ktext package"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2\n",
"from ktext.preprocess import processor"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"# Clean, tokenize, and apply padding / truncating such that each document length = 70\n",
"# also, retain only the top 8,000 words in the vocabulary and set the remaining words\n",
"# to 1 which will become common index for rare words \n",
"body_pp = processor(keep_n=8000, padding_maxlen=70)\n",
"train_body_vecs = body_pp.fit_transform(train_body_raw)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Look at one example of processed issue bodies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('\\noriginal string:\\n', train_body_raw[0], '\\n')\n",
"print('after pre-processing:\\n', train_body_vecs[0], '\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Instantiate a text processor for the titles, with some different parameters\n",
"# append_indicators = True appends the tokens '_start_' and '_end_' to each\n",
"# document\n",
"# padding = 'post' means that zero padding is appended to the end of the \n",
"# of the document (as opposed to the default which is 'pre')\n",
"title_pp = processor(append_indicators=True, keep_n=4500, \n",
" padding_maxlen=12, padding ='post')\n",
"\n",
"# process the title data\n",
"train_title_vecs = title_pp.fit_transform(train_title_raw)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Look at one example of processed issue titles"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('\\noriginal string:\\n', train_title_raw[0])\n",
"print('after pre-processing:\\n', train_title_vecs[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Serialize all of this to disk for later use"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import dill as dpickle\n",
"import numpy as np\n",
"\n",
"# Save the preprocessor\n",
"with open('body_pp.dpkl', 'wb') as f:\n",
" dpickle.dump(body_pp, f)\n",
"\n",
"with open('title_pp.dpkl', 'wb') as f:\n",
" dpickle.dump(title_pp, f)\n",
"\n",
"# Save the processed data\n",
"np.save('train_title_vecs.npy', train_title_vecs)\n",
"np.save('train_body_vecs.npy', train_body_vecs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Define Model Architecture"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load the data from disk into variables"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"encoder_input_data, doc_length = load_encoder_inputs('train_body_vecs.npy')\n",
"decoder_input_data, decoder_target_data = load_decoder_inputs('train_title_vecs.npy')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"num_encoder_tokens, body_pp = load_text_processor('body_pp.dpkl')\n",
"num_decoder_tokens, title_pp = load_text_processor('title_pp.dpkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define Model Architecture"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"from keras.models import Model\n",
"from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization\n",
"from keras import optimizers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#arbitrarly set latent dimension for embedding and hidden units\n",
"latent_dim = 300\n",
"\n",
"##### Define Model Architecture ######\n",
"\n",
"########################\n",
"#### Encoder Model ####\n",
"encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')\n",
"\n",
"# Word embeding for encoder (ex: Issue Body)\n",
"x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)\n",
"x = BatchNormalization(name='Encoder-Batchnorm-1')(x)\n",
"\n",
"# Intermediate GRU layer (optional)\n",
"#x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x)\n",
"#x = BatchNormalization(name='Encoder-Batchnorm-2')(x)\n",
"\n",
"# We do not need the `encoder_output` just the hidden state.\n",
"_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)\n",
"\n",
"# Encapsulate the encoder as a separate entity so we can just \n",
"# encode without decoding if we want to.\n",
"encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')\n",
"\n",
"seq2seq_encoder_out = encoder_model(encoder_inputs)\n",
"\n",
"########################\n",
"#### Decoder Model ####\n",
"decoder_inputs = Input(shape=(None,), name='Decoder-Input') # for teacher forcing\n",
"\n",
"# Word Embedding For Decoder (ex: Issue Titles)\n",
"dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)\n",
"dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)\n",
"\n",
"# Set up the decoder, using `decoder_state_input` as initial state.\n",
"decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')\n",
"decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)\n",
"x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)\n",
"\n",
"# Dense layer for prediction\n",
"decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')\n",
"decoder_outputs = decoder_dense(x)\n",
"\n",
"########################\n",
"#### Seq2Seq Model ####\n",
"\n",
"#seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])\n",
"seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)\n",
"\n",
"\n",
"seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"** Examine Model Architecture Summary **"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from seq2seq_utils import viz_model_architecture\n",
"seq2seq_Model.summary()\n",
"viz_model_architecture(seq2seq_Model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from keras.callbacks import CSVLogger, ModelCheckpoint\n",
"\n",
"script_name_base = 'tutorial_seq2seq'\n",
"csv_logger = CSVLogger('{:}.log'.format(script_name_base))\n",
"model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),\n",
" save_best_only=True)\n",
"\n",
"batch_size = 1200\n",
"epochs = 7\n",
"history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),\n",
" batch_size=batch_size,\n",
" epochs=epochs,\n",
" validation_split=0.12, callbacks=[csv_logger, model_checkpoint])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#save model\n",
"seq2seq_Model.save('seq2seq_model_tutorial.h5')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# See Results On Holdout Set"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from seq2seq_utils import Seq2Seq_Inference\n",
"seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,\n",
" decoder_preprocessor=title_pp,\n",
" seq2seq_model=seq2seq_Model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# this method displays the predictions on random rows of the holdout set\n",
"seq2seq_inf.demo_model_predictions(n=50, issue_df=testdf)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
},
"toc": {
"nav_menu": {
"height": "263px",
"width": "352px"
},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,425 @@
from matplotlib import pyplot as plt
import tensorflow as tf
from keras import backend as K
from keras.layers import Input
from keras.models import Model
from IPython.display import SVG, display
from keras.utils.vis_utils import model_to_dot
import logging
import numpy as np
import dill as dpickle
from annoy import AnnoyIndex
from tqdm import tqdm, tqdm_notebook
from random import random
from nltk.translate.bleu_score import corpus_bleu
def load_text_processor(fname='title_pp.dpkl'):
"""
Load preprocessors from disk.
Parameters
----------
fname: str
file name of ktext.proccessor object
Returns
-------
num_tokens : int
size of vocabulary loaded into ktext.processor
pp : ktext.processor
the processor you are trying to load
Typical Usage:
-------------
num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
"""
# Load files from disk
with open(fname, 'rb') as f:
pp = dpickle.load(f)
num_tokens = max(pp.id2token.keys()) + 1
print(f'Size of vocabulary for {fname}: {num_tokens:,}')
return num_tokens, pp
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
"""
Load decoder inputs.
Parameters
----------
decoder_np_vecs : str
filename of serialized numpy.array of decoder input (issue title)
Returns
-------
decoder_input_data : numpy.array
The data fed to the decoder as input during training for teacher forcing.
This is the same as `decoder_np_vecs` except the last position.
decoder_target_data : numpy.array
The data that the decoder data is trained to generate (issue title).
Calculated by sliding `decoder_np_vecs` one position forward.
"""
vectorized_title = np.load(decoder_np_vecs)
# For Decoder Input, you don't need the last word as that is only for prediction
# when we are training using Teacher Forcing.
decoder_input_data = vectorized_title[:, :-1]
# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
decoder_target_data = vectorized_title[:, 1:]
print(f'Shape of decoder input: {decoder_input_data.shape}')
print(f'Shape of decoder target: {decoder_target_data.shape}')
return decoder_input_data, decoder_target_data
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
"""
Load variables & data that are inputs to encoder.
Parameters
----------
encoder_np_vecs : str
filename of serialized numpy.array of encoder input (issue title)
Returns
-------
encoder_input_data : numpy.array
The issue body
doc_length : int
The standard document length of the input for the encoder after padding
the shape of this array will be (num_examples, doc_length)
"""
vectorized_body = np.load(encoder_np_vecs)
# Encoder input is simply the body of the issue text
encoder_input_data = vectorized_body
doc_length = encoder_input_data.shape[1]
print(f'Shape of encoder input: {encoder_input_data.shape}')
return encoder_input_data, doc_length
def viz_model_architecture(model):
"""Visualize model architecture in Jupyter notebook."""
display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
def free_gpu_mem():
"""Attempt to free gpu memory."""
K.get_session().close()
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))
def test_gpu():
"""Run a toy computation task in tensorflow to test GPU."""
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
hello = tf.constant('Hello, TensorFlow!')
print(session.run(hello))
def plot_model_training_history(history_object):
"""Plots model train vs. validation loss."""
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.plot(history_object.history['loss'])
plt.plot(history_object.history['val_loss'])
plt.legend(['train', 'test'], loc='upper left')
plt.show()
def extract_encoder_model(model):
"""
Extract the encoder from the original Sequence to Sequence Model.
Returns a keras model object that has one input (body of issue) and one
output (encoding of issue, which is the last hidden state).
Input:
-----
model: keras model object
Returns:
-----
keras model object
"""
encoder_model = model.get_layer('Encoder-Model')
return encoder_model
def extract_decoder_model(model):
"""
Extract the decoder from the original model.
Inputs:
------
model: keras model object
Returns:
-------
A Keras model object with the following inputs and outputs:
Inputs of Keras Model That Is Returned:
1: the embedding index for the last predicted word or the <Start> indicator
2: the last hidden state, or in the case of the first word the hidden state from the encoder
Outputs of Keras Model That Is Returned:
1. Prediction (class probabilities) for the next word
2. The hidden state of the decoder, to be fed back into the decoder at the next time step
Implementation Notes:
----------------------
Must extract relevant layers and reconstruct part of the computation graph
to allow for different inputs as we are not going to use teacher forcing at
inference time.
"""
# the latent dimension is the same throughout the architecture so we are going to
# cheat and grab the latent dimension of the embedding because that is the same as what is
# output from the decoder
latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
# Reconstruct the input into the decoder
decoder_inputs = model.get_layer('Decoder-Input').input
dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
# Instead of setting the intial state from the encoder and forgetting about it, during inference
# we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
# the GRU, thus we define this input layer for the state so we can add this capability
gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
# we need to reuse the weights that is why we are getting this
# If you inspect the decoder GRU that we created for training, it will take as input
# 2 tensors -> (1) is the embedding layer output for the teacher forcing
# (which will now be the last step's prediction, and will be _start_ on the first time step)
# (2) is the state, which we will initialize with the encoder on the first time step, but then
# grab the state after the first prediction and feed that back in again.
gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
# Reconstruct dense layers
dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
decoder_model = Model([decoder_inputs, gru_inference_state_input],
[dense_out, gru_state_out])
return decoder_model
class Seq2Seq_Inference(object):
def __init__(self,
encoder_preprocessor,
decoder_preprocessor,
seq2seq_model):
self.pp_body = encoder_preprocessor
self.pp_title = decoder_preprocessor
self.seq2seq_model = seq2seq_model
self.encoder_model = extract_encoder_model(seq2seq_model)
self.decoder_model = extract_decoder_model(seq2seq_model)
self.default_max_len_title = self.pp_title.padding_maxlen
self.nn = None
self.rec_df = None
def generate_issue_title(self,
raw_input_text,
max_len_title=None):
"""
Use the seq2seq model to generate a title given the body of an issue.
Inputs
------
raw_input: str
The body of the issue text as an input string
max_len_title: int (optional)
The maximum length of the title the model will generate
"""
if max_len_title is None:
max_len_title = self.default_max_len_title
# get the encoder's features for the decoder
raw_tokenized = self.pp_body.transform([raw_input_text])
body_encoding = self.encoder_model.predict(raw_tokenized)
# we want to save the encoder's embedding before its updated by decoder
# because we can use that as an embedding for other tasks.
original_body_encoding = body_encoding
state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
decoded_sentence = []
stop_condition = False
while not stop_condition:
preds, st = self.decoder_model.predict([state_value, body_encoding])
# We are going to ignore indices 0 (padding) and indices 1 (unknown)
# Argmax will return the integer index corresponding to the
# prediction + 2 b/c we chopped off first two
pred_idx = np.argmax(preds[:, :, 2:]) + 2
# retrieve word from index prediction
pred_word_str = self.pp_title.id2token[pred_idx]
if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
stop_condition = True
break
decoded_sentence.append(pred_word_str)
# update the decoder for the next word
body_encoding = st
state_value = np.array(pred_idx).reshape(1, 1)
return original_body_encoding, ' '.join(decoded_sentence)
def print_example(self,
i,
body_text,
title_text,
url,
threshold):
"""
Prints an example of the model's prediction for manual inspection.
"""
if i:
print('\n\n==============================================')
print(f'============== Example # {i} =================\n')
if url:
print(url)
print(f"Issue Body:\n {body_text} \n")
if title_text:
print(f"Original Title:\n {title_text}")
emb, gen_title = self.generate_issue_title(body_text)
print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
if self.nn:
# return neighbors and distances
n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
include_distances=True)
neighbors = n[1:]
dist = d[1:]
if min(dist) <= threshold:
cols = ['issue_url', 'issue_title', 'body']
dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
dfcopy['dist'] = dist
similar_issues_df = dfcopy.query(f'dist <= {threshold}')
print("\n**** Similar Issues (using encoder embedding) ****:\n")
display(similar_issues_df)
def demo_model_predictions(self,
n,
issue_df,
threshold=1):
"""
Pick n random Issues and display predictions.
Input:
------
n : int
Number of issues to display from issue_df
issue_df : pandas DataFrame
DataFrame that contains two columns: `body` and `issue_title`.
threshold : float
distance threshold for recommendation of similar issues.
Returns:
--------
None
Prints the original issue body and the model's prediction.
"""
# Extract body and title from DF
body_text = issue_df.body.tolist()
title_text = issue_df.issue_title.tolist()
url = issue_df.issue_url.tolist()
demo_list = np.random.randint(low=1, high=len(body_text), size=n)
for i in demo_list:
self.print_example(i,
body_text=body_text[i],
title_text=title_text[i],
url=url[i],
threshold=threshold)
def prepare_recommender(self, vectorized_array, original_df):
"""
Use the annoy library to build recommender
Parameters
----------
vectorized_array : List[List[int]]
This is the list of list of integers that represents your corpus
that is fed into the seq2seq model for training.
original_df : pandas.DataFrame
This is the original dataframe that has the columns
['issue_url', 'issue_title', 'body']
Returns
-------
annoy.AnnoyIndex object (see https://github.com/spotify/annoy)
"""
self.rec_df = original_df
emb = self.encoder_model.predict(x=vectorized_array,
batch_size=vectorized_array.shape[0]//200)
f = emb.shape[1]
self.nn = AnnoyIndex(f)
logging.warning('Adding embeddings')
for i in tqdm(range(len(emb))):
self.nn.add_item(i, emb[i])
logging.warning('Building trees for similarity lookup.')
self.nn.build(50)
return self.nn
def set_recsys_data(self, original_df):
self.rec_df = original_df
def set_recsys_annoyobj(self, annoyobj):
self.nn = annoyobj
def evaluate_model(self, holdout_bodies, holdout_titles):
"""
Method for calculating BLEU Score.
Parameters
----------
holdout_bodies : List[str]
These are the issue bodies that we want to summarize
holdout_titles : List[str]
This is the ground truth we are trying to predict --> issue titles
Returns
-------
bleu : float
The BLEU Score
"""
actual, predicted = list(), list()
assert len(holdout_bodies) == len(holdout_titles)
num_examples = len(holdout_bodies)
logging.warning('Generating predictions.')
# step over the whole set TODO: parallelize this
for i in tqdm_notebook(range(num_examples)):
_, yhat = self.generate_issue_title(holdout_bodies[i])
actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
predicted.append(self.pp_title.process_text([yhat])[0])
# calculate BLEU score
logging.warning('Calculating BLEU.')
bleu = corpus_bleu(actual, predicted)
return bleu

View File

@ -0,0 +1,233 @@
alabaster==0.7.10
altair==1.2.1
anaconda-client==1.6.5
anaconda-navigator==1.6.8
anaconda-project==0.8.0
annoy==1.10.0
asn1crypto==0.22.0
astroid==1.5.3
astropy==2.0.2
Babel==2.5.0
backports.functools-lru-cache==1.4
backports.shutil-get-terminal-size==1.0.0
bcolz==1.1.2
beautifulsoup4==4.6.0
bitarray==0.8.1
bkcharts==0.2
blaze==0.11.3
bleach==1.5.0
bokeh==0.12.7
boto==2.48.0
boto3==1.5.14
botocore==1.8.28
Bottleneck==1.2.1
bz2file==0.98
cachetools==2.0.1
certifi==2017.11.5
cffi==1.10.0
chardet==3.0.4
click==6.7
cloudpickle==0.4.0
clyent==1.2.2
colorama==0.3.9
conda==4.4.6
conda-build==3.0.23
conda-verify==2.0.0
contextlib2==0.5.5
cryptography==2.0.3
cycler==0.10.0
cymem==1.31.2
Cython==0.26.1
cytoolz==0.9.0
dask==0.16.1
datashape==0.5.4
decorator==4.2.1
dill==0.2.7.1
distributed==1.20.2
docopt==0.6.2
docutils==0.14
en-core-web-sm==2.0.0
entrypoints==0.2.3
et-xmlfile==1.0.1
fastcache==1.0.2
fastparquet==0.1.3
filelock==2.0.12
Flask==0.12.2
Flask-Cors==3.0.3
ftfy==4.4.3
future==0.16.0
gensim==3.2.0
gevent==1.2.2
glob2==0.5
gmpy2==2.0.8
graphviz==0.8.1
greenlet==0.4.12
h5py==2.7.1
hdfs==2.1.0
heapdict==1.0.0
html5lib==1.0.1
idna==2.6
ijson==2.3
imageio==2.2.0
imagesize==0.7.1
ipykernel==4.6.1
ipython==6.2.1
ipython-genutils==0.2.0
ipywidgets==7.0.0
isort==4.2.15
isoweek==1.3.3
itsdangerous==0.24
jdcal==1.3
jedi==0.11.0
Jinja2==2.9.6
jmespath==0.9.3
jsonschema==2.6.0
jupyter-client==5.1.0
jupyter-console==5.2.0
jupyter-core==4.3.0
jupyterlab==0.27.0
jupyterlab-launcher==0.4.0
Keras==2.1.2
ktext==0.27
lazy-object-proxy==1.3.1
llvmlite==0.20.0
locket==0.2.0
lxml==3.8.0
Markdown==2.6.9
MarkupSafe==1.0
matplotlib==2.1.0
mccabe==0.6.1
mistune==0.7.4
more-itertools==4.0.1
mpmath==0.19
msgpack==0.5.1
msgpack-numpy==0.4.2
msgpack-python==0.5.1
multipledispatch==0.4.9
multiprocess==0.70.5
murmurhash==0.28.0
navigator-updater==0.1.0
nbconvert==5.3.1
nbformat==4.4.0
networkx==2.0
nltk==3.2.5
nose==1.3.7
notebook==5.0.0
numba==0.35.0+10.g143f70e90
numexpr==2.6.2
numpy==1.14.0
numpydoc==0.7.0
odo==0.5.1
olefile==0.44
openpyxl==2.4.8
packaging==16.8
pandas==0.22.0
pandas-summary==0.0.41
pandocfilters==1.4.2
parso==0.1.0
partd==0.3.8
path.py==10.3.1
pathlib==1.0.1
pathlib2==2.3.0
pathos==0.2.1
patsy==0.4.1
pep8==1.7.0
pexpect==4.3.0
pickleshare==0.7.4
Pillow==4.3.0
pkginfo==1.4.1
plac==0.9.6
ply==3.10
pox==0.2.3
ppft==1.6.4.7.1
preshed==1.0.0
prompt-toolkit==1.0.15
protobuf==3.5.0
psutil==5.2.2
ptyprocess==0.5.2
py==1.4.34
pyarrow==0.8.0
pycodestyle==2.3.1
pycosat==0.6.3
pycparser==2.18
pycrypto==2.6.1
pycurl==7.43.0
pydot==1.2.3
pydot-ng==1.0.0
pyemd==0.4.4
pyflakes==1.5.0
Pygments==2.2.0
PyHive==0.5.0
pylint==1.7.2
pyodbc==4.0.17
pyOpenSSL==17.2.0
pyparsing==2.2.0
Pyphen==0.9.4
PySocks==1.6.7
pytest==3.2.1
python-dateutil==2.6.1
python-Levenshtein==0.12.0
pytz==2017.3
PyWavelets==0.5.2
PyYAML==3.12
pyzmq==16.0.2
QtAwesome==0.4.4
qtconsole==4.3.1
QtPy==1.3.1
regex==2017.4.5
requests==2.18.4
rope==0.10.5
ruamel-yaml==0.11.14
s3transfer==0.1.12
scikit-image==0.13.0
scikit-learn==0.19.1
scipy==1.0.0
seaborn==0.8
simplegeneric==0.8.1
singledispatch==3.4.0.3
six==1.11.0
sklearn-pandas==1.6.0
smart-open==1.5.6
snowballstemmer==1.2.1
sortedcollections==0.5.3
sortedcontainers==1.5.7
spacy==2.0.5
Sphinx==1.6.3
sphinxcontrib-websupport==1.0.1
spyder==3.2.3
SQLAlchemy==1.1.13
statsmodels==0.8.0
sympy==1.1.1
tables==3.4.2
tabulate==0.8.2
tblib==1.3.2
tensorflow-gpu==1.3.0
tensorflow-tensorboard==0.1.8
termcolor==1.1.0
terminado==0.6
testpath==0.3.1
textacy==0.5.0
thinc==6.10.2
thrift==0.10.0
toolz==0.9.0
torch==0.2.0.post4
torchtext==0.2.0
torchvision==0.1.9
tornado==4.5.2
tqdm==4.19.5
traitlets==4.3.2
typing==3.6.2
ujson==1.35
unicodecsv==0.14.1
Unidecode==1.0.22
urllib3==1.22
vega==0.4.4
wcwidth==0.1.7
webencodings==0.5.1
Werkzeug==0.12.2
widgetsnbextension==3.0.2
wrapt==1.10.11
xlrd==1.1.0
XlsxWriter==0.9.8
xlwt==1.3.0
zict==0.1.3

View File

@ -0,0 +1,66 @@
# Setup Kubeflow
In this part, you will setup kubeflow on an existing kubernetes cluster.
## Requirements
* A kubernetes cluster
* `kubectl` CLI pointing to the kubernetes cluster
* Make sure that you can run `kubectl get nodes` from your terminal
successfully
* The ksonnet CLI: [ks](https://ksonnet.io/#get-started)
Refer to the [user
guide](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md) for
instructions on how to setup Kubeflow on your Kubernetes Cluster. Specifically
complete the [Deploy
Kubeflow](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#deploy-kubeflow)
section and [Bringing up a
Notebook](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#bringing-up-a-notebook)
section.
After completing that, you should have the following ready
* A ksonnet app in a directory named `my-kubeflow`
* An output similar to this for `kubectl get pods`
```
NAME READY STATUS RESTARTS AGE
ambassador-7987df44b9-4pht8 2/2 Running 0 1m
ambassador-7987df44b9-dh5h6 2/2 Running 0 1m
ambassador-7987df44b9-qrgsm 2/2 Running 0 1m
tf-hub-0 1/1 Running 0 1m
tf-job-operator-78757955b-qkg7s 1/1 Running 0 1m
```
* A Jupyter Notebook accessible at `http://127.0.0.1:8000`
## Provision storage for training data
We need a shared persistent disk to store our training data since containers'
filesystems are ephemeral and don't have a lot of storage space.
The [Advanced
Customization](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#advanced-customization)
section of the [user
guide](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md) has
instructions on how to provision a cluster-wide shared NFS.
For this example, provision a `10GB` NFS mount with the name
`github-issues-data`.
After the NFS is ready, delete the `tf-hub-0` pod so that it gets recreated and
picks up the NFS mount. You can delete it by running `kubectl delete pod
tf-hub-0 -n=${NAMESPACE}`
At this point you should have a 10GB mount `/mnt/github-issues-data` in your
Jupyter Notebook pod. Check this by running `!df` in your Jupyter Notebook.
## Summary
* We created a ksonnet app for our kubeflow deployment
* We created a disk for storing our training data
* We deployed the kubeflow-core component to our kubernetes cluster
* We connected to JupyterHub and spawned a new Jupyter notebook
Next: [Training the model using our cluster](training_the_model.md)

View File

@ -0,0 +1,30 @@
SELECT
url as issue_url
-- replace more than one white-space character in a row with a single space
, REGEXP_REPLACE(title, r"\s{2,}", ' ') as issue_title
, REGEXP_REPLACE(body, r"\s{2,}", ' ') as body
FROM(
SELECT
JSON_EXTRACT(payload, '$.issue.html_url') as url
-- extract the title and body removing parentheses, brackets, and quotes
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as title
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as body
FROM `githubarchive.day.2017*`
WHERE
-- 70 random days in 2017 (because it costs money to query these tables!!)
_TABLE_SUFFIX BETWEEN '0101' and '1231'
and type="IssuesEvent"
-- Only want the issue at a specific point otherwise will have duplicates
and JSON_EXTRACT(payload, '$.action') = "\"opened\""
) as tbl
WHERE
-- the body must be at least 8 words long and the title at least 3 words long
-- this is an arbitrary way to filter out empty or sparse issues
ARRAY_LENGTH(SPLIT(body, ' ')) >= 6
and ARRAY_LENGTH(SPLIT(title, ' ')) >= 3
-- filter out issues that have really long titles or bodies
-- (these are outliers, and will slow tokenization down).
and LENGTH(title) <= 400
and LENGTH(body) <= 2000

View File

@ -0,0 +1,20 @@
# Teardown
Delete the kubernetes namespace
```
kubectl delete namespace ${NAMESPACE}
```
Delete the PD backing the NFS mount
```
gcloud --project=${PROJECT} compute disks delete --zone=${ZONE} ${PD_DISK_NAME}
```
Delete the kubeflow-app directory
```
rm -rf my-kubeflow
```

View File

@ -0,0 +1,9 @@
# Training the model
By this point, you should have a Jupyter Notebook running at `http://127.0.0.1:8000`.
Open the Jupyter Notebook interface and create a new Terminal by clicking on New -> Terminal. In the Terminal, clone this git repo by executing: `git clone https://github.com/kubeflow/examples.git`.
Now you should have all the code required to complete this tutorial in the `examples/issue_summarization_github_isses/notebooks` folder. Navigate to this folder. Here you should see two files: `Tutorial.ipynb` and `seq2seq_utils.py`. Open `Tutorial.ipynb` - this contains a complete walk-through of how to go about downloading the training data, preprocessing it and training it.
Next: [Serving the model](serving_the_model.md)

7
prow_config.yaml Normal file
View File

@ -0,0 +1,7 @@
# This file configures the workflows to trigger in our Prow jobs.
# see kubeflow/testing/py/run_e2e_workflow.py
workflows:
- app_dir: kubeflow/examples/test/workflows
component: workflows
name: examples-e2e

View File

@ -0,0 +1,39 @@
apiVersion: "0.1"
gitVersion:
commitSha: 422d521c05aa905df949868143b26445f5e4eda5
refSpec: master
kind: ksonnet.io/registry
libraries:
apache:
path: apache
version: master
efk:
path: efk
version: master
mariadb:
path: mariadb
version: master
memcached:
path: memcached
version: master
mongodb:
path: mongodb
version: master
mysql:
path: mysql
version: master
nginx:
path: nginx
version: master
node:
path: node
version: master
postgres:
path: postgres
version: master
redis:
path: redis
version: master
tomcat:
path: tomcat
version: master

11
test/workflows/app.yaml Normal file
View File

@ -0,0 +1,11 @@
apiVersion: 0.0.1
kind: ksonnet.io/app
name: test-infra
registries:
incubator:
gitVersion:
commitSha: 422d521c05aa905df949868143b26445f5e4eda5
refSpec: master
protocol: github
uri: github.com/ksonnet/parts/tree/master/incubator
version: 0.0.1

View File

@ -0,0 +1,17 @@
{
global: {
// User-defined global parameters; accessible to all component and environments, Ex:
// replicas: 4,
},
components: {
// Component-level parameters, defined initially from 'ks prototype use ...'
// Each object below should correspond to a component in the components/ directory
workflows: {
bucket: "mlkube-testing_temp",
name: "kubeflow-examples-presubmit-test-374-6e32",
namespace: "kubeflow-test-infra",
prow: "JOB_NAME=kubeflow-examples-presubmit-test,JOB_TYPE=presubmit,PULL_NUMBER=209,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=997a",
prow_env: "JOB_NAME=kubeflow-examples-presubmit-test,JOB_TYPE=presubmit,PULL_NUMBER=374,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=6e32",
},
},
}

View File

@ -0,0 +1,14 @@
local params = std.extVar("__ksonnet/params").components.workflows;
local k = import 'k.libsonnet';
local workflows = import 'workflows.libsonnet';
local namespace = params.namespace;
// TODO(jlewi): Can we make name default so some random unique value?
// I didn't see any routines in the standard library for datetime or random.
local name = params.name;
local prowEnv = workflows.parseEnv(params.prow_env);
local bucket = params.bucket;
std.prune(k.core.v1.list.new([workflows.parts(namespace, name).e2e(prowEnv, bucket)]))

View File

@ -0,0 +1,244 @@
{
// TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet
// doesn't support automatically piping in the namespace from the environment to prototypes.
// convert a list of two items into a map representing an environment variable
// TODO(jlewi): Should we move this into kubeflow/core/util.libsonnet
listToMap:: function(v)
{
name: v[0],
value: v[1],
},
// Function to turn comma separated list of prow environment variables into a dictionary.
parseEnv:: function(v)
local pieces = std.split(v, ",");
if v != "" && std.length(pieces) > 0 then
std.map(
function(i) $.listToMap(std.split(i, "=")),
std.split(v, ",")
)
else [],
parts(namespace, name):: {
// Workflow to run the e2e test.
e2e(prow_env, bucket):
// mountPath is the directory where the volume to store the test data
// should be mounted.
local mountPath = "/mnt/" + "test-data-volume";
// testDir is the root directory for all data for a particular test run.
local testDir = mountPath + "/" + name;
// outputDir is the directory to sync to GCS to contain the output for this job.
local outputDir = testDir + "/output";
local artifactsDir = outputDir + "/artifacts";
local goDir = testDir + "/go";
// Source directory where all repos should be checked out
local srcRootDir = testDir + "/src";
// The directory containing the kubeflow/examples repo
local srcDir = srcRootDir + "/kubeflow/examples";
local image = "gcr.io/mlkube-testing/test-worker";
// The name of the NFS volume claim to use for test files.
// local nfsVolumeClaim = "kubeflow-testing";
local nfsVolumeClaim = "nfs-external";
// The name to use for the volume to use to contain test data.
local dataVolume = "kubeflow-test-volume";
local versionTag = name;
// The directory within the kubeflow_testing submodule containing
// py scripts to use.
local kubeflowExamplesPy = srcDir;
local kubeflowTestingPy = srcRootDir + "/kubeflow/testing/py";
local project = "mlkube-testing";
// GKE cluster to use
// We need to truncate the cluster to no more than 40 characters because
// cluster names can be a max of 40 characters.
// We expect the suffix of the cluster name to be unique salt.
// We prepend a z because cluster name must start with an alphanumeric character
// and if we cut the prefix we might end up starting with "-" or other invalid
// character for first character.
local cluster =
if std.length(name) > 40 then
"z" + std.substr(name, std.length(name) - 39, 39)
else
name;
local zone = "us-east1-d";
local chart = srcDir + "/bin/examples-chart-0.2.1-" + versionTag + ".tgz";
{
// Build an Argo template to execute a particular command.
// step_name: Name for the template
// command: List to pass as the container command.
buildTemplate(step_name, command):: {
name: step_name,
container: {
command: command,
image: image,
workingDir: srcDir,
env: [
{
// Add the source directories to the python path.
name: "PYTHONPATH",
value: kubeflowExamplesPy + ":" + kubeflowTestingPy,
},
{
// Set the GOPATH
name: "GOPATH",
value: goDir,
},
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/key.json",
},
{
name: "GIT_TOKEN",
valueFrom: {
secretKeyRef: {
name: "github-token",
key: "github_token",
},
},
},
{
name: "EXTRA_REPOS",
value: "kubeflow/testing@HEAD",
},
] + prow_env,
volumeMounts: [
{
name: dataVolume,
mountPath: mountPath,
},
{
name: "github-token",
mountPath: "/secret/github-token",
},
{
name: "gcp-credentials",
mountPath: "/secret/gcp-credentials",
},
],
},
}, // buildTemplate
apiVersion: "argoproj.io/v1alpha1",
kind: "Workflow",
metadata: {
name: name,
namespace: namespace,
},
// TODO(jlewi): Use OnExit to run cleanup steps.
spec: {
entrypoint: "e2e",
volumes: [
{
name: "github-token",
secret: {
secretName: "github-token",
},
},
{
name: "gcp-credentials",
secret: {
secretName: "kubeflow-testing-credentials",
},
},
{
name: dataVolume,
persistentVolumeClaim: {
claimName: nfsVolumeClaim,
},
},
], // volumes
// onExit specifies the template that should always run when the workflow completes.
onExit: "exit-handler",
templates: [
{
name: "e2e",
steps: [
[{
name: "checkout",
template: "checkout",
}],
[
{
name: "create-pr-symlink",
template: "create-pr-symlink",
},
{
name: "py-test",
template: "py-test",
},
{
name: "py-lint",
template: "py-lint",
},
],
],
},
{
name: "exit-handler",
steps: [
[{
name: "copy-artifacts",
template: "copy-artifacts",
}],
],
},
{
name: "checkout",
container: {
command: [
"/usr/local/bin/checkout.sh",
srcRootDir,
],
env: prow_env + [{
name: "EXTRA_REPOS",
value: "kubeflow/testing@HEAD",
}],
image: image,
volumeMounts: [
{
name: dataVolume,
mountPath: mountPath,
},
],
},
}, // checkout
$.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("py-test", [
"python",
"-m",
"kubeflow.testing.py_checks",
"test",
"--src_dir=" + srcDir,
"--project=mlkube-testing",
"--junit_path=" + artifactsDir + "/junit_pycheckstest.xml",
]), // py test
$.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("py-lint", [
"python",
"-m",
"kubeflow.testing.py_checks",
"lint",
"--src_dir=" + srcDir,
"--project=mlkube-testing",
"--junit_path=" + artifactsDir + "/junit_pycheckslint.xml",
]), // py lint
$.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("create-pr-symlink", [
"python",
"-m",
"kubeflow.testing.prow_artifacts",
"--artifacts_dir=" + outputDir,
"create_pr_symlink",
"--bucket=" + bucket,
]), // create-pr-symlink
$.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("copy-artifacts", [
"python",
"-m",
"kubeflow.testing.prow_artifacts",
"--artifacts_dir=" + outputDir,
"copy_artifacts",
"--bucket=" + bucket,
]), // copy-artifacts
], // templates
},
}, // e2e
}, // parts
}

View File

@ -0,0 +1,4 @@
local components = std.extVar("__ksonnet/components");
components + {
// Insert user-specified overrides here.
}

View File

@ -0,0 +1,80 @@
local k8s = import "k8s.libsonnet";
local apps = k8s.apps;
local core = k8s.core;
local extensions = k8s.extensions;
local hidden = {
mapContainers(f):: {
local podContainers = super.spec.template.spec.containers,
spec+: {
template+: {
spec+: {
// IMPORTANT: This overwrites the 'containers' field
// for this deployment.
containers: std.map(f, podContainers),
},
},
},
},
mapContainersWithName(names, f) ::
local nameSet =
if std.type(names) == "array"
then std.set(names)
else std.set([names]);
local inNameSet(name) = std.length(std.setInter(nameSet, std.set([name]))) > 0;
self.mapContainers(
function(c)
if std.objectHas(c, "name") && inNameSet(c.name)
then f(c)
else c
),
};
k8s + {
apps:: apps + {
v1beta1:: apps.v1beta1 + {
local v1beta1 = apps.v1beta1,
daemonSet:: v1beta1.daemonSet + {
mapContainers(f):: hidden.mapContainers(f),
mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
},
deployment:: v1beta1.deployment + {
mapContainers(f):: hidden.mapContainers(f),
mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
},
},
},
core:: core + {
v1:: core.v1 + {
list:: {
new(items)::
{apiVersion: "v1"} +
{kind: "List"} +
self.items(items),
items(items):: if std.type(items) == "array" then {items+: items} else {items+: [items]},
},
},
},
extensions:: extensions + {
v1beta1:: extensions.v1beta1 + {
local v1beta1 = extensions.v1beta1,
daemonSet:: v1beta1.daemonSet + {
mapContainers(f):: hidden.mapContainers(f),
mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
},
deployment:: v1beta1.deployment + {
mapContainers(f):: hidden.mapContainers(f),
mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
},
},
},
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
local base = import "../base.libsonnet";
local k = import "k.libsonnet";
base + {
// Insert user-specified overrides here. For example if a component is named "nginx-deployment", you might have something like:
// "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
}

View File

@ -0,0 +1,10 @@
local params = import "../../components/params.libsonnet";
params + {
components +: {
// Insert component parameter overrides here. Ex:
// guestbook +: {
// name: "guestbook-dev",
// replicas: params.global.replicas,
// },
},
}

View File

@ -0,0 +1,4 @@
{
"server": "https://35.196.185.88",
"namespace": "kubeflow-test-infra"
}