mirror of https://github.com/kubeflow/examples.git
Merge remote-tracking branch 'upstream/master' into contributing
This commit is contained in:
commit
0ad22a29e1
|
|
@ -0,0 +1,41 @@
|
|||
# pkg and bin directories currently contain build artifacts
|
||||
# only so we exclude them.
|
||||
bin/
|
||||
vendor/
|
||||
|
||||
.vscode/
|
||||
|
||||
# Compiled python files.
|
||||
*.pyc
|
||||
|
||||
# Emacs temporary files
|
||||
*~
|
||||
|
||||
# Other temporary files
|
||||
.DS_Store
|
||||
|
||||
# temporary files from emacs flymd-mode
|
||||
flymd.*
|
||||
|
||||
# vim .swp files
|
||||
.swp
|
||||
|
||||
# Files created by Gogland IDE
|
||||
.idea/
|
||||
|
||||
# Exclude wheel files for now.
|
||||
# The only wheel file is the TF wheel one which is quite large.
|
||||
# We don't want to check that into source control because it could be
|
||||
# quite large.
|
||||
*.whl
|
||||
|
||||
# Bazel files
|
||||
**/bazel-*
|
||||
# Examples egg
|
||||
examples/tf_sample/tf_sample.egg-info/
|
||||
examples/.ipynb_checkpoints/
|
||||
|
||||
**/.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
# TODO(jlewi): We should probably have OWNERs files in subdirectories that
|
||||
# list approvers for individual components (e.g. Seldon folks for Seldon component)
|
||||
approvers:
|
||||
- ankushagarwal
|
||||
- DjangoPeng
|
||||
- gaocegege
|
||||
- jlewi
|
||||
- llunn
|
||||
- ScorpioCPH
|
||||
reviewers:
|
||||
- ankushagarwal
|
||||
- DjangoPeng
|
||||
- gaocegege
|
||||
- Jimexist
|
||||
- jlewi
|
||||
- llunn
|
||||
- nkashy1
|
||||
- ScorpioCPH
|
||||
- texasmichelle
|
||||
- wbuchwalter
|
||||
- zjj2wry
|
||||
|
|
@ -1,2 +1,3 @@
|
|||
# examples
|
||||
A repository to host extended examples and tutorials
|
||||
## A repository to host extended examples and tutorials for kubeflow.
|
||||
|
||||
1. [Github issue summarization using sequence-to-sequence learning](./issue_summarization_github_issues) by [Hamel Husain](https://github.com/hamelsmu)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,30 @@
|
|||
# [WIP] End-to-End kubeflow tutorial using a Sequence-to-Sequence model
|
||||
|
||||
This example demonstrates how you can use `kubeflow` end-to-end to train and
|
||||
serve a Sequence-to-Sequence model on an existing kubernetes cluster. This
|
||||
tutorial is based upon @hamelsmu's article ["How To Create Data Products That
|
||||
Are Magical Using Sequence-to-Sequence
|
||||
Models"](https://medium.com/@hamelhusain/how-to-create-data-products-that-are-magical-using-sequence-to-sequence-models-703f86a231f8).
|
||||
|
||||
## Goals
|
||||
|
||||
There are two primary goals for this tutorial:
|
||||
|
||||
* End-to-End kubeflow example
|
||||
* End-to-End Sequence-to-Sequence model
|
||||
|
||||
By the end of this tutorial, you should learn how to:
|
||||
|
||||
* Setup a Kubeflow cluster on an existing Kubernetes deployment
|
||||
* Spawn up a Jupyter Notebook on the cluster
|
||||
* Spawn up a shared-persistent storage across the cluster to store large
|
||||
datasets
|
||||
* Train a Sequence-to-Sequence model using TensorFlow on the cluster using
|
||||
GPUs
|
||||
* Serve the model using TensorFlow Serving
|
||||
|
||||
## Steps:
|
||||
|
||||
1. [Setup a Kubeflow cluster](setup_a_kubeflow_cluster.md)
|
||||
1. [Training the model](training_the_model.md)
|
||||
1. [Teardown](teardown.md)
|
||||
|
|
@ -0,0 +1,510 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"toc": true
|
||||
},
|
||||
"source": [
|
||||
"<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
|
||||
"<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Process-Data\" data-toc-modified-id=\"Process-Data-1\"><span class=\"toc-item-num\">1 </span>Process Data</a></span></li><li><span><a href=\"#Pre-Process-Data-For-Deep-Learning\" data-toc-modified-id=\"Pre-Process-Data-For-Deep-Learning-2\"><span class=\"toc-item-num\">2 </span>Pre-Process Data For Deep Learning</a></span><ul class=\"toc-item\"><li><ul class=\"toc-item\"><li><ul class=\"toc-item\"><li><span><a href=\"#Look-at-one-example-of-processed-issue-bodies\" data-toc-modified-id=\"Look-at-one-example-of-processed-issue-bodies-2.0.0.1\"><span class=\"toc-item-num\">2.0.0.1 </span>Look at one example of processed issue bodies</a></span></li><li><span><a href=\"#Look-at-one-example-of-processed-issue-titles\" data-toc-modified-id=\"Look-at-one-example-of-processed-issue-titles-2.0.0.2\"><span class=\"toc-item-num\">2.0.0.2 </span>Look at one example of processed issue titles</a></span></li></ul></li></ul></li></ul></li><li><span><a href=\"#Define-Model-Architecture\" data-toc-modified-id=\"Define-Model-Architecture-3\"><span class=\"toc-item-num\">3 </span>Define Model Architecture</a></span><ul class=\"toc-item\"><li><ul class=\"toc-item\"><li><span><a href=\"#Load-the-data-from-disk-into-variables\" data-toc-modified-id=\"Load-the-data-from-disk-into-variables-3.0.1\"><span class=\"toc-item-num\">3.0.1 </span>Load the data from disk into variables</a></span></li><li><span><a href=\"#Define-Model-Architecture\" data-toc-modified-id=\"Define-Model-Architecture-3.0.2\"><span class=\"toc-item-num\">3.0.2 </span>Define Model Architecture</a></span></li></ul></li></ul></li><li><span><a href=\"#Train-Model\" data-toc-modified-id=\"Train-Model-4\"><span class=\"toc-item-num\">4 </span>Train Model</a></span></li><li><span><a href=\"#See-Results-On-Holdout-Set\" data-toc-modified-id=\"See-Results-On-Holdout-Set-5\"><span class=\"toc-item-num\">5 </span>See Results On Holdout Set</a></span></li><li><span><a href=\"#Feature-Extraction-Demo\" data-toc-modified-id=\"Feature-Extraction-Demo-6\"><span class=\"toc-item-num\">6 </span>Feature Extraction Demo</a></span><ul class=\"toc-item\"><li><ul class=\"toc-item\"><li><span><a href=\"#Example-1:-Issues-Installing-Python-Packages\" data-toc-modified-id=\"Example-1:-Issues-Installing-Python-Packages-6.0.1\"><span class=\"toc-item-num\">6.0.1 </span>Example 1: Issues Installing Python Packages</a></span></li><li><span><a href=\"#Example-2:--Issues-asking-for-feature-improvements\" data-toc-modified-id=\"Example-2:--Issues-asking-for-feature-improvements-6.0.2\"><span class=\"toc-item-num\">6.0.2 </span>Example 2: Issues asking for feature improvements</a></span></li></ul></li></ul></li></ul></div>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import logging\n",
|
||||
"import glob\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"pd.set_option('display.max_colwidth', 500)\n",
|
||||
"logger = logging.getLogger()\n",
|
||||
"logger.setLevel(logging.WARNING)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Download Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Ensure that the github-issues-data volume is mounted in /mnt\n",
|
||||
"!ls -la /mnt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download the github-issues.zip training data to /mnt/github-issues-data\n",
|
||||
"!wget --directory-prefix=/mnt/github-issues-data https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip\n",
|
||||
"\n",
|
||||
"# Unzip the file into /mnt/github-issues-data directory\n",
|
||||
"!unzip /mnt/github-issues-data/github-issues.zip -d /mnt/github-issues-data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a symlink from <current_directory>/github-issues-data to /mnt/github-issues-data\n",
|
||||
"!ln -sf /mnt/github-issues-data github-issues-data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Make sure that the github-issues-data symlink is created\n",
|
||||
"!ls -lh github-issues-data/github_issues.csv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Process Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Split data into train and test set and preview data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_file='github-issues-data/github_issues.csv'\n",
|
||||
"\n",
|
||||
"# read in data sample 2000 rows (for speed of tutorial)\n",
|
||||
"# Set this to False to train on the entire dataset\n",
|
||||
"use_sample_data=True\n",
|
||||
"\n",
|
||||
"if use_sample_data:\n",
|
||||
" training_data_size=2000\n",
|
||||
" traindf, testdf = train_test_split(pd.read_csv(data_file).sample(n=training_data_size), \n",
|
||||
" test_size=.10)\n",
|
||||
"else:\n",
|
||||
" traindf, testdf = train_test_split(pd.read_csv(data_file),test_size=.10)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#print out stats about shape of data\n",
|
||||
"print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')\n",
|
||||
"print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')\n",
|
||||
"\n",
|
||||
"# preview data\n",
|
||||
"traindf.head(3)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Convert to lists in preparation for modeling**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_body_raw = traindf.body.tolist()\n",
|
||||
"train_title_raw = traindf.issue_title.tolist()\n",
|
||||
"#preview output of first element\n",
|
||||
"train_body_raw[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Pre-Process Data For Deep Learning\n",
|
||||
"\n",
|
||||
"See [this repo](https://github.com/hamelsmu/ktext) for documentation on the ktext package"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%reload_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"from ktext.preprocess import processor"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"# Clean, tokenize, and apply padding / truncating such that each document length = 70\n",
|
||||
"# also, retain only the top 8,000 words in the vocabulary and set the remaining words\n",
|
||||
"# to 1 which will become common index for rare words \n",
|
||||
"body_pp = processor(keep_n=8000, padding_maxlen=70)\n",
|
||||
"train_body_vecs = body_pp.fit_transform(train_body_raw)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Look at one example of processed issue bodies"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('\\noriginal string:\\n', train_body_raw[0], '\\n')\n",
|
||||
"print('after pre-processing:\\n', train_body_vecs[0], '\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Instantiate a text processor for the titles, with some different parameters\n",
|
||||
"# append_indicators = True appends the tokens '_start_' and '_end_' to each\n",
|
||||
"# document\n",
|
||||
"# padding = 'post' means that zero padding is appended to the end of the \n",
|
||||
"# of the document (as opposed to the default which is 'pre')\n",
|
||||
"title_pp = processor(append_indicators=True, keep_n=4500, \n",
|
||||
" padding_maxlen=12, padding ='post')\n",
|
||||
"\n",
|
||||
"# process the title data\n",
|
||||
"train_title_vecs = title_pp.fit_transform(train_title_raw)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Look at one example of processed issue titles"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('\\noriginal string:\\n', train_title_raw[0])\n",
|
||||
"print('after pre-processing:\\n', train_title_vecs[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Serialize all of this to disk for later use"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import dill as dpickle\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"# Save the preprocessor\n",
|
||||
"with open('body_pp.dpkl', 'wb') as f:\n",
|
||||
" dpickle.dump(body_pp, f)\n",
|
||||
"\n",
|
||||
"with open('title_pp.dpkl', 'wb') as f:\n",
|
||||
" dpickle.dump(title_pp, f)\n",
|
||||
"\n",
|
||||
"# Save the processed data\n",
|
||||
"np.save('train_title_vecs.npy', train_title_vecs)\n",
|
||||
"np.save('train_body_vecs.npy', train_body_vecs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Define Model Architecture"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Load the data from disk into variables"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"encoder_input_data, doc_length = load_encoder_inputs('train_body_vecs.npy')\n",
|
||||
"decoder_input_data, decoder_target_data = load_decoder_inputs('train_title_vecs.npy')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"num_encoder_tokens, body_pp = load_text_processor('body_pp.dpkl')\n",
|
||||
"num_decoder_tokens, title_pp = load_text_processor('title_pp.dpkl')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Define Model Architecture"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"from keras.models import Model\n",
|
||||
"from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization\n",
|
||||
"from keras import optimizers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#arbitrarly set latent dimension for embedding and hidden units\n",
|
||||
"latent_dim = 300\n",
|
||||
"\n",
|
||||
"##### Define Model Architecture ######\n",
|
||||
"\n",
|
||||
"########################\n",
|
||||
"#### Encoder Model ####\n",
|
||||
"encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')\n",
|
||||
"\n",
|
||||
"# Word embeding for encoder (ex: Issue Body)\n",
|
||||
"x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)\n",
|
||||
"x = BatchNormalization(name='Encoder-Batchnorm-1')(x)\n",
|
||||
"\n",
|
||||
"# Intermediate GRU layer (optional)\n",
|
||||
"#x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x)\n",
|
||||
"#x = BatchNormalization(name='Encoder-Batchnorm-2')(x)\n",
|
||||
"\n",
|
||||
"# We do not need the `encoder_output` just the hidden state.\n",
|
||||
"_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)\n",
|
||||
"\n",
|
||||
"# Encapsulate the encoder as a separate entity so we can just \n",
|
||||
"# encode without decoding if we want to.\n",
|
||||
"encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')\n",
|
||||
"\n",
|
||||
"seq2seq_encoder_out = encoder_model(encoder_inputs)\n",
|
||||
"\n",
|
||||
"########################\n",
|
||||
"#### Decoder Model ####\n",
|
||||
"decoder_inputs = Input(shape=(None,), name='Decoder-Input') # for teacher forcing\n",
|
||||
"\n",
|
||||
"# Word Embedding For Decoder (ex: Issue Titles)\n",
|
||||
"dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)\n",
|
||||
"dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)\n",
|
||||
"\n",
|
||||
"# Set up the decoder, using `decoder_state_input` as initial state.\n",
|
||||
"decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')\n",
|
||||
"decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)\n",
|
||||
"x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)\n",
|
||||
"\n",
|
||||
"# Dense layer for prediction\n",
|
||||
"decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')\n",
|
||||
"decoder_outputs = decoder_dense(x)\n",
|
||||
"\n",
|
||||
"########################\n",
|
||||
"#### Seq2Seq Model ####\n",
|
||||
"\n",
|
||||
"#seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])\n",
|
||||
"seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"** Examine Model Architecture Summary **"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from seq2seq_utils import viz_model_architecture\n",
|
||||
"seq2seq_Model.summary()\n",
|
||||
"viz_model_architecture(seq2seq_Model)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Train Model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from keras.callbacks import CSVLogger, ModelCheckpoint\n",
|
||||
"\n",
|
||||
"script_name_base = 'tutorial_seq2seq'\n",
|
||||
"csv_logger = CSVLogger('{:}.log'.format(script_name_base))\n",
|
||||
"model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),\n",
|
||||
" save_best_only=True)\n",
|
||||
"\n",
|
||||
"batch_size = 1200\n",
|
||||
"epochs = 7\n",
|
||||
"history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),\n",
|
||||
" batch_size=batch_size,\n",
|
||||
" epochs=epochs,\n",
|
||||
" validation_split=0.12, callbacks=[csv_logger, model_checkpoint])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#save model\n",
|
||||
"seq2seq_Model.save('seq2seq_model_tutorial.h5')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# See Results On Holdout Set"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from seq2seq_utils import Seq2Seq_Inference\n",
|
||||
"seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,\n",
|
||||
" decoder_preprocessor=title_pp,\n",
|
||||
" seq2seq_model=seq2seq_Model)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# this method displays the predictions on random rows of the holdout set\n",
|
||||
"seq2seq_inf.demo_model_predictions(n=50, issue_df=testdf)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.3"
|
||||
},
|
||||
"toc": {
|
||||
"nav_menu": {
|
||||
"height": "263px",
|
||||
"width": "352px"
|
||||
},
|
||||
"number_sections": true,
|
||||
"sideBar": true,
|
||||
"skip_h1_title": false,
|
||||
"title_cell": "Table of Contents",
|
||||
"title_sidebar": "Contents",
|
||||
"toc_cell": true,
|
||||
"toc_position": {},
|
||||
"toc_section_display": true,
|
||||
"toc_window_display": false
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -0,0 +1,425 @@
|
|||
from matplotlib import pyplot as plt
|
||||
import tensorflow as tf
|
||||
from keras import backend as K
|
||||
from keras.layers import Input
|
||||
from keras.models import Model
|
||||
from IPython.display import SVG, display
|
||||
from keras.utils.vis_utils import model_to_dot
|
||||
import logging
|
||||
import numpy as np
|
||||
import dill as dpickle
|
||||
from annoy import AnnoyIndex
|
||||
from tqdm import tqdm, tqdm_notebook
|
||||
from random import random
|
||||
from nltk.translate.bleu_score import corpus_bleu
|
||||
|
||||
|
||||
def load_text_processor(fname='title_pp.dpkl'):
|
||||
"""
|
||||
Load preprocessors from disk.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fname: str
|
||||
file name of ktext.proccessor object
|
||||
|
||||
Returns
|
||||
-------
|
||||
num_tokens : int
|
||||
size of vocabulary loaded into ktext.processor
|
||||
pp : ktext.processor
|
||||
the processor you are trying to load
|
||||
|
||||
Typical Usage:
|
||||
-------------
|
||||
|
||||
num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
|
||||
num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
|
||||
|
||||
"""
|
||||
# Load files from disk
|
||||
with open(fname, 'rb') as f:
|
||||
pp = dpickle.load(f)
|
||||
|
||||
num_tokens = max(pp.id2token.keys()) + 1
|
||||
print(f'Size of vocabulary for {fname}: {num_tokens:,}')
|
||||
return num_tokens, pp
|
||||
|
||||
|
||||
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
|
||||
"""
|
||||
Load decoder inputs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
decoder_np_vecs : str
|
||||
filename of serialized numpy.array of decoder input (issue title)
|
||||
|
||||
Returns
|
||||
-------
|
||||
decoder_input_data : numpy.array
|
||||
The data fed to the decoder as input during training for teacher forcing.
|
||||
This is the same as `decoder_np_vecs` except the last position.
|
||||
decoder_target_data : numpy.array
|
||||
The data that the decoder data is trained to generate (issue title).
|
||||
Calculated by sliding `decoder_np_vecs` one position forward.
|
||||
|
||||
"""
|
||||
vectorized_title = np.load(decoder_np_vecs)
|
||||
# For Decoder Input, you don't need the last word as that is only for prediction
|
||||
# when we are training using Teacher Forcing.
|
||||
decoder_input_data = vectorized_title[:, :-1]
|
||||
|
||||
# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
|
||||
decoder_target_data = vectorized_title[:, 1:]
|
||||
|
||||
print(f'Shape of decoder input: {decoder_input_data.shape}')
|
||||
print(f'Shape of decoder target: {decoder_target_data.shape}')
|
||||
return decoder_input_data, decoder_target_data
|
||||
|
||||
|
||||
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
|
||||
"""
|
||||
Load variables & data that are inputs to encoder.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
encoder_np_vecs : str
|
||||
filename of serialized numpy.array of encoder input (issue title)
|
||||
|
||||
Returns
|
||||
-------
|
||||
encoder_input_data : numpy.array
|
||||
The issue body
|
||||
doc_length : int
|
||||
The standard document length of the input for the encoder after padding
|
||||
the shape of this array will be (num_examples, doc_length)
|
||||
|
||||
"""
|
||||
vectorized_body = np.load(encoder_np_vecs)
|
||||
# Encoder input is simply the body of the issue text
|
||||
encoder_input_data = vectorized_body
|
||||
doc_length = encoder_input_data.shape[1]
|
||||
print(f'Shape of encoder input: {encoder_input_data.shape}')
|
||||
return encoder_input_data, doc_length
|
||||
|
||||
|
||||
def viz_model_architecture(model):
|
||||
"""Visualize model architecture in Jupyter notebook."""
|
||||
display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
|
||||
|
||||
|
||||
def free_gpu_mem():
|
||||
"""Attempt to free gpu memory."""
|
||||
K.get_session().close()
|
||||
cfg = K.tf.ConfigProto()
|
||||
cfg.gpu_options.allow_growth = True
|
||||
K.set_session(K.tf.Session(config=cfg))
|
||||
|
||||
|
||||
def test_gpu():
|
||||
"""Run a toy computation task in tensorflow to test GPU."""
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
session = tf.Session(config=config)
|
||||
hello = tf.constant('Hello, TensorFlow!')
|
||||
print(session.run(hello))
|
||||
|
||||
|
||||
def plot_model_training_history(history_object):
|
||||
"""Plots model train vs. validation loss."""
|
||||
plt.title('model accuracy')
|
||||
plt.ylabel('accuracy')
|
||||
plt.xlabel('epoch')
|
||||
plt.plot(history_object.history['loss'])
|
||||
plt.plot(history_object.history['val_loss'])
|
||||
plt.legend(['train', 'test'], loc='upper left')
|
||||
plt.show()
|
||||
|
||||
|
||||
def extract_encoder_model(model):
|
||||
"""
|
||||
Extract the encoder from the original Sequence to Sequence Model.
|
||||
|
||||
Returns a keras model object that has one input (body of issue) and one
|
||||
output (encoding of issue, which is the last hidden state).
|
||||
|
||||
Input:
|
||||
-----
|
||||
model: keras model object
|
||||
|
||||
Returns:
|
||||
-----
|
||||
keras model object
|
||||
|
||||
"""
|
||||
encoder_model = model.get_layer('Encoder-Model')
|
||||
return encoder_model
|
||||
|
||||
|
||||
def extract_decoder_model(model):
|
||||
"""
|
||||
Extract the decoder from the original model.
|
||||
|
||||
Inputs:
|
||||
------
|
||||
model: keras model object
|
||||
|
||||
Returns:
|
||||
-------
|
||||
A Keras model object with the following inputs and outputs:
|
||||
|
||||
Inputs of Keras Model That Is Returned:
|
||||
1: the embedding index for the last predicted word or the <Start> indicator
|
||||
2: the last hidden state, or in the case of the first word the hidden state from the encoder
|
||||
|
||||
Outputs of Keras Model That Is Returned:
|
||||
1. Prediction (class probabilities) for the next word
|
||||
2. The hidden state of the decoder, to be fed back into the decoder at the next time step
|
||||
|
||||
Implementation Notes:
|
||||
----------------------
|
||||
Must extract relevant layers and reconstruct part of the computation graph
|
||||
to allow for different inputs as we are not going to use teacher forcing at
|
||||
inference time.
|
||||
|
||||
"""
|
||||
# the latent dimension is the same throughout the architecture so we are going to
|
||||
# cheat and grab the latent dimension of the embedding because that is the same as what is
|
||||
# output from the decoder
|
||||
latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
|
||||
|
||||
# Reconstruct the input into the decoder
|
||||
decoder_inputs = model.get_layer('Decoder-Input').input
|
||||
dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
|
||||
dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
|
||||
|
||||
# Instead of setting the intial state from the encoder and forgetting about it, during inference
|
||||
# we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
|
||||
# the GRU, thus we define this input layer for the state so we can add this capability
|
||||
gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
|
||||
|
||||
# we need to reuse the weights that is why we are getting this
|
||||
# If you inspect the decoder GRU that we created for training, it will take as input
|
||||
# 2 tensors -> (1) is the embedding layer output for the teacher forcing
|
||||
# (which will now be the last step's prediction, and will be _start_ on the first time step)
|
||||
# (2) is the state, which we will initialize with the encoder on the first time step, but then
|
||||
# grab the state after the first prediction and feed that back in again.
|
||||
gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
|
||||
|
||||
# Reconstruct dense layers
|
||||
dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
|
||||
dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
|
||||
decoder_model = Model([decoder_inputs, gru_inference_state_input],
|
||||
[dense_out, gru_state_out])
|
||||
return decoder_model
|
||||
|
||||
|
||||
class Seq2Seq_Inference(object):
|
||||
def __init__(self,
|
||||
encoder_preprocessor,
|
||||
decoder_preprocessor,
|
||||
seq2seq_model):
|
||||
|
||||
self.pp_body = encoder_preprocessor
|
||||
self.pp_title = decoder_preprocessor
|
||||
self.seq2seq_model = seq2seq_model
|
||||
self.encoder_model = extract_encoder_model(seq2seq_model)
|
||||
self.decoder_model = extract_decoder_model(seq2seq_model)
|
||||
self.default_max_len_title = self.pp_title.padding_maxlen
|
||||
self.nn = None
|
||||
self.rec_df = None
|
||||
|
||||
def generate_issue_title(self,
|
||||
raw_input_text,
|
||||
max_len_title=None):
|
||||
"""
|
||||
Use the seq2seq model to generate a title given the body of an issue.
|
||||
|
||||
Inputs
|
||||
------
|
||||
raw_input: str
|
||||
The body of the issue text as an input string
|
||||
|
||||
max_len_title: int (optional)
|
||||
The maximum length of the title the model will generate
|
||||
|
||||
"""
|
||||
if max_len_title is None:
|
||||
max_len_title = self.default_max_len_title
|
||||
# get the encoder's features for the decoder
|
||||
raw_tokenized = self.pp_body.transform([raw_input_text])
|
||||
body_encoding = self.encoder_model.predict(raw_tokenized)
|
||||
# we want to save the encoder's embedding before its updated by decoder
|
||||
# because we can use that as an embedding for other tasks.
|
||||
original_body_encoding = body_encoding
|
||||
state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
|
||||
|
||||
decoded_sentence = []
|
||||
stop_condition = False
|
||||
while not stop_condition:
|
||||
preds, st = self.decoder_model.predict([state_value, body_encoding])
|
||||
|
||||
# We are going to ignore indices 0 (padding) and indices 1 (unknown)
|
||||
# Argmax will return the integer index corresponding to the
|
||||
# prediction + 2 b/c we chopped off first two
|
||||
pred_idx = np.argmax(preds[:, :, 2:]) + 2
|
||||
|
||||
# retrieve word from index prediction
|
||||
pred_word_str = self.pp_title.id2token[pred_idx]
|
||||
|
||||
if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
|
||||
stop_condition = True
|
||||
break
|
||||
decoded_sentence.append(pred_word_str)
|
||||
|
||||
# update the decoder for the next word
|
||||
body_encoding = st
|
||||
state_value = np.array(pred_idx).reshape(1, 1)
|
||||
|
||||
return original_body_encoding, ' '.join(decoded_sentence)
|
||||
|
||||
|
||||
def print_example(self,
|
||||
i,
|
||||
body_text,
|
||||
title_text,
|
||||
url,
|
||||
threshold):
|
||||
"""
|
||||
Prints an example of the model's prediction for manual inspection.
|
||||
"""
|
||||
if i:
|
||||
print('\n\n==============================================')
|
||||
print(f'============== Example # {i} =================\n')
|
||||
|
||||
if url:
|
||||
print(url)
|
||||
|
||||
print(f"Issue Body:\n {body_text} \n")
|
||||
|
||||
if title_text:
|
||||
print(f"Original Title:\n {title_text}")
|
||||
|
||||
emb, gen_title = self.generate_issue_title(body_text)
|
||||
print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
|
||||
|
||||
if self.nn:
|
||||
# return neighbors and distances
|
||||
n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
|
||||
include_distances=True)
|
||||
neighbors = n[1:]
|
||||
dist = d[1:]
|
||||
|
||||
if min(dist) <= threshold:
|
||||
cols = ['issue_url', 'issue_title', 'body']
|
||||
dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
|
||||
dfcopy['dist'] = dist
|
||||
similar_issues_df = dfcopy.query(f'dist <= {threshold}')
|
||||
|
||||
print("\n**** Similar Issues (using encoder embedding) ****:\n")
|
||||
display(similar_issues_df)
|
||||
|
||||
|
||||
def demo_model_predictions(self,
|
||||
n,
|
||||
issue_df,
|
||||
threshold=1):
|
||||
"""
|
||||
Pick n random Issues and display predictions.
|
||||
|
||||
Input:
|
||||
------
|
||||
n : int
|
||||
Number of issues to display from issue_df
|
||||
issue_df : pandas DataFrame
|
||||
DataFrame that contains two columns: `body` and `issue_title`.
|
||||
threshold : float
|
||||
distance threshold for recommendation of similar issues.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
None
|
||||
Prints the original issue body and the model's prediction.
|
||||
"""
|
||||
# Extract body and title from DF
|
||||
body_text = issue_df.body.tolist()
|
||||
title_text = issue_df.issue_title.tolist()
|
||||
url = issue_df.issue_url.tolist()
|
||||
|
||||
demo_list = np.random.randint(low=1, high=len(body_text), size=n)
|
||||
for i in demo_list:
|
||||
self.print_example(i,
|
||||
body_text=body_text[i],
|
||||
title_text=title_text[i],
|
||||
url=url[i],
|
||||
threshold=threshold)
|
||||
|
||||
def prepare_recommender(self, vectorized_array, original_df):
|
||||
"""
|
||||
Use the annoy library to build recommender
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vectorized_array : List[List[int]]
|
||||
This is the list of list of integers that represents your corpus
|
||||
that is fed into the seq2seq model for training.
|
||||
original_df : pandas.DataFrame
|
||||
This is the original dataframe that has the columns
|
||||
['issue_url', 'issue_title', 'body']
|
||||
|
||||
Returns
|
||||
-------
|
||||
annoy.AnnoyIndex object (see https://github.com/spotify/annoy)
|
||||
"""
|
||||
self.rec_df = original_df
|
||||
emb = self.encoder_model.predict(x=vectorized_array,
|
||||
batch_size=vectorized_array.shape[0]//200)
|
||||
|
||||
f = emb.shape[1]
|
||||
self.nn = AnnoyIndex(f)
|
||||
logging.warning('Adding embeddings')
|
||||
for i in tqdm(range(len(emb))):
|
||||
self.nn.add_item(i, emb[i])
|
||||
logging.warning('Building trees for similarity lookup.')
|
||||
self.nn.build(50)
|
||||
return self.nn
|
||||
|
||||
def set_recsys_data(self, original_df):
|
||||
self.rec_df = original_df
|
||||
|
||||
def set_recsys_annoyobj(self, annoyobj):
|
||||
self.nn = annoyobj
|
||||
|
||||
def evaluate_model(self, holdout_bodies, holdout_titles):
|
||||
"""
|
||||
Method for calculating BLEU Score.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
holdout_bodies : List[str]
|
||||
These are the issue bodies that we want to summarize
|
||||
holdout_titles : List[str]
|
||||
This is the ground truth we are trying to predict --> issue titles
|
||||
|
||||
Returns
|
||||
-------
|
||||
bleu : float
|
||||
The BLEU Score
|
||||
|
||||
"""
|
||||
actual, predicted = list(), list()
|
||||
assert len(holdout_bodies) == len(holdout_titles)
|
||||
num_examples = len(holdout_bodies)
|
||||
|
||||
logging.warning('Generating predictions.')
|
||||
# step over the whole set TODO: parallelize this
|
||||
for i in tqdm_notebook(range(num_examples)):
|
||||
_, yhat = self.generate_issue_title(holdout_bodies[i])
|
||||
|
||||
actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
|
||||
predicted.append(self.pp_title.process_text([yhat])[0])
|
||||
# calculate BLEU score
|
||||
logging.warning('Calculating BLEU.')
|
||||
bleu = corpus_bleu(actual, predicted)
|
||||
return bleu
|
||||
|
|
@ -0,0 +1,233 @@
|
|||
alabaster==0.7.10
|
||||
altair==1.2.1
|
||||
anaconda-client==1.6.5
|
||||
anaconda-navigator==1.6.8
|
||||
anaconda-project==0.8.0
|
||||
annoy==1.10.0
|
||||
asn1crypto==0.22.0
|
||||
astroid==1.5.3
|
||||
astropy==2.0.2
|
||||
Babel==2.5.0
|
||||
backports.functools-lru-cache==1.4
|
||||
backports.shutil-get-terminal-size==1.0.0
|
||||
bcolz==1.1.2
|
||||
beautifulsoup4==4.6.0
|
||||
bitarray==0.8.1
|
||||
bkcharts==0.2
|
||||
blaze==0.11.3
|
||||
bleach==1.5.0
|
||||
bokeh==0.12.7
|
||||
boto==2.48.0
|
||||
boto3==1.5.14
|
||||
botocore==1.8.28
|
||||
Bottleneck==1.2.1
|
||||
bz2file==0.98
|
||||
cachetools==2.0.1
|
||||
certifi==2017.11.5
|
||||
cffi==1.10.0
|
||||
chardet==3.0.4
|
||||
click==6.7
|
||||
cloudpickle==0.4.0
|
||||
clyent==1.2.2
|
||||
colorama==0.3.9
|
||||
conda==4.4.6
|
||||
conda-build==3.0.23
|
||||
conda-verify==2.0.0
|
||||
contextlib2==0.5.5
|
||||
cryptography==2.0.3
|
||||
cycler==0.10.0
|
||||
cymem==1.31.2
|
||||
Cython==0.26.1
|
||||
cytoolz==0.9.0
|
||||
dask==0.16.1
|
||||
datashape==0.5.4
|
||||
decorator==4.2.1
|
||||
dill==0.2.7.1
|
||||
distributed==1.20.2
|
||||
docopt==0.6.2
|
||||
docutils==0.14
|
||||
en-core-web-sm==2.0.0
|
||||
entrypoints==0.2.3
|
||||
et-xmlfile==1.0.1
|
||||
fastcache==1.0.2
|
||||
fastparquet==0.1.3
|
||||
filelock==2.0.12
|
||||
Flask==0.12.2
|
||||
Flask-Cors==3.0.3
|
||||
ftfy==4.4.3
|
||||
future==0.16.0
|
||||
gensim==3.2.0
|
||||
gevent==1.2.2
|
||||
glob2==0.5
|
||||
gmpy2==2.0.8
|
||||
graphviz==0.8.1
|
||||
greenlet==0.4.12
|
||||
h5py==2.7.1
|
||||
hdfs==2.1.0
|
||||
heapdict==1.0.0
|
||||
html5lib==1.0.1
|
||||
idna==2.6
|
||||
ijson==2.3
|
||||
imageio==2.2.0
|
||||
imagesize==0.7.1
|
||||
ipykernel==4.6.1
|
||||
ipython==6.2.1
|
||||
ipython-genutils==0.2.0
|
||||
ipywidgets==7.0.0
|
||||
isort==4.2.15
|
||||
isoweek==1.3.3
|
||||
itsdangerous==0.24
|
||||
jdcal==1.3
|
||||
jedi==0.11.0
|
||||
Jinja2==2.9.6
|
||||
jmespath==0.9.3
|
||||
jsonschema==2.6.0
|
||||
jupyter-client==5.1.0
|
||||
jupyter-console==5.2.0
|
||||
jupyter-core==4.3.0
|
||||
jupyterlab==0.27.0
|
||||
jupyterlab-launcher==0.4.0
|
||||
Keras==2.1.2
|
||||
ktext==0.27
|
||||
lazy-object-proxy==1.3.1
|
||||
llvmlite==0.20.0
|
||||
locket==0.2.0
|
||||
lxml==3.8.0
|
||||
Markdown==2.6.9
|
||||
MarkupSafe==1.0
|
||||
matplotlib==2.1.0
|
||||
mccabe==0.6.1
|
||||
mistune==0.7.4
|
||||
more-itertools==4.0.1
|
||||
mpmath==0.19
|
||||
msgpack==0.5.1
|
||||
msgpack-numpy==0.4.2
|
||||
msgpack-python==0.5.1
|
||||
multipledispatch==0.4.9
|
||||
multiprocess==0.70.5
|
||||
murmurhash==0.28.0
|
||||
navigator-updater==0.1.0
|
||||
nbconvert==5.3.1
|
||||
nbformat==4.4.0
|
||||
networkx==2.0
|
||||
nltk==3.2.5
|
||||
nose==1.3.7
|
||||
notebook==5.0.0
|
||||
numba==0.35.0+10.g143f70e90
|
||||
numexpr==2.6.2
|
||||
numpy==1.14.0
|
||||
numpydoc==0.7.0
|
||||
odo==0.5.1
|
||||
olefile==0.44
|
||||
openpyxl==2.4.8
|
||||
packaging==16.8
|
||||
pandas==0.22.0
|
||||
pandas-summary==0.0.41
|
||||
pandocfilters==1.4.2
|
||||
parso==0.1.0
|
||||
partd==0.3.8
|
||||
path.py==10.3.1
|
||||
pathlib==1.0.1
|
||||
pathlib2==2.3.0
|
||||
pathos==0.2.1
|
||||
patsy==0.4.1
|
||||
pep8==1.7.0
|
||||
pexpect==4.3.0
|
||||
pickleshare==0.7.4
|
||||
Pillow==4.3.0
|
||||
pkginfo==1.4.1
|
||||
plac==0.9.6
|
||||
ply==3.10
|
||||
pox==0.2.3
|
||||
ppft==1.6.4.7.1
|
||||
preshed==1.0.0
|
||||
prompt-toolkit==1.0.15
|
||||
protobuf==3.5.0
|
||||
psutil==5.2.2
|
||||
ptyprocess==0.5.2
|
||||
py==1.4.34
|
||||
pyarrow==0.8.0
|
||||
pycodestyle==2.3.1
|
||||
pycosat==0.6.3
|
||||
pycparser==2.18
|
||||
pycrypto==2.6.1
|
||||
pycurl==7.43.0
|
||||
pydot==1.2.3
|
||||
pydot-ng==1.0.0
|
||||
pyemd==0.4.4
|
||||
pyflakes==1.5.0
|
||||
Pygments==2.2.0
|
||||
PyHive==0.5.0
|
||||
pylint==1.7.2
|
||||
pyodbc==4.0.17
|
||||
pyOpenSSL==17.2.0
|
||||
pyparsing==2.2.0
|
||||
Pyphen==0.9.4
|
||||
PySocks==1.6.7
|
||||
pytest==3.2.1
|
||||
python-dateutil==2.6.1
|
||||
python-Levenshtein==0.12.0
|
||||
pytz==2017.3
|
||||
PyWavelets==0.5.2
|
||||
PyYAML==3.12
|
||||
pyzmq==16.0.2
|
||||
QtAwesome==0.4.4
|
||||
qtconsole==4.3.1
|
||||
QtPy==1.3.1
|
||||
regex==2017.4.5
|
||||
requests==2.18.4
|
||||
rope==0.10.5
|
||||
ruamel-yaml==0.11.14
|
||||
s3transfer==0.1.12
|
||||
scikit-image==0.13.0
|
||||
scikit-learn==0.19.1
|
||||
scipy==1.0.0
|
||||
seaborn==0.8
|
||||
simplegeneric==0.8.1
|
||||
singledispatch==3.4.0.3
|
||||
six==1.11.0
|
||||
sklearn-pandas==1.6.0
|
||||
smart-open==1.5.6
|
||||
snowballstemmer==1.2.1
|
||||
sortedcollections==0.5.3
|
||||
sortedcontainers==1.5.7
|
||||
spacy==2.0.5
|
||||
Sphinx==1.6.3
|
||||
sphinxcontrib-websupport==1.0.1
|
||||
spyder==3.2.3
|
||||
SQLAlchemy==1.1.13
|
||||
statsmodels==0.8.0
|
||||
sympy==1.1.1
|
||||
tables==3.4.2
|
||||
tabulate==0.8.2
|
||||
tblib==1.3.2
|
||||
tensorflow-gpu==1.3.0
|
||||
tensorflow-tensorboard==0.1.8
|
||||
termcolor==1.1.0
|
||||
terminado==0.6
|
||||
testpath==0.3.1
|
||||
textacy==0.5.0
|
||||
thinc==6.10.2
|
||||
thrift==0.10.0
|
||||
toolz==0.9.0
|
||||
torch==0.2.0.post4
|
||||
torchtext==0.2.0
|
||||
torchvision==0.1.9
|
||||
tornado==4.5.2
|
||||
tqdm==4.19.5
|
||||
traitlets==4.3.2
|
||||
typing==3.6.2
|
||||
ujson==1.35
|
||||
unicodecsv==0.14.1
|
||||
Unidecode==1.0.22
|
||||
urllib3==1.22
|
||||
vega==0.4.4
|
||||
wcwidth==0.1.7
|
||||
webencodings==0.5.1
|
||||
Werkzeug==0.12.2
|
||||
widgetsnbextension==3.0.2
|
||||
wrapt==1.10.11
|
||||
xlrd==1.1.0
|
||||
XlsxWriter==0.9.8
|
||||
xlwt==1.3.0
|
||||
zict==0.1.3
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
# Setup Kubeflow
|
||||
|
||||
In this part, you will setup kubeflow on an existing kubernetes cluster.
|
||||
|
||||
## Requirements
|
||||
|
||||
* A kubernetes cluster
|
||||
* `kubectl` CLI pointing to the kubernetes cluster
|
||||
* Make sure that you can run `kubectl get nodes` from your terminal
|
||||
successfully
|
||||
* The ksonnet CLI: [ks](https://ksonnet.io/#get-started)
|
||||
|
||||
Refer to the [user
|
||||
guide](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md) for
|
||||
instructions on how to setup Kubeflow on your Kubernetes Cluster. Specifically
|
||||
complete the [Deploy
|
||||
Kubeflow](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#deploy-kubeflow)
|
||||
section and [Bringing up a
|
||||
Notebook](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#bringing-up-a-notebook)
|
||||
section.
|
||||
|
||||
After completing that, you should have the following ready
|
||||
|
||||
* A ksonnet app in a directory named `my-kubeflow`
|
||||
* An output similar to this for `kubectl get pods`
|
||||
|
||||
```
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
ambassador-7987df44b9-4pht8 2/2 Running 0 1m
|
||||
ambassador-7987df44b9-dh5h6 2/2 Running 0 1m
|
||||
ambassador-7987df44b9-qrgsm 2/2 Running 0 1m
|
||||
tf-hub-0 1/1 Running 0 1m
|
||||
tf-job-operator-78757955b-qkg7s 1/1 Running 0 1m
|
||||
```
|
||||
|
||||
* A Jupyter Notebook accessible at `http://127.0.0.1:8000`
|
||||
|
||||
## Provision storage for training data
|
||||
|
||||
We need a shared persistent disk to store our training data since containers'
|
||||
filesystems are ephemeral and don't have a lot of storage space.
|
||||
|
||||
The [Advanced
|
||||
Customization](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#advanced-customization)
|
||||
section of the [user
|
||||
guide](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md) has
|
||||
instructions on how to provision a cluster-wide shared NFS.
|
||||
|
||||
For this example, provision a `10GB` NFS mount with the name
|
||||
`github-issues-data`.
|
||||
|
||||
After the NFS is ready, delete the `tf-hub-0` pod so that it gets recreated and
|
||||
picks up the NFS mount. You can delete it by running `kubectl delete pod
|
||||
tf-hub-0 -n=${NAMESPACE}`
|
||||
|
||||
At this point you should have a 10GB mount `/mnt/github-issues-data` in your
|
||||
Jupyter Notebook pod. Check this by running `!df` in your Jupyter Notebook.
|
||||
|
||||
## Summary
|
||||
|
||||
* We created a ksonnet app for our kubeflow deployment
|
||||
* We created a disk for storing our training data
|
||||
* We deployed the kubeflow-core component to our kubernetes cluster
|
||||
* We connected to JupyterHub and spawned a new Jupyter notebook
|
||||
|
||||
Next: [Training the model using our cluster](training_the_model.md)
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
SELECT
|
||||
url as issue_url
|
||||
-- replace more than one white-space character in a row with a single space
|
||||
, REGEXP_REPLACE(title, r"\s{2,}", ' ') as issue_title
|
||||
, REGEXP_REPLACE(body, r"\s{2,}", ' ') as body
|
||||
|
||||
FROM(
|
||||
SELECT
|
||||
JSON_EXTRACT(payload, '$.issue.html_url') as url
|
||||
-- extract the title and body removing parentheses, brackets, and quotes
|
||||
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as title
|
||||
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as body
|
||||
FROM `githubarchive.day.2017*`
|
||||
WHERE
|
||||
-- 70 random days in 2017 (because it costs money to query these tables!!)
|
||||
_TABLE_SUFFIX BETWEEN '0101' and '1231'
|
||||
and type="IssuesEvent"
|
||||
-- Only want the issue at a specific point otherwise will have duplicates
|
||||
and JSON_EXTRACT(payload, '$.action') = "\"opened\""
|
||||
) as tbl
|
||||
|
||||
WHERE
|
||||
-- the body must be at least 8 words long and the title at least 3 words long
|
||||
-- this is an arbitrary way to filter out empty or sparse issues
|
||||
ARRAY_LENGTH(SPLIT(body, ' ')) >= 6
|
||||
and ARRAY_LENGTH(SPLIT(title, ' ')) >= 3
|
||||
-- filter out issues that have really long titles or bodies
|
||||
-- (these are outliers, and will slow tokenization down).
|
||||
and LENGTH(title) <= 400
|
||||
and LENGTH(body) <= 2000
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
# Teardown
|
||||
|
||||
Delete the kubernetes namespace
|
||||
|
||||
```
|
||||
kubectl delete namespace ${NAMESPACE}
|
||||
```
|
||||
|
||||
Delete the PD backing the NFS mount
|
||||
|
||||
```
|
||||
gcloud --project=${PROJECT} compute disks delete --zone=${ZONE} ${PD_DISK_NAME}
|
||||
|
||||
```
|
||||
|
||||
Delete the kubeflow-app directory
|
||||
|
||||
```
|
||||
rm -rf my-kubeflow
|
||||
```
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
# Training the model
|
||||
|
||||
By this point, you should have a Jupyter Notebook running at `http://127.0.0.1:8000`.
|
||||
|
||||
Open the Jupyter Notebook interface and create a new Terminal by clicking on New -> Terminal. In the Terminal, clone this git repo by executing: `git clone https://github.com/kubeflow/examples.git`.
|
||||
|
||||
Now you should have all the code required to complete this tutorial in the `examples/issue_summarization_github_isses/notebooks` folder. Navigate to this folder. Here you should see two files: `Tutorial.ipynb` and `seq2seq_utils.py`. Open `Tutorial.ipynb` - this contains a complete walk-through of how to go about downloading the training data, preprocessing it and training it.
|
||||
|
||||
Next: [Serving the model](serving_the_model.md)
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
# This file configures the workflows to trigger in our Prow jobs.
|
||||
# see kubeflow/testing/py/run_e2e_workflow.py
|
||||
workflows:
|
||||
- app_dir: kubeflow/examples/test/workflows
|
||||
component: workflows
|
||||
name: examples-e2e
|
||||
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
apiVersion: "0.1"
|
||||
gitVersion:
|
||||
commitSha: 422d521c05aa905df949868143b26445f5e4eda5
|
||||
refSpec: master
|
||||
kind: ksonnet.io/registry
|
||||
libraries:
|
||||
apache:
|
||||
path: apache
|
||||
version: master
|
||||
efk:
|
||||
path: efk
|
||||
version: master
|
||||
mariadb:
|
||||
path: mariadb
|
||||
version: master
|
||||
memcached:
|
||||
path: memcached
|
||||
version: master
|
||||
mongodb:
|
||||
path: mongodb
|
||||
version: master
|
||||
mysql:
|
||||
path: mysql
|
||||
version: master
|
||||
nginx:
|
||||
path: nginx
|
||||
version: master
|
||||
node:
|
||||
path: node
|
||||
version: master
|
||||
postgres:
|
||||
path: postgres
|
||||
version: master
|
||||
redis:
|
||||
path: redis
|
||||
version: master
|
||||
tomcat:
|
||||
path: tomcat
|
||||
version: master
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
apiVersion: 0.0.1
|
||||
kind: ksonnet.io/app
|
||||
name: test-infra
|
||||
registries:
|
||||
incubator:
|
||||
gitVersion:
|
||||
commitSha: 422d521c05aa905df949868143b26445f5e4eda5
|
||||
refSpec: master
|
||||
protocol: github
|
||||
uri: github.com/ksonnet/parts/tree/master/incubator
|
||||
version: 0.0.1
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
{
|
||||
global: {
|
||||
// User-defined global parameters; accessible to all component and environments, Ex:
|
||||
// replicas: 4,
|
||||
},
|
||||
components: {
|
||||
// Component-level parameters, defined initially from 'ks prototype use ...'
|
||||
// Each object below should correspond to a component in the components/ directory
|
||||
workflows: {
|
||||
bucket: "mlkube-testing_temp",
|
||||
name: "kubeflow-examples-presubmit-test-374-6e32",
|
||||
namespace: "kubeflow-test-infra",
|
||||
prow: "JOB_NAME=kubeflow-examples-presubmit-test,JOB_TYPE=presubmit,PULL_NUMBER=209,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=997a",
|
||||
prow_env: "JOB_NAME=kubeflow-examples-presubmit-test,JOB_TYPE=presubmit,PULL_NUMBER=374,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=6e32",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
local params = std.extVar("__ksonnet/params").components.workflows;
|
||||
|
||||
local k = import 'k.libsonnet';
|
||||
local workflows = import 'workflows.libsonnet';
|
||||
local namespace = params.namespace;
|
||||
|
||||
// TODO(jlewi): Can we make name default so some random unique value?
|
||||
// I didn't see any routines in the standard library for datetime or random.
|
||||
local name = params.name;
|
||||
|
||||
local prowEnv = workflows.parseEnv(params.prow_env);
|
||||
local bucket = params.bucket;
|
||||
std.prune(k.core.v1.list.new([workflows.parts(namespace, name).e2e(prowEnv, bucket)]))
|
||||
|
||||
|
|
@ -0,0 +1,244 @@
|
|||
{
|
||||
// TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet
|
||||
// doesn't support automatically piping in the namespace from the environment to prototypes.
|
||||
|
||||
// convert a list of two items into a map representing an environment variable
|
||||
// TODO(jlewi): Should we move this into kubeflow/core/util.libsonnet
|
||||
listToMap:: function(v)
|
||||
{
|
||||
name: v[0],
|
||||
value: v[1],
|
||||
},
|
||||
|
||||
// Function to turn comma separated list of prow environment variables into a dictionary.
|
||||
parseEnv:: function(v)
|
||||
local pieces = std.split(v, ",");
|
||||
if v != "" && std.length(pieces) > 0 then
|
||||
std.map(
|
||||
function(i) $.listToMap(std.split(i, "=")),
|
||||
std.split(v, ",")
|
||||
)
|
||||
else [],
|
||||
|
||||
parts(namespace, name):: {
|
||||
// Workflow to run the e2e test.
|
||||
e2e(prow_env, bucket):
|
||||
// mountPath is the directory where the volume to store the test data
|
||||
// should be mounted.
|
||||
local mountPath = "/mnt/" + "test-data-volume";
|
||||
// testDir is the root directory for all data for a particular test run.
|
||||
local testDir = mountPath + "/" + name;
|
||||
// outputDir is the directory to sync to GCS to contain the output for this job.
|
||||
local outputDir = testDir + "/output";
|
||||
local artifactsDir = outputDir + "/artifacts";
|
||||
local goDir = testDir + "/go";
|
||||
// Source directory where all repos should be checked out
|
||||
local srcRootDir = testDir + "/src";
|
||||
// The directory containing the kubeflow/examples repo
|
||||
local srcDir = srcRootDir + "/kubeflow/examples";
|
||||
local image = "gcr.io/mlkube-testing/test-worker";
|
||||
// The name of the NFS volume claim to use for test files.
|
||||
// local nfsVolumeClaim = "kubeflow-testing";
|
||||
local nfsVolumeClaim = "nfs-external";
|
||||
// The name to use for the volume to use to contain test data.
|
||||
local dataVolume = "kubeflow-test-volume";
|
||||
local versionTag = name;
|
||||
// The directory within the kubeflow_testing submodule containing
|
||||
// py scripts to use.
|
||||
local kubeflowExamplesPy = srcDir;
|
||||
local kubeflowTestingPy = srcRootDir + "/kubeflow/testing/py";
|
||||
|
||||
local project = "mlkube-testing";
|
||||
// GKE cluster to use
|
||||
// We need to truncate the cluster to no more than 40 characters because
|
||||
// cluster names can be a max of 40 characters.
|
||||
// We expect the suffix of the cluster name to be unique salt.
|
||||
// We prepend a z because cluster name must start with an alphanumeric character
|
||||
// and if we cut the prefix we might end up starting with "-" or other invalid
|
||||
// character for first character.
|
||||
local cluster =
|
||||
if std.length(name) > 40 then
|
||||
"z" + std.substr(name, std.length(name) - 39, 39)
|
||||
else
|
||||
name;
|
||||
local zone = "us-east1-d";
|
||||
local chart = srcDir + "/bin/examples-chart-0.2.1-" + versionTag + ".tgz";
|
||||
{
|
||||
// Build an Argo template to execute a particular command.
|
||||
// step_name: Name for the template
|
||||
// command: List to pass as the container command.
|
||||
buildTemplate(step_name, command):: {
|
||||
name: step_name,
|
||||
container: {
|
||||
command: command,
|
||||
image: image,
|
||||
workingDir: srcDir,
|
||||
env: [
|
||||
{
|
||||
// Add the source directories to the python path.
|
||||
name: "PYTHONPATH",
|
||||
value: kubeflowExamplesPy + ":" + kubeflowTestingPy,
|
||||
},
|
||||
{
|
||||
// Set the GOPATH
|
||||
name: "GOPATH",
|
||||
value: goDir,
|
||||
},
|
||||
{
|
||||
name: "GOOGLE_APPLICATION_CREDENTIALS",
|
||||
value: "/secret/gcp-credentials/key.json",
|
||||
},
|
||||
{
|
||||
name: "GIT_TOKEN",
|
||||
valueFrom: {
|
||||
secretKeyRef: {
|
||||
name: "github-token",
|
||||
key: "github_token",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "EXTRA_REPOS",
|
||||
value: "kubeflow/testing@HEAD",
|
||||
},
|
||||
] + prow_env,
|
||||
volumeMounts: [
|
||||
{
|
||||
name: dataVolume,
|
||||
mountPath: mountPath,
|
||||
},
|
||||
{
|
||||
name: "github-token",
|
||||
mountPath: "/secret/github-token",
|
||||
},
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
},
|
||||
],
|
||||
},
|
||||
}, // buildTemplate
|
||||
|
||||
apiVersion: "argoproj.io/v1alpha1",
|
||||
kind: "Workflow",
|
||||
metadata: {
|
||||
name: name,
|
||||
namespace: namespace,
|
||||
},
|
||||
// TODO(jlewi): Use OnExit to run cleanup steps.
|
||||
spec: {
|
||||
entrypoint: "e2e",
|
||||
volumes: [
|
||||
{
|
||||
name: "github-token",
|
||||
secret: {
|
||||
secretName: "github-token",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: "kubeflow-testing-credentials",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: dataVolume,
|
||||
persistentVolumeClaim: {
|
||||
claimName: nfsVolumeClaim,
|
||||
},
|
||||
},
|
||||
], // volumes
|
||||
// onExit specifies the template that should always run when the workflow completes.
|
||||
onExit: "exit-handler",
|
||||
templates: [
|
||||
{
|
||||
name: "e2e",
|
||||
steps: [
|
||||
[{
|
||||
name: "checkout",
|
||||
template: "checkout",
|
||||
}],
|
||||
[
|
||||
{
|
||||
name: "create-pr-symlink",
|
||||
template: "create-pr-symlink",
|
||||
},
|
||||
{
|
||||
name: "py-test",
|
||||
template: "py-test",
|
||||
},
|
||||
{
|
||||
name: "py-lint",
|
||||
template: "py-lint",
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
{
|
||||
name: "exit-handler",
|
||||
steps: [
|
||||
[{
|
||||
name: "copy-artifacts",
|
||||
template: "copy-artifacts",
|
||||
}],
|
||||
],
|
||||
},
|
||||
{
|
||||
name: "checkout",
|
||||
container: {
|
||||
command: [
|
||||
"/usr/local/bin/checkout.sh",
|
||||
srcRootDir,
|
||||
],
|
||||
env: prow_env + [{
|
||||
name: "EXTRA_REPOS",
|
||||
value: "kubeflow/testing@HEAD",
|
||||
}],
|
||||
image: image,
|
||||
volumeMounts: [
|
||||
{
|
||||
name: dataVolume,
|
||||
mountPath: mountPath,
|
||||
},
|
||||
],
|
||||
},
|
||||
}, // checkout
|
||||
$.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("py-test", [
|
||||
"python",
|
||||
"-m",
|
||||
"kubeflow.testing.py_checks",
|
||||
"test",
|
||||
"--src_dir=" + srcDir,
|
||||
"--project=mlkube-testing",
|
||||
"--junit_path=" + artifactsDir + "/junit_pycheckstest.xml",
|
||||
]), // py test
|
||||
$.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("py-lint", [
|
||||
"python",
|
||||
"-m",
|
||||
"kubeflow.testing.py_checks",
|
||||
"lint",
|
||||
"--src_dir=" + srcDir,
|
||||
"--project=mlkube-testing",
|
||||
"--junit_path=" + artifactsDir + "/junit_pycheckslint.xml",
|
||||
]), // py lint
|
||||
$.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("create-pr-symlink", [
|
||||
"python",
|
||||
"-m",
|
||||
"kubeflow.testing.prow_artifacts",
|
||||
"--artifacts_dir=" + outputDir,
|
||||
"create_pr_symlink",
|
||||
"--bucket=" + bucket,
|
||||
]), // create-pr-symlink
|
||||
$.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("copy-artifacts", [
|
||||
"python",
|
||||
"-m",
|
||||
"kubeflow.testing.prow_artifacts",
|
||||
"--artifacts_dir=" + outputDir,
|
||||
"copy_artifacts",
|
||||
"--bucket=" + bucket,
|
||||
]), // copy-artifacts
|
||||
], // templates
|
||||
},
|
||||
}, // e2e
|
||||
}, // parts
|
||||
}
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
local components = std.extVar("__ksonnet/components");
|
||||
components + {
|
||||
// Insert user-specified overrides here.
|
||||
}
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
local k8s = import "k8s.libsonnet";
|
||||
|
||||
local apps = k8s.apps;
|
||||
local core = k8s.core;
|
||||
local extensions = k8s.extensions;
|
||||
|
||||
local hidden = {
|
||||
mapContainers(f):: {
|
||||
local podContainers = super.spec.template.spec.containers,
|
||||
spec+: {
|
||||
template+: {
|
||||
spec+: {
|
||||
// IMPORTANT: This overwrites the 'containers' field
|
||||
// for this deployment.
|
||||
containers: std.map(f, podContainers),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
mapContainersWithName(names, f) ::
|
||||
local nameSet =
|
||||
if std.type(names) == "array"
|
||||
then std.set(names)
|
||||
else std.set([names]);
|
||||
local inNameSet(name) = std.length(std.setInter(nameSet, std.set([name]))) > 0;
|
||||
self.mapContainers(
|
||||
function(c)
|
||||
if std.objectHas(c, "name") && inNameSet(c.name)
|
||||
then f(c)
|
||||
else c
|
||||
),
|
||||
};
|
||||
|
||||
k8s + {
|
||||
apps:: apps + {
|
||||
v1beta1:: apps.v1beta1 + {
|
||||
local v1beta1 = apps.v1beta1,
|
||||
|
||||
daemonSet:: v1beta1.daemonSet + {
|
||||
mapContainers(f):: hidden.mapContainers(f),
|
||||
mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
|
||||
},
|
||||
|
||||
deployment:: v1beta1.deployment + {
|
||||
mapContainers(f):: hidden.mapContainers(f),
|
||||
mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
core:: core + {
|
||||
v1:: core.v1 + {
|
||||
list:: {
|
||||
new(items)::
|
||||
{apiVersion: "v1"} +
|
||||
{kind: "List"} +
|
||||
self.items(items),
|
||||
|
||||
items(items):: if std.type(items) == "array" then {items+: items} else {items+: [items]},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
extensions:: extensions + {
|
||||
v1beta1:: extensions.v1beta1 + {
|
||||
local v1beta1 = extensions.v1beta1,
|
||||
|
||||
daemonSet:: v1beta1.daemonSet + {
|
||||
mapContainers(f):: hidden.mapContainers(f),
|
||||
mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
|
||||
},
|
||||
|
||||
deployment:: v1beta1.deployment + {
|
||||
mapContainers(f):: hidden.mapContainers(f),
|
||||
mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,7 @@
|
|||
local base = import "../base.libsonnet";
|
||||
local k = import "k.libsonnet";
|
||||
|
||||
base + {
|
||||
// Insert user-specified overrides here. For example if a component is named "nginx-deployment", you might have something like:
|
||||
// "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
|
||||
}
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
local params = import "../../components/params.libsonnet";
|
||||
params + {
|
||||
components +: {
|
||||
// Insert component parameter overrides here. Ex:
|
||||
// guestbook +: {
|
||||
// name: "guestbook-dev",
|
||||
// replicas: params.global.replicas,
|
||||
// },
|
||||
},
|
||||
}
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"server": "https://35.196.185.88",
|
||||
"namespace": "kubeflow-test-infra"
|
||||
}
|
||||
Loading…
Reference in New Issue