mirror of https://github.com/kubeflow/examples.git
584 lines
16 KiB
Plaintext
584 lines
16 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import logging\n",
|
|
"import glob\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"pd.set_option('display.max_colwidth', 500)\n",
|
|
"logger = logging.getLogger()\n",
|
|
"logger.setLevel(logging.WARNING)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Download Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Ensure that the github-issues-data volume is mounted in /mnt\n",
|
|
"!ls -la /mnt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Set path for data dir\n",
|
|
"%env DATA_DIR=/mnt/github-issues-data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true,
|
|
"scrolled": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Download the github-issues.zip training data to /mnt/github-issues-data\n",
|
|
"!wget --directory-prefix=${DATA_DIR} https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip\n",
|
|
"\n",
|
|
"# Unzip the file into /mnt/github-issues-data directory\n",
|
|
"!unzip ${DATA_DIR}/github-issues.zip -d ${DATA_DIR}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create a symlink from <current_directory>/github-issues-data to /mnt/github-issues-data\n",
|
|
"!ln -sf ${DATA_DIR} github-issues-data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Make sure that the github-issues-data symlink is created\n",
|
|
"!ls -lh github-issues-data/github_issues.csv"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Process Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Split data into train and test set and preview data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_file='github-issues-data/github_issues.csv'\n",
|
|
"\n",
|
|
"# read in data sample 2000 rows (for speed of tutorial)\n",
|
|
"# Set this to False to train on the entire dataset\n",
|
|
"use_sample_data=True\n",
|
|
"\n",
|
|
"if use_sample_data:\n",
|
|
" training_data_size=2000\n",
|
|
" traindf, testdf = train_test_split(pd.read_csv(data_file).sample(n=training_data_size), \n",
|
|
" test_size=.10)\n",
|
|
"else:\n",
|
|
" traindf, testdf = train_test_split(pd.read_csv(data_file),test_size=.10)\n",
|
|
"\n",
|
|
"\n",
|
|
"#print out stats about shape of data\n",
|
|
"print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')\n",
|
|
"print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')\n",
|
|
"\n",
|
|
"# preview data\n",
|
|
"traindf.head(3)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"**Convert to lists in preparation for modeling**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_body_raw = traindf.body.tolist()\n",
|
|
"train_title_raw = traindf.issue_title.tolist()\n",
|
|
"#preview output of first element\n",
|
|
"train_body_raw[0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Pre-Process Data For Deep Learning\n",
|
|
"\n",
|
|
"See [this repo](https://github.com/hamelsmu/ktext) for documentation on the ktext package"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"%reload_ext autoreload\n",
|
|
"%autoreload 2\n",
|
|
"from ktext.preprocess import processor"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"%%time\n",
|
|
"# Clean, tokenize, and apply padding / truncating such that each document length = 70\n",
|
|
"# also, retain only the top 8,000 words in the vocabulary and set the remaining words\n",
|
|
"# to 1 which will become common index for rare words \n",
|
|
"body_pp = processor(keep_n=8000, padding_maxlen=70)\n",
|
|
"train_body_vecs = body_pp.fit_transform(train_body_raw)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Look at one example of processed issue bodies"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print('\\noriginal string:\\n', train_body_raw[0], '\\n')\n",
|
|
"print('after pre-processing:\\n', train_body_vecs[0], '\\n')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Instantiate a text processor for the titles, with some different parameters\n",
|
|
"# append_indicators = True appends the tokens '_start_' and '_end_' to each\n",
|
|
"# document\n",
|
|
"# padding = 'post' means that zero padding is appended to the end of the \n",
|
|
"# of the document (as opposed to the default which is 'pre')\n",
|
|
"title_pp = processor(append_indicators=True, keep_n=4500, \n",
|
|
" padding_maxlen=12, padding ='post')\n",
|
|
"\n",
|
|
"# process the title data\n",
|
|
"train_title_vecs = title_pp.fit_transform(train_title_raw)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Look at one example of processed issue titles"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print('\\noriginal string:\\n', train_title_raw[0])\n",
|
|
"print('after pre-processing:\\n', train_title_vecs[0])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Serialize all of this to disk for later use"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import dill as dpickle\n",
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"# Save the preprocessor\n",
|
|
"with open('body_pp.dpkl', 'wb') as f:\n",
|
|
" dpickle.dump(body_pp, f)\n",
|
|
"\n",
|
|
"with open('title_pp.dpkl', 'wb') as f:\n",
|
|
" dpickle.dump(title_pp, f)\n",
|
|
"\n",
|
|
"# Save the processed data\n",
|
|
"np.save('train_title_vecs.npy', train_title_vecs)\n",
|
|
"np.save('train_body_vecs.npy', train_body_vecs)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Define Model Architecture"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Load the data from disk into variables"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"encoder_input_data, doc_length = load_encoder_inputs('train_body_vecs.npy')\n",
|
|
"decoder_input_data, decoder_target_data = load_decoder_inputs('train_title_vecs.npy')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"num_encoder_tokens, body_pp = load_text_processor('body_pp.dpkl')\n",
|
|
"num_decoder_tokens, title_pp = load_text_processor('title_pp.dpkl')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Define Model Architecture"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"%matplotlib inline\n",
|
|
"from keras.models import Model\n",
|
|
"from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization\n",
|
|
"from keras import optimizers"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#arbitrarly set latent dimension for embedding and hidden units\n",
|
|
"latent_dim = 300\n",
|
|
"\n",
|
|
"##### Define Model Architecture ######\n",
|
|
"\n",
|
|
"########################\n",
|
|
"#### Encoder Model ####\n",
|
|
"encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')\n",
|
|
"\n",
|
|
"# Word embeding for encoder (ex: Issue Body)\n",
|
|
"x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)\n",
|
|
"x = BatchNormalization(name='Encoder-Batchnorm-1')(x)\n",
|
|
"\n",
|
|
"# Intermediate GRU layer (optional)\n",
|
|
"#x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x)\n",
|
|
"#x = BatchNormalization(name='Encoder-Batchnorm-2')(x)\n",
|
|
"\n",
|
|
"# We do not need the `encoder_output` just the hidden state.\n",
|
|
"_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)\n",
|
|
"\n",
|
|
"# Encapsulate the encoder as a separate entity so we can just \n",
|
|
"# encode without decoding if we want to.\n",
|
|
"encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')\n",
|
|
"\n",
|
|
"seq2seq_encoder_out = encoder_model(encoder_inputs)\n",
|
|
"\n",
|
|
"########################\n",
|
|
"#### Decoder Model ####\n",
|
|
"decoder_inputs = Input(shape=(None,), name='Decoder-Input') # for teacher forcing\n",
|
|
"\n",
|
|
"# Word Embedding For Decoder (ex: Issue Titles)\n",
|
|
"dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)\n",
|
|
"dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)\n",
|
|
"\n",
|
|
"# Set up the decoder, using `decoder_state_input` as initial state.\n",
|
|
"decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')\n",
|
|
"decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)\n",
|
|
"x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)\n",
|
|
"\n",
|
|
"# Dense layer for prediction\n",
|
|
"decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')\n",
|
|
"decoder_outputs = decoder_dense(x)\n",
|
|
"\n",
|
|
"########################\n",
|
|
"#### Seq2Seq Model ####\n",
|
|
"\n",
|
|
"#seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])\n",
|
|
"seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)\n",
|
|
"\n",
|
|
"\n",
|
|
"seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"** Examine Model Architecture Summary **"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from seq2seq_utils import viz_model_architecture\n",
|
|
"seq2seq_Model.summary()\n",
|
|
"viz_model_architecture(seq2seq_Model)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Train Model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from keras.callbacks import CSVLogger, ModelCheckpoint\n",
|
|
"\n",
|
|
"script_name_base = 'tutorial_seq2seq'\n",
|
|
"csv_logger = CSVLogger('{:}.log'.format(script_name_base))\n",
|
|
"model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),\n",
|
|
" save_best_only=True)\n",
|
|
"\n",
|
|
"batch_size = 1200\n",
|
|
"epochs = 7\n",
|
|
"history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),\n",
|
|
" batch_size=batch_size,\n",
|
|
" epochs=epochs,\n",
|
|
" validation_split=0.12, callbacks=[csv_logger, model_checkpoint])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#save model\n",
|
|
"seq2seq_Model.save('seq2seq_model_tutorial.h5')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# See Example Results On Holdout Set\n",
|
|
"\n",
|
|
"It is useful to see examples of real predictions on a holdout set to get a sense of the performance of the model. We will also evaluate the model numerically in a following section."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from seq2seq_utils import Seq2Seq_Inference\n",
|
|
"seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,\n",
|
|
" decoder_preprocessor=title_pp,\n",
|
|
" seq2seq_model=seq2seq_Model)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true,
|
|
"scrolled": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# this method displays the predictions on random rows of the holdout set\n",
|
|
"seq2seq_inf.demo_model_predictions(n=50, issue_df=testdf)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"source": [
|
|
"# Evaluate Model: BLEU Score\n",
|
|
"\n",
|
|
"For machine-translation tasks such as this one, it is common to measure the accuracy of results using the [BLEU Score](https://en.wikipedia.org/wiki/BLEU). The convenience function illustrated below uses [NLTK's corpus_bleu](https://www.nltk.org/api/nltk.translate.html#nltk.translate.bleu_score.corpus_bleu). The output of the below convenience function is an Average of BlEU-1, BLEU-2, BLEU-3 and BLEU-4. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#convenience function that generates predictions on holdout set and calculates BLEU Score\n",
|
|
"\n",
|
|
"bleu_score = seq2seq_inf.evaluate_model(holdout_bodies=testdf.body.tolist(), \n",
|
|
" holdout_titles=testdf.issue_title.tolist(), \n",
|
|
" max_len_title=12)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(f'BLEU Score (avg of BLUE 1-4) on Holdout Set: {bleu_score * 100}')"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.3"
|
|
},
|
|
"toc": {
|
|
"nav_menu": {
|
|
"height": "263px",
|
|
"width": "352px"
|
|
},
|
|
"number_sections": true,
|
|
"sideBar": true,
|
|
"skip_h1_title": false,
|
|
"title_cell": "Table of Contents",
|
|
"title_sidebar": "Contents",
|
|
"toc_cell": true,
|
|
"toc_position": {},
|
|
"toc_section_display": true,
|
|
"toc_window_display": false
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|