{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import logging\n", "import glob\n", "from sklearn.model_selection import train_test_split\n", "pd.set_option('display.max_colwidth', 500)\n", "logger = logging.getLogger()\n", "logger.setLevel(logging.WARNING)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Download Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Ensure that the github-issues-data volume is mounted in /mnt\n", "!ls -la /mnt" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Set path for data dir\n", "%env DATA_DIR=/mnt/github-issues-data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "# Download the github-issues.zip training data to /mnt/github-issues-data\n", "!wget --directory-prefix=${DATA_DIR} https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip\n", "\n", "# Unzip the file into /mnt/github-issues-data directory\n", "!unzip ${DATA_DIR}/github-issues.zip -d ${DATA_DIR}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Create a symlink from /github-issues-data to /mnt/github-issues-data\n", "!ln -sf ${DATA_DIR} github-issues-data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Make sure that the github-issues-data symlink is created\n", "!ls -lh github-issues-data/github_issues.csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Process Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Split data into train and test set and preview data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data_file='github-issues-data/github_issues.csv'\n", "\n", "# read in data sample 2000 rows (for speed of tutorial)\n", "# Set this to False to train on the entire dataset\n", "use_sample_data=True\n", "\n", "if use_sample_data:\n", " training_data_size=2000\n", " traindf, testdf = train_test_split(pd.read_csv(data_file).sample(n=training_data_size), \n", " test_size=.10)\n", "else:\n", " traindf, testdf = train_test_split(pd.read_csv(data_file),test_size=.10)\n", "\n", "\n", "#print out stats about shape of data\n", "print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')\n", "print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')\n", "\n", "# preview data\n", "traindf.head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Convert to lists in preparation for modeling**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "train_body_raw = traindf.body.tolist()\n", "train_title_raw = traindf.issue_title.tolist()\n", "#preview output of first element\n", "train_body_raw[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Pre-Process Data For Deep Learning\n", "\n", "See [this repo](https://github.com/hamelsmu/ktext) for documentation on the ktext package" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "from ktext.preprocess import processor" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%time\n", "# Clean, tokenize, and apply padding / truncating such that each document length = 70\n", "# also, retain only the top 8,000 words in the vocabulary and set the remaining words\n", "# to 1 which will become common index for rare words \n", "body_pp = processor(keep_n=8000, padding_maxlen=70)\n", "train_body_vecs = body_pp.fit_transform(train_body_raw)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Look at one example of processed issue bodies" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print('\\noriginal string:\\n', train_body_raw[0], '\\n')\n", "print('after pre-processing:\\n', train_body_vecs[0], '\\n')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Instantiate a text processor for the titles, with some different parameters\n", "# append_indicators = True appends the tokens '_start_' and '_end_' to each\n", "# document\n", "# padding = 'post' means that zero padding is appended to the end of the \n", "# of the document (as opposed to the default which is 'pre')\n", "title_pp = processor(append_indicators=True, keep_n=4500, \n", " padding_maxlen=12, padding ='post')\n", "\n", "# process the title data\n", "train_title_vecs = title_pp.fit_transform(train_title_raw)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Look at one example of processed issue titles" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print('\\noriginal string:\\n', train_title_raw[0])\n", "print('after pre-processing:\\n', train_title_vecs[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Serialize all of this to disk for later use" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import dill as dpickle\n", "import numpy as np\n", "\n", "# Save the preprocessor\n", "with open('body_pp.dpkl', 'wb') as f:\n", " dpickle.dump(body_pp, f)\n", "\n", "with open('title_pp.dpkl', 'wb') as f:\n", " dpickle.dump(title_pp, f)\n", "\n", "# Save the processed data\n", "np.save('train_title_vecs.npy', train_title_vecs)\n", "np.save('train_body_vecs.npy', train_body_vecs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Define Model Architecture" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load the data from disk into variables" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "encoder_input_data, doc_length = load_encoder_inputs('train_body_vecs.npy')\n", "decoder_input_data, decoder_target_data = load_decoder_inputs('train_title_vecs.npy')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "num_encoder_tokens, body_pp = load_text_processor('body_pp.dpkl')\n", "num_decoder_tokens, title_pp = load_text_processor('title_pp.dpkl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Define Model Architecture" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%matplotlib inline\n", "from keras.models import Model\n", "from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization\n", "from keras import optimizers" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#arbitrarly set latent dimension for embedding and hidden units\n", "latent_dim = 300\n", "\n", "##### Define Model Architecture ######\n", "\n", "########################\n", "#### Encoder Model ####\n", "encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')\n", "\n", "# Word embeding for encoder (ex: Issue Body)\n", "x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)\n", "x = BatchNormalization(name='Encoder-Batchnorm-1')(x)\n", "\n", "# Intermediate GRU layer (optional)\n", "#x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x)\n", "#x = BatchNormalization(name='Encoder-Batchnorm-2')(x)\n", "\n", "# We do not need the `encoder_output` just the hidden state.\n", "_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)\n", "\n", "# Encapsulate the encoder as a separate entity so we can just \n", "# encode without decoding if we want to.\n", "encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')\n", "\n", "seq2seq_encoder_out = encoder_model(encoder_inputs)\n", "\n", "########################\n", "#### Decoder Model ####\n", "decoder_inputs = Input(shape=(None,), name='Decoder-Input') # for teacher forcing\n", "\n", "# Word Embedding For Decoder (ex: Issue Titles)\n", "dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)\n", "dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)\n", "\n", "# Set up the decoder, using `decoder_state_input` as initial state.\n", "decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')\n", "decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)\n", "x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)\n", "\n", "# Dense layer for prediction\n", "decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')\n", "decoder_outputs = decoder_dense(x)\n", "\n", "########################\n", "#### Seq2Seq Model ####\n", "\n", "#seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])\n", "seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)\n", "\n", "\n", "seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "** Examine Model Architecture Summary **" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from seq2seq_utils import viz_model_architecture\n", "seq2seq_Model.summary()\n", "viz_model_architecture(seq2seq_Model)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Train Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from keras.callbacks import CSVLogger, ModelCheckpoint\n", "\n", "script_name_base = 'tutorial_seq2seq'\n", "csv_logger = CSVLogger('{:}.log'.format(script_name_base))\n", "model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),\n", " save_best_only=True)\n", "\n", "batch_size = 1200\n", "epochs = 7\n", "history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),\n", " batch_size=batch_size,\n", " epochs=epochs,\n", " validation_split=0.12, callbacks=[csv_logger, model_checkpoint])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#save model\n", "seq2seq_Model.save('seq2seq_model_tutorial.h5')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# See Example Results On Holdout Set\n", "\n", "It is useful to see examples of real predictions on a holdout set to get a sense of the performance of the model. We will also evaluate the model numerically in a following section." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from seq2seq_utils import Seq2Seq_Inference\n", "seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,\n", " decoder_preprocessor=title_pp,\n", " seq2seq_model=seq2seq_Model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "# this method displays the predictions on random rows of the holdout set\n", "seq2seq_inf.demo_model_predictions(n=50, issue_df=testdf)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# Evaluate Model: BLEU Score\n", "\n", "For machine-translation tasks such as this one, it is common to measure the accuracy of results using the [BLEU Score](https://en.wikipedia.org/wiki/BLEU). The convenience function illustrated below uses [NLTK's corpus_bleu](https://www.nltk.org/api/nltk.translate.html#nltk.translate.bleu_score.corpus_bleu). The output of the below convenience function is an Average of BlEU-1, BLEU-2, BLEU-3 and BLEU-4. " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#convenience function that generates predictions on holdout set and calculates BLEU Score\n", "\n", "bleu_score = seq2seq_inf.evaluate_model(holdout_bodies=testdf.body.tolist(), \n", " holdout_titles=testdf.issue_title.tolist(), \n", " max_len_title=12)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "print(f'BLEU Score (avg of BLUE 1-4) on Holdout Set: {bleu_score * 100}')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" }, "toc": { "nav_menu": { "height": "263px", "width": "352px" }, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": true, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }