mirror of https://github.com/kubeflow/examples.git
433 lines
14 KiB
Python
433 lines
14 KiB
Python
import logging
|
|
import dill as dpickle
|
|
import numpy as np
|
|
from matplotlib import pyplot as plt
|
|
import tensorflow as tf
|
|
from IPython.display import SVG, display
|
|
from keras import backend as K
|
|
from keras.layers import Input
|
|
from keras.models import Model
|
|
from keras.utils.vis_utils import model_to_dot
|
|
from annoy import AnnoyIndex
|
|
from tqdm import tqdm, tqdm_notebook
|
|
from nltk.translate.bleu_score import corpus_bleu
|
|
|
|
|
|
def load_text_processor(fname='title_pp.dpkl'):
|
|
"""
|
|
Load preprocessors from disk.
|
|
|
|
Parameters
|
|
----------
|
|
fname: str
|
|
file name of ktext.proccessor object
|
|
|
|
Returns
|
|
-------
|
|
num_tokens : int
|
|
size of vocabulary loaded into ktext.processor
|
|
pp : ktext.processor
|
|
the processor you are trying to load
|
|
|
|
Typical Usage:
|
|
-------------
|
|
|
|
num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
|
|
num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
|
|
|
|
"""
|
|
# Load files from disk
|
|
with open(fname, 'rb') as f:
|
|
pp = dpickle.load(f)
|
|
|
|
num_tokens = max(pp.id2token.keys()) + 1
|
|
print('Size of vocabulary for {}: {}'.format(fname, num_tokens))
|
|
return num_tokens, pp
|
|
|
|
|
|
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
|
|
"""
|
|
Load decoder inputs.
|
|
|
|
Parameters
|
|
----------
|
|
decoder_np_vecs : str
|
|
filename of serialized numpy.array of decoder input (issue title)
|
|
|
|
Returns
|
|
-------
|
|
decoder_input_data : numpy.array
|
|
The data fed to the decoder as input during training for teacher forcing.
|
|
This is the same as `decoder_np_vecs` except the last position.
|
|
decoder_target_data : numpy.array
|
|
The data that the decoder data is trained to generate (issue title).
|
|
Calculated by sliding `decoder_np_vecs` one position forward.
|
|
|
|
"""
|
|
vectorized_title = np.load(decoder_np_vecs)
|
|
# For Decoder Input, you don't need the last word as that is only for prediction
|
|
# when we are training using Teacher Forcing.
|
|
decoder_input_data = vectorized_title[:, :-1]
|
|
|
|
# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
|
|
decoder_target_data = vectorized_title[:, 1:]
|
|
|
|
print('Shape of decoder input: {}'.format(decoder_input_data.shape))
|
|
print('Shape of decoder target: {}'.format(decoder_target_data.shape))
|
|
return decoder_input_data, decoder_target_data
|
|
|
|
|
|
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
|
|
"""
|
|
Load variables & data that are inputs to encoder.
|
|
|
|
Parameters
|
|
----------
|
|
encoder_np_vecs : str
|
|
filename of serialized numpy.array of encoder input (issue title)
|
|
|
|
Returns
|
|
-------
|
|
encoder_input_data : numpy.array
|
|
The issue body
|
|
doc_length : int
|
|
The standard document length of the input for the encoder after padding
|
|
the shape of this array will be (num_examples, doc_length)
|
|
|
|
"""
|
|
vectorized_body = np.load(encoder_np_vecs)
|
|
# Encoder input is simply the body of the issue text
|
|
encoder_input_data = vectorized_body
|
|
doc_length = encoder_input_data.shape[1]
|
|
print('Shape of encoder input: {}'.format(encoder_input_data.shape))
|
|
return encoder_input_data, doc_length
|
|
|
|
|
|
def viz_model_architecture(model):
|
|
"""Visualize model architecture in Jupyter notebook."""
|
|
display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
|
|
|
|
|
|
def free_gpu_mem():
|
|
"""Attempt to free gpu memory."""
|
|
K.get_session().close()
|
|
cfg = K.tf.ConfigProto()
|
|
cfg.gpu_options.allow_growth = True
|
|
K.set_session(K.tf.Session(config=cfg))
|
|
|
|
|
|
def test_gpu():
|
|
"""Run a toy computation task in tensorflow to test GPU."""
|
|
config = tf.ConfigProto()
|
|
config.gpu_options.allow_growth = True
|
|
session = tf.Session(config=config)
|
|
hello = tf.constant('Hello, TensorFlow!')
|
|
print(session.run(hello))
|
|
|
|
|
|
def plot_model_training_history(history_object):
|
|
"""Plots model train vs. validation loss."""
|
|
plt.title('model accuracy')
|
|
plt.ylabel('accuracy')
|
|
plt.xlabel('epoch')
|
|
plt.plot(history_object.history['loss'])
|
|
plt.plot(history_object.history['val_loss'])
|
|
plt.legend(['train', 'test'], loc='upper left')
|
|
plt.show()
|
|
|
|
|
|
def extract_encoder_model(model):
|
|
"""
|
|
Extract the encoder from the original Sequence to Sequence Model.
|
|
|
|
Returns a keras model object that has one input (body of issue) and one
|
|
output (encoding of issue, which is the last hidden state).
|
|
|
|
Input:
|
|
-----
|
|
model: keras model object
|
|
|
|
Returns:
|
|
-----
|
|
keras model object
|
|
|
|
"""
|
|
encoder_model = model.get_layer('Encoder-Model')
|
|
return encoder_model
|
|
|
|
|
|
def extract_decoder_model(model):
|
|
"""
|
|
Extract the decoder from the original model.
|
|
|
|
Inputs:
|
|
------
|
|
model: keras model object
|
|
|
|
Returns:
|
|
-------
|
|
A Keras model object with the following inputs and outputs:
|
|
|
|
Inputs of Keras Model That Is Returned:
|
|
1: the embedding index for the last predicted word or the <Start> indicator
|
|
2: the last hidden state, or in the case of the first word the hidden state from the encoder
|
|
|
|
Outputs of Keras Model That Is Returned:
|
|
1. Prediction (class probabilities) for the next word
|
|
2. The hidden state of the decoder, to be fed back into the decoder at the next time step
|
|
|
|
Implementation Notes:
|
|
----------------------
|
|
Must extract relevant layers and reconstruct part of the computation graph
|
|
to allow for different inputs as we are not going to use teacher forcing at
|
|
inference time.
|
|
|
|
"""
|
|
# the latent dimension is the same throughout the architecture so we are going to
|
|
# cheat and grab the latent dimension of the embedding because that is the same as what is
|
|
# output from the decoder
|
|
latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
|
|
|
|
# Reconstruct the input into the decoder
|
|
decoder_inputs = model.get_layer('Decoder-Input').input
|
|
dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
|
|
dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
|
|
|
|
# Instead of setting the intial state from the encoder and forgetting about it, during inference
|
|
# we are not doing teacher forcing, so we will have to have a feedback loop from predictions back
|
|
# into the GRU, thus we define this input layer for the state so we can add this capability
|
|
gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
|
|
|
|
# we need to reuse the weights that is why we are getting this
|
|
# If you inspect the decoder GRU that we created for training, it will take as input
|
|
# 2 tensors -> (1) is the embedding layer output for the teacher forcing
|
|
# (which will now be the last step's prediction, and will be _start_ on the
|
|
# first time step)
|
|
# (2) is the state, which we will initialize with the encoder on the first time step
|
|
# but then grab the state after the first prediction and feed that back in again.
|
|
gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
|
|
|
|
# Reconstruct dense layers
|
|
dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
|
|
dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
|
|
decoder_model = Model([decoder_inputs, gru_inference_state_input],
|
|
[dense_out, gru_state_out])
|
|
return decoder_model
|
|
|
|
|
|
class Seq2Seq_Inference(object):
|
|
|
|
# pylint: disable=too-many-instance-attributes
|
|
|
|
def __init__(self,
|
|
encoder_preprocessor,
|
|
decoder_preprocessor,
|
|
seq2seq_model):
|
|
|
|
self.pp_body = encoder_preprocessor
|
|
self.pp_title = decoder_preprocessor
|
|
self.seq2seq_model = seq2seq_model
|
|
self.encoder_model = extract_encoder_model(seq2seq_model)
|
|
self.decoder_model = extract_decoder_model(seq2seq_model)
|
|
self.default_max_len_title = self.pp_title.padding_maxlen
|
|
self.nn = None
|
|
self.rec_df = None
|
|
|
|
def generate_issue_title(self,
|
|
raw_input_text,
|
|
max_len_title=None):
|
|
"""
|
|
Use the seq2seq model to generate a title given the body of an issue.
|
|
|
|
Inputs
|
|
------
|
|
raw_input: str
|
|
The body of the issue text as an input string
|
|
|
|
max_len_title: int (optional)
|
|
The maximum length of the title the model will generate
|
|
|
|
"""
|
|
if max_len_title is None:
|
|
max_len_title = self.default_max_len_title
|
|
# get the encoder's features for the decoder
|
|
raw_tokenized = self.pp_body.transform([raw_input_text])
|
|
body_encoding = self.encoder_model.predict(raw_tokenized)
|
|
# we want to save the encoder's embedding before its updated by decoder
|
|
# because we can use that as an embedding for other tasks.
|
|
original_body_encoding = body_encoding
|
|
state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
|
|
|
|
decoded_sentence = []
|
|
stop_condition = False
|
|
while not stop_condition:
|
|
preds, st = self.decoder_model.predict([state_value, body_encoding])
|
|
|
|
# We are going to ignore indices 0 (padding) and indices 1 (unknown)
|
|
# Argmax will return the integer index corresponding to the
|
|
# prediction + 2 b/c we chopped off first two
|
|
pred_idx = np.argmax(preds[:, :, 2:]) + 2
|
|
|
|
# retrieve word from index prediction
|
|
pred_word_str = self.pp_title.id2token[pred_idx]
|
|
|
|
if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
|
|
stop_condition = True
|
|
break
|
|
decoded_sentence.append(pred_word_str)
|
|
|
|
# update the decoder for the next word
|
|
body_encoding = st
|
|
state_value = np.array(pred_idx).reshape(1, 1)
|
|
|
|
return original_body_encoding, ' '.join(decoded_sentence)
|
|
|
|
|
|
def print_example(self,
|
|
i,
|
|
body_text,
|
|
title_text,
|
|
url,
|
|
threshold):
|
|
"""
|
|
Prints an example of the model's prediction for manual inspection.
|
|
"""
|
|
if i:
|
|
print('\n\n==============================================')
|
|
print('============== Example # {} =================\n'.format(i))
|
|
|
|
if url:
|
|
print(url)
|
|
|
|
print("Issue Body:\n {} \n".format(body_text))
|
|
|
|
if title_text:
|
|
print("Original Title:\n {}".format(title_text))
|
|
|
|
emb, gen_title = self.generate_issue_title(body_text)
|
|
print("\n****** Machine Generated Title (Prediction) ******:\n {}".format(gen_title))
|
|
|
|
if self.nn:
|
|
# return neighbors and distances
|
|
n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
|
|
include_distances=True)
|
|
neighbors = n[1:]
|
|
dist = d[1:]
|
|
|
|
if min(dist) <= threshold:
|
|
cols = ['issue_url', 'issue_title', 'body']
|
|
dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
|
|
dfcopy['dist'] = dist
|
|
similar_issues_df = dfcopy.query('dist <= {}'.format(threshold))
|
|
|
|
print("\n**** Similar Issues (using encoder embedding) ****:\n")
|
|
display(similar_issues_df)
|
|
|
|
|
|
def demo_model_predictions(self,
|
|
n,
|
|
issue_df,
|
|
threshold=1):
|
|
"""
|
|
Pick n random Issues and display predictions.
|
|
|
|
Input:
|
|
------
|
|
n : int
|
|
Number of issues to display from issue_df
|
|
issue_df : pandas DataFrame
|
|
DataFrame that contains two columns: `body` and `issue_title`.
|
|
threshold : float
|
|
distance threshold for recommendation of similar issues.
|
|
|
|
Returns:
|
|
--------
|
|
None
|
|
Prints the original issue body and the model's prediction.
|
|
"""
|
|
# Extract body and title from DF
|
|
body_text = issue_df.body.tolist()
|
|
title_text = issue_df.issue_title.tolist()
|
|
url = issue_df.issue_url.tolist()
|
|
|
|
demo_list = np.random.randint(low=1, high=len(body_text), size=n)
|
|
for i in demo_list:
|
|
self.print_example(i,
|
|
body_text=body_text[i],
|
|
title_text=title_text[i],
|
|
url=url[i],
|
|
threshold=threshold)
|
|
|
|
def prepare_recommender(self, vectorized_array, original_df):
|
|
"""
|
|
Use the annoy library to build recommender
|
|
|
|
Parameters
|
|
----------
|
|
vectorized_array : List[List[int]]
|
|
This is the list of list of integers that represents your corpus
|
|
that is fed into the seq2seq model for training.
|
|
original_df : pandas.DataFrame
|
|
This is the original dataframe that has the columns
|
|
['issue_url', 'issue_title', 'body']
|
|
|
|
Returns
|
|
-------
|
|
annoy.AnnoyIndex object (see https://github.com/spotify/annoy)
|
|
"""
|
|
self.rec_df = original_df
|
|
emb = self.encoder_model.predict(x=vectorized_array,
|
|
batch_size=vectorized_array.shape[0]//200)
|
|
|
|
f = emb.shape[1]
|
|
self.nn = AnnoyIndex(f)
|
|
logging.warning('Adding embeddings')
|
|
for i in tqdm(range(len(emb))):
|
|
self.nn.add_item(i, emb[i])
|
|
logging.warning('Building trees for similarity lookup.')
|
|
self.nn.build(50)
|
|
return self.nn
|
|
|
|
def set_recsys_data(self, original_df):
|
|
self.rec_df = original_df
|
|
|
|
def set_recsys_annoyobj(self, annoyobj):
|
|
self.nn = annoyobj
|
|
|
|
def evaluate_model(self, holdout_bodies, holdout_titles):
|
|
"""
|
|
Method for calculating BLEU Score.
|
|
|
|
Parameters
|
|
----------
|
|
holdout_bodies : List[str]
|
|
These are the issue bodies that we want to summarize
|
|
holdout_titles : List[str]
|
|
This is the ground truth we are trying to predict --> issue titles
|
|
|
|
Returns
|
|
-------
|
|
bleu : float
|
|
The BLEU Score
|
|
|
|
"""
|
|
actual, predicted = list(), list()
|
|
assert len(holdout_bodies) == len(holdout_titles)
|
|
num_examples = len(holdout_bodies)
|
|
|
|
logging.warning('Generating predictions.')
|
|
# step over the whole set TODO: parallelize this
|
|
for i in tqdm_notebook(range(num_examples)):
|
|
_, yhat = self.generate_issue_title(holdout_bodies[i])
|
|
|
|
actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
|
|
predicted.append(self.pp_title.process_text([yhat])[0])
|
|
|
|
# calculate BLEU score
|
|
logging.warning('Calculating BLEU.')
|
|
#must be careful with nltk api for corpus_bleu!,
|
|
# expects List[List[List[str]]] for ground truth, using List[List[str]] will give you
|
|
# erroneous results.
|
|
bleu = corpus_bleu([[a] for a in actual], predicted)
|
|
return bleu
|