Add .pylintrc (#61)

* Add .pylintrc

* Resolve lint complaints in agents/trainer/task.py

* Resolve lint complaints with flask app.py

* Resolve linting issues

Remove duplicate seq2seq_utils.py from workflow/workspace/src

* Use python 3.5.2 with pylint to match prow

Put pybullet import back into agents/trainer/task.py with a pylint ignore statement
Use main(_) to ensure it works with tf.app.run
This commit is contained in:
Michelle Casbon 2018-03-29 08:25:02 -07:00 committed by k8s-ci-robot
parent 1d6946ead8
commit 41372c9314
12 changed files with 841 additions and 821 deletions

399
.pylintrc Normal file
View File

@ -0,0 +1,399 @@
[MASTER]
# Specify a configuration file.
#rcfile=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=third_party
# Add files or directories matching the regex patterns to the blacklist. The
# regex matches against base names, not paths.
ignore-patterns=
# Pickle collected data for later comparisons.
persistent=no
# List of plugins (as comma separated values of python modules names) to load,
# usually to register additional checkers.
load-plugins=
# Use multiple processes to speed up Pylint.
jobs=4
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code
extension-pkg-whitelist=
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
confidence=
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
#enable=
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once).You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use"--disable=all --enable=classes
# --disable=W"
disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,range-builtin-not-iterating,suppressed-message,missing-docstring,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating,relative-import,invalid-name,bad-continuation,no-member,locally-disabled,fixme,import-error,too-many-locals
[REPORTS]
# Set the output format. Available formats are text, parseable, colorized, msvs
# (visual studio) and html. You can also give a reporter class, eg
# mypackage.mymodule.MyReporterClass.
output-format=text
# Put messages in a separate file for each module / package specified on the
# command line instead of printing them on stdout. Reports (if any) will be
# written in a file name "pylint_global.[txt|html]". This option is deprecated
# and it will be removed in Pylint 2.0.
files-output=no
# Tells whether to display a full report or only the messages
reports=no
# Python expression which should return a note less than 10 (10 is the highest
# note). You have access to the variables errors warning, statement which
# respectively contain the number of errors / warnings messages and the total
# number of statements analyzed. This is used by the global evaluation report
# (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details
#msg-template=
[BASIC]
# Good variable names which should always be accepted, separated by a comma
good-names=i,j,k,ex,Run,_
# Bad variable names which should always be refused, separated by a comma
bad-names=foo,bar,baz,toto,tutu,tata
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Include a hint for the correct naming format with invalid-name
include-naming-hint=no
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
property-classes=abc.abstractproperty
# Regular expression matching correct function names
function-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for function names
function-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct variable names
variable-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for variable names
variable-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct constant names
const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
# Naming hint for constant names
const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
# Regular expression matching correct attribute names
attr-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for attribute names
attr-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct argument names
argument-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for argument names
argument-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct class attribute names
class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
# Naming hint for class attribute names
class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
# Regular expression matching correct inline iteration names
inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
# Naming hint for inline iteration names
inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
# Regular expression matching correct class names
class-rgx=[A-Z_][a-zA-Z0-9]+$
# Naming hint for class names
class-name-hint=[A-Z_][a-zA-Z0-9]+$
# Regular expression matching correct module names
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Naming hint for module names
module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Regular expression matching correct method names
method-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for method names
method-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
[ELIF]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
[TYPECHECK]
# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis. It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
[FORMAT]
# Maximum number of characters on a single line.
max-line-length=100
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
# List of optional constructs for which whitespace checking is disabled. `dict-
# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
# `empty-line` allows space-only lines.
no-space-check=trailing-comma,dict-separator
# Maximum number of lines in a module
max-module-lines=1000
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
# Use 2 spaces consistent with TensorFlow style.
indent-string=' '
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,XXX,TODO
[VARIABLES]
# Tells whether we should check for unused import in __init__ files.
init-import=no
# A regular expression matching the name of dummy variables (i.e. expectedly
# not used).
dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid to define new builtins when possible.
additional-builtins=
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,_cb
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,future.builtins
[LOGGING]
# Logging modules to check that the string format arguments are in logging
# function parameter format
logging-modules=logging
[SIMILARITIES]
# Minimum lines number of a similarity.
min-similarity-lines=4
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
# Ignore imports when computing similarities.
ignore-imports=no
[SPELLING]
# Spelling dictionary name. Available dictionaries: none. To make it working
# install python-enchant package.
spelling-dict=
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to indicated private dictionary in
# --spelling-private-dict-file option instead of raising a message.
spelling-store-unknown-words=no
[IMPORTS]
# Deprecated modules which should not be used, separated by a comma
deprecated-modules=regsub,TERMIOS,Bastion,rexec
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled)
import-graph=
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled)
ext-import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled)
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
[DESIGN]
# Maximum number of arguments for function / method
max-args=7
# Argument names that match this expression will be ignored. Default to name
# with leading underscore
ignored-argument-names=_.*
# Maximum number of locals for function / method body
max-locals=15
# Maximum number of return / yield for function / method body
max-returns=6
# Maximum number of branch for function / method body
max-branches=12
# Maximum number of statements in function / method body
max-statements=50
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Minimum number of public methods for a class (see R0903).
min-public-methods=0
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of boolean expressions in a if statement
max-bool-expr=5
[CLASSES]
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,__new__,setUp
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=mcs
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,_fields,_replace,_source,_make
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "Exception"
overgeneral-exceptions=Exception

View File

@ -12,18 +12,18 @@
"""Provides an entrypoint for the training task.""" """Provides an entrypoint for the training task."""
#pylint: disable=unused-import
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import argparse
import datetime import datetime
import logging import logging
import os import os
import pprint import pprint
import uuid import uuid
import pip
import tensorflow as tf
from google.cloud import storage from google.cloud import storage
import tensorflow as tf
import agents import agents
import pybullet_envs # To make AntBulletEnv-v0 available. import pybullet_envs # To make AntBulletEnv-v0 available.
@ -113,39 +113,39 @@ def hparams_base():
"""Base hparams tf/Agents PPO """ """Base hparams tf/Agents PPO """
# General # General
algorithm = agents.ppo.PPOAlgorithm # algorithm = agents.ppo.PPOAlgorithm
num_agents = 30 # num_agents = 30
eval_episodes = 30 # eval_episodes = 30
use_gpu = False # use_gpu = False
# Environment # Environment
env = 'KukaBulletEnv-v0' # env = 'KukaBulletEnv-v0'
normalize_ranges = True # normalize_ranges = True
max_length = 1000 # max_length = 1000
# Network # Network
network = agents.scripts.networks.feed_forward_gaussian # network = agents.scripts.networks.feed_forward_gaussian
weight_summaries = dict( # weight_summaries = dict(
all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*') # all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*')
policy_layers = 200, 100 # policy_layers = 200, 100
value_layers = 200, 100 # value_layers = 200, 100
init_output_factor = 0.1 # init_output_factor = 0.1
init_logstd = -1 # init_logstd = -1
init_std = 0.35 # init_std = 0.35
# Optimization # Optimization
update_every = 60 # update_every = 60
update_epochs = 25 # update_epochs = 25
optimizer = tf.train.AdamOptimizer # optimizer = tf.train.AdamOptimizer
learning_rate = 1e-4 # learning_rate = 1e-4
steps = 3e7 # 30M # steps = 3e7 # 30M
# Losses # Losses
discount = 0.995 # discount = 0.995
kl_target = 1e-2 # kl_target = 1e-2
kl_cutoff_factor = 2 # kl_cutoff_factor = 2
kl_cutoff_coef = 1000 # kl_cutoff_coef = 1000
kl_init_penalty = 1 # kl_init_penalty = 1
return locals() return locals()
@ -158,9 +158,9 @@ def _object_import_from_string(name):
return mod return mod
def _realize_import_attrs(d, filter): def _realize_import_attrs(d, hparam_filter):
for k, v in d.items(): for k, v in d.items():
if k in filter: if k in hparam_filter:
imported = _object_import_from_string(v) imported = _object_import_from_string(v)
# TODO: Provide an appropriately informative error if the import fails # TODO: Provide an appropriately informative error if the import fails
# except ImportError as e: # except ImportError as e:
@ -170,7 +170,7 @@ def _realize_import_attrs(d, filter):
return d return d
def _get_agents_configuration(hparam_set_name, log_dir=None, is_chief=False): def _get_agents_configuration(log_dir=None):
"""Load hyperparameter config.""" """Load hyperparameter config."""
try: try:
# Try to resume training. # Try to resume training.
@ -243,23 +243,20 @@ def gcs_upload(local_dir, gcs_out_dir):
blob.upload_from_filename(local_file_path) blob.upload_from_filename(local_file_path)
def main(unused_argv): def main(_):
"""Run training.""" """Run training."""
tf.logging.set_verbosity(tf.logging.INFO) tf.logging.set_verbosity(tf.logging.INFO)
if FLAGS.debug: if FLAGS.debug:
tf.logging.set_verbosity(tf.logging.DEBUG) tf.logging.set_verbosity(tf.logging.DEBUG)
run_config = tf.contrib.learn.RunConfig()
log_dir = FLAGS.logdir log_dir = FLAGS.logdir
agents_config = _get_agents_configuration( agents_config = _get_agents_configuration(log_dir)
FLAGS.hparam_set_id, log_dir, run_config.is_chief)
if FLAGS.run_mode == 'train': if FLAGS.run_mode == 'train':
for score in agents.scripts.train.train(agents_config, env_processes=True): for score in agents.scripts.train.train(agents_config, env_processes=True):
logging.info('Score {}.'.format(score)) logging.info('Score %s.', score)
if FLAGS.run_mode == 'render': if FLAGS.run_mode == 'render':
now = datetime.datetime.now() now = datetime.datetime.now()
subdir = now.strftime("%m%d-%H%M") + "-" + uuid.uuid4().hex[0:4] subdir = now.strftime("%m%d-%H%M") + "-" + uuid.uuid4().hex[0:4]

View File

@ -2,38 +2,47 @@
Simple app that parses predictions from a trained model and displays them. Simple app that parses predictions from a trained model and displays them.
""" """
from flask import Flask, json, render_template, request
import requests import requests
app = Flask(__name__) from flask import Flask, json, render_template, request
APP = Flask(__name__)
@app.route("/") @APP.route("/")
def index(): def index():
"""Default route.
Placeholder, does nothing.
"""
return render_template("index.html") return render_template("index.html")
@app.route("/summary", methods=['GET', 'POST']) @APP.route("/summary", methods=['GET', 'POST'])
def summary(): def summary():
"""Main prediction route.
Provides a machine-generated summary of the given text. Sends a request to a live
model trained on GitHub issues.
"""
if request.method == 'POST': if request.method == 'POST':
issue_text = request.form["issue_text"] issue_text = request.form["issue_text"]
url = "http://ambassador:80/seldon/issue-summarization/api/v0.1/predictions" url = "http://ambassador:80/seldon/issue-summarization/api/v0.1/predictions"
headers = { 'content-type': 'application/json' } headers = {'content-type': 'application/json'}
json_data = { json_data = {
"data" : { "data" : {
"ndarray" : [[ issue_text ]] "ndarray" : [[issue_text]]
} }
} }
r = requests.post(url = url, response = requests.post(url=url,
headers = headers, headers=headers,
data = json.dumps(json_data)) data=json.dumps(json_data))
rjs = json.loads(r.text) response_json = json.loads(response.text)
summary = rjs["data"]["ndarray"][0][0] issue_summary = response_json["data"]["ndarray"][0][0]
return render_template("summary.html", return render_template("issue_summary.html",
issue_text = issue_text, issue_text=issue_text,
summary = summary) issue_summary=issue_summary)
return ('', 204)
if __name__ == '__main__': if __name__ == '__main__':
app.run(debug = True, host = '0.0.0.0', port = 80) APP.run(debug=True, host='0.0.0.0', port=80)

View File

@ -1,22 +0,0 @@
from __future__ import print_function
import dill as dpickle
import numpy as np
from keras.models import load_model
from seq2seq_utils import Seq2Seq_Inference
class IssueSummarization(object):
def __init__(self):
with open('body_pp.dpkl', 'rb') as f:
body_pp = dpickle.load(f)
with open('title_pp.dpkl', 'rb') as f:
title_pp = dpickle.load(f)
self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
decoder_preprocessor=title_pp,
seq2seq_model=load_model('seq2seq_model_tutorial.h5'))
def predict(self, X, feature_names):
return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in X])

View File

@ -0,0 +1,25 @@
"""Generates predictions using a stored model.
Uses trained model files to generate a prediction.
"""
from __future__ import print_function
import numpy as np
import dill as dpickle
from keras.models import load_model
from seq2seq_utils import Seq2Seq_Inference
class IssueSummarization(object):
def __init__(self):
with open('body_pp.dpkl', 'rb') as body_file:
body_pp = dpickle.load(body_file)
with open('title_pp.dpkl', 'rb') as title_file:
title_pp = dpickle.load(title_file)
self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
decoder_preprocessor=title_pp,
seq2seq_model=load_model('seq2seq_model_tutorial.h5'))
def predict(self, input_text):
return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in input_text])

View File

@ -1,429 +1,432 @@
import logging
import dill as dpickle
import numpy as np
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
import tensorflow as tf import tensorflow as tf
from IPython.display import SVG, display
from keras import backend as K from keras import backend as K
from keras.layers import Input from keras.layers import Input
from keras.models import Model from keras.models import Model
from IPython.display import SVG, display
from keras.utils.vis_utils import model_to_dot from keras.utils.vis_utils import model_to_dot
import logging
import numpy as np
import dill as dpickle
from annoy import AnnoyIndex from annoy import AnnoyIndex
from tqdm import tqdm, tqdm_notebook from tqdm import tqdm, tqdm_notebook
from random import random
from nltk.translate.bleu_score import corpus_bleu from nltk.translate.bleu_score import corpus_bleu
def load_text_processor(fname='title_pp.dpkl'): def load_text_processor(fname='title_pp.dpkl'):
""" """
Load preprocessors from disk. Load preprocessors from disk.
Parameters Parameters
---------- ----------
fname: str fname: str
file name of ktext.proccessor object file name of ktext.proccessor object
Returns Returns
------- -------
num_tokens : int num_tokens : int
size of vocabulary loaded into ktext.processor size of vocabulary loaded into ktext.processor
pp : ktext.processor pp : ktext.processor
the processor you are trying to load the processor you are trying to load
Typical Usage: Typical Usage:
------------- -------------
num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl') num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl') num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
""" """
# Load files from disk # Load files from disk
with open(fname, 'rb') as f: with open(fname, 'rb') as f:
pp = dpickle.load(f) pp = dpickle.load(f)
num_tokens = max(pp.id2token.keys()) + 1 num_tokens = max(pp.id2token.keys()) + 1
print(f'Size of vocabulary for {fname}: {num_tokens:,}') print('Size of vocabulary for {}: {}'.format(fname, num_tokens))
return num_tokens, pp return num_tokens, pp
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'): def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
""" """
Load decoder inputs. Load decoder inputs.
Parameters Parameters
---------- ----------
decoder_np_vecs : str decoder_np_vecs : str
filename of serialized numpy.array of decoder input (issue title) filename of serialized numpy.array of decoder input (issue title)
Returns Returns
------- -------
decoder_input_data : numpy.array decoder_input_data : numpy.array
The data fed to the decoder as input during training for teacher forcing. The data fed to the decoder as input during training for teacher forcing.
This is the same as `decoder_np_vecs` except the last position. This is the same as `decoder_np_vecs` except the last position.
decoder_target_data : numpy.array decoder_target_data : numpy.array
The data that the decoder data is trained to generate (issue title). The data that the decoder data is trained to generate (issue title).
Calculated by sliding `decoder_np_vecs` one position forward. Calculated by sliding `decoder_np_vecs` one position forward.
""" """
vectorized_title = np.load(decoder_np_vecs) vectorized_title = np.load(decoder_np_vecs)
# For Decoder Input, you don't need the last word as that is only for prediction # For Decoder Input, you don't need the last word as that is only for prediction
# when we are training using Teacher Forcing. # when we are training using Teacher Forcing.
decoder_input_data = vectorized_title[:, :-1] decoder_input_data = vectorized_title[:, :-1]
# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing) # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
decoder_target_data = vectorized_title[:, 1:] decoder_target_data = vectorized_title[:, 1:]
print(f'Shape of decoder input: {decoder_input_data.shape}') print('Shape of decoder input: {}'.format(decoder_input_data.shape))
print(f'Shape of decoder target: {decoder_target_data.shape}') print('Shape of decoder target: {}'.format(decoder_target_data.shape))
return decoder_input_data, decoder_target_data return decoder_input_data, decoder_target_data
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'): def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
""" """
Load variables & data that are inputs to encoder. Load variables & data that are inputs to encoder.
Parameters Parameters
---------- ----------
encoder_np_vecs : str encoder_np_vecs : str
filename of serialized numpy.array of encoder input (issue title) filename of serialized numpy.array of encoder input (issue title)
Returns Returns
------- -------
encoder_input_data : numpy.array encoder_input_data : numpy.array
The issue body The issue body
doc_length : int doc_length : int
The standard document length of the input for the encoder after padding The standard document length of the input for the encoder after padding
the shape of this array will be (num_examples, doc_length) the shape of this array will be (num_examples, doc_length)
""" """
vectorized_body = np.load(encoder_np_vecs) vectorized_body = np.load(encoder_np_vecs)
# Encoder input is simply the body of the issue text # Encoder input is simply the body of the issue text
encoder_input_data = vectorized_body encoder_input_data = vectorized_body
doc_length = encoder_input_data.shape[1] doc_length = encoder_input_data.shape[1]
print(f'Shape of encoder input: {encoder_input_data.shape}') print('Shape of encoder input: {}'.format(encoder_input_data.shape))
return encoder_input_data, doc_length return encoder_input_data, doc_length
def viz_model_architecture(model): def viz_model_architecture(model):
"""Visualize model architecture in Jupyter notebook.""" """Visualize model architecture in Jupyter notebook."""
display(SVG(model_to_dot(model).create(prog='dot', format='svg'))) display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
def free_gpu_mem(): def free_gpu_mem():
"""Attempt to free gpu memory.""" """Attempt to free gpu memory."""
K.get_session().close() K.get_session().close()
cfg = K.tf.ConfigProto() cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg)) K.set_session(K.tf.Session(config=cfg))
def test_gpu(): def test_gpu():
"""Run a toy computation task in tensorflow to test GPU.""" """Run a toy computation task in tensorflow to test GPU."""
config = tf.ConfigProto() config = tf.ConfigProto()
config.gpu_options.allow_growth = True config.gpu_options.allow_growth = True
session = tf.Session(config=config) session = tf.Session(config=config)
hello = tf.constant('Hello, TensorFlow!') hello = tf.constant('Hello, TensorFlow!')
print(session.run(hello)) print(session.run(hello))
def plot_model_training_history(history_object): def plot_model_training_history(history_object):
"""Plots model train vs. validation loss.""" """Plots model train vs. validation loss."""
plt.title('model accuracy') plt.title('model accuracy')
plt.ylabel('accuracy') plt.ylabel('accuracy')
plt.xlabel('epoch') plt.xlabel('epoch')
plt.plot(history_object.history['loss']) plt.plot(history_object.history['loss'])
plt.plot(history_object.history['val_loss']) plt.plot(history_object.history['val_loss'])
plt.legend(['train', 'test'], loc='upper left') plt.legend(['train', 'test'], loc='upper left')
plt.show() plt.show()
def extract_encoder_model(model): def extract_encoder_model(model):
""" """
Extract the encoder from the original Sequence to Sequence Model. Extract the encoder from the original Sequence to Sequence Model.
Returns a keras model object that has one input (body of issue) and one Returns a keras model object that has one input (body of issue) and one
output (encoding of issue, which is the last hidden state). output (encoding of issue, which is the last hidden state).
Input: Input:
----- -----
model: keras model object model: keras model object
Returns: Returns:
----- -----
keras model object keras model object
""" """
encoder_model = model.get_layer('Encoder-Model') encoder_model = model.get_layer('Encoder-Model')
return encoder_model return encoder_model
def extract_decoder_model(model): def extract_decoder_model(model):
""" """
Extract the decoder from the original model. Extract the decoder from the original model.
Inputs: Inputs:
------ ------
model: keras model object model: keras model object
Returns: Returns:
------- -------
A Keras model object with the following inputs and outputs: A Keras model object with the following inputs and outputs:
Inputs of Keras Model That Is Returned: Inputs of Keras Model That Is Returned:
1: the embedding index for the last predicted word or the <Start> indicator 1: the embedding index for the last predicted word or the <Start> indicator
2: the last hidden state, or in the case of the first word the hidden state from the encoder 2: the last hidden state, or in the case of the first word the hidden state from the encoder
Outputs of Keras Model That Is Returned: Outputs of Keras Model That Is Returned:
1. Prediction (class probabilities) for the next word 1. Prediction (class probabilities) for the next word
2. The hidden state of the decoder, to be fed back into the decoder at the next time step 2. The hidden state of the decoder, to be fed back into the decoder at the next time step
Implementation Notes: Implementation Notes:
---------------------- ----------------------
Must extract relevant layers and reconstruct part of the computation graph Must extract relevant layers and reconstruct part of the computation graph
to allow for different inputs as we are not going to use teacher forcing at to allow for different inputs as we are not going to use teacher forcing at
inference time. inference time.
""" """
# the latent dimension is the same throughout the architecture so we are going to # the latent dimension is the same throughout the architecture so we are going to
# cheat and grab the latent dimension of the embedding because that is the same as what is # cheat and grab the latent dimension of the embedding because that is the same as what is
# output from the decoder # output from the decoder
latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1] latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
# Reconstruct the input into the decoder # Reconstruct the input into the decoder
decoder_inputs = model.get_layer('Decoder-Input').input decoder_inputs = model.get_layer('Decoder-Input').input
dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs) dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb) dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
# Instead of setting the intial state from the encoder and forgetting about it, during inference # Instead of setting the intial state from the encoder and forgetting about it, during inference
# we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into # we are not doing teacher forcing, so we will have to have a feedback loop from predictions back
# the GRU, thus we define this input layer for the state so we can add this capability # into the GRU, thus we define this input layer for the state so we can add this capability
gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input') gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
# we need to reuse the weights that is why we are getting this # we need to reuse the weights that is why we are getting this
# If you inspect the decoder GRU that we created for training, it will take as input # If you inspect the decoder GRU that we created for training, it will take as input
# 2 tensors -> (1) is the embedding layer output for the teacher forcing # 2 tensors -> (1) is the embedding layer output for the teacher forcing
# (which will now be the last step's prediction, and will be _start_ on the first time step) # (which will now be the last step's prediction, and will be _start_ on the
# (2) is the state, which we will initialize with the encoder on the first time step, but then # first time step)
# grab the state after the first prediction and feed that back in again. # (2) is the state, which we will initialize with the encoder on the first time step
gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input]) # but then grab the state after the first prediction and feed that back in again.
gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
# Reconstruct dense layers # Reconstruct dense layers
dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out) dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
dense_out = model.get_layer('Final-Output-Dense')(dec_bn2) dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
decoder_model = Model([decoder_inputs, gru_inference_state_input], decoder_model = Model([decoder_inputs, gru_inference_state_input],
[dense_out, gru_state_out]) [dense_out, gru_state_out])
return decoder_model return decoder_model
class Seq2Seq_Inference(object): class Seq2Seq_Inference(object):
def __init__(self,
encoder_preprocessor,
decoder_preprocessor,
seq2seq_model):
self.pp_body = encoder_preprocessor # pylint: disable=too-many-instance-attributes
self.pp_title = decoder_preprocessor
self.seq2seq_model = seq2seq_model
self.encoder_model = extract_encoder_model(seq2seq_model)
self.decoder_model = extract_decoder_model(seq2seq_model)
self.default_max_len_title = self.pp_title.padding_maxlen
self.nn = None
self.rec_df = None
def generate_issue_title(self, def __init__(self,
raw_input_text, encoder_preprocessor,
max_len_title=None): decoder_preprocessor,
""" seq2seq_model):
Use the seq2seq model to generate a title given the body of an issue.
Inputs self.pp_body = encoder_preprocessor
------ self.pp_title = decoder_preprocessor
raw_input: str self.seq2seq_model = seq2seq_model
The body of the issue text as an input string self.encoder_model = extract_encoder_model(seq2seq_model)
self.decoder_model = extract_decoder_model(seq2seq_model)
self.default_max_len_title = self.pp_title.padding_maxlen
self.nn = None
self.rec_df = None
max_len_title: int (optional) def generate_issue_title(self,
The maximum length of the title the model will generate raw_input_text,
max_len_title=None):
"""
Use the seq2seq model to generate a title given the body of an issue.
""" Inputs
if max_len_title is None: ------
max_len_title = self.default_max_len_title raw_input: str
# get the encoder's features for the decoder The body of the issue text as an input string
raw_tokenized = self.pp_body.transform([raw_input_text])
body_encoding = self.encoder_model.predict(raw_tokenized)
# we want to save the encoder's embedding before its updated by decoder
# because we can use that as an embedding for other tasks.
original_body_encoding = body_encoding
state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
decoded_sentence = [] max_len_title: int (optional)
stop_condition = False The maximum length of the title the model will generate
while not stop_condition:
preds, st = self.decoder_model.predict([state_value, body_encoding])
# We are going to ignore indices 0 (padding) and indices 1 (unknown) """
# Argmax will return the integer index corresponding to the if max_len_title is None:
# prediction + 2 b/c we chopped off first two max_len_title = self.default_max_len_title
pred_idx = np.argmax(preds[:, :, 2:]) + 2 # get the encoder's features for the decoder
raw_tokenized = self.pp_body.transform([raw_input_text])
body_encoding = self.encoder_model.predict(raw_tokenized)
# we want to save the encoder's embedding before its updated by decoder
# because we can use that as an embedding for other tasks.
original_body_encoding = body_encoding
state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
# retrieve word from index prediction decoded_sentence = []
pred_word_str = self.pp_title.id2token[pred_idx] stop_condition = False
while not stop_condition:
preds, st = self.decoder_model.predict([state_value, body_encoding])
if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title: # We are going to ignore indices 0 (padding) and indices 1 (unknown)
stop_condition = True # Argmax will return the integer index corresponding to the
break # prediction + 2 b/c we chopped off first two
decoded_sentence.append(pred_word_str) pred_idx = np.argmax(preds[:, :, 2:]) + 2
# update the decoder for the next word # retrieve word from index prediction
body_encoding = st pred_word_str = self.pp_title.id2token[pred_idx]
state_value = np.array(pred_idx).reshape(1, 1)
return original_body_encoding, ' '.join(decoded_sentence) if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
stop_condition = True
break
decoded_sentence.append(pred_word_str)
# update the decoder for the next word
body_encoding = st
state_value = np.array(pred_idx).reshape(1, 1)
return original_body_encoding, ' '.join(decoded_sentence)
def print_example(self, def print_example(self,
i, i,
body_text, body_text,
title_text, title_text,
url, url,
threshold): threshold):
""" """
Prints an example of the model's prediction for manual inspection. Prints an example of the model's prediction for manual inspection.
""" """
if i: if i:
print('\n\n==============================================') print('\n\n==============================================')
print(f'============== Example # {i} =================\n') print('============== Example # {} =================\n'.format(i))
if url: if url:
print(url) print(url)
print(f"Issue Body:\n {body_text} \n") print("Issue Body:\n {} \n".format(body_text))
if title_text: if title_text:
print(f"Original Title:\n {title_text}") print("Original Title:\n {}".format(title_text))
emb, gen_title = self.generate_issue_title(body_text) emb, gen_title = self.generate_issue_title(body_text)
print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}") print("\n****** Machine Generated Title (Prediction) ******:\n {}".format(gen_title))
if self.nn: if self.nn:
# return neighbors and distances # return neighbors and distances
n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4, n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
include_distances=True) include_distances=True)
neighbors = n[1:] neighbors = n[1:]
dist = d[1:] dist = d[1:]
if min(dist) <= threshold: if min(dist) <= threshold:
cols = ['issue_url', 'issue_title', 'body'] cols = ['issue_url', 'issue_title', 'body']
dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True) dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
dfcopy['dist'] = dist dfcopy['dist'] = dist
similar_issues_df = dfcopy.query(f'dist <= {threshold}') similar_issues_df = dfcopy.query('dist <= {}'.format(threshold))
print("\n**** Similar Issues (using encoder embedding) ****:\n") print("\n**** Similar Issues (using encoder embedding) ****:\n")
display(similar_issues_df) display(similar_issues_df)
def demo_model_predictions(self, def demo_model_predictions(self,
n, n,
issue_df, issue_df,
threshold=1): threshold=1):
""" """
Pick n random Issues and display predictions. Pick n random Issues and display predictions.
Input: Input:
------ ------
n : int n : int
Number of issues to display from issue_df Number of issues to display from issue_df
issue_df : pandas DataFrame issue_df : pandas DataFrame
DataFrame that contains two columns: `body` and `issue_title`. DataFrame that contains two columns: `body` and `issue_title`.
threshold : float threshold : float
distance threshold for recommendation of similar issues. distance threshold for recommendation of similar issues.
Returns: Returns:
-------- --------
None None
Prints the original issue body and the model's prediction. Prints the original issue body and the model's prediction.
""" """
# Extract body and title from DF # Extract body and title from DF
body_text = issue_df.body.tolist() body_text = issue_df.body.tolist()
title_text = issue_df.issue_title.tolist() title_text = issue_df.issue_title.tolist()
url = issue_df.issue_url.tolist() url = issue_df.issue_url.tolist()
demo_list = np.random.randint(low=1, high=len(body_text), size=n) demo_list = np.random.randint(low=1, high=len(body_text), size=n)
for i in demo_list: for i in demo_list:
self.print_example(i, self.print_example(i,
body_text=body_text[i], body_text=body_text[i],
title_text=title_text[i], title_text=title_text[i],
url=url[i], url=url[i],
threshold=threshold) threshold=threshold)
def prepare_recommender(self, vectorized_array, original_df): def prepare_recommender(self, vectorized_array, original_df):
""" """
Use the annoy library to build recommender Use the annoy library to build recommender
Parameters Parameters
---------- ----------
vectorized_array : List[List[int]] vectorized_array : List[List[int]]
This is the list of list of integers that represents your corpus This is the list of list of integers that represents your corpus
that is fed into the seq2seq model for training. that is fed into the seq2seq model for training.
original_df : pandas.DataFrame original_df : pandas.DataFrame
This is the original dataframe that has the columns This is the original dataframe that has the columns
['issue_url', 'issue_title', 'body'] ['issue_url', 'issue_title', 'body']
Returns Returns
------- -------
annoy.AnnoyIndex object (see https://github.com/spotify/annoy) annoy.AnnoyIndex object (see https://github.com/spotify/annoy)
""" """
self.rec_df = original_df self.rec_df = original_df
emb = self.encoder_model.predict(x=vectorized_array, emb = self.encoder_model.predict(x=vectorized_array,
batch_size=vectorized_array.shape[0]//200) batch_size=vectorized_array.shape[0]//200)
f = emb.shape[1] f = emb.shape[1]
self.nn = AnnoyIndex(f) self.nn = AnnoyIndex(f)
logging.warning('Adding embeddings') logging.warning('Adding embeddings')
for i in tqdm(range(len(emb))): for i in tqdm(range(len(emb))):
self.nn.add_item(i, emb[i]) self.nn.add_item(i, emb[i])
logging.warning('Building trees for similarity lookup.') logging.warning('Building trees for similarity lookup.')
self.nn.build(50) self.nn.build(50)
return self.nn return self.nn
def set_recsys_data(self, original_df): def set_recsys_data(self, original_df):
self.rec_df = original_df self.rec_df = original_df
def set_recsys_annoyobj(self, annoyobj): def set_recsys_annoyobj(self, annoyobj):
self.nn = annoyobj self.nn = annoyobj
def evaluate_model(self, holdout_bodies, holdout_titles): def evaluate_model(self, holdout_bodies, holdout_titles):
""" """
Method for calculating BLEU Score. Method for calculating BLEU Score.
Parameters Parameters
---------- ----------
holdout_bodies : List[str] holdout_bodies : List[str]
These are the issue bodies that we want to summarize These are the issue bodies that we want to summarize
holdout_titles : List[str] holdout_titles : List[str]
This is the ground truth we are trying to predict --> issue titles This is the ground truth we are trying to predict --> issue titles
Returns Returns
------- -------
bleu : float bleu : float
The BLEU Score The BLEU Score
""" """
actual, predicted = list(), list() actual, predicted = list(), list()
assert len(holdout_bodies) == len(holdout_titles) assert len(holdout_bodies) == len(holdout_titles)
num_examples = len(holdout_bodies) num_examples = len(holdout_bodies)
logging.warning('Generating predictions.') logging.warning('Generating predictions.')
# step over the whole set TODO: parallelize this # step over the whole set TODO: parallelize this
for i in tqdm_notebook(range(num_examples)): for i in tqdm_notebook(range(num_examples)):
_, yhat = self.generate_issue_title(holdout_bodies[i]) _, yhat = self.generate_issue_title(holdout_bodies[i])
actual.append(self.pp_title.process_text([holdout_titles[i]])[0]) actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
predicted.append(self.pp_title.process_text([yhat])[0]) predicted.append(self.pp_title.process_text([yhat])[0])
# calculate BLEU score # calculate BLEU score
logging.warning('Calculating BLEU.') logging.warning('Calculating BLEU.')
#must be careful with nltk api for corpus_bleu!, #must be careful with nltk api for corpus_bleu!,
# expects List[List[List[str]]] for ground truth, using List[List[str]] will give you # expects List[List[List[str]]] for ground truth, using List[List[str]] will give you
# erroneous results. # erroneous results.
bleu = corpus_bleu([[a] for a in actual], predicted) bleu = corpus_bleu([[a] for a in actual], predicted)
return bleu return bleu

View File

@ -1,8 +1,6 @@
import argparse import argparse
import keras import keras
import pandas as pd import pandas as pd
from seq2seq_utils import load_decoder_inputs
from seq2seq_utils import load_encoder_inputs
from seq2seq_utils import load_text_processor from seq2seq_utils import load_text_processor
from seq2seq_utils import Seq2Seq_Inference from seq2seq_utils import Seq2Seq_Inference
@ -29,5 +27,5 @@ seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
decoder_preprocessor=title_pp, decoder_preprocessor=title_pp,
seq2seq_model=seq2seq_Model) seq2seq_model=seq2seq_Model)
# Output predictions for n random rows in the test set. # Output predictions for n random rows in the test set.
seq2seq_inf.demo_model_predictions(n=args.input_prediction_count, issue_df=testdf) seq2seq_inf.demo_model_predictions(n=args.input_prediction_count, issue_df=testdf)

View File

@ -1,7 +1,7 @@
import argparse import argparse
import dill as dpickle import dill as dpickle
from ktext.preprocess import processor
import numpy as np import numpy as np
from ktext.preprocess import processor
import pandas as pd import pandas as pd
# Parsing flags. # Parsing flags.
@ -30,7 +30,7 @@ print('Example body after pre-processing:', train_body_vecs[0])
# Instantiate a text processor for the titles, with some different parameters. # Instantiate a text processor for the titles, with some different parameters.
title_pp = processor(append_indicators=True, keep_n=4500, title_pp = processor(append_indicators=True, keep_n=4500,
padding_maxlen=12, padding ='post') padding_maxlen=12, padding='post')
# process the title data # process the title data
train_title_vecs = title_pp.fit_transform(train_title_raw) train_title_vecs = title_pp.fit_transform(train_title_raw)
@ -40,10 +40,10 @@ print('Example title after pre-processing:', train_title_vecs[0])
# Save the preprocessor. # Save the preprocessor.
with open(args.output_body_preprocessor_dpkl, 'wb') as f: with open(args.output_body_preprocessor_dpkl, 'wb') as f:
dpickle.dump(body_pp, f) dpickle.dump(body_pp, f)
with open(args.output_title_preprocessor_dpkl, 'wb') as f: with open(args.output_title_preprocessor_dpkl, 'wb') as f:
dpickle.dump(title_pp, f) dpickle.dump(title_pp, f)
# Save the processed data. # Save the processed data.
np.save(args.output_train_title_vecs_npy, train_title_vecs) np.save(args.output_train_title_vecs_npy, train_title_vecs)

View File

@ -1,6 +1,4 @@
import argparse import argparse
import glob
import logging
import pandas as pd import pandas as pd
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
@ -20,8 +18,8 @@ traindf, testdf = train_test_split(pd.read_csv(args.input_csv).sample(n=args.sam
test_size=.10) test_size=.10)
# Print stats about the shape of the data. # Print stats about the shape of the data.
print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns') print('Train: {:,} rows {:,} columns'.format(traindf.shape[0], traindf.shape[1]))
print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns') print('Test: {:,} rows {:,} columns'.format(testdf.shape[0], testdf.shape[1]))
# Store output as CSV. # Store output as CSV.
traindf.to_csv(args.output_traindf_csv) traindf.to_csv(args.output_traindf_csv)

View File

@ -1,8 +1,6 @@
import argparse import argparse
import keras import keras
import pandas as pd import pandas as pd
from seq2seq_utils import load_decoder_inputs
from seq2seq_utils import load_encoder_inputs
from seq2seq_utils import load_text_processor from seq2seq_utils import load_text_processor
from seq2seq_utils import Seq2Seq_Inference from seq2seq_utils import Seq2Seq_Inference

View File

@ -1,393 +0,0 @@
from matplotlib import pyplot as plt
import tensorflow as tf
from keras import backend as K
from keras.layers import Input
from keras.models import Model
from IPython.display import SVG, display
from keras.utils.vis_utils import model_to_dot
import logging
import numpy as np
import dill as dpickle
from annoy import AnnoyIndex
from tqdm import tqdm, tqdm_notebook
from random import random
from nltk.translate.bleu_score import corpus_bleu
def load_text_processor(fname='title_pp.dpkl'):
"""
Load preprocessors from disk.
Parameters
----------
fname: str
file name of ktext.proccessor object
Returns
-------
num_tokens : int
size of vocabulary loaded into ktext.processor
pp : ktext.processor
the processor you are trying to load
Typical Usage:
-------------
num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
"""
# Load files from disk
with open(fname, 'rb') as f:
pp = dpickle.load(f)
num_tokens = max(pp.id2token.keys()) + 1
print(f'Size of vocabulary for {fname}: {num_tokens:,}')
return num_tokens, pp
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
"""
Load decoder inputs.
Parameters
----------
decoder_np_vecs : str
filename of serialized numpy.array of decoder input (issue title)
Returns
-------
decoder_input_data : numpy.array
The data fed to the decoder as input during training for teacher forcing.
This is the same as `decoder_np_vecs` except the last position.
decoder_target_data : numpy.array
The data that the decoder data is trained to generate (issue title).
Calculated by sliding `decoder_np_vecs` one position forward.
"""
vectorized_title = np.load(decoder_np_vecs)
# For Decoder Input, you don't need the last word as that is only for prediction
# when we are training using Teacher Forcing.
decoder_input_data = vectorized_title[:, :-1]
# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
decoder_target_data = vectorized_title[:, 1:]
print(f'Shape of decoder input: {decoder_input_data.shape}')
print(f'Shape of decoder target: {decoder_target_data.shape}')
return decoder_input_data, decoder_target_data
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
"""
Load variables & data that are inputs to encoder.
Parameters
----------
encoder_np_vecs : str
filename of serialized numpy.array of encoder input (issue title)
Returns
-------
encoder_input_data : numpy.array
The issue body
doc_length : int
The standard document length of the input for the encoder after padding
the shape of this array will be (num_examples, doc_length)
"""
vectorized_body = np.load(encoder_np_vecs)
# Encoder input is simply the body of the issue text
encoder_input_data = vectorized_body
doc_length = encoder_input_data.shape[1]
print(f'Shape of encoder input: {encoder_input_data.shape}')
return encoder_input_data, doc_length
def viz_model_architecture(model):
"""Visualize model architecture in Jupyter notebook."""
display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
def free_gpu_mem():
"""Attempt to free gpu memory."""
K.get_session().close()
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))
def test_gpu():
"""Run a toy computation task in tensorflow to test GPU."""
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
hello = tf.constant('Hello, TensorFlow!')
print(session.run(hello))
def plot_model_training_history(history_object):
"""Plots model train vs. validation loss."""
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.plot(history_object.history['loss'])
plt.plot(history_object.history['val_loss'])
plt.legend(['train', 'test'], loc='upper left')
plt.show()
def extract_encoder_model(model):
"""
Extract the encoder from the original Sequence to Sequence Model.
Returns a keras model object that has one input (body of issue) and one
output (encoding of issue, which is the last hidden state).
Input:
-----
model: keras model object
Returns:
-----
keras model object
"""
encoder_model = model.get_layer('Encoder-Model')
return encoder_model
def extract_decoder_model(model):
"""
Extract the decoder from the original model.
Inputs:
------
model: keras model object
Returns:
-------
A Keras model object with the following inputs and outputs:
Inputs of Keras Model That Is Returned:
1: the embedding index for the last predicted word or the <Start> indicator
2: the last hidden state, or in the case of the first word the hidden state from the encoder
Outputs of Keras Model That Is Returned:
1. Prediction (class probabilities) for the next word
2. The hidden state of the decoder, to be fed back into the decoder at the next time step
Implementation Notes:
----------------------
Must extract relevant layers and reconstruct part of the computation graph
to allow for different inputs as we are not going to use teacher forcing at
inference time.
"""
# the latent dimension is the same throughout the architecture so we are going to
# cheat and grab the latent dimension of the embedding because that is the same as what is
# output from the decoder
latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
# Reconstruct the input into the decoder
decoder_inputs = model.get_layer('Decoder-Input').input
dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
# Instead of setting the intial state from the encoder and forgetting about it, during inference
# we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
# the GRU, thus we define this input layer for the state so we can add this capability
gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
# we need to reuse the weights that is why we are getting this
# If you inspect the decoder GRU that we created for training, it will take as input
# 2 tensors -> (1) is the embedding layer output for the teacher forcing
# (which will now be the last step's prediction, and will be _start_ on the first time step)
# (2) is the state, which we will initialize with the encoder on the first time step, but then
# grab the state after the first prediction and feed that back in again.
gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
# Reconstruct dense layers
dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
decoder_model = Model([decoder_inputs, gru_inference_state_input],
[dense_out, gru_state_out])
return decoder_model
class Seq2Seq_Inference(object):
def __init__(self,
encoder_preprocessor,
decoder_preprocessor,
seq2seq_model):
self.pp_body = encoder_preprocessor
self.pp_title = decoder_preprocessor
self.seq2seq_model = seq2seq_model
self.encoder_model = extract_encoder_model(seq2seq_model)
self.decoder_model = extract_decoder_model(seq2seq_model)
self.default_max_len_title = self.pp_title.padding_maxlen
self.nn = None
self.rec_df = None
def generate_issue_title(self,
raw_input_text,
max_len_title=None):
"""
Use the seq2seq model to generate a title given the body of an issue.
Inputs
------
raw_input: str
The body of the issue text as an input string
max_len_title: int (optional)
The maximum length of the title the model will generate
"""
if max_len_title is None:
max_len_title = self.default_max_len_title
# get the encoder's features for the decoder
raw_tokenized = self.pp_body.transform([raw_input_text])
body_encoding = self.encoder_model.predict(raw_tokenized)
# we want to save the encoder's embedding before its updated by decoder
# because we can use that as an embedding for other tasks.
original_body_encoding = body_encoding
state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
decoded_sentence = []
stop_condition = False
while not stop_condition:
preds, st = self.decoder_model.predict([state_value, body_encoding])
# We are going to ignore indices 0 (padding) and indices 1 (unknown)
# Argmax will return the integer index corresponding to the
# prediction + 2 b/c we chopped off first two
pred_idx = np.argmax(preds[:, :, 2:]) + 2
# retrieve word from index prediction
pred_word_str = self.pp_title.id2token[pred_idx]
if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
stop_condition = True
break
decoded_sentence.append(pred_word_str)
# update the decoder for the next word
body_encoding = st
state_value = np.array(pred_idx).reshape(1, 1)
return original_body_encoding, ' '.join(decoded_sentence)
def print_example(self,
i,
body_text,
title_text,
url,
threshold):
"""
Prints an example of the model's prediction for manual inspection.
"""
if i:
print('\n\n==============================================')
print(f'============== Example # {i} =================\n')
if url:
print(url)
print(f"Issue Body:\n {body_text} \n")
if title_text:
print(f"Original Title:\n {title_text}")
emb, gen_title = self.generate_issue_title(body_text)
print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
if self.nn:
# return neighbors and distances
n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
include_distances=True)
neighbors = n[1:]
dist = d[1:]
if min(dist) <= threshold:
cols = ['issue_url', 'issue_title', 'body']
dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
dfcopy['dist'] = dist
similar_issues_df = dfcopy.query(f'dist <= {threshold}')
print("\n**** Similar Issues (using encoder embedding) ****:\n")
display(similar_issues_df)
def demo_model_predictions(self,
n,
issue_df,
threshold=1):
"""
Pick n random Issues and display predictions.
Input:
------
n : int
Number of issues to display from issue_df
issue_df : pandas DataFrame
DataFrame that contains two columns: `body` and `issue_title`.
threshold : float
distance threshold for recommendation of similar issues.
Returns:
--------
None
Prints the original issue body and the model's prediction.
"""
# Extract body and title from DF
body_text = issue_df.body.tolist()
title_text = issue_df.issue_title.tolist()
url = issue_df.issue_url.tolist()
demo_list = np.random.randint(low=1, high=len(body_text), size=n)
for i in demo_list:
self.print_example(i,
body_text=body_text[i],
title_text=title_text[i],
url=url[i],
threshold=threshold)
def prepare_recommender(self, vectorized_array, original_df):
"""
Use the annoy library to build recommender
Parameters
----------
vectorized_array : List[List[int]]
This is the list of list of integers that represents your corpus
that is fed into the seq2seq model for training.
original_df : pandas.DataFrame
This is the original dataframe that has the columns
['issue_url', 'issue_title', 'body']
Returns
-------
annoy.AnnoyIndex object (see https://github.com/spotify/annoy)
"""
self.rec_df = original_df
emb = self.encoder_model.predict(x=vectorized_array,
batch_size=vectorized_array.shape[0]//200)
f = emb.shape[1]
self.nn = AnnoyIndex(f)
logging.warning('Adding embeddings')
for i in tqdm(range(len(emb))):
self.nn.add_item(i, emb[i])
logging.warning('Building trees for similarity lookup.')
self.nn.build(50)
return self.nn
def set_recsys_data(self, original_df):
self.rec_df = original_df
def set_recsys_annoyobj(self, annoyobj):
self.nn = annoyobj
def evaluate_model(self, holdout_bodies, holdout_titles):
"""
Method for calculating BLEU Score.
Parameters
----------
holdout_bodies : List[str]
These are the issue bodies that we want to summarize
holdout_titles : List[str]
This is the ground truth we are trying to predict --> issue titles
Returns
-------
bleu : float
The BLEU Score
"""
actual, predicted = list(), list()
assert len(holdout_bodies) == len(holdout_titles)
num_examples = len(holdout_bodies)
logging.warning('Generating predictions.')
# step over the whole set TODO: parallelize this
for i in tqdm_notebook(range(num_examples)):
_, yhat = self.generate_issue_title(holdout_bodies[i])
actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
predicted.append(self.pp_title.process_text([yhat])[0])
# calculate BLEU score
logging.warning('Calculating BLEU.')
bleu = corpus_bleu(actual, predicted)
return bleu

View File

@ -1,11 +1,10 @@
import argparse import argparse
import numpy as np
from keras.callbacks import CSVLogger, ModelCheckpoint from keras.callbacks import CSVLogger, ModelCheckpoint
from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization from keras.layers import Input, GRU, Dense, Embedding, BatchNormalization
from keras.models import Model from keras.models import Model
from keras import optimizers from keras import optimizers
import numpy as np
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor
from seq2seq_utils import viz_model_architecture
# Parsing flags. # Parsing flags.
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
@ -18,7 +17,7 @@ parser.add_argument("--learning_rate", default="0.001")
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
learning_rate=float(args.learning_rate) learning_rate = float(args.learning_rate)
encoder_input_data, doc_length = load_encoder_inputs(args.input_train_body_vecs_npy) encoder_input_data, doc_length = load_encoder_inputs(args.input_train_body_vecs_npy)
decoder_input_data, decoder_target_data = load_decoder_inputs(args.input_train_title_vecs_npy) decoder_input_data, decoder_target_data = load_decoder_inputs(args.input_train_title_vecs_npy)
@ -35,7 +34,10 @@ latent_dim = 300
encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input') encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')
# Word embeding for encoder (ex: Issue Body) # Word embeding for encoder (ex: Issue Body)
x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) x = Embedding(num_encoder_tokens,
latent_dim,
name='Body-Word-Embedding',
mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x) x = BatchNormalization(name='Encoder-Batchnorm-1')(x)
# We do not need the `encoder_output` just the hidden state. # We do not need the `encoder_output` just the hidden state.
@ -53,7 +55,10 @@ seq2seq_encoder_out = encoder_model(encoder_inputs)
decoder_inputs = Input(shape=(None,), name='Decoder-Input') # for teacher forcing decoder_inputs = Input(shape=(None,), name='Decoder-Input') # for teacher forcing
# Word Embedding For Decoder (ex: Issue Titles) # Word Embedding For Decoder (ex: Issue Titles)
dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) dec_emb = Embedding(num_decoder_tokens,
latent_dim,
name='Decoder-Word-Embedding',
mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)
# Set up the decoder, using `decoder_state_input` as initial state. # Set up the decoder, using `decoder_state_input` as initial state.
@ -71,21 +76,24 @@ decoder_outputs = decoder_dense(x)
seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy') seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
loss='sparse_categorical_crossentropy')
seq2seq_Model.summary() seq2seq_Model.summary()
script_name_base = 'tutorial_seq2seq' script_name_base = 'tutorial_seq2seq'
csv_logger = CSVLogger('{:}.log'.format(script_name_base)) csv_logger = CSVLogger('{:}.log'.format(script_name_base))
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base), model_checkpoint = ModelCheckpoint(
save_best_only=True) '{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base), save_best_only=True)
batch_size = 1200 batch_size = 1200
epochs = 7 epochs = 7
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1), history = seq2seq_Model.fit([encoder_input_data, decoder_input_data],
batch_size=batch_size, np.expand_dims(decoder_target_data, -1),
epochs=epochs, batch_size=batch_size,
validation_split=0.12, callbacks=[csv_logger, model_checkpoint]) epochs=epochs,
validation_split=0.12,
callbacks=[csv_logger, model_checkpoint])
############# #############
# Save model. # Save model.