diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 00000000..697316db --- /dev/null +++ b/.pylintrc @@ -0,0 +1,399 @@ +[MASTER] + +# Specify a configuration file. +#rcfile= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Add files or directories to the blacklist. They should be base names, not +# paths. +ignore=third_party + +# Add files or directories matching the regex patterns to the blacklist. The +# regex matches against base names, not paths. +ignore-patterns= + +# Pickle collected data for later comparisons. +persistent=no + +# List of plugins (as comma separated values of python modules names) to load, +# usually to register additional checkers. +load-plugins= + +# Use multiple processes to speed up Pylint. +jobs=4 + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code +extension-pkg-whitelist= + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED +confidence= + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +#enable= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once).You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use"--disable=all --enable=classes +# --disable=W" +disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,range-builtin-not-iterating,suppressed-message,missing-docstring,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating,relative-import,invalid-name,bad-continuation,no-member,locally-disabled,fixme,import-error,too-many-locals + + +[REPORTS] + +# Set the output format. Available formats are text, parseable, colorized, msvs +# (visual studio) and html. You can also give a reporter class, eg +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Put messages in a separate file for each module / package specified on the +# command line instead of printing them on stdout. Reports (if any) will be +# written in a file name "pylint_global.[txt|html]". This option is deprecated +# and it will be removed in Pylint 2.0. +files-output=no + +# Tells whether to display a full report or only the messages +reports=no + +# Python expression which should return a note less than 10 (10 is the highest +# note). You have access to the variables errors warning, statement which +# respectively contain the number of errors / warnings messages and the total +# number of statements analyzed. This is used by the global evaluation report +# (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details +#msg-template= + + +[BASIC] + +# Good variable names which should always be accepted, separated by a comma +good-names=i,j,k,ex,Run,_ + +# Bad variable names which should always be refused, separated by a comma +bad-names=foo,bar,baz,toto,tutu,tata + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Include a hint for the correct naming format with invalid-name +include-naming-hint=no + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +property-classes=abc.abstractproperty + +# Regular expression matching correct function names +function-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Naming hint for function names +function-name-hint=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression matching correct variable names +variable-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Naming hint for variable names +variable-name-hint=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression matching correct constant names +const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ + +# Naming hint for constant names +const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ + +# Regular expression matching correct attribute names +attr-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Naming hint for attribute names +attr-name-hint=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression matching correct argument names +argument-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Naming hint for argument names +argument-name-hint=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression matching correct class attribute names +class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ + +# Naming hint for class attribute names +class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ + +# Regular expression matching correct inline iteration names +inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ + +# Naming hint for inline iteration names +inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ + +# Regular expression matching correct class names +class-rgx=[A-Z_][a-zA-Z0-9]+$ + +# Naming hint for class names +class-name-hint=[A-Z_][a-zA-Z0-9]+$ + +# Regular expression matching correct module names +module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ + +# Naming hint for module names +module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ + +# Regular expression matching correct method names +method-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Naming hint for method names +method-name-hint=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + + +[ELIF] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + + +[TYPECHECK] + +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis. It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + + +[FORMAT] + +# Maximum number of characters on a single line. +max-line-length=100 + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + +# List of optional constructs for which whitespace checking is disabled. `dict- +# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. +# `trailing-comma` allows a space between comma and closing bracket: (a, ). +# `empty-line` allows space-only lines. +no-space-check=trailing-comma,dict-separator + +# Maximum number of lines in a module +max-module-lines=1000 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +# Use 2 spaces consistent with TensorFlow style. +indent-string=' ' + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME,XXX,TODO + + +[VARIABLES] + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# A regular expression matching the name of dummy variables (i.e. expectedly +# not used). +dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid to define new builtins when possible. +additional-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_,_cb + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,future.builtins + + +[LOGGING] + +# Logging modules to check that the string format arguments are in logging +# function parameter format +logging-modules=logging + + +[SIMILARITIES] + +# Minimum lines number of a similarity. +min-similarity-lines=4 + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + + +[SPELLING] + +# Spelling dictionary name. Available dictionaries: none. To make it working +# install python-enchant package. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to indicated private dictionary in +# --spelling-private-dict-file option instead of raising a message. +spelling-store-unknown-words=no + + +[IMPORTS] + +# Deprecated modules which should not be used, separated by a comma +deprecated-modules=regsub,TERMIOS,Bastion,rexec + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled) +import-graph= + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled) +ext-import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled) +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + + +[DESIGN] + +# Maximum number of arguments for function / method +max-args=7 + +# Argument names that match this expression will be ignored. Default to name +# with leading underscore +ignored-argument-names=_.* + +# Maximum number of locals for function / method body +max-locals=15 + +# Maximum number of return / yield for function / method body +max-returns=6 + +# Maximum number of branch for function / method body +max-branches=12 + +# Maximum number of statements in function / method body +max-statements=50 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=0 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of boolean expressions in a if statement +max-bool-expr=5 + + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__,__new__,setUp + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=mcs + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict,_fields,_replace,_source,_make + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "Exception" +overgeneral-exceptions=Exception \ No newline at end of file diff --git a/agents/trainer/task.py b/agents/trainer/task.py index 4c82954e..fee5beb4 100644 --- a/agents/trainer/task.py +++ b/agents/trainer/task.py @@ -12,18 +12,18 @@ """Provides an entrypoint for the training task.""" +#pylint: disable=unused-import + from __future__ import absolute_import, division, print_function -import argparse import datetime import logging import os import pprint import uuid -import pip -import tensorflow as tf from google.cloud import storage +import tensorflow as tf import agents import pybullet_envs # To make AntBulletEnv-v0 available. @@ -113,39 +113,39 @@ def hparams_base(): """Base hparams tf/Agents PPO """ # General - algorithm = agents.ppo.PPOAlgorithm - num_agents = 30 - eval_episodes = 30 - use_gpu = False +# algorithm = agents.ppo.PPOAlgorithm +# num_agents = 30 +# eval_episodes = 30 +# use_gpu = False # Environment - env = 'KukaBulletEnv-v0' - normalize_ranges = True - max_length = 1000 +# env = 'KukaBulletEnv-v0' +# normalize_ranges = True +# max_length = 1000 # Network - network = agents.scripts.networks.feed_forward_gaussian - weight_summaries = dict( - all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*') - policy_layers = 200, 100 - value_layers = 200, 100 - init_output_factor = 0.1 - init_logstd = -1 - init_std = 0.35 +# network = agents.scripts.networks.feed_forward_gaussian +# weight_summaries = dict( +# all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*') +# policy_layers = 200, 100 +# value_layers = 200, 100 +# init_output_factor = 0.1 +# init_logstd = -1 +# init_std = 0.35 # Optimization - update_every = 60 - update_epochs = 25 - optimizer = tf.train.AdamOptimizer - learning_rate = 1e-4 - steps = 3e7 # 30M +# update_every = 60 +# update_epochs = 25 +# optimizer = tf.train.AdamOptimizer +# learning_rate = 1e-4 +# steps = 3e7 # 30M # Losses - discount = 0.995 - kl_target = 1e-2 - kl_cutoff_factor = 2 - kl_cutoff_coef = 1000 - kl_init_penalty = 1 +# discount = 0.995 +# kl_target = 1e-2 +# kl_cutoff_factor = 2 +# kl_cutoff_coef = 1000 +# kl_init_penalty = 1 return locals() @@ -158,9 +158,9 @@ def _object_import_from_string(name): return mod -def _realize_import_attrs(d, filter): +def _realize_import_attrs(d, hparam_filter): for k, v in d.items(): - if k in filter: + if k in hparam_filter: imported = _object_import_from_string(v) # TODO: Provide an appropriately informative error if the import fails # except ImportError as e: @@ -170,7 +170,7 @@ def _realize_import_attrs(d, filter): return d -def _get_agents_configuration(hparam_set_name, log_dir=None, is_chief=False): +def _get_agents_configuration(log_dir=None): """Load hyperparameter config.""" try: # Try to resume training. @@ -243,23 +243,20 @@ def gcs_upload(local_dir, gcs_out_dir): blob.upload_from_filename(local_file_path) -def main(unused_argv): +def main(_): """Run training.""" tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.debug: tf.logging.set_verbosity(tf.logging.DEBUG) - run_config = tf.contrib.learn.RunConfig() - log_dir = FLAGS.logdir - agents_config = _get_agents_configuration( - FLAGS.hparam_set_id, log_dir, run_config.is_chief) + agents_config = _get_agents_configuration(log_dir) if FLAGS.run_mode == 'train': for score in agents.scripts.train.train(agents_config, env_processes=True): - logging.info('Score {}.'.format(score)) + logging.info('Score %s.', score) if FLAGS.run_mode == 'render': now = datetime.datetime.now() subdir = now.strftime("%m%d-%H%M") + "-" + uuid.uuid4().hex[0:4] diff --git a/github_issue_summarization/docker/flask_web/app.py b/github_issue_summarization/docker/flask_web/app.py index 993ece2c..4296dad6 100644 --- a/github_issue_summarization/docker/flask_web/app.py +++ b/github_issue_summarization/docker/flask_web/app.py @@ -2,38 +2,47 @@ Simple app that parses predictions from a trained model and displays them. """ -from flask import Flask, json, render_template, request import requests -app = Flask(__name__) +from flask import Flask, json, render_template, request +APP = Flask(__name__) -@app.route("/") +@APP.route("/") def index(): + """Default route. + + Placeholder, does nothing. + """ return render_template("index.html") -@app.route("/summary", methods=['GET', 'POST']) +@APP.route("/summary", methods=['GET', 'POST']) def summary(): + """Main prediction route. + + Provides a machine-generated summary of the given text. Sends a request to a live + model trained on GitHub issues. + """ if request.method == 'POST': issue_text = request.form["issue_text"] url = "http://ambassador:80/seldon/issue-summarization/api/v0.1/predictions" - headers = { 'content-type': 'application/json' } + headers = {'content-type': 'application/json'} json_data = { - "data" : { - "ndarray" : [[ issue_text ]] - } + "data" : { + "ndarray" : [[issue_text]] + } } - r = requests.post(url = url, - headers = headers, - data = json.dumps(json_data)) + response = requests.post(url=url, + headers=headers, + data=json.dumps(json_data)) - rjs = json.loads(r.text) - summary = rjs["data"]["ndarray"][0][0] + response_json = json.loads(response.text) + issue_summary = response_json["data"]["ndarray"][0][0] - return render_template("summary.html", - issue_text = issue_text, - summary = summary) + return render_template("issue_summary.html", + issue_text=issue_text, + issue_summary=issue_summary) + return ('', 204) if __name__ == '__main__': - app.run(debug = True, host = '0.0.0.0', port = 80) - + APP.run(debug=True, host='0.0.0.0', port=80) diff --git a/github_issue_summarization/notebooks/IssueSummarization.py b/github_issue_summarization/notebooks/IssueSummarization.py deleted file mode 100644 index 4dc9bc04..00000000 --- a/github_issue_summarization/notebooks/IssueSummarization.py +++ /dev/null @@ -1,22 +0,0 @@ -from __future__ import print_function - -import dill as dpickle -import numpy as np -from keras.models import load_model - -from seq2seq_utils import Seq2Seq_Inference - - -class IssueSummarization(object): - - def __init__(self): - with open('body_pp.dpkl', 'rb') as f: - body_pp = dpickle.load(f) - with open('title_pp.dpkl', 'rb') as f: - title_pp = dpickle.load(f) - self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp, - decoder_preprocessor=title_pp, - seq2seq_model=load_model('seq2seq_model_tutorial.h5')) - - def predict(self, X, feature_names): - return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in X]) diff --git a/github_issue_summarization/notebooks/issue_summarization.py b/github_issue_summarization/notebooks/issue_summarization.py new file mode 100644 index 00000000..2286d444 --- /dev/null +++ b/github_issue_summarization/notebooks/issue_summarization.py @@ -0,0 +1,25 @@ +"""Generates predictions using a stored model. + +Uses trained model files to generate a prediction. +""" + +from __future__ import print_function + +import numpy as np +import dill as dpickle +from keras.models import load_model +from seq2seq_utils import Seq2Seq_Inference + +class IssueSummarization(object): + + def __init__(self): + with open('body_pp.dpkl', 'rb') as body_file: + body_pp = dpickle.load(body_file) + with open('title_pp.dpkl', 'rb') as title_file: + title_pp = dpickle.load(title_file) + self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp, + decoder_preprocessor=title_pp, + seq2seq_model=load_model('seq2seq_model_tutorial.h5')) + + def predict(self, input_text): + return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in input_text]) diff --git a/github_issue_summarization/notebooks/seq2seq_utils.py b/github_issue_summarization/notebooks/seq2seq_utils.py index 122ecc30..0ddaebfa 100644 --- a/github_issue_summarization/notebooks/seq2seq_utils.py +++ b/github_issue_summarization/notebooks/seq2seq_utils.py @@ -1,429 +1,432 @@ +import logging +import dill as dpickle +import numpy as np from matplotlib import pyplot as plt import tensorflow as tf +from IPython.display import SVG, display from keras import backend as K from keras.layers import Input from keras.models import Model -from IPython.display import SVG, display from keras.utils.vis_utils import model_to_dot -import logging -import numpy as np -import dill as dpickle from annoy import AnnoyIndex from tqdm import tqdm, tqdm_notebook -from random import random from nltk.translate.bleu_score import corpus_bleu def load_text_processor(fname='title_pp.dpkl'): - """ - Load preprocessors from disk. + """ + Load preprocessors from disk. - Parameters - ---------- - fname: str - file name of ktext.proccessor object + Parameters + ---------- + fname: str + file name of ktext.proccessor object - Returns - ------- - num_tokens : int - size of vocabulary loaded into ktext.processor - pp : ktext.processor - the processor you are trying to load + Returns + ------- + num_tokens : int + size of vocabulary loaded into ktext.processor + pp : ktext.processor + the processor you are trying to load - Typical Usage: - ------------- + Typical Usage: + ------------- - num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl') - num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl') + num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl') + num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl') - """ - # Load files from disk - with open(fname, 'rb') as f: - pp = dpickle.load(f) + """ + # Load files from disk + with open(fname, 'rb') as f: + pp = dpickle.load(f) - num_tokens = max(pp.id2token.keys()) + 1 - print(f'Size of vocabulary for {fname}: {num_tokens:,}') - return num_tokens, pp + num_tokens = max(pp.id2token.keys()) + 1 + print('Size of vocabulary for {}: {}'.format(fname, num_tokens)) + return num_tokens, pp def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'): - """ - Load decoder inputs. + """ + Load decoder inputs. - Parameters - ---------- - decoder_np_vecs : str - filename of serialized numpy.array of decoder input (issue title) + Parameters + ---------- + decoder_np_vecs : str + filename of serialized numpy.array of decoder input (issue title) - Returns - ------- - decoder_input_data : numpy.array - The data fed to the decoder as input during training for teacher forcing. - This is the same as `decoder_np_vecs` except the last position. - decoder_target_data : numpy.array - The data that the decoder data is trained to generate (issue title). - Calculated by sliding `decoder_np_vecs` one position forward. + Returns + ------- + decoder_input_data : numpy.array + The data fed to the decoder as input during training for teacher forcing. + This is the same as `decoder_np_vecs` except the last position. + decoder_target_data : numpy.array + The data that the decoder data is trained to generate (issue title). + Calculated by sliding `decoder_np_vecs` one position forward. - """ - vectorized_title = np.load(decoder_np_vecs) - # For Decoder Input, you don't need the last word as that is only for prediction - # when we are training using Teacher Forcing. - decoder_input_data = vectorized_title[:, :-1] + """ + vectorized_title = np.load(decoder_np_vecs) + # For Decoder Input, you don't need the last word as that is only for prediction + # when we are training using Teacher Forcing. + decoder_input_data = vectorized_title[:, :-1] - # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing) - decoder_target_data = vectorized_title[:, 1:] + # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing) + decoder_target_data = vectorized_title[:, 1:] - print(f'Shape of decoder input: {decoder_input_data.shape}') - print(f'Shape of decoder target: {decoder_target_data.shape}') - return decoder_input_data, decoder_target_data + print('Shape of decoder input: {}'.format(decoder_input_data.shape)) + print('Shape of decoder target: {}'.format(decoder_target_data.shape)) + return decoder_input_data, decoder_target_data def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'): - """ - Load variables & data that are inputs to encoder. + """ + Load variables & data that are inputs to encoder. - Parameters - ---------- - encoder_np_vecs : str - filename of serialized numpy.array of encoder input (issue title) + Parameters + ---------- + encoder_np_vecs : str + filename of serialized numpy.array of encoder input (issue title) - Returns - ------- - encoder_input_data : numpy.array - The issue body - doc_length : int - The standard document length of the input for the encoder after padding - the shape of this array will be (num_examples, doc_length) + Returns + ------- + encoder_input_data : numpy.array + The issue body + doc_length : int + The standard document length of the input for the encoder after padding + the shape of this array will be (num_examples, doc_length) - """ - vectorized_body = np.load(encoder_np_vecs) - # Encoder input is simply the body of the issue text - encoder_input_data = vectorized_body - doc_length = encoder_input_data.shape[1] - print(f'Shape of encoder input: {encoder_input_data.shape}') - return encoder_input_data, doc_length + """ + vectorized_body = np.load(encoder_np_vecs) + # Encoder input is simply the body of the issue text + encoder_input_data = vectorized_body + doc_length = encoder_input_data.shape[1] + print('Shape of encoder input: {}'.format(encoder_input_data.shape)) + return encoder_input_data, doc_length def viz_model_architecture(model): - """Visualize model architecture in Jupyter notebook.""" - display(SVG(model_to_dot(model).create(prog='dot', format='svg'))) + """Visualize model architecture in Jupyter notebook.""" + display(SVG(model_to_dot(model).create(prog='dot', format='svg'))) def free_gpu_mem(): - """Attempt to free gpu memory.""" - K.get_session().close() - cfg = K.tf.ConfigProto() - cfg.gpu_options.allow_growth = True - K.set_session(K.tf.Session(config=cfg)) + """Attempt to free gpu memory.""" + K.get_session().close() + cfg = K.tf.ConfigProto() + cfg.gpu_options.allow_growth = True + K.set_session(K.tf.Session(config=cfg)) def test_gpu(): - """Run a toy computation task in tensorflow to test GPU.""" - config = tf.ConfigProto() - config.gpu_options.allow_growth = True - session = tf.Session(config=config) - hello = tf.constant('Hello, TensorFlow!') - print(session.run(hello)) + """Run a toy computation task in tensorflow to test GPU.""" + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + session = tf.Session(config=config) + hello = tf.constant('Hello, TensorFlow!') + print(session.run(hello)) def plot_model_training_history(history_object): - """Plots model train vs. validation loss.""" - plt.title('model accuracy') - plt.ylabel('accuracy') - plt.xlabel('epoch') - plt.plot(history_object.history['loss']) - plt.plot(history_object.history['val_loss']) - plt.legend(['train', 'test'], loc='upper left') - plt.show() + """Plots model train vs. validation loss.""" + plt.title('model accuracy') + plt.ylabel('accuracy') + plt.xlabel('epoch') + plt.plot(history_object.history['loss']) + plt.plot(history_object.history['val_loss']) + plt.legend(['train', 'test'], loc='upper left') + plt.show() def extract_encoder_model(model): - """ - Extract the encoder from the original Sequence to Sequence Model. + """ + Extract the encoder from the original Sequence to Sequence Model. - Returns a keras model object that has one input (body of issue) and one - output (encoding of issue, which is the last hidden state). + Returns a keras model object that has one input (body of issue) and one + output (encoding of issue, which is the last hidden state). - Input: - ----- - model: keras model object + Input: + ----- + model: keras model object - Returns: - ----- - keras model object + Returns: + ----- + keras model object - """ - encoder_model = model.get_layer('Encoder-Model') - return encoder_model + """ + encoder_model = model.get_layer('Encoder-Model') + return encoder_model def extract_decoder_model(model): - """ - Extract the decoder from the original model. + """ + Extract the decoder from the original model. - Inputs: - ------ - model: keras model object + Inputs: + ------ + model: keras model object - Returns: - ------- - A Keras model object with the following inputs and outputs: + Returns: + ------- + A Keras model object with the following inputs and outputs: - Inputs of Keras Model That Is Returned: - 1: the embedding index for the last predicted word or the indicator - 2: the last hidden state, or in the case of the first word the hidden state from the encoder + Inputs of Keras Model That Is Returned: + 1: the embedding index for the last predicted word or the indicator + 2: the last hidden state, or in the case of the first word the hidden state from the encoder - Outputs of Keras Model That Is Returned: - 1. Prediction (class probabilities) for the next word - 2. The hidden state of the decoder, to be fed back into the decoder at the next time step + Outputs of Keras Model That Is Returned: + 1. Prediction (class probabilities) for the next word + 2. The hidden state of the decoder, to be fed back into the decoder at the next time step - Implementation Notes: - ---------------------- - Must extract relevant layers and reconstruct part of the computation graph - to allow for different inputs as we are not going to use teacher forcing at - inference time. + Implementation Notes: + ---------------------- + Must extract relevant layers and reconstruct part of the computation graph + to allow for different inputs as we are not going to use teacher forcing at + inference time. - """ - # the latent dimension is the same throughout the architecture so we are going to - # cheat and grab the latent dimension of the embedding because that is the same as what is - # output from the decoder - latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1] + """ + # the latent dimension is the same throughout the architecture so we are going to + # cheat and grab the latent dimension of the embedding because that is the same as what is + # output from the decoder + latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1] - # Reconstruct the input into the decoder - decoder_inputs = model.get_layer('Decoder-Input').input - dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs) - dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb) + # Reconstruct the input into the decoder + decoder_inputs = model.get_layer('Decoder-Input').input + dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs) + dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb) - # Instead of setting the intial state from the encoder and forgetting about it, during inference - # we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into - # the GRU, thus we define this input layer for the state so we can add this capability - gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input') + # Instead of setting the intial state from the encoder and forgetting about it, during inference + # we are not doing teacher forcing, so we will have to have a feedback loop from predictions back + # into the GRU, thus we define this input layer for the state so we can add this capability + gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input') - # we need to reuse the weights that is why we are getting this - # If you inspect the decoder GRU that we created for training, it will take as input - # 2 tensors -> (1) is the embedding layer output for the teacher forcing - # (which will now be the last step's prediction, and will be _start_ on the first time step) - # (2) is the state, which we will initialize with the encoder on the first time step, but then - # grab the state after the first prediction and feed that back in again. - gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input]) + # we need to reuse the weights that is why we are getting this + # If you inspect the decoder GRU that we created for training, it will take as input + # 2 tensors -> (1) is the embedding layer output for the teacher forcing + # (which will now be the last step's prediction, and will be _start_ on the + # first time step) + # (2) is the state, which we will initialize with the encoder on the first time step + # but then grab the state after the first prediction and feed that back in again. + gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input]) - # Reconstruct dense layers - dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out) - dense_out = model.get_layer('Final-Output-Dense')(dec_bn2) - decoder_model = Model([decoder_inputs, gru_inference_state_input], - [dense_out, gru_state_out]) - return decoder_model + # Reconstruct dense layers + dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out) + dense_out = model.get_layer('Final-Output-Dense')(dec_bn2) + decoder_model = Model([decoder_inputs, gru_inference_state_input], + [dense_out, gru_state_out]) + return decoder_model class Seq2Seq_Inference(object): - def __init__(self, - encoder_preprocessor, - decoder_preprocessor, - seq2seq_model): - self.pp_body = encoder_preprocessor - self.pp_title = decoder_preprocessor - self.seq2seq_model = seq2seq_model - self.encoder_model = extract_encoder_model(seq2seq_model) - self.decoder_model = extract_decoder_model(seq2seq_model) - self.default_max_len_title = self.pp_title.padding_maxlen - self.nn = None - self.rec_df = None + # pylint: disable=too-many-instance-attributes - def generate_issue_title(self, - raw_input_text, - max_len_title=None): - """ - Use the seq2seq model to generate a title given the body of an issue. + def __init__(self, + encoder_preprocessor, + decoder_preprocessor, + seq2seq_model): - Inputs - ------ - raw_input: str - The body of the issue text as an input string + self.pp_body = encoder_preprocessor + self.pp_title = decoder_preprocessor + self.seq2seq_model = seq2seq_model + self.encoder_model = extract_encoder_model(seq2seq_model) + self.decoder_model = extract_decoder_model(seq2seq_model) + self.default_max_len_title = self.pp_title.padding_maxlen + self.nn = None + self.rec_df = None - max_len_title: int (optional) - The maximum length of the title the model will generate + def generate_issue_title(self, + raw_input_text, + max_len_title=None): + """ + Use the seq2seq model to generate a title given the body of an issue. - """ - if max_len_title is None: - max_len_title = self.default_max_len_title - # get the encoder's features for the decoder - raw_tokenized = self.pp_body.transform([raw_input_text]) - body_encoding = self.encoder_model.predict(raw_tokenized) - # we want to save the encoder's embedding before its updated by decoder - # because we can use that as an embedding for other tasks. - original_body_encoding = body_encoding - state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1) + Inputs + ------ + raw_input: str + The body of the issue text as an input string - decoded_sentence = [] - stop_condition = False - while not stop_condition: - preds, st = self.decoder_model.predict([state_value, body_encoding]) + max_len_title: int (optional) + The maximum length of the title the model will generate - # We are going to ignore indices 0 (padding) and indices 1 (unknown) - # Argmax will return the integer index corresponding to the - # prediction + 2 b/c we chopped off first two - pred_idx = np.argmax(preds[:, :, 2:]) + 2 + """ + if max_len_title is None: + max_len_title = self.default_max_len_title + # get the encoder's features for the decoder + raw_tokenized = self.pp_body.transform([raw_input_text]) + body_encoding = self.encoder_model.predict(raw_tokenized) + # we want to save the encoder's embedding before its updated by decoder + # because we can use that as an embedding for other tasks. + original_body_encoding = body_encoding + state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1) - # retrieve word from index prediction - pred_word_str = self.pp_title.id2token[pred_idx] + decoded_sentence = [] + stop_condition = False + while not stop_condition: + preds, st = self.decoder_model.predict([state_value, body_encoding]) - if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title: - stop_condition = True - break - decoded_sentence.append(pred_word_str) + # We are going to ignore indices 0 (padding) and indices 1 (unknown) + # Argmax will return the integer index corresponding to the + # prediction + 2 b/c we chopped off first two + pred_idx = np.argmax(preds[:, :, 2:]) + 2 - # update the decoder for the next word - body_encoding = st - state_value = np.array(pred_idx).reshape(1, 1) + # retrieve word from index prediction + pred_word_str = self.pp_title.id2token[pred_idx] - return original_body_encoding, ' '.join(decoded_sentence) + if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title: + stop_condition = True + break + decoded_sentence.append(pred_word_str) + + # update the decoder for the next word + body_encoding = st + state_value = np.array(pred_idx).reshape(1, 1) + + return original_body_encoding, ' '.join(decoded_sentence) - def print_example(self, - i, - body_text, - title_text, - url, - threshold): - """ - Prints an example of the model's prediction for manual inspection. - """ - if i: - print('\n\n==============================================') - print(f'============== Example # {i} =================\n') + def print_example(self, + i, + body_text, + title_text, + url, + threshold): + """ + Prints an example of the model's prediction for manual inspection. + """ + if i: + print('\n\n==============================================') + print('============== Example # {} =================\n'.format(i)) - if url: - print(url) + if url: + print(url) - print(f"Issue Body:\n {body_text} \n") + print("Issue Body:\n {} \n".format(body_text)) - if title_text: - print(f"Original Title:\n {title_text}") + if title_text: + print("Original Title:\n {}".format(title_text)) - emb, gen_title = self.generate_issue_title(body_text) - print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}") + emb, gen_title = self.generate_issue_title(body_text) + print("\n****** Machine Generated Title (Prediction) ******:\n {}".format(gen_title)) - if self.nn: - # return neighbors and distances - n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4, - include_distances=True) - neighbors = n[1:] - dist = d[1:] + if self.nn: + # return neighbors and distances + n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4, + include_distances=True) + neighbors = n[1:] + dist = d[1:] - if min(dist) <= threshold: - cols = ['issue_url', 'issue_title', 'body'] - dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True) - dfcopy['dist'] = dist - similar_issues_df = dfcopy.query(f'dist <= {threshold}') + if min(dist) <= threshold: + cols = ['issue_url', 'issue_title', 'body'] + dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True) + dfcopy['dist'] = dist + similar_issues_df = dfcopy.query('dist <= {}'.format(threshold)) - print("\n**** Similar Issues (using encoder embedding) ****:\n") - display(similar_issues_df) + print("\n**** Similar Issues (using encoder embedding) ****:\n") + display(similar_issues_df) - def demo_model_predictions(self, - n, - issue_df, - threshold=1): - """ - Pick n random Issues and display predictions. + def demo_model_predictions(self, + n, + issue_df, + threshold=1): + """ + Pick n random Issues and display predictions. - Input: - ------ - n : int - Number of issues to display from issue_df - issue_df : pandas DataFrame - DataFrame that contains two columns: `body` and `issue_title`. - threshold : float - distance threshold for recommendation of similar issues. + Input: + ------ + n : int + Number of issues to display from issue_df + issue_df : pandas DataFrame + DataFrame that contains two columns: `body` and `issue_title`. + threshold : float + distance threshold for recommendation of similar issues. - Returns: - -------- - None - Prints the original issue body and the model's prediction. - """ - # Extract body and title from DF - body_text = issue_df.body.tolist() - title_text = issue_df.issue_title.tolist() - url = issue_df.issue_url.tolist() + Returns: + -------- + None + Prints the original issue body and the model's prediction. + """ + # Extract body and title from DF + body_text = issue_df.body.tolist() + title_text = issue_df.issue_title.tolist() + url = issue_df.issue_url.tolist() - demo_list = np.random.randint(low=1, high=len(body_text), size=n) - for i in demo_list: - self.print_example(i, - body_text=body_text[i], - title_text=title_text[i], - url=url[i], - threshold=threshold) + demo_list = np.random.randint(low=1, high=len(body_text), size=n) + for i in demo_list: + self.print_example(i, + body_text=body_text[i], + title_text=title_text[i], + url=url[i], + threshold=threshold) - def prepare_recommender(self, vectorized_array, original_df): - """ - Use the annoy library to build recommender + def prepare_recommender(self, vectorized_array, original_df): + """ + Use the annoy library to build recommender - Parameters - ---------- - vectorized_array : List[List[int]] - This is the list of list of integers that represents your corpus - that is fed into the seq2seq model for training. - original_df : pandas.DataFrame - This is the original dataframe that has the columns - ['issue_url', 'issue_title', 'body'] + Parameters + ---------- + vectorized_array : List[List[int]] + This is the list of list of integers that represents your corpus + that is fed into the seq2seq model for training. + original_df : pandas.DataFrame + This is the original dataframe that has the columns + ['issue_url', 'issue_title', 'body'] - Returns - ------- - annoy.AnnoyIndex object (see https://github.com/spotify/annoy) - """ - self.rec_df = original_df - emb = self.encoder_model.predict(x=vectorized_array, - batch_size=vectorized_array.shape[0]//200) + Returns + ------- + annoy.AnnoyIndex object (see https://github.com/spotify/annoy) + """ + self.rec_df = original_df + emb = self.encoder_model.predict(x=vectorized_array, + batch_size=vectorized_array.shape[0]//200) - f = emb.shape[1] - self.nn = AnnoyIndex(f) - logging.warning('Adding embeddings') - for i in tqdm(range(len(emb))): - self.nn.add_item(i, emb[i]) - logging.warning('Building trees for similarity lookup.') - self.nn.build(50) - return self.nn + f = emb.shape[1] + self.nn = AnnoyIndex(f) + logging.warning('Adding embeddings') + for i in tqdm(range(len(emb))): + self.nn.add_item(i, emb[i]) + logging.warning('Building trees for similarity lookup.') + self.nn.build(50) + return self.nn - def set_recsys_data(self, original_df): - self.rec_df = original_df + def set_recsys_data(self, original_df): + self.rec_df = original_df - def set_recsys_annoyobj(self, annoyobj): - self.nn = annoyobj + def set_recsys_annoyobj(self, annoyobj): + self.nn = annoyobj - def evaluate_model(self, holdout_bodies, holdout_titles): - """ - Method for calculating BLEU Score. + def evaluate_model(self, holdout_bodies, holdout_titles): + """ + Method for calculating BLEU Score. - Parameters - ---------- - holdout_bodies : List[str] - These are the issue bodies that we want to summarize - holdout_titles : List[str] - This is the ground truth we are trying to predict --> issue titles + Parameters + ---------- + holdout_bodies : List[str] + These are the issue bodies that we want to summarize + holdout_titles : List[str] + This is the ground truth we are trying to predict --> issue titles - Returns - ------- - bleu : float - The BLEU Score + Returns + ------- + bleu : float + The BLEU Score - """ - actual, predicted = list(), list() - assert len(holdout_bodies) == len(holdout_titles) - num_examples = len(holdout_bodies) + """ + actual, predicted = list(), list() + assert len(holdout_bodies) == len(holdout_titles) + num_examples = len(holdout_bodies) - logging.warning('Generating predictions.') - # step over the whole set TODO: parallelize this - for i in tqdm_notebook(range(num_examples)): - _, yhat = self.generate_issue_title(holdout_bodies[i]) + logging.warning('Generating predictions.') + # step over the whole set TODO: parallelize this + for i in tqdm_notebook(range(num_examples)): + _, yhat = self.generate_issue_title(holdout_bodies[i]) - actual.append(self.pp_title.process_text([holdout_titles[i]])[0]) - predicted.append(self.pp_title.process_text([yhat])[0]) - - # calculate BLEU score - logging.warning('Calculating BLEU.') - #must be careful with nltk api for corpus_bleu!, - # expects List[List[List[str]]] for ground truth, using List[List[str]] will give you - # erroneous results. - bleu = corpus_bleu([[a] for a in actual], predicted) - return bleu + actual.append(self.pp_title.process_text([holdout_titles[i]])[0]) + predicted.append(self.pp_title.process_text([yhat])[0]) + + # calculate BLEU score + logging.warning('Calculating BLEU.') + #must be careful with nltk api for corpus_bleu!, + # expects List[List[List[str]]] for ground truth, using List[List[str]] will give you + # erroneous results. + bleu = corpus_bleu([[a] for a in actual], predicted) + return bleu diff --git a/github_issue_summarization/workflow/workspace/src/prediction.py b/github_issue_summarization/workflow/workspace/src/prediction.py index 5539f6c0..3f11813c 100644 --- a/github_issue_summarization/workflow/workspace/src/prediction.py +++ b/github_issue_summarization/workflow/workspace/src/prediction.py @@ -1,8 +1,6 @@ import argparse import keras import pandas as pd -from seq2seq_utils import load_decoder_inputs -from seq2seq_utils import load_encoder_inputs from seq2seq_utils import load_text_processor from seq2seq_utils import Seq2Seq_Inference @@ -29,5 +27,5 @@ seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp, decoder_preprocessor=title_pp, seq2seq_model=seq2seq_Model) -# Output predictions for n random rows in the test set. +# Output predictions for n random rows in the test set. seq2seq_inf.demo_model_predictions(n=args.input_prediction_count, issue_df=testdf) diff --git a/github_issue_summarization/workflow/workspace/src/preprocess_data_for_deep_learning.py b/github_issue_summarization/workflow/workspace/src/preprocess_data_for_deep_learning.py index 125717e0..4928d4a6 100644 --- a/github_issue_summarization/workflow/workspace/src/preprocess_data_for_deep_learning.py +++ b/github_issue_summarization/workflow/workspace/src/preprocess_data_for_deep_learning.py @@ -1,7 +1,7 @@ import argparse import dill as dpickle -from ktext.preprocess import processor import numpy as np +from ktext.preprocess import processor import pandas as pd # Parsing flags. @@ -30,7 +30,7 @@ print('Example body after pre-processing:', train_body_vecs[0]) # Instantiate a text processor for the titles, with some different parameters. title_pp = processor(append_indicators=True, keep_n=4500, - padding_maxlen=12, padding ='post') + padding_maxlen=12, padding='post') # process the title data train_title_vecs = title_pp.fit_transform(train_title_raw) @@ -40,10 +40,10 @@ print('Example title after pre-processing:', train_title_vecs[0]) # Save the preprocessor. with open(args.output_body_preprocessor_dpkl, 'wb') as f: - dpickle.dump(body_pp, f) + dpickle.dump(body_pp, f) with open(args.output_title_preprocessor_dpkl, 'wb') as f: - dpickle.dump(title_pp, f) + dpickle.dump(title_pp, f) # Save the processed data. np.save(args.output_train_title_vecs_npy, train_title_vecs) diff --git a/github_issue_summarization/workflow/workspace/src/process_data.py b/github_issue_summarization/workflow/workspace/src/process_data.py index d6b27cf4..6b258a1f 100644 --- a/github_issue_summarization/workflow/workspace/src/process_data.py +++ b/github_issue_summarization/workflow/workspace/src/process_data.py @@ -1,6 +1,4 @@ import argparse -import glob -import logging import pandas as pd from sklearn.model_selection import train_test_split @@ -20,8 +18,8 @@ traindf, testdf = train_test_split(pd.read_csv(args.input_csv).sample(n=args.sam test_size=.10) # Print stats about the shape of the data. -print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns') -print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns') +print('Train: {:,} rows {:,} columns'.format(traindf.shape[0], traindf.shape[1])) +print('Test: {:,} rows {:,} columns'.format(testdf.shape[0], testdf.shape[1])) # Store output as CSV. traindf.to_csv(args.output_traindf_csv) diff --git a/github_issue_summarization/workflow/workspace/src/recommend.py b/github_issue_summarization/workflow/workspace/src/recommend.py index f755bb4f..03de81ae 100644 --- a/github_issue_summarization/workflow/workspace/src/recommend.py +++ b/github_issue_summarization/workflow/workspace/src/recommend.py @@ -1,8 +1,6 @@ import argparse import keras import pandas as pd -from seq2seq_utils import load_decoder_inputs -from seq2seq_utils import load_encoder_inputs from seq2seq_utils import load_text_processor from seq2seq_utils import Seq2Seq_Inference diff --git a/github_issue_summarization/workflow/workspace/src/seq2seq_utils.py b/github_issue_summarization/workflow/workspace/src/seq2seq_utils.py deleted file mode 100644 index c278dfdb..00000000 --- a/github_issue_summarization/workflow/workspace/src/seq2seq_utils.py +++ /dev/null @@ -1,393 +0,0 @@ -from matplotlib import pyplot as plt -import tensorflow as tf -from keras import backend as K -from keras.layers import Input -from keras.models import Model -from IPython.display import SVG, display -from keras.utils.vis_utils import model_to_dot -import logging -import numpy as np -import dill as dpickle -from annoy import AnnoyIndex -from tqdm import tqdm, tqdm_notebook -from random import random -from nltk.translate.bleu_score import corpus_bleu - -def load_text_processor(fname='title_pp.dpkl'): - """ - Load preprocessors from disk. - Parameters - ---------- - fname: str - file name of ktext.proccessor object - Returns - ------- - num_tokens : int - size of vocabulary loaded into ktext.processor - pp : ktext.processor - the processor you are trying to load - Typical Usage: - ------------- - num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl') - num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl') - """ - # Load files from disk - with open(fname, 'rb') as f: - pp = dpickle.load(f) - - num_tokens = max(pp.id2token.keys()) + 1 - print(f'Size of vocabulary for {fname}: {num_tokens:,}') - return num_tokens, pp - - -def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'): - """ - Load decoder inputs. - Parameters - ---------- - decoder_np_vecs : str - filename of serialized numpy.array of decoder input (issue title) - Returns - ------- - decoder_input_data : numpy.array - The data fed to the decoder as input during training for teacher forcing. - This is the same as `decoder_np_vecs` except the last position. - decoder_target_data : numpy.array - The data that the decoder data is trained to generate (issue title). - Calculated by sliding `decoder_np_vecs` one position forward. - """ - vectorized_title = np.load(decoder_np_vecs) - # For Decoder Input, you don't need the last word as that is only for prediction - # when we are training using Teacher Forcing. - decoder_input_data = vectorized_title[:, :-1] - - # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing) - decoder_target_data = vectorized_title[:, 1:] - - print(f'Shape of decoder input: {decoder_input_data.shape}') - print(f'Shape of decoder target: {decoder_target_data.shape}') - return decoder_input_data, decoder_target_data - - -def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'): - """ - Load variables & data that are inputs to encoder. - Parameters - ---------- - encoder_np_vecs : str - filename of serialized numpy.array of encoder input (issue title) - Returns - ------- - encoder_input_data : numpy.array - The issue body - doc_length : int - The standard document length of the input for the encoder after padding - the shape of this array will be (num_examples, doc_length) - """ - vectorized_body = np.load(encoder_np_vecs) - # Encoder input is simply the body of the issue text - encoder_input_data = vectorized_body - doc_length = encoder_input_data.shape[1] - print(f'Shape of encoder input: {encoder_input_data.shape}') - return encoder_input_data, doc_length - - -def viz_model_architecture(model): - """Visualize model architecture in Jupyter notebook.""" - display(SVG(model_to_dot(model).create(prog='dot', format='svg'))) - - -def free_gpu_mem(): - """Attempt to free gpu memory.""" - K.get_session().close() - cfg = K.tf.ConfigProto() - cfg.gpu_options.allow_growth = True - K.set_session(K.tf.Session(config=cfg)) - - -def test_gpu(): - """Run a toy computation task in tensorflow to test GPU.""" - config = tf.ConfigProto() - config.gpu_options.allow_growth = True - session = tf.Session(config=config) - hello = tf.constant('Hello, TensorFlow!') - print(session.run(hello)) - - -def plot_model_training_history(history_object): - """Plots model train vs. validation loss.""" - plt.title('model accuracy') - plt.ylabel('accuracy') - plt.xlabel('epoch') - plt.plot(history_object.history['loss']) - plt.plot(history_object.history['val_loss']) - plt.legend(['train', 'test'], loc='upper left') - plt.show() - - -def extract_encoder_model(model): - """ - Extract the encoder from the original Sequence to Sequence Model. - Returns a keras model object that has one input (body of issue) and one - output (encoding of issue, which is the last hidden state). - Input: - ----- - model: keras model object - Returns: - ----- - keras model object - """ - encoder_model = model.get_layer('Encoder-Model') - return encoder_model - - -def extract_decoder_model(model): - """ - Extract the decoder from the original model. - Inputs: - ------ - model: keras model object - Returns: - ------- - A Keras model object with the following inputs and outputs: - Inputs of Keras Model That Is Returned: - 1: the embedding index for the last predicted word or the indicator - 2: the last hidden state, or in the case of the first word the hidden state from the encoder - Outputs of Keras Model That Is Returned: - 1. Prediction (class probabilities) for the next word - 2. The hidden state of the decoder, to be fed back into the decoder at the next time step - Implementation Notes: - ---------------------- - Must extract relevant layers and reconstruct part of the computation graph - to allow for different inputs as we are not going to use teacher forcing at - inference time. - """ - # the latent dimension is the same throughout the architecture so we are going to - # cheat and grab the latent dimension of the embedding because that is the same as what is - # output from the decoder - latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1] - - # Reconstruct the input into the decoder - decoder_inputs = model.get_layer('Decoder-Input').input - dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs) - dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb) - - # Instead of setting the intial state from the encoder and forgetting about it, during inference - # we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into - # the GRU, thus we define this input layer for the state so we can add this capability - gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input') - - # we need to reuse the weights that is why we are getting this - # If you inspect the decoder GRU that we created for training, it will take as input - # 2 tensors -> (1) is the embedding layer output for the teacher forcing - # (which will now be the last step's prediction, and will be _start_ on the first time step) - # (2) is the state, which we will initialize with the encoder on the first time step, but then - # grab the state after the first prediction and feed that back in again. - gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input]) - - # Reconstruct dense layers - dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out) - dense_out = model.get_layer('Final-Output-Dense')(dec_bn2) - decoder_model = Model([decoder_inputs, gru_inference_state_input], - [dense_out, gru_state_out]) - return decoder_model - - -class Seq2Seq_Inference(object): - def __init__(self, - encoder_preprocessor, - decoder_preprocessor, - seq2seq_model): - - self.pp_body = encoder_preprocessor - self.pp_title = decoder_preprocessor - self.seq2seq_model = seq2seq_model - self.encoder_model = extract_encoder_model(seq2seq_model) - self.decoder_model = extract_decoder_model(seq2seq_model) - self.default_max_len_title = self.pp_title.padding_maxlen - self.nn = None - self.rec_df = None - - def generate_issue_title(self, - raw_input_text, - max_len_title=None): - """ - Use the seq2seq model to generate a title given the body of an issue. - Inputs - ------ - raw_input: str - The body of the issue text as an input string - max_len_title: int (optional) - The maximum length of the title the model will generate - """ - if max_len_title is None: - max_len_title = self.default_max_len_title - # get the encoder's features for the decoder - raw_tokenized = self.pp_body.transform([raw_input_text]) - body_encoding = self.encoder_model.predict(raw_tokenized) - # we want to save the encoder's embedding before its updated by decoder - # because we can use that as an embedding for other tasks. - original_body_encoding = body_encoding - state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1) - - decoded_sentence = [] - stop_condition = False - while not stop_condition: - preds, st = self.decoder_model.predict([state_value, body_encoding]) - - # We are going to ignore indices 0 (padding) and indices 1 (unknown) - # Argmax will return the integer index corresponding to the - # prediction + 2 b/c we chopped off first two - pred_idx = np.argmax(preds[:, :, 2:]) + 2 - - # retrieve word from index prediction - pred_word_str = self.pp_title.id2token[pred_idx] - - if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title: - stop_condition = True - break - decoded_sentence.append(pred_word_str) - - # update the decoder for the next word - body_encoding = st - state_value = np.array(pred_idx).reshape(1, 1) - - return original_body_encoding, ' '.join(decoded_sentence) - - - def print_example(self, - i, - body_text, - title_text, - url, - threshold): - """ - Prints an example of the model's prediction for manual inspection. - """ - if i: - print('\n\n==============================================') - print(f'============== Example # {i} =================\n') - - if url: - print(url) - - print(f"Issue Body:\n {body_text} \n") - - if title_text: - print(f"Original Title:\n {title_text}") - - emb, gen_title = self.generate_issue_title(body_text) - print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}") - - if self.nn: - # return neighbors and distances - n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4, - include_distances=True) - neighbors = n[1:] - dist = d[1:] - - if min(dist) <= threshold: - cols = ['issue_url', 'issue_title', 'body'] - dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True) - dfcopy['dist'] = dist - similar_issues_df = dfcopy.query(f'dist <= {threshold}') - - print("\n**** Similar Issues (using encoder embedding) ****:\n") - display(similar_issues_df) - - - def demo_model_predictions(self, - n, - issue_df, - threshold=1): - """ - Pick n random Issues and display predictions. - Input: - ------ - n : int - Number of issues to display from issue_df - issue_df : pandas DataFrame - DataFrame that contains two columns: `body` and `issue_title`. - threshold : float - distance threshold for recommendation of similar issues. - Returns: - -------- - None - Prints the original issue body and the model's prediction. - """ - # Extract body and title from DF - body_text = issue_df.body.tolist() - title_text = issue_df.issue_title.tolist() - url = issue_df.issue_url.tolist() - - demo_list = np.random.randint(low=1, high=len(body_text), size=n) - for i in demo_list: - self.print_example(i, - body_text=body_text[i], - title_text=title_text[i], - url=url[i], - threshold=threshold) - - def prepare_recommender(self, vectorized_array, original_df): - """ - Use the annoy library to build recommender - Parameters - ---------- - vectorized_array : List[List[int]] - This is the list of list of integers that represents your corpus - that is fed into the seq2seq model for training. - original_df : pandas.DataFrame - This is the original dataframe that has the columns - ['issue_url', 'issue_title', 'body'] - Returns - ------- - annoy.AnnoyIndex object (see https://github.com/spotify/annoy) - """ - self.rec_df = original_df - emb = self.encoder_model.predict(x=vectorized_array, - batch_size=vectorized_array.shape[0]//200) - - f = emb.shape[1] - self.nn = AnnoyIndex(f) - logging.warning('Adding embeddings') - for i in tqdm(range(len(emb))): - self.nn.add_item(i, emb[i]) - logging.warning('Building trees for similarity lookup.') - self.nn.build(50) - return self.nn - - def set_recsys_data(self, original_df): - self.rec_df = original_df - - def set_recsys_annoyobj(self, annoyobj): - self.nn = annoyobj - - def evaluate_model(self, holdout_bodies, holdout_titles): - """ - Method for calculating BLEU Score. - Parameters - ---------- - holdout_bodies : List[str] - These are the issue bodies that we want to summarize - holdout_titles : List[str] - This is the ground truth we are trying to predict --> issue titles - Returns - ------- - bleu : float - The BLEU Score - """ - actual, predicted = list(), list() - assert len(holdout_bodies) == len(holdout_titles) - num_examples = len(holdout_bodies) - - logging.warning('Generating predictions.') - # step over the whole set TODO: parallelize this - for i in tqdm_notebook(range(num_examples)): - _, yhat = self.generate_issue_title(holdout_bodies[i]) - - actual.append(self.pp_title.process_text([holdout_titles[i]])[0]) - predicted.append(self.pp_title.process_text([yhat])[0]) - # calculate BLEU score - logging.warning('Calculating BLEU.') - bleu = corpus_bleu(actual, predicted) - return bleu diff --git a/github_issue_summarization/workflow/workspace/src/train.py b/github_issue_summarization/workflow/workspace/src/train.py index 0969019a..87dc89ca 100644 --- a/github_issue_summarization/workflow/workspace/src/train.py +++ b/github_issue_summarization/workflow/workspace/src/train.py @@ -1,11 +1,10 @@ import argparse +import numpy as np from keras.callbacks import CSVLogger, ModelCheckpoint -from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization +from keras.layers import Input, GRU, Dense, Embedding, BatchNormalization from keras.models import Model from keras import optimizers -import numpy as np from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor -from seq2seq_utils import viz_model_architecture # Parsing flags. parser = argparse.ArgumentParser() @@ -18,7 +17,7 @@ parser.add_argument("--learning_rate", default="0.001") args = parser.parse_args() print(args) -learning_rate=float(args.learning_rate) +learning_rate = float(args.learning_rate) encoder_input_data, doc_length = load_encoder_inputs(args.input_train_body_vecs_npy) decoder_input_data, decoder_target_data = load_decoder_inputs(args.input_train_title_vecs_npy) @@ -35,7 +34,10 @@ latent_dim = 300 encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) -x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) +x = Embedding(num_encoder_tokens, + latent_dim, + name='Body-Word-Embedding', + mask_zero=False)(encoder_inputs) x = BatchNormalization(name='Encoder-Batchnorm-1')(x) # We do not need the `encoder_output` just the hidden state. @@ -53,7 +55,10 @@ seq2seq_encoder_out = encoder_model(encoder_inputs) decoder_inputs = Input(shape=(None,), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) -dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) +dec_emb = Embedding(num_decoder_tokens, + latent_dim, + name='Decoder-Word-Embedding', + mask_zero=False)(decoder_inputs) dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # Set up the decoder, using `decoder_state_input` as initial state. @@ -71,21 +76,24 @@ decoder_outputs = decoder_dense(x) seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) -seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy') +seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), + loss='sparse_categorical_crossentropy') seq2seq_Model.summary() script_name_base = 'tutorial_seq2seq' csv_logger = CSVLogger('{:}.log'.format(script_name_base)) -model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base), - save_best_only=True) +model_checkpoint = ModelCheckpoint( + '{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base), save_best_only=True) batch_size = 1200 epochs = 7 -history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1), - batch_size=batch_size, - epochs=epochs, - validation_split=0.12, callbacks=[csv_logger, model_checkpoint]) +history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], + np.expand_dims(decoder_target_data, -1), + batch_size=batch_size, + epochs=epochs, + validation_split=0.12, + callbacks=[csv_logger, model_checkpoint]) ############# # Save model.