Add .pylintrc (#61)

* Add .pylintrc * Resolve lint complaints in agents/trainer/task.py * Resolve lint complaints with flask app.py * Resolve linting issues Remove duplicate seq2seq_utils.py from workflow/workspace/src * Use python 3.5.2 with pylint to match prow Put pybullet import back into agents/trainer/task.py with a pylint ignore statement Use main(_) to ensure it works with tf.app.run
2018-03-29 08:25:02 -07:00 · 2018-03-29 08:25:02 -07:00 · 41372c9314
parent 1d6946ead8
commit 41372c9314
12 changed files with 841 additions and 821 deletions
--- a/.pylintrc
+++ b/.pylintrc
@ -0,0 +1,399 @@
 [MASTER]
 # Specify a configuration file.
 #rcfile=
 # Python code to execute, usually for sys.path manipulation such as
 # pygtk.require().
 #init-hook=
 # Add files or directories to the blacklist. They should be base names, not
 # paths.
 ignore=third_party
 # Add files or directories matching the regex patterns to the blacklist. The
 # regex matches against base names, not paths.
 ignore-patterns=
 # Pickle collected data for later comparisons.
 persistent=no
 # List of plugins (as comma separated values of python modules names) to load,
 # usually to register additional checkers.
 load-plugins=
 # Use multiple processes to speed up Pylint.
 jobs=4
 # Allow loading of arbitrary C extensions. Extensions are imported into the
 # active Python interpreter and may run arbitrary code.
 unsafe-load-any-extension=no
 # A comma-separated list of package or module names from where C extensions may
 # be loaded. Extensions are loading into the active Python interpreter and may
 # run arbitrary code
 extension-pkg-whitelist=
 [MESSAGES CONTROL]
 # Only show warnings with the listed confidence levels. Leave empty to show
 # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
 confidence=
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
 # multiple time (only on the command line, not in the configuration file where
 # it should appear only once). See also the "--disable" option for examples.
 #enable=
 # Disable the message, report, category or checker with the given id(s). You
 # can either give multiple identifiers separated by comma (,) or put this
 # option multiple times (only on the command line, not in the configuration
 # file where it should appear only once).You can also use "--disable=all" to
 # disable everything first and then reenable specific checks. For example, if
 # you want to run only the similarities checker, you can use "--disable=all
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use"--disable=all --enable=classes
 # --disable=W"
 disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,range-builtin-not-iterating,suppressed-message,missing-docstring,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating,relative-import,invalid-name,bad-continuation,no-member,locally-disabled,fixme,import-error,too-many-locals
 [REPORTS]
 # Set the output format. Available formats are text, parseable, colorized, msvs
 # (visual studio) and html. You can also give a reporter class, eg
 # mypackage.mymodule.MyReporterClass.
 output-format=text
 # Put messages in a separate file for each module / package specified on the
 # command line instead of printing them on stdout. Reports (if any) will be
 # written in a file name "pylint_global.[txt|html]". This option is deprecated
 # and it will be removed in Pylint 2.0.
 files-output=no
 # Tells whether to display a full report or only the messages
 reports=no
 # Python expression which should return a note less than 10 (10 is the highest
 # note). You have access to the variables errors warning, statement which
 # respectively contain the number of errors / warnings messages and the total
 # number of statements analyzed. This is used by the global evaluation report
 # (RP0004).
 evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 # Template used to display messages. This is a python new-style format string
 # used to format the message information. See doc for all details
 #msg-template=
 [BASIC]
 # Good variable names which should always be accepted, separated by a comma
 good-names=i,j,k,ex,Run,_
 # Bad variable names which should always be refused, separated by a comma
 bad-names=foo,bar,baz,toto,tutu,tata
 # Colon-delimited sets of names that determine each other's naming style when
 # the name regexes allow several styles.
 name-group=
 # Include a hint for the correct naming format with invalid-name
 include-naming-hint=no
 # List of decorators that produce properties, such as abc.abstractproperty. Add
 # to this list to register other decorators that produce valid properties.
 property-classes=abc.abstractproperty
 # Regular expression matching correct function names
 function-rgx=[a-z_][a-z0-9_]{2,30}$
 # Naming hint for function names
 function-name-hint=[a-z_][a-z0-9_]{2,30}$
 # Regular expression matching correct variable names
 variable-rgx=[a-z_][a-z0-9_]{2,30}$
 # Naming hint for variable names
 variable-name-hint=[a-z_][a-z0-9_]{2,30}$
 # Regular expression matching correct constant names
 const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
 # Naming hint for constant names
 const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
 # Regular expression matching correct attribute names
 attr-rgx=[a-z_][a-z0-9_]{2,30}$
 # Naming hint for attribute names
 attr-name-hint=[a-z_][a-z0-9_]{2,30}$
 # Regular expression matching correct argument names
 argument-rgx=[a-z_][a-z0-9_]{2,30}$
 # Naming hint for argument names
 argument-name-hint=[a-z_][a-z0-9_]{2,30}$
 # Regular expression matching correct class attribute names
 class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
 # Naming hint for class attribute names
 class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
 # Regular expression matching correct inline iteration names
 inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
 # Naming hint for inline iteration names
 inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
 # Regular expression matching correct class names
 class-rgx=[A-Z_][a-zA-Z0-9]+$
 # Naming hint for class names
 class-name-hint=[A-Z_][a-zA-Z0-9]+$
 # Regular expression matching correct module names
 module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
 # Naming hint for module names
 module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
 # Regular expression matching correct method names
 method-rgx=[a-z_][a-z0-9_]{2,30}$
 # Naming hint for method names
 method-name-hint=[a-z_][a-z0-9_]{2,30}$
 # Regular expression which should only match function or class names that do
 # not require a docstring.
 no-docstring-rgx=^_
 # Minimum line length for functions/classes that require docstrings, shorter
 # ones are exempt.
 docstring-min-length=-1
 [ELIF]
 # Maximum number of nested blocks for function / method body
 max-nested-blocks=5
 [TYPECHECK]
 # Tells whether missing members accessed in mixin class should be ignored. A
 # mixin class is detected if its name ends with "mixin" (case insensitive).
 ignore-mixin-members=yes
 # List of module names for which member attributes should not be checked
 # (useful for modules/projects where namespaces are manipulated during runtime
 # and thus existing member attributes cannot be deduced by static analysis. It
 # supports qualified module names, as well as Unix pattern matching.
 ignored-modules=
 # List of class names for which member attributes should not be checked (useful
 # for classes with dynamically set attributes). This supports the use of
 # qualified names.
 ignored-classes=optparse.Values,thread._local,_thread._local
 # List of members which are set dynamically and missed by pylint inference
 # system, and so shouldn't trigger E1101 when accessed. Python regular
 # expressions are accepted.
 generated-members=
 # List of decorators that produce context managers, such as
 # contextlib.contextmanager. Add to this list to register other decorators that
 # produce valid context managers.
 contextmanager-decorators=contextlib.contextmanager
 [FORMAT]
 # Maximum number of characters on a single line.
 max-line-length=100
 # Regexp for a line that is allowed to be longer than the limit.
 ignore-long-lines=^\s*(# )?<?https?://\S+>?$
 # Allow the body of an if to be on the same line as the test if there is no
 # else.
 single-line-if-stmt=no
 # List of optional constructs for which whitespace checking is disabled. `dict-
 # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
 # `trailing-comma` allows a space between comma and closing bracket: (a, ).
 # `empty-line` allows space-only lines.
 no-space-check=trailing-comma,dict-separator
 # Maximum number of lines in a module
 max-module-lines=1000
 # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
 # tab).
 # Use 2 spaces consistent with TensorFlow style.
 indent-string='  '
 # Number of spaces of indent required inside a hanging  or continued line.
 indent-after-paren=4
 # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
 expected-line-ending-format=
 [MISCELLANEOUS]
 # List of note tags to take in consideration, separated by a comma.
 notes=FIXME,XXX,TODO
 [VARIABLES]
 # Tells whether we should check for unused import in __init__ files.
 init-import=no
 # A regular expression matching the name of dummy variables (i.e. expectedly
 # not used).
 dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
 # List of additional names supposed to be defined in builtins. Remember that
 # you should avoid to define new builtins when possible.
 additional-builtins=
 # List of strings which can identify a callback function by name. A callback
 # name must start or end with one of those strings.
 callbacks=cb_,_cb
 # List of qualified module names which can have objects that can redefine
 # builtins.
 redefining-builtins-modules=six.moves,future.builtins
 [LOGGING]
 # Logging modules to check that the string format arguments are in logging
 # function parameter format
 logging-modules=logging
 [SIMILARITIES]
 # Minimum lines number of a similarity.
 min-similarity-lines=4
 # Ignore comments when computing similarities.
 ignore-comments=yes
 # Ignore docstrings when computing similarities.
 ignore-docstrings=yes
 # Ignore imports when computing similarities.
 ignore-imports=no
 [SPELLING]
 # Spelling dictionary name. Available dictionaries: none. To make it working
 # install python-enchant package.
 spelling-dict=
 # List of comma separated words that should not be checked.
 spelling-ignore-words=
 # A path to a file that contains private dictionary; one word per line.
 spelling-private-dict-file=
 # Tells whether to store unknown words to indicated private dictionary in
 # --spelling-private-dict-file option instead of raising a message.
 spelling-store-unknown-words=no
 [IMPORTS]
 # Deprecated modules which should not be used, separated by a comma
 deprecated-modules=regsub,TERMIOS,Bastion,rexec
 # Create a graph of every (i.e. internal and external) dependencies in the
 # given file (report RP0402 must not be disabled)
 import-graph=
 # Create a graph of external dependencies in the given file (report RP0402 must
 # not be disabled)
 ext-import-graph=
 # Create a graph of internal dependencies in the given file (report RP0402 must
 # not be disabled)
 int-import-graph=
 # Force import order to recognize a module as part of the standard
 # compatibility libraries.
 known-standard-library=
 # Force import order to recognize a module as part of a third party library.
 known-third-party=enchant
 # Analyse import fallback blocks. This can be used to support both Python 2 and
 # 3 compatible code, which means that the block might have code that exists
 # only in one or another interpreter, leading to false positives when analysed.
 analyse-fallback-blocks=no
 [DESIGN]
 # Maximum number of arguments for function / method
 max-args=7
 # Argument names that match this expression will be ignored. Default to name
 # with leading underscore
 ignored-argument-names=_.*
 # Maximum number of locals for function / method body
 max-locals=15
 # Maximum number of return / yield for function / method body
 max-returns=6
 # Maximum number of branch for function / method body
 max-branches=12
 # Maximum number of statements in function / method body
 max-statements=50
 # Maximum number of parents for a class (see R0901).
 max-parents=7
 # Maximum number of attributes for a class (see R0902).
 max-attributes=7
 # Minimum number of public methods for a class (see R0903).
 min-public-methods=0
 # Maximum number of public methods for a class (see R0904).
 max-public-methods=20
 # Maximum number of boolean expressions in a if statement
 max-bool-expr=5
 [CLASSES]
 # List of method names used to declare (i.e. assign) instance attributes.
 defining-attr-methods=__init__,__new__,setUp
 # List of valid names for the first argument in a class method.
 valid-classmethod-first-arg=cls
 # List of valid names for the first argument in a metaclass class method.
 valid-metaclass-classmethod-first-arg=mcs
 # List of member names, which should be excluded from the protected access
 # warning.
 exclude-protected=_asdict,_fields,_replace,_source,_make
 [EXCEPTIONS]
 # Exceptions that will emit a warning when being caught. Defaults to
 # "Exception"
 overgeneral-exceptions=Exception
--- a/agents/trainer/task.py
+++ b/agents/trainer/task.py
@ -12,18 +12,18 @@
 """Provides an entrypoint for the training task."""
 #pylint: disable=unused-import
 from __future__ import absolute_import, division, print_function
 import argparse
 import datetime
 import logging
 import os
 import pprint
 import uuid
 import pip
 import tensorflow as tf
 from google.cloud import storage
 import tensorflow as tf
 import agents
 import pybullet_envs  # To make AntBulletEnv-v0 available.
@ -113,39 +113,39 @@ def hparams_base():
  """Base hparams tf/Agents PPO """
  # General
-  algorithm = agents.ppo.PPOAlgorithm
+#  algorithm = agents.ppo.PPOAlgorithm
-  num_agents = 30
+#  num_agents = 30
-  eval_episodes = 30
+#  eval_episodes = 30
-  use_gpu = False
+#  use_gpu = False
  # Environment
-  env = 'KukaBulletEnv-v0'
+#  env = 'KukaBulletEnv-v0'
-  normalize_ranges = True
+#  normalize_ranges = True
-  max_length = 1000
+#  max_length = 1000
  # Network
-  network = agents.scripts.networks.feed_forward_gaussian
+#  network = agents.scripts.networks.feed_forward_gaussian
-  weight_summaries = dict(
+#  weight_summaries = dict(
-      all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*')
+#      all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*')
-  policy_layers = 200, 100
+#  policy_layers = 200, 100
-  value_layers = 200, 100
+#  value_layers = 200, 100
-  init_output_factor = 0.1
+#  init_output_factor = 0.1
-  init_logstd = -1
+#  init_logstd = -1
-  init_std = 0.35
+#  init_std = 0.35
  # Optimization
-  update_every = 60
+#  update_every = 60
-  update_epochs = 25
+#  update_epochs = 25
-  optimizer = tf.train.AdamOptimizer
+#  optimizer = tf.train.AdamOptimizer
-  learning_rate = 1e-4
+#  learning_rate = 1e-4
-  steps = 3e7  # 30M
+#  steps = 3e7  # 30M
  # Losses
-  discount = 0.995
+#  discount = 0.995
-  kl_target = 1e-2
+#  kl_target = 1e-2
-  kl_cutoff_factor = 2
+#  kl_cutoff_factor = 2
-  kl_cutoff_coef = 1000
+#  kl_cutoff_coef = 1000
-  kl_init_penalty = 1
+#  kl_init_penalty = 1
  return locals()
@ -158,9 +158,9 @@ def _object_import_from_string(name):
  return mod
-def _realize_import_attrs(d, filter):
+def _realize_import_attrs(d, hparam_filter):
  for k, v in d.items():
-    if k in filter:
+    if k in hparam_filter:
      imported = _object_import_from_string(v)
      # TODO: Provide an appropriately informative error if the import fails
      # except ImportError as e:
@ -170,7 +170,7 @@ def _realize_import_attrs(d, filter):
  return d
-def _get_agents_configuration(hparam_set_name, log_dir=None, is_chief=False):
+def _get_agents_configuration(log_dir=None):
  """Load hyperparameter config."""
  try:
    # Try to resume training.
@ -243,23 +243,20 @@ def gcs_upload(local_dir, gcs_out_dir):
    blob.upload_from_filename(local_file_path)
-def main(unused_argv):
+def main(_):
  """Run training."""
  tf.logging.set_verbosity(tf.logging.INFO)
  if FLAGS.debug:
    tf.logging.set_verbosity(tf.logging.DEBUG)
  run_config = tf.contrib.learn.RunConfig()
  log_dir = FLAGS.logdir
-  agents_config = _get_agents_configuration(
+  agents_config = _get_agents_configuration(log_dir)
      FLAGS.hparam_set_id, log_dir, run_config.is_chief)
  if FLAGS.run_mode == 'train':
    for score in agents.scripts.train.train(agents_config, env_processes=True):
-      logging.info('Score {}.'.format(score))
+      logging.info('Score %s.', score)
  if FLAGS.run_mode == 'render':
    now = datetime.datetime.now()
    subdir = now.strftime("%m%d-%H%M") + "-" + uuid.uuid4().hex[0:4]
--- a/github_issue_summarization/docker/flask_web/app.py
+++ b/github_issue_summarization/docker/flask_web/app.py
@ -2,38 +2,47 @@
 Simple app that parses predictions from a trained model and displays them.
 """
 from flask import Flask, json, render_template, request
 import requests
-app = Flask(__name__)
+from flask import Flask, json, render_template, request
 APP = Flask(__name__)
-@app.route("/")
+@APP.route("/")
 def index():
  """Default route.
  Placeholder, does nothing.
  """
  return render_template("index.html")
-@app.route("/summary", methods=['GET', 'POST'])
+@APP.route("/summary", methods=['GET', 'POST'])
 def summary():
  """Main prediction route.
  Provides a machine-generated summary of the given text. Sends a request to a live
  model trained on GitHub issues.
  """
  if request.method == 'POST':
    issue_text = request.form["issue_text"]
    url = "http://ambassador:80/seldon/issue-summarization/api/v0.1/predictions"
-    headers = { 'content-type': 'application/json' }
+    headers = {'content-type': 'application/json'}
    json_data = {
-        "data" : {
+      "data" : {
-          "ndarray" : [[ issue_text ]]
+        "ndarray" : [[issue_text]]
-        }
+      }
    }
-    r = requests.post(url = url,
+    response = requests.post(url=url,
-                      headers = headers,
+                             headers=headers,
-                      data = json.dumps(json_data))
+                             data=json.dumps(json_data))
-    rjs = json.loads(r.text)
+    response_json = json.loads(response.text)
-    summary = rjs["data"]["ndarray"][0][0]
+    issue_summary = response_json["data"]["ndarray"][0][0]
-    return render_template("summary.html",
+    return render_template("issue_summary.html",
-                           issue_text = issue_text,
+                           issue_text=issue_text,
-                           summary = summary)
+                           issue_summary=issue_summary)
  return ('', 204)
 if __name__ == '__main__':
-  app.run(debug = True, host = '0.0.0.0', port = 80)
+  APP.run(debug=True, host='0.0.0.0', port=80)
--- a/github_issue_summarization/notebooks/IssueSummarization.py
+++ b/github_issue_summarization/notebooks/IssueSummarization.py
@ -1,22 +0,0 @@
 from __future__ import print_function
 import dill as dpickle
 import numpy as np
 from keras.models import load_model
 from seq2seq_utils import Seq2Seq_Inference
 class IssueSummarization(object):
    def __init__(self):
        with open('body_pp.dpkl', 'rb') as f:
            body_pp = dpickle.load(f)
        with open('title_pp.dpkl', 'rb') as f:
            title_pp = dpickle.load(f)
        self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                       decoder_preprocessor=title_pp,
                                       seq2seq_model=load_model('seq2seq_model_tutorial.h5'))
    def predict(self, X, feature_names):
        return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in X])
--- a/github_issue_summarization/notebooks/issue_summarization.py
+++ b/github_issue_summarization/notebooks/issue_summarization.py
@ -0,0 +1,25 @@
 """Generates predictions using a stored model.
 Uses trained model files to generate a prediction.
 """
 from __future__ import print_function
 import numpy as np
 import dill as dpickle
 from keras.models import load_model
 from seq2seq_utils import Seq2Seq_Inference
 class IssueSummarization(object):
  def __init__(self):
    with open('body_pp.dpkl', 'rb') as body_file:
      body_pp = dpickle.load(body_file)
    with open('title_pp.dpkl', 'rb') as title_file:
      title_pp = dpickle.load(title_file)
    self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                   decoder_preprocessor=title_pp,
                                   seq2seq_model=load_model('seq2seq_model_tutorial.h5'))
  def predict(self, input_text):
    return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in input_text])
--- a/github_issue_summarization/notebooks/seq2seq_utils.py
+++ b/github_issue_summarization/notebooks/seq2seq_utils.py
@ -1,429 +1,432 @@
 import logging
 import dill as dpickle
 import numpy as np
 from matplotlib import pyplot as plt
 import tensorflow as tf
 from IPython.display import SVG, display
 from keras import backend as K
 from keras.layers import Input
 from keras.models import Model
 from IPython.display import SVG, display
 from keras.utils.vis_utils import model_to_dot
 import logging
 import numpy as np
 import dill as dpickle
 from annoy import AnnoyIndex
 from tqdm import tqdm, tqdm_notebook
 from random import random
 from nltk.translate.bleu_score import corpus_bleu
 def load_text_processor(fname='title_pp.dpkl'):
-    """
+  """
-    Load preprocessors from disk.
+  Load preprocessors from disk.
-    Parameters
+  Parameters
-    ----------
+  ----------
-    fname: str
+  fname: str
-        file name of ktext.proccessor object
+    file name of ktext.proccessor object
-    Returns
+  Returns
-    -------
+  -------
-    num_tokens : int
+  num_tokens : int
-        size of vocabulary loaded into ktext.processor
+    size of vocabulary loaded into ktext.processor
-    pp : ktext.processor
+  pp : ktext.processor
-        the processor you are trying to load
+    the processor you are trying to load
-    Typical Usage:
+  Typical Usage:
-    -------------
+  -------------
-    num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
+  num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
-    num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
+  num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
-    """
+  """
-    # Load files from disk
+  # Load files from disk
-    with open(fname, 'rb') as f:
+  with open(fname, 'rb') as f:
-        pp = dpickle.load(f)
+    pp = dpickle.load(f)
-    num_tokens = max(pp.id2token.keys()) + 1
+  num_tokens = max(pp.id2token.keys()) + 1
-    print(f'Size of vocabulary for {fname}: {num_tokens:,}')
+  print('Size of vocabulary for {}: {}'.format(fname, num_tokens))
-    return num_tokens, pp
+  return num_tokens, pp
 def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
-    """
+  """
-    Load decoder inputs.
+  Load decoder inputs.
-    Parameters
+  Parameters
-    ----------
+  ----------
-    decoder_np_vecs : str
+  decoder_np_vecs : str
-        filename of serialized numpy.array of decoder input (issue title)
+    filename of serialized numpy.array of decoder input (issue title)
-    Returns
+  Returns
-    -------
+  -------
-    decoder_input_data : numpy.array
+  decoder_input_data : numpy.array
-        The data fed to the decoder as input during training for teacher forcing.
+    The data fed to the decoder as input during training for teacher forcing.
-        This is the same as `decoder_np_vecs` except the last position.
+    This is the same as `decoder_np_vecs` except the last position.
-    decoder_target_data : numpy.array
+  decoder_target_data : numpy.array
-        The data that the decoder data is trained to generate (issue title).
+    The data that the decoder data is trained to generate (issue title).
-        Calculated by sliding `decoder_np_vecs` one position forward.
+    Calculated by sliding `decoder_np_vecs` one position forward.
-    """
+  """
-    vectorized_title = np.load(decoder_np_vecs)
+  vectorized_title = np.load(decoder_np_vecs)
-    # For Decoder Input, you don't need the last word as that is only for prediction
+  # For Decoder Input, you don't need the last word as that is only for prediction
-    # when we are training using Teacher Forcing.
+  # when we are training using Teacher Forcing.
-    decoder_input_data = vectorized_title[:, :-1]
+  decoder_input_data = vectorized_title[:, :-1]
-    # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
+  # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
-    decoder_target_data = vectorized_title[:, 1:]
+  decoder_target_data = vectorized_title[:, 1:]
-    print(f'Shape of decoder input: {decoder_input_data.shape}')
+  print('Shape of decoder input: {}'.format(decoder_input_data.shape))
-    print(f'Shape of decoder target: {decoder_target_data.shape}')
+  print('Shape of decoder target: {}'.format(decoder_target_data.shape))
-    return decoder_input_data, decoder_target_data
+  return decoder_input_data, decoder_target_data
 def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
-    """
+  """
-    Load variables & data that are inputs to encoder.
+  Load variables & data that are inputs to encoder.
-    Parameters
+  Parameters
-    ----------
+  ----------
-    encoder_np_vecs : str
+  encoder_np_vecs : str
-        filename of serialized numpy.array of encoder input (issue title)
+    filename of serialized numpy.array of encoder input (issue title)
-    Returns
+  Returns
-    -------
+  -------
-    encoder_input_data : numpy.array
+  encoder_input_data : numpy.array
-        The issue body
+    The issue body
-    doc_length : int
+  doc_length : int
-        The standard document length of the input for the encoder after padding
+    The standard document length of the input for the encoder after padding
-        the shape of this array will be (num_examples, doc_length)
+    the shape of this array will be (num_examples, doc_length)
-    """
+  """
-    vectorized_body = np.load(encoder_np_vecs)
+  vectorized_body = np.load(encoder_np_vecs)
-    # Encoder input is simply the body of the issue text
+  # Encoder input is simply the body of the issue text
-    encoder_input_data = vectorized_body
+  encoder_input_data = vectorized_body
-    doc_length = encoder_input_data.shape[1]
+  doc_length = encoder_input_data.shape[1]
-    print(f'Shape of encoder input: {encoder_input_data.shape}')
+  print('Shape of encoder input: {}'.format(encoder_input_data.shape))
-    return encoder_input_data, doc_length
+  return encoder_input_data, doc_length
 def viz_model_architecture(model):
-    """Visualize model architecture in Jupyter notebook."""
+  """Visualize model architecture in Jupyter notebook."""
-    display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
+  display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
 def free_gpu_mem():
-    """Attempt to free gpu memory."""
+  """Attempt to free gpu memory."""
-    K.get_session().close()
+  K.get_session().close()
-    cfg = K.tf.ConfigProto()
+  cfg = K.tf.ConfigProto()
-    cfg.gpu_options.allow_growth = True
+  cfg.gpu_options.allow_growth = True
-    K.set_session(K.tf.Session(config=cfg))
+  K.set_session(K.tf.Session(config=cfg))
 def test_gpu():
-    """Run a toy computation task in tensorflow to test GPU."""
+  """Run a toy computation task in tensorflow to test GPU."""
-    config = tf.ConfigProto()
+  config = tf.ConfigProto()
-    config.gpu_options.allow_growth = True
+  config.gpu_options.allow_growth = True
-    session = tf.Session(config=config)
+  session = tf.Session(config=config)
-    hello = tf.constant('Hello, TensorFlow!')
+  hello = tf.constant('Hello, TensorFlow!')
-    print(session.run(hello))
+  print(session.run(hello))
 def plot_model_training_history(history_object):
-    """Plots model train vs. validation loss."""
+  """Plots model train vs. validation loss."""
-    plt.title('model accuracy')
+  plt.title('model accuracy')
-    plt.ylabel('accuracy')
+  plt.ylabel('accuracy')
-    plt.xlabel('epoch')
+  plt.xlabel('epoch')
-    plt.plot(history_object.history['loss'])
+  plt.plot(history_object.history['loss'])
-    plt.plot(history_object.history['val_loss'])
+  plt.plot(history_object.history['val_loss'])
-    plt.legend(['train', 'test'], loc='upper left')
+  plt.legend(['train', 'test'], loc='upper left')
-    plt.show()
+  plt.show()
 def extract_encoder_model(model):
-    """
+  """
-    Extract the encoder from the original Sequence to Sequence Model.
+  Extract the encoder from the original Sequence to Sequence Model.
-    Returns a keras model object that has one input (body of issue) and one
+  Returns a keras model object that has one input (body of issue) and one
-    output (encoding of issue, which is the last hidden state).
+  output (encoding of issue, which is the last hidden state).
-    Input:
+  Input:
-    -----
+  -----
-    model: keras model object
+  model: keras model object
-    Returns:
+  Returns:
-    -----
+  -----
-    keras model object
+  keras model object
-    """
+  """
-    encoder_model = model.get_layer('Encoder-Model')
+  encoder_model = model.get_layer('Encoder-Model')
-    return encoder_model
+  return encoder_model
 def extract_decoder_model(model):
-    """
+  """
-    Extract the decoder from the original model.
+  Extract the decoder from the original model.
-    Inputs:
+  Inputs:
-    ------
+  ------
-    model: keras model object
+  model: keras model object
-    Returns:
+  Returns:
-    -------
+  -------
-    A Keras model object with the following inputs and outputs:
+  A Keras model object with the following inputs and outputs:
-    Inputs of Keras Model That Is Returned:
+  Inputs of Keras Model That Is Returned:
-    1: the embedding index for the last predicted word or the <Start> indicator
+  1: the embedding index for the last predicted word or the <Start> indicator
-    2: the last hidden state, or in the case of the first word the hidden state from the encoder
+  2: the last hidden state, or in the case of the first word the hidden state from the encoder
-    Outputs of Keras Model That Is Returned:
+  Outputs of Keras Model That Is Returned:
-    1.  Prediction (class probabilities) for the next word
+  1.  Prediction (class probabilities) for the next word
-    2.  The hidden state of the decoder, to be fed back into the decoder at the next time step
+  2.  The hidden state of the decoder, to be fed back into the decoder at the next time step
-    Implementation Notes:
+  Implementation Notes:
-    ----------------------
+  ----------------------
-    Must extract relevant layers and reconstruct part of the computation graph
+  Must extract relevant layers and reconstruct part of the computation graph
-    to allow for different inputs as we are not going to use teacher forcing at
+  to allow for different inputs as we are not going to use teacher forcing at
-    inference time.
+  inference time.
-    """
+  """
-    # the latent dimension is the same throughout the architecture so we are going to
+  # the latent dimension is the same throughout the architecture so we are going to
-    # cheat and grab the latent dimension of the embedding because that is the same as what is
+  # cheat and grab the latent dimension of the embedding because that is the same as what is
-    # output from the decoder
+  # output from the decoder
-    latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
+  latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
-    # Reconstruct the input into the decoder
+  # Reconstruct the input into the decoder
-    decoder_inputs = model.get_layer('Decoder-Input').input
+  decoder_inputs = model.get_layer('Decoder-Input').input
-    dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
+  dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
-    dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
+  dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
-    # Instead of setting the intial state from the encoder and forgetting about it, during inference
+  # Instead of setting the intial state from the encoder and forgetting about it, during inference
-    # we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
+  # we are not doing teacher forcing, so we will have to have a feedback loop from predictions back
-    # the GRU, thus we define this input layer for the state so we can add this capability
+  # into the GRU, thus we define this input layer for the state so we can add this capability
-    gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
+  gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
-    # we need to reuse the weights that is why we are getting this
+  # we need to reuse the weights that is why we are getting this
-    # If you inspect the decoder GRU that we created for training, it will take as input
+  # If you inspect the decoder GRU that we created for training, it will take as input
-    # 2 tensors -> (1) is the embedding layer output for the teacher forcing
+  # 2 tensors -> (1) is the embedding layer output for the teacher forcing
-    #                  (which will now be the last step's prediction, and will be _start_ on the first time step)
+  #                  (which will now be the last step's prediction, and will be _start_ on the
-    #              (2) is the state, which we will initialize with the encoder on the first time step, but then
+  #                  first time step)
-    #                   grab the state after the first prediction and feed that back in again.
+  #              (2) is the state, which we will initialize with the encoder on the first time step
-    gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
+  #              but then grab the state after the first prediction and feed that back in again.
  gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
-    # Reconstruct dense layers
+  # Reconstruct dense layers
-    dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
+  dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
-    dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
+  dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
-    decoder_model = Model([decoder_inputs, gru_inference_state_input],
+  decoder_model = Model([decoder_inputs, gru_inference_state_input],
-                          [dense_out, gru_state_out])
+                        [dense_out, gru_state_out])
-    return decoder_model
+  return decoder_model
 class Seq2Seq_Inference(object):
    def __init__(self,
                 encoder_preprocessor,
                 decoder_preprocessor,
                 seq2seq_model):
-        self.pp_body = encoder_preprocessor
+  # pylint: disable=too-many-instance-attributes
        self.pp_title = decoder_preprocessor
        self.seq2seq_model = seq2seq_model
        self.encoder_model = extract_encoder_model(seq2seq_model)
        self.decoder_model = extract_decoder_model(seq2seq_model)
        self.default_max_len_title = self.pp_title.padding_maxlen
        self.nn = None
        self.rec_df = None
-    def generate_issue_title(self,
+  def __init__(self,
-                             raw_input_text,
+               encoder_preprocessor,
-                             max_len_title=None):
+               decoder_preprocessor,
-        """
+               seq2seq_model):
        Use the seq2seq model to generate a title given the body of an issue.
-        Inputs
+    self.pp_body = encoder_preprocessor
-        ------
+    self.pp_title = decoder_preprocessor
-        raw_input: str
+    self.seq2seq_model = seq2seq_model
-            The body of the issue text as an input string
+    self.encoder_model = extract_encoder_model(seq2seq_model)
    self.decoder_model = extract_decoder_model(seq2seq_model)
    self.default_max_len_title = self.pp_title.padding_maxlen
    self.nn = None
    self.rec_df = None
-        max_len_title: int (optional)
+  def generate_issue_title(self,
-            The maximum length of the title the model will generate
+                           raw_input_text,
                           max_len_title=None):
    """
    Use the seq2seq model to generate a title given the body of an issue.
-        """
+    Inputs
-        if max_len_title is None:
+    ------
-            max_len_title = self.default_max_len_title
+    raw_input: str
-        # get the encoder's features for the decoder
+        The body of the issue text as an input string
        raw_tokenized = self.pp_body.transform([raw_input_text])
        body_encoding = self.encoder_model.predict(raw_tokenized)
        # we want to save the encoder's embedding before its updated by decoder
        #   because we can use that as an embedding for other tasks.
        original_body_encoding = body_encoding
        state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
-        decoded_sentence = []
+    max_len_title: int (optional)
-        stop_condition = False
+        The maximum length of the title the model will generate
        while not stop_condition:
            preds, st = self.decoder_model.predict([state_value, body_encoding])
-            # We are going to ignore indices 0 (padding) and indices 1 (unknown)
+    """
-            # Argmax will return the integer index corresponding to the
+    if max_len_title is None:
-            #  prediction + 2 b/c we chopped off first two
+      max_len_title = self.default_max_len_title
-            pred_idx = np.argmax(preds[:, :, 2:]) + 2
+    # get the encoder's features for the decoder
    raw_tokenized = self.pp_body.transform([raw_input_text])
    body_encoding = self.encoder_model.predict(raw_tokenized)
    # we want to save the encoder's embedding before its updated by decoder
    #   because we can use that as an embedding for other tasks.
    original_body_encoding = body_encoding
    state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
-            # retrieve word from index prediction
+    decoded_sentence = []
-            pred_word_str = self.pp_title.id2token[pred_idx]
+    stop_condition = False
    while not stop_condition:
      preds, st = self.decoder_model.predict([state_value, body_encoding])
-            if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
+      # We are going to ignore indices 0 (padding) and indices 1 (unknown)
-                stop_condition = True
+      # Argmax will return the integer index corresponding to the
-                break
+      #  prediction + 2 b/c we chopped off first two
-            decoded_sentence.append(pred_word_str)
+      pred_idx = np.argmax(preds[:, :, 2:]) + 2
-            # update the decoder for the next word
+      # retrieve word from index prediction
-            body_encoding = st
+      pred_word_str = self.pp_title.id2token[pred_idx]
            state_value = np.array(pred_idx).reshape(1, 1)
-        return original_body_encoding, ' '.join(decoded_sentence)
+      if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
        stop_condition = True
        break
      decoded_sentence.append(pred_word_str)
      # update the decoder for the next word
      body_encoding = st
      state_value = np.array(pred_idx).reshape(1, 1)
    return original_body_encoding, ' '.join(decoded_sentence)
-    def print_example(self,
+  def print_example(self,
-                      i,
+                    i,
-                      body_text,
+                    body_text,
-                      title_text,
+                    title_text,
-                      url,
+                    url,
-                      threshold):
+                    threshold):
-        """
+    """
-        Prints an example of the model's prediction for manual inspection.
+    Prints an example of the model's prediction for manual inspection.
-        """
+    """
-        if i:
+    if i:
-            print('\n\n==============================================')
+      print('\n\n==============================================')
-            print(f'============== Example # {i} =================\n')
+      print('============== Example # {} =================\n'.format(i))
-        if url:
+    if url:
-            print(url)
+      print(url)
-        print(f"Issue Body:\n {body_text} \n")
+    print("Issue Body:\n {} \n".format(body_text))
-        if title_text:
+    if title_text:
-            print(f"Original Title:\n {title_text}")
+      print("Original Title:\n {}".format(title_text))
-        emb, gen_title = self.generate_issue_title(body_text)
+    emb, gen_title = self.generate_issue_title(body_text)
-        print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
+    print("\n****** Machine Generated Title (Prediction) ******:\n {}".format(gen_title))
-        if self.nn:
+    if self.nn:
-            # return neighbors and distances
+      # return neighbors and distances
-            n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
+      n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
-                                             include_distances=True)
+                                       include_distances=True)
-            neighbors = n[1:]
+      neighbors = n[1:]
-            dist = d[1:]
+      dist = d[1:]
-            if min(dist) <= threshold:
+      if min(dist) <= threshold:
-                cols = ['issue_url', 'issue_title', 'body']
+        cols = ['issue_url', 'issue_title', 'body']
-                dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
+        dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
-                dfcopy['dist'] = dist
+        dfcopy['dist'] = dist
-                similar_issues_df = dfcopy.query(f'dist <= {threshold}')
+        similar_issues_df = dfcopy.query('dist <= {}'.format(threshold))
-                print("\n**** Similar Issues (using encoder embedding) ****:\n")
+        print("\n**** Similar Issues (using encoder embedding) ****:\n")
-                display(similar_issues_df)
+        display(similar_issues_df)
-    def demo_model_predictions(self,
+  def demo_model_predictions(self,
-                               n,
+                             n,
-                               issue_df,
+                             issue_df,
-                               threshold=1):
+                             threshold=1):
-        """
+    """
-        Pick n random Issues and display predictions.
+    Pick n random Issues and display predictions.
-        Input:
+    Input:
-        ------
+    ------
-        n : int
+    n : int
-            Number of issues to display from issue_df
+      Number of issues to display from issue_df
-        issue_df : pandas DataFrame
+    issue_df : pandas DataFrame
-            DataFrame that contains two columns: `body` and `issue_title`.
+      DataFrame that contains two columns: `body` and `issue_title`.
-        threshold : float
+    threshold : float
-            distance threshold for recommendation of similar issues.
+      distance threshold for recommendation of similar issues.
-        Returns:
+    Returns:
-        --------
+    --------
-        None
+    None
-            Prints the original issue body and the model's prediction.
+      Prints the original issue body and the model's prediction.
-        """
+    """
-        # Extract body and title from DF
+    # Extract body and title from DF
-        body_text = issue_df.body.tolist()
+    body_text = issue_df.body.tolist()
-        title_text = issue_df.issue_title.tolist()
+    title_text = issue_df.issue_title.tolist()
-        url = issue_df.issue_url.tolist()
+    url = issue_df.issue_url.tolist()
-        demo_list = np.random.randint(low=1, high=len(body_text), size=n)
+    demo_list = np.random.randint(low=1, high=len(body_text), size=n)
-        for i in demo_list:
+    for i in demo_list:
-            self.print_example(i,
+      self.print_example(i,
-                               body_text=body_text[i],
+                         body_text=body_text[i],
-                               title_text=title_text[i],
+                         title_text=title_text[i],
-                               url=url[i],
+                         url=url[i],
-                               threshold=threshold)
+                         threshold=threshold)
-    def prepare_recommender(self, vectorized_array, original_df):
+  def prepare_recommender(self, vectorized_array, original_df):
-        """
+    """
-        Use the annoy library to build recommender
+    Use the annoy library to build recommender
-        Parameters
+    Parameters
-        ----------
+    ----------
-        vectorized_array : List[List[int]]
+    vectorized_array : List[List[int]]
-            This is the list of list of integers that represents your corpus
+      This is the list of list of integers that represents your corpus
-            that is fed into the seq2seq model for training.
+      that is fed into the seq2seq model for training.
-        original_df : pandas.DataFrame
+    original_df : pandas.DataFrame
-            This is the original dataframe that has the columns
+      This is the original dataframe that has the columns
-            ['issue_url', 'issue_title', 'body']
+      ['issue_url', 'issue_title', 'body']
-        Returns
+    Returns
-        -------
+    -------
-        annoy.AnnoyIndex  object (see https://github.com/spotify/annoy)
+    annoy.AnnoyIndex  object (see https://github.com/spotify/annoy)
-        """
+    """
-        self.rec_df = original_df
+    self.rec_df = original_df
-        emb = self.encoder_model.predict(x=vectorized_array,
+    emb = self.encoder_model.predict(x=vectorized_array,
-                                         batch_size=vectorized_array.shape[0]//200)
+                                     batch_size=vectorized_array.shape[0]//200)
-        f = emb.shape[1]
+    f = emb.shape[1]
-        self.nn = AnnoyIndex(f)
+    self.nn = AnnoyIndex(f)
-        logging.warning('Adding embeddings')
+    logging.warning('Adding embeddings')
-        for i in tqdm(range(len(emb))):
+    for i in tqdm(range(len(emb))):
-            self.nn.add_item(i, emb[i])
+      self.nn.add_item(i, emb[i])
-        logging.warning('Building trees for similarity lookup.')
+    logging.warning('Building trees for similarity lookup.')
-        self.nn.build(50)
+    self.nn.build(50)
-        return self.nn
+    return self.nn
-    def set_recsys_data(self, original_df):
+  def set_recsys_data(self, original_df):
-        self.rec_df = original_df
+    self.rec_df = original_df
-    def set_recsys_annoyobj(self, annoyobj):
+  def set_recsys_annoyobj(self, annoyobj):
-        self.nn = annoyobj
+    self.nn = annoyobj
-    def evaluate_model(self, holdout_bodies, holdout_titles):
+  def evaluate_model(self, holdout_bodies, holdout_titles):
-        """
+    """
-        Method for calculating BLEU Score.
+    Method for calculating BLEU Score.
-        Parameters
+    Parameters
-        ----------
+    ----------
-        holdout_bodies : List[str]
+    holdout_bodies : List[str]
-            These are the issue bodies that we want to summarize
+      These are the issue bodies that we want to summarize
-        holdout_titles : List[str]
+    holdout_titles : List[str]
-            This is the ground truth we are trying to predict --> issue titles
+      This is the ground truth we are trying to predict --> issue titles
-        Returns
+    Returns
-        -------
+    -------
-        bleu : float
+    bleu : float
-            The BLEU Score
+      The BLEU Score
-        """
+    """
-        actual, predicted = list(), list()
+    actual, predicted = list(), list()
-        assert len(holdout_bodies) == len(holdout_titles)
+    assert len(holdout_bodies) == len(holdout_titles)
-        num_examples = len(holdout_bodies)
+    num_examples = len(holdout_bodies)
-        logging.warning('Generating predictions.')
+    logging.warning('Generating predictions.')
-        # step over the whole set TODO: parallelize this
+    # step over the whole set TODO: parallelize this
-        for i in tqdm_notebook(range(num_examples)):
+    for i in tqdm_notebook(range(num_examples)):
-            _, yhat = self.generate_issue_title(holdout_bodies[i])
+      _, yhat = self.generate_issue_title(holdout_bodies[i])
-            actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
+      actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
-            predicted.append(self.pp_title.process_text([yhat])[0])
+      predicted.append(self.pp_title.process_text([yhat])[0])
-            
+
-        # calculate BLEU score
+    # calculate BLEU score
-        logging.warning('Calculating BLEU.')
+    logging.warning('Calculating BLEU.')
-        #must be careful with nltk api for corpus_bleu!, 
+    #must be careful with nltk api for corpus_bleu!,
-        # expects List[List[List[str]]] for ground truth, using List[List[str]] will give you
+    # expects List[List[List[str]]] for ground truth, using List[List[str]] will give you
-        # erroneous results.
+    # erroneous results.
-        bleu = corpus_bleu([[a] for a in actual], predicted)
+    bleu = corpus_bleu([[a] for a in actual], predicted)
-        return bleu
+    return bleu
--- a/github_issue_summarization/workflow/workspace/src/prediction.py
+++ b/github_issue_summarization/workflow/workspace/src/prediction.py
@ -1,8 +1,6 @@
 import argparse
 import keras
 import pandas as pd
 from seq2seq_utils import load_decoder_inputs
 from seq2seq_utils import load_encoder_inputs
 from seq2seq_utils import load_text_processor
 from seq2seq_utils import Seq2Seq_Inference
@ -29,5 +27,5 @@ seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                 decoder_preprocessor=title_pp,
                                 seq2seq_model=seq2seq_Model)
-# Output predictions for n random rows in the test set. 
+# Output predictions for n random rows in the test set.
 seq2seq_inf.demo_model_predictions(n=args.input_prediction_count, issue_df=testdf)
--- a/github_issue_summarization/workflow/workspace/src/preprocess_data_for_deep_learning.py
+++ b/github_issue_summarization/workflow/workspace/src/preprocess_data_for_deep_learning.py
@ -1,7 +1,7 @@
 import argparse
 import dill as dpickle
 from ktext.preprocess import processor
 import numpy as np
 from ktext.preprocess import processor
 import pandas as pd
 # Parsing flags.
@ -30,7 +30,7 @@ print('Example body after pre-processing:', train_body_vecs[0])
 # Instantiate a text processor for the titles, with some different parameters.
 title_pp = processor(append_indicators=True, keep_n=4500,
-                     padding_maxlen=12, padding ='post')
+                     padding_maxlen=12, padding='post')
 # process the title data
 train_title_vecs = title_pp.fit_transform(train_title_raw)
@ -40,10 +40,10 @@ print('Example title after pre-processing:', train_title_vecs[0])
 # Save the preprocessor.
 with open(args.output_body_preprocessor_dpkl, 'wb') as f:
-    dpickle.dump(body_pp, f)
+  dpickle.dump(body_pp, f)
 with open(args.output_title_preprocessor_dpkl, 'wb') as f:
-    dpickle.dump(title_pp, f)
+  dpickle.dump(title_pp, f)
 # Save the processed data.
 np.save(args.output_train_title_vecs_npy, train_title_vecs)
--- a/github_issue_summarization/workflow/workspace/src/process_data.py
+++ b/github_issue_summarization/workflow/workspace/src/process_data.py
@ -1,6 +1,4 @@
 import argparse
 import glob
 import logging
 import pandas as pd
 from sklearn.model_selection import train_test_split
@ -20,8 +18,8 @@ traindf, testdf = train_test_split(pd.read_csv(args.input_csv).sample(n=args.sam
                                   test_size=.10)
 # Print stats about the shape of the data.
-print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')
+print('Train: {:,} rows {:,} columns'.format(traindf.shape[0], traindf.shape[1]))
-print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')
+print('Test: {:,} rows {:,} columns'.format(testdf.shape[0], testdf.shape[1]))
 # Store output as CSV.
 traindf.to_csv(args.output_traindf_csv)
--- a/github_issue_summarization/workflow/workspace/src/recommend.py
+++ b/github_issue_summarization/workflow/workspace/src/recommend.py
@ -1,8 +1,6 @@
 import argparse
 import keras
 import pandas as pd
 from seq2seq_utils import load_decoder_inputs
 from seq2seq_utils import load_encoder_inputs
 from seq2seq_utils import load_text_processor
 from seq2seq_utils import Seq2Seq_Inference
--- a/github_issue_summarization/workflow/workspace/src/seq2seq_utils.py
+++ b/github_issue_summarization/workflow/workspace/src/seq2seq_utils.py
@ -1,393 +0,0 @@
 from matplotlib import pyplot as plt
 import tensorflow as tf
 from keras import backend as K
 from keras.layers import Input
 from keras.models import Model
 from IPython.display import SVG, display
 from keras.utils.vis_utils import model_to_dot
 import logging
 import numpy as np
 import dill as dpickle
 from annoy import AnnoyIndex
 from tqdm import tqdm, tqdm_notebook
 from random import random
 from nltk.translate.bleu_score import corpus_bleu
 def load_text_processor(fname='title_pp.dpkl'):
    """
    Load preprocessors from disk.
    Parameters
    ----------
    fname: str
        file name of ktext.proccessor object
    Returns
    -------
    num_tokens : int
        size of vocabulary loaded into ktext.processor
    pp : ktext.processor
        the processor you are trying to load
    Typical Usage:
    -------------
    num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
    num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
    """
    # Load files from disk
    with open(fname, 'rb') as f:
        pp = dpickle.load(f)
    num_tokens = max(pp.id2token.keys()) + 1
    print(f'Size of vocabulary for {fname}: {num_tokens:,}')
    return num_tokens, pp
 def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
    """
    Load decoder inputs.
    Parameters
    ----------
    decoder_np_vecs : str
        filename of serialized numpy.array of decoder input (issue title)
    Returns
    -------
    decoder_input_data : numpy.array
        The data fed to the decoder as input during training for teacher forcing.
        This is the same as `decoder_np_vecs` except the last position.
    decoder_target_data : numpy.array
        The data that the decoder data is trained to generate (issue title).
        Calculated by sliding `decoder_np_vecs` one position forward.
    """
    vectorized_title = np.load(decoder_np_vecs)
    # For Decoder Input, you don't need the last word as that is only for prediction
    # when we are training using Teacher Forcing.
    decoder_input_data = vectorized_title[:, :-1]
    # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
    decoder_target_data = vectorized_title[:, 1:]
    print(f'Shape of decoder input: {decoder_input_data.shape}')
    print(f'Shape of decoder target: {decoder_target_data.shape}')
    return decoder_input_data, decoder_target_data
 def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
    """
    Load variables & data that are inputs to encoder.
    Parameters
    ----------
    encoder_np_vecs : str
        filename of serialized numpy.array of encoder input (issue title)
    Returns
    -------
    encoder_input_data : numpy.array
        The issue body
    doc_length : int
        The standard document length of the input for the encoder after padding
        the shape of this array will be (num_examples, doc_length)
    """
    vectorized_body = np.load(encoder_np_vecs)
    # Encoder input is simply the body of the issue text
    encoder_input_data = vectorized_body
    doc_length = encoder_input_data.shape[1]
    print(f'Shape of encoder input: {encoder_input_data.shape}')
    return encoder_input_data, doc_length
 def viz_model_architecture(model):
    """Visualize model architecture in Jupyter notebook."""
    display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
 def free_gpu_mem():
    """Attempt to free gpu memory."""
    K.get_session().close()
    cfg = K.tf.ConfigProto()
    cfg.gpu_options.allow_growth = True
    K.set_session(K.tf.Session(config=cfg))
 def test_gpu():
    """Run a toy computation task in tensorflow to test GPU."""
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    session = tf.Session(config=config)
    hello = tf.constant('Hello, TensorFlow!')
    print(session.run(hello))
 def plot_model_training_history(history_object):
    """Plots model train vs. validation loss."""
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.plot(history_object.history['loss'])
    plt.plot(history_object.history['val_loss'])
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
 def extract_encoder_model(model):
    """
    Extract the encoder from the original Sequence to Sequence Model.
    Returns a keras model object that has one input (body of issue) and one
    output (encoding of issue, which is the last hidden state).
    Input:
    -----
    model: keras model object
    Returns:
    -----
    keras model object
    """
    encoder_model = model.get_layer('Encoder-Model')
    return encoder_model
 def extract_decoder_model(model):
    """
    Extract the decoder from the original model.
    Inputs:
    ------
    model: keras model object
    Returns:
    -------
    A Keras model object with the following inputs and outputs:
    Inputs of Keras Model That Is Returned:
    1: the embedding index for the last predicted word or the <Start> indicator
    2: the last hidden state, or in the case of the first word the hidden state from the encoder
    Outputs of Keras Model That Is Returned:
    1.  Prediction (class probabilities) for the next word
    2.  The hidden state of the decoder, to be fed back into the decoder at the next time step
    Implementation Notes:
    ----------------------
    Must extract relevant layers and reconstruct part of the computation graph
    to allow for different inputs as we are not going to use teacher forcing at
    inference time.
    """
    # the latent dimension is the same throughout the architecture so we are going to
    # cheat and grab the latent dimension of the embedding because that is the same as what is
    # output from the decoder
    latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
    # Reconstruct the input into the decoder
    decoder_inputs = model.get_layer('Decoder-Input').input
    dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
    dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
    # Instead of setting the intial state from the encoder and forgetting about it, during inference
    # we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
    # the GRU, thus we define this input layer for the state so we can add this capability
    gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
    # we need to reuse the weights that is why we are getting this
    # If you inspect the decoder GRU that we created for training, it will take as input
    # 2 tensors -> (1) is the embedding layer output for the teacher forcing
    #                  (which will now be the last step's prediction, and will be _start_ on the first time step)
    #              (2) is the state, which we will initialize with the encoder on the first time step, but then
    #                   grab the state after the first prediction and feed that back in again.
    gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
    # Reconstruct dense layers
    dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
    dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
    decoder_model = Model([decoder_inputs, gru_inference_state_input],
                          [dense_out, gru_state_out])
    return decoder_model
 class Seq2Seq_Inference(object):
    def __init__(self,
                 encoder_preprocessor,
                 decoder_preprocessor,
                 seq2seq_model):
        self.pp_body = encoder_preprocessor
        self.pp_title = decoder_preprocessor
        self.seq2seq_model = seq2seq_model
        self.encoder_model = extract_encoder_model(seq2seq_model)
        self.decoder_model = extract_decoder_model(seq2seq_model)
        self.default_max_len_title = self.pp_title.padding_maxlen
        self.nn = None
        self.rec_df = None
    def generate_issue_title(self,
                             raw_input_text,
                             max_len_title=None):
        """
        Use the seq2seq model to generate a title given the body of an issue.
        Inputs
        ------
        raw_input: str
            The body of the issue text as an input string
        max_len_title: int (optional)
            The maximum length of the title the model will generate
        """
        if max_len_title is None:
            max_len_title = self.default_max_len_title
        # get the encoder's features for the decoder
        raw_tokenized = self.pp_body.transform([raw_input_text])
        body_encoding = self.encoder_model.predict(raw_tokenized)
        # we want to save the encoder's embedding before its updated by decoder
        #   because we can use that as an embedding for other tasks.
        original_body_encoding = body_encoding
        state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
        decoded_sentence = []
        stop_condition = False
        while not stop_condition:
            preds, st = self.decoder_model.predict([state_value, body_encoding])
            # We are going to ignore indices 0 (padding) and indices 1 (unknown)
            # Argmax will return the integer index corresponding to the
            #  prediction + 2 b/c we chopped off first two
            pred_idx = np.argmax(preds[:, :, 2:]) + 2
            # retrieve word from index prediction
            pred_word_str = self.pp_title.id2token[pred_idx]
            if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
                stop_condition = True
                break
            decoded_sentence.append(pred_word_str)
            # update the decoder for the next word
            body_encoding = st
            state_value = np.array(pred_idx).reshape(1, 1)
        return original_body_encoding, ' '.join(decoded_sentence)
    def print_example(self,
                      i,
                      body_text,
                      title_text,
                      url,
                      threshold):
        """
        Prints an example of the model's prediction for manual inspection.
        """
        if i:
            print('\n\n==============================================')
            print(f'============== Example # {i} =================\n')
        if url:
            print(url)
        print(f"Issue Body:\n {body_text} \n")
        if title_text:
            print(f"Original Title:\n {title_text}")
        emb, gen_title = self.generate_issue_title(body_text)
        print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
        if self.nn:
            # return neighbors and distances
            n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
                                             include_distances=True)
            neighbors = n[1:]
            dist = d[1:]
            if min(dist) <= threshold:
                cols = ['issue_url', 'issue_title', 'body']
                dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
                dfcopy['dist'] = dist
                similar_issues_df = dfcopy.query(f'dist <= {threshold}')
                print("\n**** Similar Issues (using encoder embedding) ****:\n")
                display(similar_issues_df)
    def demo_model_predictions(self,
                               n,
                               issue_df,
                               threshold=1):
        """
        Pick n random Issues and display predictions.
        Input:
        ------
        n : int
            Number of issues to display from issue_df
        issue_df : pandas DataFrame
            DataFrame that contains two columns: `body` and `issue_title`.
        threshold : float
            distance threshold for recommendation of similar issues.
        Returns:
        --------
        None
            Prints the original issue body and the model's prediction.
        """
        # Extract body and title from DF
        body_text = issue_df.body.tolist()
        title_text = issue_df.issue_title.tolist()
        url = issue_df.issue_url.tolist()
        demo_list = np.random.randint(low=1, high=len(body_text), size=n)
        for i in demo_list:
            self.print_example(i,
                               body_text=body_text[i],
                               title_text=title_text[i],
                               url=url[i],
                               threshold=threshold)
    def prepare_recommender(self, vectorized_array, original_df):
        """
        Use the annoy library to build recommender
        Parameters
        ----------
        vectorized_array : List[List[int]]
            This is the list of list of integers that represents your corpus
            that is fed into the seq2seq model for training.
        original_df : pandas.DataFrame
            This is the original dataframe that has the columns
            ['issue_url', 'issue_title', 'body']
        Returns
        -------
        annoy.AnnoyIndex  object (see https://github.com/spotify/annoy)
        """
        self.rec_df = original_df
        emb = self.encoder_model.predict(x=vectorized_array,
                                         batch_size=vectorized_array.shape[0]//200)
        f = emb.shape[1]
        self.nn = AnnoyIndex(f)
        logging.warning('Adding embeddings')
        for i in tqdm(range(len(emb))):
            self.nn.add_item(i, emb[i])
        logging.warning('Building trees for similarity lookup.')
        self.nn.build(50)
        return self.nn
    def set_recsys_data(self, original_df):
        self.rec_df = original_df
    def set_recsys_annoyobj(self, annoyobj):
        self.nn = annoyobj
    def evaluate_model(self, holdout_bodies, holdout_titles):
        """
        Method for calculating BLEU Score.
        Parameters
        ----------
        holdout_bodies : List[str]
            These are the issue bodies that we want to summarize
        holdout_titles : List[str]
            This is the ground truth we are trying to predict --> issue titles
        Returns
        -------
        bleu : float
            The BLEU Score
        """
        actual, predicted = list(), list()
        assert len(holdout_bodies) == len(holdout_titles)
        num_examples = len(holdout_bodies)
        logging.warning('Generating predictions.')
        # step over the whole set TODO: parallelize this
        for i in tqdm_notebook(range(num_examples)):
            _, yhat = self.generate_issue_title(holdout_bodies[i])
            actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
            predicted.append(self.pp_title.process_text([yhat])[0])
        # calculate BLEU score
        logging.warning('Calculating BLEU.')
        bleu = corpus_bleu(actual, predicted)
        return bleu
--- a/github_issue_summarization/workflow/workspace/src/train.py
+++ b/github_issue_summarization/workflow/workspace/src/train.py
@ -1,11 +1,10 @@
 import argparse
 import numpy as np
 from keras.callbacks import CSVLogger, ModelCheckpoint
-from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
+from keras.layers import Input, GRU, Dense, Embedding, BatchNormalization
 from keras.models import Model
 from keras import optimizers
 import numpy as np
 from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor
 from seq2seq_utils import viz_model_architecture
 # Parsing flags.
 parser = argparse.ArgumentParser()
@ -18,7 +17,7 @@ parser.add_argument("--learning_rate", default="0.001")
 args = parser.parse_args()
 print(args)
-learning_rate=float(args.learning_rate)
+learning_rate = float(args.learning_rate)
 encoder_input_data, doc_length = load_encoder_inputs(args.input_train_body_vecs_npy)
 decoder_input_data, decoder_target_data = load_decoder_inputs(args.input_train_title_vecs_npy)
@ -35,7 +34,10 @@ latent_dim = 300
 encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')
 # Word embeding for encoder (ex: Issue Body)
-x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
+x = Embedding(num_encoder_tokens,
              latent_dim,
              name='Body-Word-Embedding',
              mask_zero=False)(encoder_inputs)
 x = BatchNormalization(name='Encoder-Batchnorm-1')(x)
 # We do not need the `encoder_output` just the hidden state.
@ -53,7 +55,10 @@ seq2seq_encoder_out = encoder_model(encoder_inputs)
 decoder_inputs = Input(shape=(None,), name='Decoder-Input')  # for teacher forcing
 # Word Embedding For Decoder (ex: Issue Titles)
-dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
+dec_emb = Embedding(num_decoder_tokens,
                    latent_dim,
                    name='Decoder-Word-Embedding',
                    mask_zero=False)(decoder_inputs)
 dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)
 # Set up the decoder, using `decoder_state_input` as initial state.
@ -71,21 +76,24 @@ decoder_outputs = decoder_dense(x)
 seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
-seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy')
+seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
                      loss='sparse_categorical_crossentropy')
 seq2seq_Model.summary()
 script_name_base = 'tutorial_seq2seq'
 csv_logger = CSVLogger('{:}.log'.format(script_name_base))
-model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
+model_checkpoint = ModelCheckpoint(
-                                   save_best_only=True)
+    '{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base), save_best_only=True)
 batch_size = 1200
 epochs = 7
-history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
+history = seq2seq_Model.fit([encoder_input_data, decoder_input_data],
-          batch_size=batch_size,
+                            np.expand_dims(decoder_target_data, -1),
-          epochs=epochs,
+                            batch_size=batch_size,
-          validation_split=0.12, callbacks=[csv_logger, model_checkpoint])
+                            epochs=epochs,
                            validation_split=0.12,
                            callbacks=[csv_logger, model_checkpoint])
 #############
 # Save model.