Add .pylintrc (#61)

* Add .pylintrc * Resolve lint complaints in agents/trainer/task.py * Resolve lint complaints with flask app.py * Resolve linting issues Remove duplicate seq2seq_utils.py from workflow/workspace/src * Use python 3.5.2 with pylint to match prow Put pybullet import back into agents/trainer/task.py with a pylint ignore statement Use main(_) to ensure it works with tf.app.run
2018-03-29 08:25:02 -07:00 · 2018-03-29 08:25:02 -07:00 · 41372c9314
parent 1d6946ead8
commit 41372c9314
12 changed files with 841 additions and 821 deletions
--- a/.pylintrc
+++ b/.pylintrc
@ -0,0 +1,399 @@
+[MASTER]
+
+# Specify a configuration file.
+#rcfile=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=third_party
+
+# Add files or directories matching the regex patterns to the blacklist. The
+# regex matches against base names, not paths.
+ignore-patterns=
+
+# Pickle collected data for later comparisons.
+persistent=no
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Use multiple processes to speed up Pylint.
+jobs=4
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code
+extension-pkg-whitelist=
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+confidence=
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+#enable=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,range-builtin-not-iterating,suppressed-message,missing-docstring,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating,relative-import,invalid-name,bad-continuation,no-member,locally-disabled,fixme,import-error,too-many-locals
+
+
+[REPORTS]
+
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Put messages in a separate file for each module / package specified on the
+# command line instead of printing them on stdout. Reports (if any) will be
+# written in a file name "pylint_global.[txt|html]". This option is deprecated
+# and it will be removed in Pylint 2.0.
+files-output=no
+
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+
+
+[BASIC]
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=i,j,k,ex,Run,_
+
+# Bad variable names which should always be refused, separated by a comma
+bad-names=foo,bar,baz,toto,tutu,tata
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+property-classes=abc.abstractproperty
+
+# Regular expression matching correct function names
+function-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for function names
+function-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct variable names
+variable-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for variable names
+variable-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct constant names
+const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Naming hint for constant names
+const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Regular expression matching correct attribute names
+attr-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for attribute names
+attr-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct argument names
+argument-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for argument names
+argument-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct class attribute names
+class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Naming hint for class attribute names
+class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
+
+# Naming hint for inline iteration names
+inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
+
+# Regular expression matching correct class names
+class-rgx=[A-Z_][a-zA-Z0-9]+$
+
+# Naming hint for class names
+class-name-hint=[A-Z_][a-zA-Z0-9]+$
+
+# Regular expression matching correct module names
+module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Naming hint for module names
+module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Regular expression matching correct method names
+method-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for method names
+method-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+
+[ELIF]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+
+[TYPECHECK]
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+# List of optional constructs for which whitespace checking is disabled. `dict-
+# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
+# `trailing-comma` allows a space between comma and closing bracket: (a, ).
+# `empty-line` allows space-only lines.
+no-space-check=trailing-comma,dict-separator
+
+# Maximum number of lines in a module
+max-module-lines=1000
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+# Use 2 spaces consistent with TensorFlow style.
+indent-string='  '
+
+# Number of spaces of indent required inside a hanging  or continued line.
+indent-after-paren=4
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,XXX,TODO
+
+
+[VARIABLES]
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,future.builtins
+
+
+[LOGGING]
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging
+
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+
+[SPELLING]
+
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[IMPORTS]
+
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=regsub,TERMIOS,Bastion,rexec
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method
+max-args=7
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore
+ignored-argument-names=_.*
+
+# Maximum number of locals for function / method body
+max-locals=15
+
+# Maximum number of return / yield for function / method body
+max-returns=6
+
+# Maximum number of branch for function / method body
+max-branches=12
+
+# Maximum number of statements in function / method body
+max-statements=50
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=0
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of boolean expressions in a if statement
+max-bool-expr=5
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,__new__,setUp
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=Exception
--- a/agents/trainer/task.py
+++ b/agents/trainer/task.py
@ -12,18 +12,18 @@

 """Provides an entrypoint for the training task."""

+#pylint: disable=unused-import
+
 from __future__ import absolute_import, division, print_function

-import argparse
 import datetime
 import logging
 import os
 import pprint
 import uuid

-import pip
-import tensorflow as tf
 from google.cloud import storage
+import tensorflow as tf

 import agents
 import pybullet_envs  # To make AntBulletEnv-v0 available.
@ -113,39 +113,39 @@ def hparams_base():
  """Base hparams tf/Agents PPO """

  # General
-  algorithm = agents.ppo.PPOAlgorithm
-  num_agents = 30
-  eval_episodes = 30
-  use_gpu = False
+#  algorithm = agents.ppo.PPOAlgorithm
+#  num_agents = 30
+#  eval_episodes = 30
+#  use_gpu = False

  # Environment
-  env = 'KukaBulletEnv-v0'
-  normalize_ranges = True
-  max_length = 1000
+#  env = 'KukaBulletEnv-v0'
+#  normalize_ranges = True
+#  max_length = 1000

  # Network
-  network = agents.scripts.networks.feed_forward_gaussian
-  weight_summaries = dict(
-      all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*')
-  policy_layers = 200, 100
-  value_layers = 200, 100
-  init_output_factor = 0.1
-  init_logstd = -1
-  init_std = 0.35
+#  network = agents.scripts.networks.feed_forward_gaussian
+#  weight_summaries = dict(
+#      all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*')
+#  policy_layers = 200, 100
+#  value_layers = 200, 100
+#  init_output_factor = 0.1
+#  init_logstd = -1
+#  init_std = 0.35

  # Optimization
-  update_every = 60
-  update_epochs = 25
-  optimizer = tf.train.AdamOptimizer
-  learning_rate = 1e-4
-  steps = 3e7  # 30M
+#  update_every = 60
+#  update_epochs = 25
+#  optimizer = tf.train.AdamOptimizer
+#  learning_rate = 1e-4
+#  steps = 3e7  # 30M

  # Losses
-  discount = 0.995
-  kl_target = 1e-2
-  kl_cutoff_factor = 2
-  kl_cutoff_coef = 1000
-  kl_init_penalty = 1
+#  discount = 0.995
+#  kl_target = 1e-2
+#  kl_cutoff_factor = 2
+#  kl_cutoff_coef = 1000
+#  kl_init_penalty = 1

  return locals()

@ -158,9 +158,9 @@ def _object_import_from_string(name):
  return mod


-def _realize_import_attrs(d, filter):
+def _realize_import_attrs(d, hparam_filter):
  for k, v in d.items():
-    if k in filter:
+    if k in hparam_filter:
      imported = _object_import_from_string(v)
      # TODO: Provide an appropriately informative error if the import fails
      # except ImportError as e:
@ -170,7 +170,7 @@ def _realize_import_attrs(d, filter):
  return d


-def _get_agents_configuration(hparam_set_name, log_dir=None, is_chief=False):
+def _get_agents_configuration(log_dir=None):
  """Load hyperparameter config."""
  try:
    # Try to resume training.
@ -243,23 +243,20 @@ def gcs_upload(local_dir, gcs_out_dir):
    blob.upload_from_filename(local_file_path)


-def main(unused_argv):
+def main(_):
  """Run training."""
  tf.logging.set_verbosity(tf.logging.INFO)

  if FLAGS.debug:
    tf.logging.set_verbosity(tf.logging.DEBUG)

-  run_config = tf.contrib.learn.RunConfig()
-
  log_dir = FLAGS.logdir

-  agents_config = _get_agents_configuration(
-      FLAGS.hparam_set_id, log_dir, run_config.is_chief)
+  agents_config = _get_agents_configuration(log_dir)

  if FLAGS.run_mode == 'train':
    for score in agents.scripts.train.train(agents_config, env_processes=True):
-      logging.info('Score {}.'.format(score))
+      logging.info('Score %s.', score)
  if FLAGS.run_mode == 'render':
    now = datetime.datetime.now()
    subdir = now.strftime("%m%d-%H%M") + "-" + uuid.uuid4().hex[0:4]
--- a/github_issue_summarization/docker/flask_web/app.py
+++ b/github_issue_summarization/docker/flask_web/app.py
@ -2,16 +2,25 @@
 Simple app that parses predictions from a trained model and displays them.
 """

-from flask import Flask, json, render_template, request
 import requests
-app = Flask(__name__)
+from flask import Flask, json, render_template, request
+APP = Flask(__name__)

-@app.route("/")
+@APP.route("/")
 def index():
+  """Default route.
+
+  Placeholder, does nothing.
+  """
  return render_template("index.html")

-@app.route("/summary", methods=['GET', 'POST'])
+@APP.route("/summary", methods=['GET', 'POST'])
 def summary():
+  """Main prediction route.
+
+  Provides a machine-generated summary of the given text. Sends a request to a live
+  model trained on GitHub issues.
+  """
  if request.method == 'POST':
    issue_text = request.form["issue_text"]

@ -23,17 +32,17 @@ def summary():
      }
    }

-    r = requests.post(url = url,
+    response = requests.post(url=url,
                             headers=headers,
                             data=json.dumps(json_data))

-    rjs = json.loads(r.text)
-    summary = rjs["data"]["ndarray"][0][0]
+    response_json = json.loads(response.text)
+    issue_summary = response_json["data"]["ndarray"][0][0]

-    return render_template("summary.html",
+    return render_template("issue_summary.html",
                           issue_text=issue_text,
-                           summary = summary)
+                           issue_summary=issue_summary)
+  return ('', 204)

 if __name__ == '__main__':
-  app.run(debug = True, host = '0.0.0.0', port = 80)
-
+  APP.run(debug=True, host='0.0.0.0', port=80)
--- a/github_issue_summarization/notebooks/IssueSummarization.py
+++ b/github_issue_summarization/notebooks/IssueSummarization.py
@ -1,22 +0,0 @@
-from __future__ import print_function
-
-import dill as dpickle
-import numpy as np
-from keras.models import load_model
-
-from seq2seq_utils import Seq2Seq_Inference
-
-
-class IssueSummarization(object):
-
-    def __init__(self):
-        with open('body_pp.dpkl', 'rb') as f:
-            body_pp = dpickle.load(f)
-        with open('title_pp.dpkl', 'rb') as f:
-            title_pp = dpickle.load(f)
-        self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
-                                       decoder_preprocessor=title_pp,
-                                       seq2seq_model=load_model('seq2seq_model_tutorial.h5'))
-
-    def predict(self, X, feature_names):
-        return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in X])
--- a/github_issue_summarization/notebooks/issue_summarization.py
+++ b/github_issue_summarization/notebooks/issue_summarization.py
@ -0,0 +1,25 @@
+"""Generates predictions using a stored model.
+
+Uses trained model files to generate a prediction.
+"""
+
+from __future__ import print_function
+
+import numpy as np
+import dill as dpickle
+from keras.models import load_model
+from seq2seq_utils import Seq2Seq_Inference
+
+class IssueSummarization(object):
+
+  def __init__(self):
+    with open('body_pp.dpkl', 'rb') as body_file:
+      body_pp = dpickle.load(body_file)
+    with open('title_pp.dpkl', 'rb') as title_file:
+      title_pp = dpickle.load(title_file)
+    self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
+                                   decoder_preprocessor=title_pp,
+                                   seq2seq_model=load_model('seq2seq_model_tutorial.h5'))
+
+  def predict(self, input_text):
+    return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in input_text])
--- a/github_issue_summarization/notebooks/seq2seq_utils.py
+++ b/github_issue_summarization/notebooks/seq2seq_utils.py
@ -1,16 +1,15 @@
+import logging
+import dill as dpickle
+import numpy as np
 from matplotlib import pyplot as plt
 import tensorflow as tf
+from IPython.display import SVG, display
 from keras import backend as K
 from keras.layers import Input
 from keras.models import Model
-from IPython.display import SVG, display
 from keras.utils.vis_utils import model_to_dot
-import logging
-import numpy as np
-import dill as dpickle
 from annoy import AnnoyIndex
 from tqdm import tqdm, tqdm_notebook
-from random import random
 from nltk.translate.bleu_score import corpus_bleu


@ -42,7 +41,7 @@ def load_text_processor(fname='title_pp.dpkl'):
    pp = dpickle.load(f)

  num_tokens = max(pp.id2token.keys()) + 1
-    print(f'Size of vocabulary for {fname}: {num_tokens:,}')
+  print('Size of vocabulary for {}: {}'.format(fname, num_tokens))
  return num_tokens, pp


@ -73,8 +72,8 @@ def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
  # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
  decoder_target_data = vectorized_title[:, 1:]

-    print(f'Shape of decoder input: {decoder_input_data.shape}')
-    print(f'Shape of decoder target: {decoder_target_data.shape}')
+  print('Shape of decoder input: {}'.format(decoder_input_data.shape))
+  print('Shape of decoder target: {}'.format(decoder_target_data.shape))
  return decoder_input_data, decoder_target_data


@ -100,7 +99,7 @@ def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
  # Encoder input is simply the body of the issue text
  encoder_input_data = vectorized_body
  doc_length = encoder_input_data.shape[1]
-    print(f'Shape of encoder input: {encoder_input_data.shape}')
+  print('Shape of encoder input: {}'.format(encoder_input_data.shape))
  return encoder_input_data, doc_length


@ -195,16 +194,17 @@ def extract_decoder_model(model):
  dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)

  # Instead of setting the intial state from the encoder and forgetting about it, during inference
-    # we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
-    # the GRU, thus we define this input layer for the state so we can add this capability
+  # we are not doing teacher forcing, so we will have to have a feedback loop from predictions back
+  # into the GRU, thus we define this input layer for the state so we can add this capability
  gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')

  # we need to reuse the weights that is why we are getting this
  # If you inspect the decoder GRU that we created for training, it will take as input
  # 2 tensors -> (1) is the embedding layer output for the teacher forcing
-    #                  (which will now be the last step's prediction, and will be _start_ on the first time step)
-    #              (2) is the state, which we will initialize with the encoder on the first time step, but then
-    #                   grab the state after the first prediction and feed that back in again.
+  #                  (which will now be the last step's prediction, and will be _start_ on the
+  #                  first time step)
+  #              (2) is the state, which we will initialize with the encoder on the first time step
+  #              but then grab the state after the first prediction and feed that back in again.
  gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])

  # Reconstruct dense layers
@ -216,6 +216,9 @@ def extract_decoder_model(model):


 class Seq2Seq_Inference(object):
+
+  # pylint: disable=too-many-instance-attributes
+
  def __init__(self,
               encoder_preprocessor,
               decoder_preprocessor,
@ -291,18 +294,18 @@ class Seq2Seq_Inference(object):
    """
    if i:
      print('\n\n==============================================')
-            print(f'============== Example # {i} =================\n')
+      print('============== Example # {} =================\n'.format(i))

    if url:
      print(url)

-        print(f"Issue Body:\n {body_text} \n")
+    print("Issue Body:\n {} \n".format(body_text))

    if title_text:
-            print(f"Original Title:\n {title_text}")
+      print("Original Title:\n {}".format(title_text))

    emb, gen_title = self.generate_issue_title(body_text)
-        print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
+    print("\n****** Machine Generated Title (Prediction) ******:\n {}".format(gen_title))

    if self.nn:
      # return neighbors and distances
@ -315,7 +318,7 @@ class Seq2Seq_Inference(object):
        cols = ['issue_url', 'issue_title', 'body']
        dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
        dfcopy['dist'] = dist
-                similar_issues_df = dfcopy.query(f'dist <= {threshold}')
+        similar_issues_df = dfcopy.query('dist <= {}'.format(threshold))

        print("\n**** Similar Issues (using encoder embedding) ****:\n")
        display(similar_issues_df)
--- a/github_issue_summarization/workflow/workspace/src/prediction.py
+++ b/github_issue_summarization/workflow/workspace/src/prediction.py
@ -1,8 +1,6 @@
 import argparse
 import keras
 import pandas as pd
-from seq2seq_utils import load_decoder_inputs
-from seq2seq_utils import load_encoder_inputs
 from seq2seq_utils import load_text_processor
 from seq2seq_utils import Seq2Seq_Inference

--- a/github_issue_summarization/workflow/workspace/src/preprocess_data_for_deep_learning.py
+++ b/github_issue_summarization/workflow/workspace/src/preprocess_data_for_deep_learning.py
@ -1,7 +1,7 @@
 import argparse
 import dill as dpickle
-from ktext.preprocess import processor
 import numpy as np
+from ktext.preprocess import processor
 import pandas as pd

 # Parsing flags.
--- a/github_issue_summarization/workflow/workspace/src/process_data.py
+++ b/github_issue_summarization/workflow/workspace/src/process_data.py
@ -1,6 +1,4 @@
 import argparse
-import glob
-import logging
 import pandas as pd
 from sklearn.model_selection import train_test_split

@ -20,8 +18,8 @@ traindf, testdf = train_test_split(pd.read_csv(args.input_csv).sample(n=args.sam
                                   test_size=.10)

 # Print stats about the shape of the data.
-print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')
-print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')
+print('Train: {:,} rows {:,} columns'.format(traindf.shape[0], traindf.shape[1]))
+print('Test: {:,} rows {:,} columns'.format(testdf.shape[0], testdf.shape[1]))

 # Store output as CSV.
 traindf.to_csv(args.output_traindf_csv)
--- a/github_issue_summarization/workflow/workspace/src/recommend.py
+++ b/github_issue_summarization/workflow/workspace/src/recommend.py
@ -1,8 +1,6 @@
 import argparse
 import keras
 import pandas as pd
-from seq2seq_utils import load_decoder_inputs
-from seq2seq_utils import load_encoder_inputs
 from seq2seq_utils import load_text_processor
 from seq2seq_utils import Seq2Seq_Inference

--- a/github_issue_summarization/workflow/workspace/src/seq2seq_utils.py
+++ b/github_issue_summarization/workflow/workspace/src/seq2seq_utils.py
@ -1,393 +0,0 @@
-from matplotlib import pyplot as plt
-import tensorflow as tf
-from keras import backend as K
-from keras.layers import Input
-from keras.models import Model
-from IPython.display import SVG, display
-from keras.utils.vis_utils import model_to_dot
-import logging
-import numpy as np
-import dill as dpickle
-from annoy import AnnoyIndex
-from tqdm import tqdm, tqdm_notebook
-from random import random
-from nltk.translate.bleu_score import corpus_bleu
-
-def load_text_processor(fname='title_pp.dpkl'):
-    """
-    Load preprocessors from disk.
-    Parameters
-    ----------
-    fname: str
-        file name of ktext.proccessor object
-    Returns
-    -------
-    num_tokens : int
-        size of vocabulary loaded into ktext.processor
-    pp : ktext.processor
-        the processor you are trying to load
-    Typical Usage:
-    -------------
-    num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
-    num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
-    """
-    # Load files from disk
-    with open(fname, 'rb') as f:
-        pp = dpickle.load(f)
-
-    num_tokens = max(pp.id2token.keys()) + 1
-    print(f'Size of vocabulary for {fname}: {num_tokens:,}')
-    return num_tokens, pp
-
-
-def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
-    """
-    Load decoder inputs.
-    Parameters
-    ----------
-    decoder_np_vecs : str
-        filename of serialized numpy.array of decoder input (issue title)
-    Returns
-    -------
-    decoder_input_data : numpy.array
-        The data fed to the decoder as input during training for teacher forcing.
-        This is the same as `decoder_np_vecs` except the last position.
-    decoder_target_data : numpy.array
-        The data that the decoder data is trained to generate (issue title).
-        Calculated by sliding `decoder_np_vecs` one position forward.
-    """
-    vectorized_title = np.load(decoder_np_vecs)
-    # For Decoder Input, you don't need the last word as that is only for prediction
-    # when we are training using Teacher Forcing.
-    decoder_input_data = vectorized_title[:, :-1]
-
-    # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
-    decoder_target_data = vectorized_title[:, 1:]
-
-    print(f'Shape of decoder input: {decoder_input_data.shape}')
-    print(f'Shape of decoder target: {decoder_target_data.shape}')
-    return decoder_input_data, decoder_target_data
-
-
-def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
-    """
-    Load variables & data that are inputs to encoder.
-    Parameters
-    ----------
-    encoder_np_vecs : str
-        filename of serialized numpy.array of encoder input (issue title)
-    Returns
-    -------
-    encoder_input_data : numpy.array
-        The issue body
-    doc_length : int
-        The standard document length of the input for the encoder after padding
-        the shape of this array will be (num_examples, doc_length)
-    """
-    vectorized_body = np.load(encoder_np_vecs)
-    # Encoder input is simply the body of the issue text
-    encoder_input_data = vectorized_body
-    doc_length = encoder_input_data.shape[1]
-    print(f'Shape of encoder input: {encoder_input_data.shape}')
-    return encoder_input_data, doc_length
-
-
-def viz_model_architecture(model):
-    """Visualize model architecture in Jupyter notebook."""
-    display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
-
-
-def free_gpu_mem():
-    """Attempt to free gpu memory."""
-    K.get_session().close()
-    cfg = K.tf.ConfigProto()
-    cfg.gpu_options.allow_growth = True
-    K.set_session(K.tf.Session(config=cfg))
-
-
-def test_gpu():
-    """Run a toy computation task in tensorflow to test GPU."""
-    config = tf.ConfigProto()
-    config.gpu_options.allow_growth = True
-    session = tf.Session(config=config)
-    hello = tf.constant('Hello, TensorFlow!')
-    print(session.run(hello))
-
-
-def plot_model_training_history(history_object):
-    """Plots model train vs. validation loss."""
-    plt.title('model accuracy')
-    plt.ylabel('accuracy')
-    plt.xlabel('epoch')
-    plt.plot(history_object.history['loss'])
-    plt.plot(history_object.history['val_loss'])
-    plt.legend(['train', 'test'], loc='upper left')
-    plt.show()
-
-
-def extract_encoder_model(model):
-    """
-    Extract the encoder from the original Sequence to Sequence Model.
-    Returns a keras model object that has one input (body of issue) and one
-    output (encoding of issue, which is the last hidden state).
-    Input:
-    -----
-    model: keras model object
-    Returns:
-    -----
-    keras model object
-    """
-    encoder_model = model.get_layer('Encoder-Model')
-    return encoder_model
-
-
-def extract_decoder_model(model):
-    """
-    Extract the decoder from the original model.
-    Inputs:
-    ------
-    model: keras model object
-    Returns:
-    -------
-    A Keras model object with the following inputs and outputs:
-    Inputs of Keras Model That Is Returned:
-    1: the embedding index for the last predicted word or the <Start> indicator
-    2: the last hidden state, or in the case of the first word the hidden state from the encoder
-    Outputs of Keras Model That Is Returned:
-    1.  Prediction (class probabilities) for the next word
-    2.  The hidden state of the decoder, to be fed back into the decoder at the next time step
-    Implementation Notes:
-    ----------------------
-    Must extract relevant layers and reconstruct part of the computation graph
-    to allow for different inputs as we are not going to use teacher forcing at
-    inference time.
-    """
-    # the latent dimension is the same throughout the architecture so we are going to
-    # cheat and grab the latent dimension of the embedding because that is the same as what is
-    # output from the decoder
-    latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
-
-    # Reconstruct the input into the decoder
-    decoder_inputs = model.get_layer('Decoder-Input').input
-    dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
-    dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
-
-    # Instead of setting the intial state from the encoder and forgetting about it, during inference
-    # we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
-    # the GRU, thus we define this input layer for the state so we can add this capability
-    gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
-
-    # we need to reuse the weights that is why we are getting this
-    # If you inspect the decoder GRU that we created for training, it will take as input
-    # 2 tensors -> (1) is the embedding layer output for the teacher forcing
-    #                  (which will now be the last step's prediction, and will be _start_ on the first time step)
-    #              (2) is the state, which we will initialize with the encoder on the first time step, but then
-    #                   grab the state after the first prediction and feed that back in again.
-    gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
-
-    # Reconstruct dense layers
-    dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
-    dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
-    decoder_model = Model([decoder_inputs, gru_inference_state_input],
-                          [dense_out, gru_state_out])
-    return decoder_model
-
-
-class Seq2Seq_Inference(object):
-    def __init__(self,
-                 encoder_preprocessor,
-                 decoder_preprocessor,
-                 seq2seq_model):
-
-        self.pp_body = encoder_preprocessor
-        self.pp_title = decoder_preprocessor
-        self.seq2seq_model = seq2seq_model
-        self.encoder_model = extract_encoder_model(seq2seq_model)
-        self.decoder_model = extract_decoder_model(seq2seq_model)
-        self.default_max_len_title = self.pp_title.padding_maxlen
-        self.nn = None
-        self.rec_df = None
-
-    def generate_issue_title(self,
-                             raw_input_text,
-                             max_len_title=None):
-        """
-        Use the seq2seq model to generate a title given the body of an issue.
-        Inputs
-        ------
-        raw_input: str
-            The body of the issue text as an input string
-        max_len_title: int (optional)
-            The maximum length of the title the model will generate
-        """
-        if max_len_title is None:
-            max_len_title = self.default_max_len_title
-        # get the encoder's features for the decoder
-        raw_tokenized = self.pp_body.transform([raw_input_text])
-        body_encoding = self.encoder_model.predict(raw_tokenized)
-        # we want to save the encoder's embedding before its updated by decoder
-        #   because we can use that as an embedding for other tasks.
-        original_body_encoding = body_encoding
-        state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
-
-        decoded_sentence = []
-        stop_condition = False
-        while not stop_condition:
-            preds, st = self.decoder_model.predict([state_value, body_encoding])
-
-            # We are going to ignore indices 0 (padding) and indices 1 (unknown)
-            # Argmax will return the integer index corresponding to the
-            #  prediction + 2 b/c we chopped off first two
-            pred_idx = np.argmax(preds[:, :, 2:]) + 2
-
-            # retrieve word from index prediction
-            pred_word_str = self.pp_title.id2token[pred_idx]
-
-            if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
-                stop_condition = True
-                break
-            decoded_sentence.append(pred_word_str)
-
-            # update the decoder for the next word
-            body_encoding = st
-            state_value = np.array(pred_idx).reshape(1, 1)
-
-        return original_body_encoding, ' '.join(decoded_sentence)
-
-
-    def print_example(self,
-                      i,
-                      body_text,
-                      title_text,
-                      url,
-                      threshold):
-        """
-        Prints an example of the model's prediction for manual inspection.
-        """
-        if i:
-            print('\n\n==============================================')
-            print(f'============== Example # {i} =================\n')
-
-        if url:
-            print(url)
-
-        print(f"Issue Body:\n {body_text} \n")
-
-        if title_text:
-            print(f"Original Title:\n {title_text}")
-
-        emb, gen_title = self.generate_issue_title(body_text)
-        print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
-
-        if self.nn:
-            # return neighbors and distances
-            n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
-                                             include_distances=True)
-            neighbors = n[1:]
-            dist = d[1:]
-
-            if min(dist) <= threshold:
-                cols = ['issue_url', 'issue_title', 'body']
-                dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
-                dfcopy['dist'] = dist
-                similar_issues_df = dfcopy.query(f'dist <= {threshold}')
-
-                print("\n**** Similar Issues (using encoder embedding) ****:\n")
-                display(similar_issues_df)
-
-
-    def demo_model_predictions(self,
-                               n,
-                               issue_df,
-                               threshold=1):
-        """
-        Pick n random Issues and display predictions.
-        Input:
-        ------
-        n : int
-            Number of issues to display from issue_df
-        issue_df : pandas DataFrame
-            DataFrame that contains two columns: `body` and `issue_title`.
-        threshold : float
-            distance threshold for recommendation of similar issues.
-        Returns:
-        --------
-        None
-            Prints the original issue body and the model's prediction.
-        """
-        # Extract body and title from DF
-        body_text = issue_df.body.tolist()
-        title_text = issue_df.issue_title.tolist()
-        url = issue_df.issue_url.tolist()
-
-        demo_list = np.random.randint(low=1, high=len(body_text), size=n)
-        for i in demo_list:
-            self.print_example(i,
-                               body_text=body_text[i],
-                               title_text=title_text[i],
-                               url=url[i],
-                               threshold=threshold)
-
-    def prepare_recommender(self, vectorized_array, original_df):
-        """
-        Use the annoy library to build recommender
-        Parameters
-        ----------
-        vectorized_array : List[List[int]]
-            This is the list of list of integers that represents your corpus
-            that is fed into the seq2seq model for training.
-        original_df : pandas.DataFrame
-            This is the original dataframe that has the columns
-            ['issue_url', 'issue_title', 'body']
-        Returns
-        -------
-        annoy.AnnoyIndex  object (see https://github.com/spotify/annoy)
-        """
-        self.rec_df = original_df
-        emb = self.encoder_model.predict(x=vectorized_array,
-                                         batch_size=vectorized_array.shape[0]//200)
-
-        f = emb.shape[1]
-        self.nn = AnnoyIndex(f)
-        logging.warning('Adding embeddings')
-        for i in tqdm(range(len(emb))):
-            self.nn.add_item(i, emb[i])
-        logging.warning('Building trees for similarity lookup.')
-        self.nn.build(50)
-        return self.nn
-
-    def set_recsys_data(self, original_df):
-        self.rec_df = original_df
-
-    def set_recsys_annoyobj(self, annoyobj):
-        self.nn = annoyobj
-
-    def evaluate_model(self, holdout_bodies, holdout_titles):
-        """
-        Method for calculating BLEU Score.
-        Parameters
-        ----------
-        holdout_bodies : List[str]
-            These are the issue bodies that we want to summarize
-        holdout_titles : List[str]
-            This is the ground truth we are trying to predict --> issue titles
-        Returns
-        -------
-        bleu : float
-            The BLEU Score
-        """
-        actual, predicted = list(), list()
-        assert len(holdout_bodies) == len(holdout_titles)
-        num_examples = len(holdout_bodies)
-
-        logging.warning('Generating predictions.')
-        # step over the whole set TODO: parallelize this
-        for i in tqdm_notebook(range(num_examples)):
-            _, yhat = self.generate_issue_title(holdout_bodies[i])
-
-            actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
-            predicted.append(self.pp_title.process_text([yhat])[0])
-        # calculate BLEU score
-        logging.warning('Calculating BLEU.')
-        bleu = corpus_bleu(actual, predicted)
-        return bleu
--- a/github_issue_summarization/workflow/workspace/src/train.py
+++ b/github_issue_summarization/workflow/workspace/src/train.py
@ -1,11 +1,10 @@
 import argparse
+import numpy as np
 from keras.callbacks import CSVLogger, ModelCheckpoint
-from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
+from keras.layers import Input, GRU, Dense, Embedding, BatchNormalization
 from keras.models import Model
 from keras import optimizers
-import numpy as np
 from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor
-from seq2seq_utils import viz_model_architecture

 # Parsing flags.
 parser = argparse.ArgumentParser()
@ -35,7 +34,10 @@ latent_dim = 300
 encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')

 # Word embeding for encoder (ex: Issue Body)
-x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
+x = Embedding(num_encoder_tokens,
+              latent_dim,
+              name='Body-Word-Embedding',
+              mask_zero=False)(encoder_inputs)
 x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

 # We do not need the `encoder_output` just the hidden state.
@ -53,7 +55,10 @@ seq2seq_encoder_out = encoder_model(encoder_inputs)
 decoder_inputs = Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

 # Word Embedding For Decoder (ex: Issue Titles)
-dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
+dec_emb = Embedding(num_decoder_tokens,
+                    latent_dim,
+                    name='Decoder-Word-Embedding',
+                    mask_zero=False)(decoder_inputs)
 dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

 # Set up the decoder, using `decoder_state_input` as initial state.
@ -71,21 +76,24 @@ decoder_outputs = decoder_dense(x)

 seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

-seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy')
+seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
+                      loss='sparse_categorical_crossentropy')

 seq2seq_Model.summary()

 script_name_base = 'tutorial_seq2seq'
 csv_logger = CSVLogger('{:}.log'.format(script_name_base))
-model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
-                                   save_best_only=True)
+model_checkpoint = ModelCheckpoint(
+    '{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base), save_best_only=True)

 batch_size = 1200
 epochs = 7
-history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
+history = seq2seq_Model.fit([encoder_input_data, decoder_input_data],
+                            np.expand_dims(decoder_target_data, -1),
                            batch_size=batch_size,
                            epochs=epochs,
-          validation_split=0.12, callbacks=[csv_logger, model_checkpoint])
+                            validation_split=0.12,
+                            callbacks=[csv_logger, model_checkpoint])

 #############
 # Save model.