mirror of https://github.com/kubeflow/examples.git
Add .pylintrc (#61)
* Add .pylintrc * Resolve lint complaints in agents/trainer/task.py * Resolve lint complaints with flask app.py * Resolve linting issues Remove duplicate seq2seq_utils.py from workflow/workspace/src * Use python 3.5.2 with pylint to match prow Put pybullet import back into agents/trainer/task.py with a pylint ignore statement Use main(_) to ensure it works with tf.app.run
This commit is contained in:
parent
1d6946ead8
commit
41372c9314
|
@ -0,0 +1,399 @@
|
||||||
|
[MASTER]
|
||||||
|
|
||||||
|
# Specify a configuration file.
|
||||||
|
#rcfile=
|
||||||
|
|
||||||
|
# Python code to execute, usually for sys.path manipulation such as
|
||||||
|
# pygtk.require().
|
||||||
|
#init-hook=
|
||||||
|
|
||||||
|
# Add files or directories to the blacklist. They should be base names, not
|
||||||
|
# paths.
|
||||||
|
ignore=third_party
|
||||||
|
|
||||||
|
# Add files or directories matching the regex patterns to the blacklist. The
|
||||||
|
# regex matches against base names, not paths.
|
||||||
|
ignore-patterns=
|
||||||
|
|
||||||
|
# Pickle collected data for later comparisons.
|
||||||
|
persistent=no
|
||||||
|
|
||||||
|
# List of plugins (as comma separated values of python modules names) to load,
|
||||||
|
# usually to register additional checkers.
|
||||||
|
load-plugins=
|
||||||
|
|
||||||
|
# Use multiple processes to speed up Pylint.
|
||||||
|
jobs=4
|
||||||
|
|
||||||
|
# Allow loading of arbitrary C extensions. Extensions are imported into the
|
||||||
|
# active Python interpreter and may run arbitrary code.
|
||||||
|
unsafe-load-any-extension=no
|
||||||
|
|
||||||
|
# A comma-separated list of package or module names from where C extensions may
|
||||||
|
# be loaded. Extensions are loading into the active Python interpreter and may
|
||||||
|
# run arbitrary code
|
||||||
|
extension-pkg-whitelist=
|
||||||
|
|
||||||
|
|
||||||
|
[MESSAGES CONTROL]
|
||||||
|
|
||||||
|
# Only show warnings with the listed confidence levels. Leave empty to show
|
||||||
|
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
|
||||||
|
confidence=
|
||||||
|
|
||||||
|
# Enable the message, report, category or checker with the given id(s). You can
|
||||||
|
# either give multiple identifier separated by comma (,) or put this option
|
||||||
|
# multiple time (only on the command line, not in the configuration file where
|
||||||
|
# it should appear only once). See also the "--disable" option for examples.
|
||||||
|
#enable=
|
||||||
|
|
||||||
|
# Disable the message, report, category or checker with the given id(s). You
|
||||||
|
# can either give multiple identifiers separated by comma (,) or put this
|
||||||
|
# option multiple times (only on the command line, not in the configuration
|
||||||
|
# file where it should appear only once).You can also use "--disable=all" to
|
||||||
|
# disable everything first and then reenable specific checks. For example, if
|
||||||
|
# you want to run only the similarities checker, you can use "--disable=all
|
||||||
|
# --enable=similarities". If you want to run only the classes checker, but have
|
||||||
|
# no Warning level messages displayed, use"--disable=all --enable=classes
|
||||||
|
# --disable=W"
|
||||||
|
disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,range-builtin-not-iterating,suppressed-message,missing-docstring,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating,relative-import,invalid-name,bad-continuation,no-member,locally-disabled,fixme,import-error,too-many-locals
|
||||||
|
|
||||||
|
|
||||||
|
[REPORTS]
|
||||||
|
|
||||||
|
# Set the output format. Available formats are text, parseable, colorized, msvs
|
||||||
|
# (visual studio) and html. You can also give a reporter class, eg
|
||||||
|
# mypackage.mymodule.MyReporterClass.
|
||||||
|
output-format=text
|
||||||
|
|
||||||
|
# Put messages in a separate file for each module / package specified on the
|
||||||
|
# command line instead of printing them on stdout. Reports (if any) will be
|
||||||
|
# written in a file name "pylint_global.[txt|html]". This option is deprecated
|
||||||
|
# and it will be removed in Pylint 2.0.
|
||||||
|
files-output=no
|
||||||
|
|
||||||
|
# Tells whether to display a full report or only the messages
|
||||||
|
reports=no
|
||||||
|
|
||||||
|
# Python expression which should return a note less than 10 (10 is the highest
|
||||||
|
# note). You have access to the variables errors warning, statement which
|
||||||
|
# respectively contain the number of errors / warnings messages and the total
|
||||||
|
# number of statements analyzed. This is used by the global evaluation report
|
||||||
|
# (RP0004).
|
||||||
|
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
|
||||||
|
|
||||||
|
# Template used to display messages. This is a python new-style format string
|
||||||
|
# used to format the message information. See doc for all details
|
||||||
|
#msg-template=
|
||||||
|
|
||||||
|
|
||||||
|
[BASIC]
|
||||||
|
|
||||||
|
# Good variable names which should always be accepted, separated by a comma
|
||||||
|
good-names=i,j,k,ex,Run,_
|
||||||
|
|
||||||
|
# Bad variable names which should always be refused, separated by a comma
|
||||||
|
bad-names=foo,bar,baz,toto,tutu,tata
|
||||||
|
|
||||||
|
# Colon-delimited sets of names that determine each other's naming style when
|
||||||
|
# the name regexes allow several styles.
|
||||||
|
name-group=
|
||||||
|
|
||||||
|
# Include a hint for the correct naming format with invalid-name
|
||||||
|
include-naming-hint=no
|
||||||
|
|
||||||
|
# List of decorators that produce properties, such as abc.abstractproperty. Add
|
||||||
|
# to this list to register other decorators that produce valid properties.
|
||||||
|
property-classes=abc.abstractproperty
|
||||||
|
|
||||||
|
# Regular expression matching correct function names
|
||||||
|
function-rgx=[a-z_][a-z0-9_]{2,30}$
|
||||||
|
|
||||||
|
# Naming hint for function names
|
||||||
|
function-name-hint=[a-z_][a-z0-9_]{2,30}$
|
||||||
|
|
||||||
|
# Regular expression matching correct variable names
|
||||||
|
variable-rgx=[a-z_][a-z0-9_]{2,30}$
|
||||||
|
|
||||||
|
# Naming hint for variable names
|
||||||
|
variable-name-hint=[a-z_][a-z0-9_]{2,30}$
|
||||||
|
|
||||||
|
# Regular expression matching correct constant names
|
||||||
|
const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
|
||||||
|
|
||||||
|
# Naming hint for constant names
|
||||||
|
const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
|
||||||
|
|
||||||
|
# Regular expression matching correct attribute names
|
||||||
|
attr-rgx=[a-z_][a-z0-9_]{2,30}$
|
||||||
|
|
||||||
|
# Naming hint for attribute names
|
||||||
|
attr-name-hint=[a-z_][a-z0-9_]{2,30}$
|
||||||
|
|
||||||
|
# Regular expression matching correct argument names
|
||||||
|
argument-rgx=[a-z_][a-z0-9_]{2,30}$
|
||||||
|
|
||||||
|
# Naming hint for argument names
|
||||||
|
argument-name-hint=[a-z_][a-z0-9_]{2,30}$
|
||||||
|
|
||||||
|
# Regular expression matching correct class attribute names
|
||||||
|
class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
|
||||||
|
|
||||||
|
# Naming hint for class attribute names
|
||||||
|
class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
|
||||||
|
|
||||||
|
# Regular expression matching correct inline iteration names
|
||||||
|
inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
|
||||||
|
|
||||||
|
# Naming hint for inline iteration names
|
||||||
|
inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
|
||||||
|
|
||||||
|
# Regular expression matching correct class names
|
||||||
|
class-rgx=[A-Z_][a-zA-Z0-9]+$
|
||||||
|
|
||||||
|
# Naming hint for class names
|
||||||
|
class-name-hint=[A-Z_][a-zA-Z0-9]+$
|
||||||
|
|
||||||
|
# Regular expression matching correct module names
|
||||||
|
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
|
||||||
|
|
||||||
|
# Naming hint for module names
|
||||||
|
module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
|
||||||
|
|
||||||
|
# Regular expression matching correct method names
|
||||||
|
method-rgx=[a-z_][a-z0-9_]{2,30}$
|
||||||
|
|
||||||
|
# Naming hint for method names
|
||||||
|
method-name-hint=[a-z_][a-z0-9_]{2,30}$
|
||||||
|
|
||||||
|
# Regular expression which should only match function or class names that do
|
||||||
|
# not require a docstring.
|
||||||
|
no-docstring-rgx=^_
|
||||||
|
|
||||||
|
# Minimum line length for functions/classes that require docstrings, shorter
|
||||||
|
# ones are exempt.
|
||||||
|
docstring-min-length=-1
|
||||||
|
|
||||||
|
|
||||||
|
[ELIF]
|
||||||
|
|
||||||
|
# Maximum number of nested blocks for function / method body
|
||||||
|
max-nested-blocks=5
|
||||||
|
|
||||||
|
|
||||||
|
[TYPECHECK]
|
||||||
|
|
||||||
|
# Tells whether missing members accessed in mixin class should be ignored. A
|
||||||
|
# mixin class is detected if its name ends with "mixin" (case insensitive).
|
||||||
|
ignore-mixin-members=yes
|
||||||
|
|
||||||
|
# List of module names for which member attributes should not be checked
|
||||||
|
# (useful for modules/projects where namespaces are manipulated during runtime
|
||||||
|
# and thus existing member attributes cannot be deduced by static analysis. It
|
||||||
|
# supports qualified module names, as well as Unix pattern matching.
|
||||||
|
ignored-modules=
|
||||||
|
|
||||||
|
# List of class names for which member attributes should not be checked (useful
|
||||||
|
# for classes with dynamically set attributes). This supports the use of
|
||||||
|
# qualified names.
|
||||||
|
ignored-classes=optparse.Values,thread._local,_thread._local
|
||||||
|
|
||||||
|
# List of members which are set dynamically and missed by pylint inference
|
||||||
|
# system, and so shouldn't trigger E1101 when accessed. Python regular
|
||||||
|
# expressions are accepted.
|
||||||
|
generated-members=
|
||||||
|
|
||||||
|
# List of decorators that produce context managers, such as
|
||||||
|
# contextlib.contextmanager. Add to this list to register other decorators that
|
||||||
|
# produce valid context managers.
|
||||||
|
contextmanager-decorators=contextlib.contextmanager
|
||||||
|
|
||||||
|
|
||||||
|
[FORMAT]
|
||||||
|
|
||||||
|
# Maximum number of characters on a single line.
|
||||||
|
max-line-length=100
|
||||||
|
|
||||||
|
# Regexp for a line that is allowed to be longer than the limit.
|
||||||
|
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
|
||||||
|
|
||||||
|
# Allow the body of an if to be on the same line as the test if there is no
|
||||||
|
# else.
|
||||||
|
single-line-if-stmt=no
|
||||||
|
|
||||||
|
# List of optional constructs for which whitespace checking is disabled. `dict-
|
||||||
|
# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
|
||||||
|
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
|
||||||
|
# `empty-line` allows space-only lines.
|
||||||
|
no-space-check=trailing-comma,dict-separator
|
||||||
|
|
||||||
|
# Maximum number of lines in a module
|
||||||
|
max-module-lines=1000
|
||||||
|
|
||||||
|
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
|
||||||
|
# tab).
|
||||||
|
# Use 2 spaces consistent with TensorFlow style.
|
||||||
|
indent-string=' '
|
||||||
|
|
||||||
|
# Number of spaces of indent required inside a hanging or continued line.
|
||||||
|
indent-after-paren=4
|
||||||
|
|
||||||
|
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
|
||||||
|
expected-line-ending-format=
|
||||||
|
|
||||||
|
|
||||||
|
[MISCELLANEOUS]
|
||||||
|
|
||||||
|
# List of note tags to take in consideration, separated by a comma.
|
||||||
|
notes=FIXME,XXX,TODO
|
||||||
|
|
||||||
|
|
||||||
|
[VARIABLES]
|
||||||
|
|
||||||
|
# Tells whether we should check for unused import in __init__ files.
|
||||||
|
init-import=no
|
||||||
|
|
||||||
|
# A regular expression matching the name of dummy variables (i.e. expectedly
|
||||||
|
# not used).
|
||||||
|
dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
|
||||||
|
|
||||||
|
# List of additional names supposed to be defined in builtins. Remember that
|
||||||
|
# you should avoid to define new builtins when possible.
|
||||||
|
additional-builtins=
|
||||||
|
|
||||||
|
# List of strings which can identify a callback function by name. A callback
|
||||||
|
# name must start or end with one of those strings.
|
||||||
|
callbacks=cb_,_cb
|
||||||
|
|
||||||
|
# List of qualified module names which can have objects that can redefine
|
||||||
|
# builtins.
|
||||||
|
redefining-builtins-modules=six.moves,future.builtins
|
||||||
|
|
||||||
|
|
||||||
|
[LOGGING]
|
||||||
|
|
||||||
|
# Logging modules to check that the string format arguments are in logging
|
||||||
|
# function parameter format
|
||||||
|
logging-modules=logging
|
||||||
|
|
||||||
|
|
||||||
|
[SIMILARITIES]
|
||||||
|
|
||||||
|
# Minimum lines number of a similarity.
|
||||||
|
min-similarity-lines=4
|
||||||
|
|
||||||
|
# Ignore comments when computing similarities.
|
||||||
|
ignore-comments=yes
|
||||||
|
|
||||||
|
# Ignore docstrings when computing similarities.
|
||||||
|
ignore-docstrings=yes
|
||||||
|
|
||||||
|
# Ignore imports when computing similarities.
|
||||||
|
ignore-imports=no
|
||||||
|
|
||||||
|
|
||||||
|
[SPELLING]
|
||||||
|
|
||||||
|
# Spelling dictionary name. Available dictionaries: none. To make it working
|
||||||
|
# install python-enchant package.
|
||||||
|
spelling-dict=
|
||||||
|
|
||||||
|
# List of comma separated words that should not be checked.
|
||||||
|
spelling-ignore-words=
|
||||||
|
|
||||||
|
# A path to a file that contains private dictionary; one word per line.
|
||||||
|
spelling-private-dict-file=
|
||||||
|
|
||||||
|
# Tells whether to store unknown words to indicated private dictionary in
|
||||||
|
# --spelling-private-dict-file option instead of raising a message.
|
||||||
|
spelling-store-unknown-words=no
|
||||||
|
|
||||||
|
|
||||||
|
[IMPORTS]
|
||||||
|
|
||||||
|
# Deprecated modules which should not be used, separated by a comma
|
||||||
|
deprecated-modules=regsub,TERMIOS,Bastion,rexec
|
||||||
|
|
||||||
|
# Create a graph of every (i.e. internal and external) dependencies in the
|
||||||
|
# given file (report RP0402 must not be disabled)
|
||||||
|
import-graph=
|
||||||
|
|
||||||
|
# Create a graph of external dependencies in the given file (report RP0402 must
|
||||||
|
# not be disabled)
|
||||||
|
ext-import-graph=
|
||||||
|
|
||||||
|
# Create a graph of internal dependencies in the given file (report RP0402 must
|
||||||
|
# not be disabled)
|
||||||
|
int-import-graph=
|
||||||
|
|
||||||
|
# Force import order to recognize a module as part of the standard
|
||||||
|
# compatibility libraries.
|
||||||
|
known-standard-library=
|
||||||
|
|
||||||
|
# Force import order to recognize a module as part of a third party library.
|
||||||
|
known-third-party=enchant
|
||||||
|
|
||||||
|
# Analyse import fallback blocks. This can be used to support both Python 2 and
|
||||||
|
# 3 compatible code, which means that the block might have code that exists
|
||||||
|
# only in one or another interpreter, leading to false positives when analysed.
|
||||||
|
analyse-fallback-blocks=no
|
||||||
|
|
||||||
|
|
||||||
|
[DESIGN]
|
||||||
|
|
||||||
|
# Maximum number of arguments for function / method
|
||||||
|
max-args=7
|
||||||
|
|
||||||
|
# Argument names that match this expression will be ignored. Default to name
|
||||||
|
# with leading underscore
|
||||||
|
ignored-argument-names=_.*
|
||||||
|
|
||||||
|
# Maximum number of locals for function / method body
|
||||||
|
max-locals=15
|
||||||
|
|
||||||
|
# Maximum number of return / yield for function / method body
|
||||||
|
max-returns=6
|
||||||
|
|
||||||
|
# Maximum number of branch for function / method body
|
||||||
|
max-branches=12
|
||||||
|
|
||||||
|
# Maximum number of statements in function / method body
|
||||||
|
max-statements=50
|
||||||
|
|
||||||
|
# Maximum number of parents for a class (see R0901).
|
||||||
|
max-parents=7
|
||||||
|
|
||||||
|
# Maximum number of attributes for a class (see R0902).
|
||||||
|
max-attributes=7
|
||||||
|
|
||||||
|
# Minimum number of public methods for a class (see R0903).
|
||||||
|
min-public-methods=0
|
||||||
|
|
||||||
|
# Maximum number of public methods for a class (see R0904).
|
||||||
|
max-public-methods=20
|
||||||
|
|
||||||
|
# Maximum number of boolean expressions in a if statement
|
||||||
|
max-bool-expr=5
|
||||||
|
|
||||||
|
|
||||||
|
[CLASSES]
|
||||||
|
|
||||||
|
# List of method names used to declare (i.e. assign) instance attributes.
|
||||||
|
defining-attr-methods=__init__,__new__,setUp
|
||||||
|
|
||||||
|
# List of valid names for the first argument in a class method.
|
||||||
|
valid-classmethod-first-arg=cls
|
||||||
|
|
||||||
|
# List of valid names for the first argument in a metaclass class method.
|
||||||
|
valid-metaclass-classmethod-first-arg=mcs
|
||||||
|
|
||||||
|
# List of member names, which should be excluded from the protected access
|
||||||
|
# warning.
|
||||||
|
exclude-protected=_asdict,_fields,_replace,_source,_make
|
||||||
|
|
||||||
|
|
||||||
|
[EXCEPTIONS]
|
||||||
|
|
||||||
|
# Exceptions that will emit a warning when being caught. Defaults to
|
||||||
|
# "Exception"
|
||||||
|
overgeneral-exceptions=Exception
|
|
@ -12,18 +12,18 @@
|
||||||
|
|
||||||
"""Provides an entrypoint for the training task."""
|
"""Provides an entrypoint for the training task."""
|
||||||
|
|
||||||
|
#pylint: disable=unused-import
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import argparse
|
|
||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import pprint
|
import pprint
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
import pip
|
|
||||||
import tensorflow as tf
|
|
||||||
from google.cloud import storage
|
from google.cloud import storage
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
import agents
|
import agents
|
||||||
import pybullet_envs # To make AntBulletEnv-v0 available.
|
import pybullet_envs # To make AntBulletEnv-v0 available.
|
||||||
|
@ -113,39 +113,39 @@ def hparams_base():
|
||||||
"""Base hparams tf/Agents PPO """
|
"""Base hparams tf/Agents PPO """
|
||||||
|
|
||||||
# General
|
# General
|
||||||
algorithm = agents.ppo.PPOAlgorithm
|
# algorithm = agents.ppo.PPOAlgorithm
|
||||||
num_agents = 30
|
# num_agents = 30
|
||||||
eval_episodes = 30
|
# eval_episodes = 30
|
||||||
use_gpu = False
|
# use_gpu = False
|
||||||
|
|
||||||
# Environment
|
# Environment
|
||||||
env = 'KukaBulletEnv-v0'
|
# env = 'KukaBulletEnv-v0'
|
||||||
normalize_ranges = True
|
# normalize_ranges = True
|
||||||
max_length = 1000
|
# max_length = 1000
|
||||||
|
|
||||||
# Network
|
# Network
|
||||||
network = agents.scripts.networks.feed_forward_gaussian
|
# network = agents.scripts.networks.feed_forward_gaussian
|
||||||
weight_summaries = dict(
|
# weight_summaries = dict(
|
||||||
all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*')
|
# all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*')
|
||||||
policy_layers = 200, 100
|
# policy_layers = 200, 100
|
||||||
value_layers = 200, 100
|
# value_layers = 200, 100
|
||||||
init_output_factor = 0.1
|
# init_output_factor = 0.1
|
||||||
init_logstd = -1
|
# init_logstd = -1
|
||||||
init_std = 0.35
|
# init_std = 0.35
|
||||||
|
|
||||||
# Optimization
|
# Optimization
|
||||||
update_every = 60
|
# update_every = 60
|
||||||
update_epochs = 25
|
# update_epochs = 25
|
||||||
optimizer = tf.train.AdamOptimizer
|
# optimizer = tf.train.AdamOptimizer
|
||||||
learning_rate = 1e-4
|
# learning_rate = 1e-4
|
||||||
steps = 3e7 # 30M
|
# steps = 3e7 # 30M
|
||||||
|
|
||||||
# Losses
|
# Losses
|
||||||
discount = 0.995
|
# discount = 0.995
|
||||||
kl_target = 1e-2
|
# kl_target = 1e-2
|
||||||
kl_cutoff_factor = 2
|
# kl_cutoff_factor = 2
|
||||||
kl_cutoff_coef = 1000
|
# kl_cutoff_coef = 1000
|
||||||
kl_init_penalty = 1
|
# kl_init_penalty = 1
|
||||||
|
|
||||||
return locals()
|
return locals()
|
||||||
|
|
||||||
|
@ -158,9 +158,9 @@ def _object_import_from_string(name):
|
||||||
return mod
|
return mod
|
||||||
|
|
||||||
|
|
||||||
def _realize_import_attrs(d, filter):
|
def _realize_import_attrs(d, hparam_filter):
|
||||||
for k, v in d.items():
|
for k, v in d.items():
|
||||||
if k in filter:
|
if k in hparam_filter:
|
||||||
imported = _object_import_from_string(v)
|
imported = _object_import_from_string(v)
|
||||||
# TODO: Provide an appropriately informative error if the import fails
|
# TODO: Provide an appropriately informative error if the import fails
|
||||||
# except ImportError as e:
|
# except ImportError as e:
|
||||||
|
@ -170,7 +170,7 @@ def _realize_import_attrs(d, filter):
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
def _get_agents_configuration(hparam_set_name, log_dir=None, is_chief=False):
|
def _get_agents_configuration(log_dir=None):
|
||||||
"""Load hyperparameter config."""
|
"""Load hyperparameter config."""
|
||||||
try:
|
try:
|
||||||
# Try to resume training.
|
# Try to resume training.
|
||||||
|
@ -243,23 +243,20 @@ def gcs_upload(local_dir, gcs_out_dir):
|
||||||
blob.upload_from_filename(local_file_path)
|
blob.upload_from_filename(local_file_path)
|
||||||
|
|
||||||
|
|
||||||
def main(unused_argv):
|
def main(_):
|
||||||
"""Run training."""
|
"""Run training."""
|
||||||
tf.logging.set_verbosity(tf.logging.INFO)
|
tf.logging.set_verbosity(tf.logging.INFO)
|
||||||
|
|
||||||
if FLAGS.debug:
|
if FLAGS.debug:
|
||||||
tf.logging.set_verbosity(tf.logging.DEBUG)
|
tf.logging.set_verbosity(tf.logging.DEBUG)
|
||||||
|
|
||||||
run_config = tf.contrib.learn.RunConfig()
|
|
||||||
|
|
||||||
log_dir = FLAGS.logdir
|
log_dir = FLAGS.logdir
|
||||||
|
|
||||||
agents_config = _get_agents_configuration(
|
agents_config = _get_agents_configuration(log_dir)
|
||||||
FLAGS.hparam_set_id, log_dir, run_config.is_chief)
|
|
||||||
|
|
||||||
if FLAGS.run_mode == 'train':
|
if FLAGS.run_mode == 'train':
|
||||||
for score in agents.scripts.train.train(agents_config, env_processes=True):
|
for score in agents.scripts.train.train(agents_config, env_processes=True):
|
||||||
logging.info('Score {}.'.format(score))
|
logging.info('Score %s.', score)
|
||||||
if FLAGS.run_mode == 'render':
|
if FLAGS.run_mode == 'render':
|
||||||
now = datetime.datetime.now()
|
now = datetime.datetime.now()
|
||||||
subdir = now.strftime("%m%d-%H%M") + "-" + uuid.uuid4().hex[0:4]
|
subdir = now.strftime("%m%d-%H%M") + "-" + uuid.uuid4().hex[0:4]
|
||||||
|
|
|
@ -2,38 +2,47 @@
|
||||||
Simple app that parses predictions from a trained model and displays them.
|
Simple app that parses predictions from a trained model and displays them.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from flask import Flask, json, render_template, request
|
|
||||||
import requests
|
import requests
|
||||||
app = Flask(__name__)
|
from flask import Flask, json, render_template, request
|
||||||
|
APP = Flask(__name__)
|
||||||
|
|
||||||
@app.route("/")
|
@APP.route("/")
|
||||||
def index():
|
def index():
|
||||||
|
"""Default route.
|
||||||
|
|
||||||
|
Placeholder, does nothing.
|
||||||
|
"""
|
||||||
return render_template("index.html")
|
return render_template("index.html")
|
||||||
|
|
||||||
@app.route("/summary", methods=['GET', 'POST'])
|
@APP.route("/summary", methods=['GET', 'POST'])
|
||||||
def summary():
|
def summary():
|
||||||
|
"""Main prediction route.
|
||||||
|
|
||||||
|
Provides a machine-generated summary of the given text. Sends a request to a live
|
||||||
|
model trained on GitHub issues.
|
||||||
|
"""
|
||||||
if request.method == 'POST':
|
if request.method == 'POST':
|
||||||
issue_text = request.form["issue_text"]
|
issue_text = request.form["issue_text"]
|
||||||
|
|
||||||
url = "http://ambassador:80/seldon/issue-summarization/api/v0.1/predictions"
|
url = "http://ambassador:80/seldon/issue-summarization/api/v0.1/predictions"
|
||||||
headers = { 'content-type': 'application/json' }
|
headers = {'content-type': 'application/json'}
|
||||||
json_data = {
|
json_data = {
|
||||||
"data" : {
|
"data" : {
|
||||||
"ndarray" : [[ issue_text ]]
|
"ndarray" : [[issue_text]]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
r = requests.post(url = url,
|
response = requests.post(url=url,
|
||||||
headers = headers,
|
headers=headers,
|
||||||
data = json.dumps(json_data))
|
data=json.dumps(json_data))
|
||||||
|
|
||||||
rjs = json.loads(r.text)
|
response_json = json.loads(response.text)
|
||||||
summary = rjs["data"]["ndarray"][0][0]
|
issue_summary = response_json["data"]["ndarray"][0][0]
|
||||||
|
|
||||||
return render_template("summary.html",
|
return render_template("issue_summary.html",
|
||||||
issue_text = issue_text,
|
issue_text=issue_text,
|
||||||
summary = summary)
|
issue_summary=issue_summary)
|
||||||
|
return ('', 204)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run(debug = True, host = '0.0.0.0', port = 80)
|
APP.run(debug=True, host='0.0.0.0', port=80)
|
||||||
|
|
||||||
|
|
|
@ -1,22 +0,0 @@
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import dill as dpickle
|
|
||||||
import numpy as np
|
|
||||||
from keras.models import load_model
|
|
||||||
|
|
||||||
from seq2seq_utils import Seq2Seq_Inference
|
|
||||||
|
|
||||||
|
|
||||||
class IssueSummarization(object):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
with open('body_pp.dpkl', 'rb') as f:
|
|
||||||
body_pp = dpickle.load(f)
|
|
||||||
with open('title_pp.dpkl', 'rb') as f:
|
|
||||||
title_pp = dpickle.load(f)
|
|
||||||
self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
|
|
||||||
decoder_preprocessor=title_pp,
|
|
||||||
seq2seq_model=load_model('seq2seq_model_tutorial.h5'))
|
|
||||||
|
|
||||||
def predict(self, X, feature_names):
|
|
||||||
return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in X])
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
"""Generates predictions using a stored model.
|
||||||
|
|
||||||
|
Uses trained model files to generate a prediction.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import dill as dpickle
|
||||||
|
from keras.models import load_model
|
||||||
|
from seq2seq_utils import Seq2Seq_Inference
|
||||||
|
|
||||||
|
class IssueSummarization(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
with open('body_pp.dpkl', 'rb') as body_file:
|
||||||
|
body_pp = dpickle.load(body_file)
|
||||||
|
with open('title_pp.dpkl', 'rb') as title_file:
|
||||||
|
title_pp = dpickle.load(title_file)
|
||||||
|
self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
|
||||||
|
decoder_preprocessor=title_pp,
|
||||||
|
seq2seq_model=load_model('seq2seq_model_tutorial.h5'))
|
||||||
|
|
||||||
|
def predict(self, input_text):
|
||||||
|
return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in input_text])
|
|
@ -1,429 +1,432 @@
|
||||||
|
import logging
|
||||||
|
import dill as dpickle
|
||||||
|
import numpy as np
|
||||||
from matplotlib import pyplot as plt
|
from matplotlib import pyplot as plt
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
from IPython.display import SVG, display
|
||||||
from keras import backend as K
|
from keras import backend as K
|
||||||
from keras.layers import Input
|
from keras.layers import Input
|
||||||
from keras.models import Model
|
from keras.models import Model
|
||||||
from IPython.display import SVG, display
|
|
||||||
from keras.utils.vis_utils import model_to_dot
|
from keras.utils.vis_utils import model_to_dot
|
||||||
import logging
|
|
||||||
import numpy as np
|
|
||||||
import dill as dpickle
|
|
||||||
from annoy import AnnoyIndex
|
from annoy import AnnoyIndex
|
||||||
from tqdm import tqdm, tqdm_notebook
|
from tqdm import tqdm, tqdm_notebook
|
||||||
from random import random
|
|
||||||
from nltk.translate.bleu_score import corpus_bleu
|
from nltk.translate.bleu_score import corpus_bleu
|
||||||
|
|
||||||
|
|
||||||
def load_text_processor(fname='title_pp.dpkl'):
|
def load_text_processor(fname='title_pp.dpkl'):
|
||||||
"""
|
"""
|
||||||
Load preprocessors from disk.
|
Load preprocessors from disk.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
fname: str
|
fname: str
|
||||||
file name of ktext.proccessor object
|
file name of ktext.proccessor object
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
num_tokens : int
|
num_tokens : int
|
||||||
size of vocabulary loaded into ktext.processor
|
size of vocabulary loaded into ktext.processor
|
||||||
pp : ktext.processor
|
pp : ktext.processor
|
||||||
the processor you are trying to load
|
the processor you are trying to load
|
||||||
|
|
||||||
Typical Usage:
|
Typical Usage:
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
|
num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
|
||||||
num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
|
num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# Load files from disk
|
# Load files from disk
|
||||||
with open(fname, 'rb') as f:
|
with open(fname, 'rb') as f:
|
||||||
pp = dpickle.load(f)
|
pp = dpickle.load(f)
|
||||||
|
|
||||||
num_tokens = max(pp.id2token.keys()) + 1
|
num_tokens = max(pp.id2token.keys()) + 1
|
||||||
print(f'Size of vocabulary for {fname}: {num_tokens:,}')
|
print('Size of vocabulary for {}: {}'.format(fname, num_tokens))
|
||||||
return num_tokens, pp
|
return num_tokens, pp
|
||||||
|
|
||||||
|
|
||||||
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
|
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
|
||||||
"""
|
"""
|
||||||
Load decoder inputs.
|
Load decoder inputs.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
decoder_np_vecs : str
|
decoder_np_vecs : str
|
||||||
filename of serialized numpy.array of decoder input (issue title)
|
filename of serialized numpy.array of decoder input (issue title)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
decoder_input_data : numpy.array
|
decoder_input_data : numpy.array
|
||||||
The data fed to the decoder as input during training for teacher forcing.
|
The data fed to the decoder as input during training for teacher forcing.
|
||||||
This is the same as `decoder_np_vecs` except the last position.
|
This is the same as `decoder_np_vecs` except the last position.
|
||||||
decoder_target_data : numpy.array
|
decoder_target_data : numpy.array
|
||||||
The data that the decoder data is trained to generate (issue title).
|
The data that the decoder data is trained to generate (issue title).
|
||||||
Calculated by sliding `decoder_np_vecs` one position forward.
|
Calculated by sliding `decoder_np_vecs` one position forward.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
vectorized_title = np.load(decoder_np_vecs)
|
vectorized_title = np.load(decoder_np_vecs)
|
||||||
# For Decoder Input, you don't need the last word as that is only for prediction
|
# For Decoder Input, you don't need the last word as that is only for prediction
|
||||||
# when we are training using Teacher Forcing.
|
# when we are training using Teacher Forcing.
|
||||||
decoder_input_data = vectorized_title[:, :-1]
|
decoder_input_data = vectorized_title[:, :-1]
|
||||||
|
|
||||||
# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
|
# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
|
||||||
decoder_target_data = vectorized_title[:, 1:]
|
decoder_target_data = vectorized_title[:, 1:]
|
||||||
|
|
||||||
print(f'Shape of decoder input: {decoder_input_data.shape}')
|
print('Shape of decoder input: {}'.format(decoder_input_data.shape))
|
||||||
print(f'Shape of decoder target: {decoder_target_data.shape}')
|
print('Shape of decoder target: {}'.format(decoder_target_data.shape))
|
||||||
return decoder_input_data, decoder_target_data
|
return decoder_input_data, decoder_target_data
|
||||||
|
|
||||||
|
|
||||||
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
|
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
|
||||||
"""
|
"""
|
||||||
Load variables & data that are inputs to encoder.
|
Load variables & data that are inputs to encoder.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
encoder_np_vecs : str
|
encoder_np_vecs : str
|
||||||
filename of serialized numpy.array of encoder input (issue title)
|
filename of serialized numpy.array of encoder input (issue title)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
encoder_input_data : numpy.array
|
encoder_input_data : numpy.array
|
||||||
The issue body
|
The issue body
|
||||||
doc_length : int
|
doc_length : int
|
||||||
The standard document length of the input for the encoder after padding
|
The standard document length of the input for the encoder after padding
|
||||||
the shape of this array will be (num_examples, doc_length)
|
the shape of this array will be (num_examples, doc_length)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
vectorized_body = np.load(encoder_np_vecs)
|
vectorized_body = np.load(encoder_np_vecs)
|
||||||
# Encoder input is simply the body of the issue text
|
# Encoder input is simply the body of the issue text
|
||||||
encoder_input_data = vectorized_body
|
encoder_input_data = vectorized_body
|
||||||
doc_length = encoder_input_data.shape[1]
|
doc_length = encoder_input_data.shape[1]
|
||||||
print(f'Shape of encoder input: {encoder_input_data.shape}')
|
print('Shape of encoder input: {}'.format(encoder_input_data.shape))
|
||||||
return encoder_input_data, doc_length
|
return encoder_input_data, doc_length
|
||||||
|
|
||||||
|
|
||||||
def viz_model_architecture(model):
|
def viz_model_architecture(model):
|
||||||
"""Visualize model architecture in Jupyter notebook."""
|
"""Visualize model architecture in Jupyter notebook."""
|
||||||
display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
|
display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
|
||||||
|
|
||||||
|
|
||||||
def free_gpu_mem():
|
def free_gpu_mem():
|
||||||
"""Attempt to free gpu memory."""
|
"""Attempt to free gpu memory."""
|
||||||
K.get_session().close()
|
K.get_session().close()
|
||||||
cfg = K.tf.ConfigProto()
|
cfg = K.tf.ConfigProto()
|
||||||
cfg.gpu_options.allow_growth = True
|
cfg.gpu_options.allow_growth = True
|
||||||
K.set_session(K.tf.Session(config=cfg))
|
K.set_session(K.tf.Session(config=cfg))
|
||||||
|
|
||||||
|
|
||||||
def test_gpu():
|
def test_gpu():
|
||||||
"""Run a toy computation task in tensorflow to test GPU."""
|
"""Run a toy computation task in tensorflow to test GPU."""
|
||||||
config = tf.ConfigProto()
|
config = tf.ConfigProto()
|
||||||
config.gpu_options.allow_growth = True
|
config.gpu_options.allow_growth = True
|
||||||
session = tf.Session(config=config)
|
session = tf.Session(config=config)
|
||||||
hello = tf.constant('Hello, TensorFlow!')
|
hello = tf.constant('Hello, TensorFlow!')
|
||||||
print(session.run(hello))
|
print(session.run(hello))
|
||||||
|
|
||||||
|
|
||||||
def plot_model_training_history(history_object):
|
def plot_model_training_history(history_object):
|
||||||
"""Plots model train vs. validation loss."""
|
"""Plots model train vs. validation loss."""
|
||||||
plt.title('model accuracy')
|
plt.title('model accuracy')
|
||||||
plt.ylabel('accuracy')
|
plt.ylabel('accuracy')
|
||||||
plt.xlabel('epoch')
|
plt.xlabel('epoch')
|
||||||
plt.plot(history_object.history['loss'])
|
plt.plot(history_object.history['loss'])
|
||||||
plt.plot(history_object.history['val_loss'])
|
plt.plot(history_object.history['val_loss'])
|
||||||
plt.legend(['train', 'test'], loc='upper left')
|
plt.legend(['train', 'test'], loc='upper left')
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
def extract_encoder_model(model):
|
def extract_encoder_model(model):
|
||||||
"""
|
"""
|
||||||
Extract the encoder from the original Sequence to Sequence Model.
|
Extract the encoder from the original Sequence to Sequence Model.
|
||||||
|
|
||||||
Returns a keras model object that has one input (body of issue) and one
|
Returns a keras model object that has one input (body of issue) and one
|
||||||
output (encoding of issue, which is the last hidden state).
|
output (encoding of issue, which is the last hidden state).
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
-----
|
-----
|
||||||
model: keras model object
|
model: keras model object
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
-----
|
-----
|
||||||
keras model object
|
keras model object
|
||||||
|
|
||||||
"""
|
"""
|
||||||
encoder_model = model.get_layer('Encoder-Model')
|
encoder_model = model.get_layer('Encoder-Model')
|
||||||
return encoder_model
|
return encoder_model
|
||||||
|
|
||||||
|
|
||||||
def extract_decoder_model(model):
|
def extract_decoder_model(model):
|
||||||
"""
|
"""
|
||||||
Extract the decoder from the original model.
|
Extract the decoder from the original model.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
------
|
------
|
||||||
model: keras model object
|
model: keras model object
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
-------
|
-------
|
||||||
A Keras model object with the following inputs and outputs:
|
A Keras model object with the following inputs and outputs:
|
||||||
|
|
||||||
Inputs of Keras Model That Is Returned:
|
Inputs of Keras Model That Is Returned:
|
||||||
1: the embedding index for the last predicted word or the <Start> indicator
|
1: the embedding index for the last predicted word or the <Start> indicator
|
||||||
2: the last hidden state, or in the case of the first word the hidden state from the encoder
|
2: the last hidden state, or in the case of the first word the hidden state from the encoder
|
||||||
|
|
||||||
Outputs of Keras Model That Is Returned:
|
Outputs of Keras Model That Is Returned:
|
||||||
1. Prediction (class probabilities) for the next word
|
1. Prediction (class probabilities) for the next word
|
||||||
2. The hidden state of the decoder, to be fed back into the decoder at the next time step
|
2. The hidden state of the decoder, to be fed back into the decoder at the next time step
|
||||||
|
|
||||||
Implementation Notes:
|
Implementation Notes:
|
||||||
----------------------
|
----------------------
|
||||||
Must extract relevant layers and reconstruct part of the computation graph
|
Must extract relevant layers and reconstruct part of the computation graph
|
||||||
to allow for different inputs as we are not going to use teacher forcing at
|
to allow for different inputs as we are not going to use teacher forcing at
|
||||||
inference time.
|
inference time.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# the latent dimension is the same throughout the architecture so we are going to
|
# the latent dimension is the same throughout the architecture so we are going to
|
||||||
# cheat and grab the latent dimension of the embedding because that is the same as what is
|
# cheat and grab the latent dimension of the embedding because that is the same as what is
|
||||||
# output from the decoder
|
# output from the decoder
|
||||||
latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
|
latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
|
||||||
|
|
||||||
# Reconstruct the input into the decoder
|
# Reconstruct the input into the decoder
|
||||||
decoder_inputs = model.get_layer('Decoder-Input').input
|
decoder_inputs = model.get_layer('Decoder-Input').input
|
||||||
dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
|
dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
|
||||||
dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
|
dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
|
||||||
|
|
||||||
# Instead of setting the intial state from the encoder and forgetting about it, during inference
|
# Instead of setting the intial state from the encoder and forgetting about it, during inference
|
||||||
# we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
|
# we are not doing teacher forcing, so we will have to have a feedback loop from predictions back
|
||||||
# the GRU, thus we define this input layer for the state so we can add this capability
|
# into the GRU, thus we define this input layer for the state so we can add this capability
|
||||||
gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
|
gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
|
||||||
|
|
||||||
# we need to reuse the weights that is why we are getting this
|
# we need to reuse the weights that is why we are getting this
|
||||||
# If you inspect the decoder GRU that we created for training, it will take as input
|
# If you inspect the decoder GRU that we created for training, it will take as input
|
||||||
# 2 tensors -> (1) is the embedding layer output for the teacher forcing
|
# 2 tensors -> (1) is the embedding layer output for the teacher forcing
|
||||||
# (which will now be the last step's prediction, and will be _start_ on the first time step)
|
# (which will now be the last step's prediction, and will be _start_ on the
|
||||||
# (2) is the state, which we will initialize with the encoder on the first time step, but then
|
# first time step)
|
||||||
# grab the state after the first prediction and feed that back in again.
|
# (2) is the state, which we will initialize with the encoder on the first time step
|
||||||
gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
|
# but then grab the state after the first prediction and feed that back in again.
|
||||||
|
gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
|
||||||
|
|
||||||
# Reconstruct dense layers
|
# Reconstruct dense layers
|
||||||
dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
|
dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
|
||||||
dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
|
dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
|
||||||
decoder_model = Model([decoder_inputs, gru_inference_state_input],
|
decoder_model = Model([decoder_inputs, gru_inference_state_input],
|
||||||
[dense_out, gru_state_out])
|
[dense_out, gru_state_out])
|
||||||
return decoder_model
|
return decoder_model
|
||||||
|
|
||||||
|
|
||||||
class Seq2Seq_Inference(object):
|
class Seq2Seq_Inference(object):
|
||||||
def __init__(self,
|
|
||||||
encoder_preprocessor,
|
|
||||||
decoder_preprocessor,
|
|
||||||
seq2seq_model):
|
|
||||||
|
|
||||||
self.pp_body = encoder_preprocessor
|
# pylint: disable=too-many-instance-attributes
|
||||||
self.pp_title = decoder_preprocessor
|
|
||||||
self.seq2seq_model = seq2seq_model
|
|
||||||
self.encoder_model = extract_encoder_model(seq2seq_model)
|
|
||||||
self.decoder_model = extract_decoder_model(seq2seq_model)
|
|
||||||
self.default_max_len_title = self.pp_title.padding_maxlen
|
|
||||||
self.nn = None
|
|
||||||
self.rec_df = None
|
|
||||||
|
|
||||||
def generate_issue_title(self,
|
def __init__(self,
|
||||||
raw_input_text,
|
encoder_preprocessor,
|
||||||
max_len_title=None):
|
decoder_preprocessor,
|
||||||
"""
|
seq2seq_model):
|
||||||
Use the seq2seq model to generate a title given the body of an issue.
|
|
||||||
|
|
||||||
Inputs
|
self.pp_body = encoder_preprocessor
|
||||||
------
|
self.pp_title = decoder_preprocessor
|
||||||
raw_input: str
|
self.seq2seq_model = seq2seq_model
|
||||||
The body of the issue text as an input string
|
self.encoder_model = extract_encoder_model(seq2seq_model)
|
||||||
|
self.decoder_model = extract_decoder_model(seq2seq_model)
|
||||||
|
self.default_max_len_title = self.pp_title.padding_maxlen
|
||||||
|
self.nn = None
|
||||||
|
self.rec_df = None
|
||||||
|
|
||||||
max_len_title: int (optional)
|
def generate_issue_title(self,
|
||||||
The maximum length of the title the model will generate
|
raw_input_text,
|
||||||
|
max_len_title=None):
|
||||||
|
"""
|
||||||
|
Use the seq2seq model to generate a title given the body of an issue.
|
||||||
|
|
||||||
"""
|
Inputs
|
||||||
if max_len_title is None:
|
------
|
||||||
max_len_title = self.default_max_len_title
|
raw_input: str
|
||||||
# get the encoder's features for the decoder
|
The body of the issue text as an input string
|
||||||
raw_tokenized = self.pp_body.transform([raw_input_text])
|
|
||||||
body_encoding = self.encoder_model.predict(raw_tokenized)
|
|
||||||
# we want to save the encoder's embedding before its updated by decoder
|
|
||||||
# because we can use that as an embedding for other tasks.
|
|
||||||
original_body_encoding = body_encoding
|
|
||||||
state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
|
|
||||||
|
|
||||||
decoded_sentence = []
|
max_len_title: int (optional)
|
||||||
stop_condition = False
|
The maximum length of the title the model will generate
|
||||||
while not stop_condition:
|
|
||||||
preds, st = self.decoder_model.predict([state_value, body_encoding])
|
|
||||||
|
|
||||||
# We are going to ignore indices 0 (padding) and indices 1 (unknown)
|
"""
|
||||||
# Argmax will return the integer index corresponding to the
|
if max_len_title is None:
|
||||||
# prediction + 2 b/c we chopped off first two
|
max_len_title = self.default_max_len_title
|
||||||
pred_idx = np.argmax(preds[:, :, 2:]) + 2
|
# get the encoder's features for the decoder
|
||||||
|
raw_tokenized = self.pp_body.transform([raw_input_text])
|
||||||
|
body_encoding = self.encoder_model.predict(raw_tokenized)
|
||||||
|
# we want to save the encoder's embedding before its updated by decoder
|
||||||
|
# because we can use that as an embedding for other tasks.
|
||||||
|
original_body_encoding = body_encoding
|
||||||
|
state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
|
||||||
|
|
||||||
# retrieve word from index prediction
|
decoded_sentence = []
|
||||||
pred_word_str = self.pp_title.id2token[pred_idx]
|
stop_condition = False
|
||||||
|
while not stop_condition:
|
||||||
|
preds, st = self.decoder_model.predict([state_value, body_encoding])
|
||||||
|
|
||||||
if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
|
# We are going to ignore indices 0 (padding) and indices 1 (unknown)
|
||||||
stop_condition = True
|
# Argmax will return the integer index corresponding to the
|
||||||
break
|
# prediction + 2 b/c we chopped off first two
|
||||||
decoded_sentence.append(pred_word_str)
|
pred_idx = np.argmax(preds[:, :, 2:]) + 2
|
||||||
|
|
||||||
# update the decoder for the next word
|
# retrieve word from index prediction
|
||||||
body_encoding = st
|
pred_word_str = self.pp_title.id2token[pred_idx]
|
||||||
state_value = np.array(pred_idx).reshape(1, 1)
|
|
||||||
|
|
||||||
return original_body_encoding, ' '.join(decoded_sentence)
|
if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
|
||||||
|
stop_condition = True
|
||||||
|
break
|
||||||
|
decoded_sentence.append(pred_word_str)
|
||||||
|
|
||||||
|
# update the decoder for the next word
|
||||||
|
body_encoding = st
|
||||||
|
state_value = np.array(pred_idx).reshape(1, 1)
|
||||||
|
|
||||||
|
return original_body_encoding, ' '.join(decoded_sentence)
|
||||||
|
|
||||||
|
|
||||||
def print_example(self,
|
def print_example(self,
|
||||||
i,
|
i,
|
||||||
body_text,
|
body_text,
|
||||||
title_text,
|
title_text,
|
||||||
url,
|
url,
|
||||||
threshold):
|
threshold):
|
||||||
"""
|
"""
|
||||||
Prints an example of the model's prediction for manual inspection.
|
Prints an example of the model's prediction for manual inspection.
|
||||||
"""
|
"""
|
||||||
if i:
|
if i:
|
||||||
print('\n\n==============================================')
|
print('\n\n==============================================')
|
||||||
print(f'============== Example # {i} =================\n')
|
print('============== Example # {} =================\n'.format(i))
|
||||||
|
|
||||||
if url:
|
if url:
|
||||||
print(url)
|
print(url)
|
||||||
|
|
||||||
print(f"Issue Body:\n {body_text} \n")
|
print("Issue Body:\n {} \n".format(body_text))
|
||||||
|
|
||||||
if title_text:
|
if title_text:
|
||||||
print(f"Original Title:\n {title_text}")
|
print("Original Title:\n {}".format(title_text))
|
||||||
|
|
||||||
emb, gen_title = self.generate_issue_title(body_text)
|
emb, gen_title = self.generate_issue_title(body_text)
|
||||||
print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
|
print("\n****** Machine Generated Title (Prediction) ******:\n {}".format(gen_title))
|
||||||
|
|
||||||
if self.nn:
|
if self.nn:
|
||||||
# return neighbors and distances
|
# return neighbors and distances
|
||||||
n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
|
n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
|
||||||
include_distances=True)
|
include_distances=True)
|
||||||
neighbors = n[1:]
|
neighbors = n[1:]
|
||||||
dist = d[1:]
|
dist = d[1:]
|
||||||
|
|
||||||
if min(dist) <= threshold:
|
if min(dist) <= threshold:
|
||||||
cols = ['issue_url', 'issue_title', 'body']
|
cols = ['issue_url', 'issue_title', 'body']
|
||||||
dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
|
dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
|
||||||
dfcopy['dist'] = dist
|
dfcopy['dist'] = dist
|
||||||
similar_issues_df = dfcopy.query(f'dist <= {threshold}')
|
similar_issues_df = dfcopy.query('dist <= {}'.format(threshold))
|
||||||
|
|
||||||
print("\n**** Similar Issues (using encoder embedding) ****:\n")
|
print("\n**** Similar Issues (using encoder embedding) ****:\n")
|
||||||
display(similar_issues_df)
|
display(similar_issues_df)
|
||||||
|
|
||||||
|
|
||||||
def demo_model_predictions(self,
|
def demo_model_predictions(self,
|
||||||
n,
|
n,
|
||||||
issue_df,
|
issue_df,
|
||||||
threshold=1):
|
threshold=1):
|
||||||
"""
|
"""
|
||||||
Pick n random Issues and display predictions.
|
Pick n random Issues and display predictions.
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
------
|
------
|
||||||
n : int
|
n : int
|
||||||
Number of issues to display from issue_df
|
Number of issues to display from issue_df
|
||||||
issue_df : pandas DataFrame
|
issue_df : pandas DataFrame
|
||||||
DataFrame that contains two columns: `body` and `issue_title`.
|
DataFrame that contains two columns: `body` and `issue_title`.
|
||||||
threshold : float
|
threshold : float
|
||||||
distance threshold for recommendation of similar issues.
|
distance threshold for recommendation of similar issues.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
--------
|
--------
|
||||||
None
|
None
|
||||||
Prints the original issue body and the model's prediction.
|
Prints the original issue body and the model's prediction.
|
||||||
"""
|
"""
|
||||||
# Extract body and title from DF
|
# Extract body and title from DF
|
||||||
body_text = issue_df.body.tolist()
|
body_text = issue_df.body.tolist()
|
||||||
title_text = issue_df.issue_title.tolist()
|
title_text = issue_df.issue_title.tolist()
|
||||||
url = issue_df.issue_url.tolist()
|
url = issue_df.issue_url.tolist()
|
||||||
|
|
||||||
demo_list = np.random.randint(low=1, high=len(body_text), size=n)
|
demo_list = np.random.randint(low=1, high=len(body_text), size=n)
|
||||||
for i in demo_list:
|
for i in demo_list:
|
||||||
self.print_example(i,
|
self.print_example(i,
|
||||||
body_text=body_text[i],
|
body_text=body_text[i],
|
||||||
title_text=title_text[i],
|
title_text=title_text[i],
|
||||||
url=url[i],
|
url=url[i],
|
||||||
threshold=threshold)
|
threshold=threshold)
|
||||||
|
|
||||||
def prepare_recommender(self, vectorized_array, original_df):
|
def prepare_recommender(self, vectorized_array, original_df):
|
||||||
"""
|
"""
|
||||||
Use the annoy library to build recommender
|
Use the annoy library to build recommender
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
vectorized_array : List[List[int]]
|
vectorized_array : List[List[int]]
|
||||||
This is the list of list of integers that represents your corpus
|
This is the list of list of integers that represents your corpus
|
||||||
that is fed into the seq2seq model for training.
|
that is fed into the seq2seq model for training.
|
||||||
original_df : pandas.DataFrame
|
original_df : pandas.DataFrame
|
||||||
This is the original dataframe that has the columns
|
This is the original dataframe that has the columns
|
||||||
['issue_url', 'issue_title', 'body']
|
['issue_url', 'issue_title', 'body']
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
annoy.AnnoyIndex object (see https://github.com/spotify/annoy)
|
annoy.AnnoyIndex object (see https://github.com/spotify/annoy)
|
||||||
"""
|
"""
|
||||||
self.rec_df = original_df
|
self.rec_df = original_df
|
||||||
emb = self.encoder_model.predict(x=vectorized_array,
|
emb = self.encoder_model.predict(x=vectorized_array,
|
||||||
batch_size=vectorized_array.shape[0]//200)
|
batch_size=vectorized_array.shape[0]//200)
|
||||||
|
|
||||||
f = emb.shape[1]
|
f = emb.shape[1]
|
||||||
self.nn = AnnoyIndex(f)
|
self.nn = AnnoyIndex(f)
|
||||||
logging.warning('Adding embeddings')
|
logging.warning('Adding embeddings')
|
||||||
for i in tqdm(range(len(emb))):
|
for i in tqdm(range(len(emb))):
|
||||||
self.nn.add_item(i, emb[i])
|
self.nn.add_item(i, emb[i])
|
||||||
logging.warning('Building trees for similarity lookup.')
|
logging.warning('Building trees for similarity lookup.')
|
||||||
self.nn.build(50)
|
self.nn.build(50)
|
||||||
return self.nn
|
return self.nn
|
||||||
|
|
||||||
def set_recsys_data(self, original_df):
|
def set_recsys_data(self, original_df):
|
||||||
self.rec_df = original_df
|
self.rec_df = original_df
|
||||||
|
|
||||||
def set_recsys_annoyobj(self, annoyobj):
|
def set_recsys_annoyobj(self, annoyobj):
|
||||||
self.nn = annoyobj
|
self.nn = annoyobj
|
||||||
|
|
||||||
def evaluate_model(self, holdout_bodies, holdout_titles):
|
def evaluate_model(self, holdout_bodies, holdout_titles):
|
||||||
"""
|
"""
|
||||||
Method for calculating BLEU Score.
|
Method for calculating BLEU Score.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
holdout_bodies : List[str]
|
holdout_bodies : List[str]
|
||||||
These are the issue bodies that we want to summarize
|
These are the issue bodies that we want to summarize
|
||||||
holdout_titles : List[str]
|
holdout_titles : List[str]
|
||||||
This is the ground truth we are trying to predict --> issue titles
|
This is the ground truth we are trying to predict --> issue titles
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
bleu : float
|
bleu : float
|
||||||
The BLEU Score
|
The BLEU Score
|
||||||
|
|
||||||
"""
|
"""
|
||||||
actual, predicted = list(), list()
|
actual, predicted = list(), list()
|
||||||
assert len(holdout_bodies) == len(holdout_titles)
|
assert len(holdout_bodies) == len(holdout_titles)
|
||||||
num_examples = len(holdout_bodies)
|
num_examples = len(holdout_bodies)
|
||||||
|
|
||||||
logging.warning('Generating predictions.')
|
logging.warning('Generating predictions.')
|
||||||
# step over the whole set TODO: parallelize this
|
# step over the whole set TODO: parallelize this
|
||||||
for i in tqdm_notebook(range(num_examples)):
|
for i in tqdm_notebook(range(num_examples)):
|
||||||
_, yhat = self.generate_issue_title(holdout_bodies[i])
|
_, yhat = self.generate_issue_title(holdout_bodies[i])
|
||||||
|
|
||||||
actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
|
actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
|
||||||
predicted.append(self.pp_title.process_text([yhat])[0])
|
predicted.append(self.pp_title.process_text([yhat])[0])
|
||||||
|
|
||||||
# calculate BLEU score
|
# calculate BLEU score
|
||||||
logging.warning('Calculating BLEU.')
|
logging.warning('Calculating BLEU.')
|
||||||
#must be careful with nltk api for corpus_bleu!,
|
#must be careful with nltk api for corpus_bleu!,
|
||||||
# expects List[List[List[str]]] for ground truth, using List[List[str]] will give you
|
# expects List[List[List[str]]] for ground truth, using List[List[str]] will give you
|
||||||
# erroneous results.
|
# erroneous results.
|
||||||
bleu = corpus_bleu([[a] for a in actual], predicted)
|
bleu = corpus_bleu([[a] for a in actual], predicted)
|
||||||
return bleu
|
return bleu
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
import argparse
|
import argparse
|
||||||
import keras
|
import keras
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from seq2seq_utils import load_decoder_inputs
|
|
||||||
from seq2seq_utils import load_encoder_inputs
|
|
||||||
from seq2seq_utils import load_text_processor
|
from seq2seq_utils import load_text_processor
|
||||||
from seq2seq_utils import Seq2Seq_Inference
|
from seq2seq_utils import Seq2Seq_Inference
|
||||||
|
|
||||||
|
@ -29,5 +27,5 @@ seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
|
||||||
decoder_preprocessor=title_pp,
|
decoder_preprocessor=title_pp,
|
||||||
seq2seq_model=seq2seq_Model)
|
seq2seq_model=seq2seq_Model)
|
||||||
|
|
||||||
# Output predictions for n random rows in the test set.
|
# Output predictions for n random rows in the test set.
|
||||||
seq2seq_inf.demo_model_predictions(n=args.input_prediction_count, issue_df=testdf)
|
seq2seq_inf.demo_model_predictions(n=args.input_prediction_count, issue_df=testdf)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import argparse
|
import argparse
|
||||||
import dill as dpickle
|
import dill as dpickle
|
||||||
from ktext.preprocess import processor
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from ktext.preprocess import processor
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
# Parsing flags.
|
# Parsing flags.
|
||||||
|
@ -30,7 +30,7 @@ print('Example body after pre-processing:', train_body_vecs[0])
|
||||||
|
|
||||||
# Instantiate a text processor for the titles, with some different parameters.
|
# Instantiate a text processor for the titles, with some different parameters.
|
||||||
title_pp = processor(append_indicators=True, keep_n=4500,
|
title_pp = processor(append_indicators=True, keep_n=4500,
|
||||||
padding_maxlen=12, padding ='post')
|
padding_maxlen=12, padding='post')
|
||||||
|
|
||||||
# process the title data
|
# process the title data
|
||||||
train_title_vecs = title_pp.fit_transform(train_title_raw)
|
train_title_vecs = title_pp.fit_transform(train_title_raw)
|
||||||
|
@ -40,10 +40,10 @@ print('Example title after pre-processing:', train_title_vecs[0])
|
||||||
|
|
||||||
# Save the preprocessor.
|
# Save the preprocessor.
|
||||||
with open(args.output_body_preprocessor_dpkl, 'wb') as f:
|
with open(args.output_body_preprocessor_dpkl, 'wb') as f:
|
||||||
dpickle.dump(body_pp, f)
|
dpickle.dump(body_pp, f)
|
||||||
|
|
||||||
with open(args.output_title_preprocessor_dpkl, 'wb') as f:
|
with open(args.output_title_preprocessor_dpkl, 'wb') as f:
|
||||||
dpickle.dump(title_pp, f)
|
dpickle.dump(title_pp, f)
|
||||||
|
|
||||||
# Save the processed data.
|
# Save the processed data.
|
||||||
np.save(args.output_train_title_vecs_npy, train_title_vecs)
|
np.save(args.output_train_title_vecs_npy, train_title_vecs)
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
import argparse
|
import argparse
|
||||||
import glob
|
|
||||||
import logging
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
@ -20,8 +18,8 @@ traindf, testdf = train_test_split(pd.read_csv(args.input_csv).sample(n=args.sam
|
||||||
test_size=.10)
|
test_size=.10)
|
||||||
|
|
||||||
# Print stats about the shape of the data.
|
# Print stats about the shape of the data.
|
||||||
print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')
|
print('Train: {:,} rows {:,} columns'.format(traindf.shape[0], traindf.shape[1]))
|
||||||
print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')
|
print('Test: {:,} rows {:,} columns'.format(testdf.shape[0], testdf.shape[1]))
|
||||||
|
|
||||||
# Store output as CSV.
|
# Store output as CSV.
|
||||||
traindf.to_csv(args.output_traindf_csv)
|
traindf.to_csv(args.output_traindf_csv)
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
import argparse
|
import argparse
|
||||||
import keras
|
import keras
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from seq2seq_utils import load_decoder_inputs
|
|
||||||
from seq2seq_utils import load_encoder_inputs
|
|
||||||
from seq2seq_utils import load_text_processor
|
from seq2seq_utils import load_text_processor
|
||||||
from seq2seq_utils import Seq2Seq_Inference
|
from seq2seq_utils import Seq2Seq_Inference
|
||||||
|
|
||||||
|
|
|
@ -1,393 +0,0 @@
|
||||||
from matplotlib import pyplot as plt
|
|
||||||
import tensorflow as tf
|
|
||||||
from keras import backend as K
|
|
||||||
from keras.layers import Input
|
|
||||||
from keras.models import Model
|
|
||||||
from IPython.display import SVG, display
|
|
||||||
from keras.utils.vis_utils import model_to_dot
|
|
||||||
import logging
|
|
||||||
import numpy as np
|
|
||||||
import dill as dpickle
|
|
||||||
from annoy import AnnoyIndex
|
|
||||||
from tqdm import tqdm, tqdm_notebook
|
|
||||||
from random import random
|
|
||||||
from nltk.translate.bleu_score import corpus_bleu
|
|
||||||
|
|
||||||
def load_text_processor(fname='title_pp.dpkl'):
|
|
||||||
"""
|
|
||||||
Load preprocessors from disk.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
fname: str
|
|
||||||
file name of ktext.proccessor object
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
num_tokens : int
|
|
||||||
size of vocabulary loaded into ktext.processor
|
|
||||||
pp : ktext.processor
|
|
||||||
the processor you are trying to load
|
|
||||||
Typical Usage:
|
|
||||||
-------------
|
|
||||||
num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
|
|
||||||
num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
|
|
||||||
"""
|
|
||||||
# Load files from disk
|
|
||||||
with open(fname, 'rb') as f:
|
|
||||||
pp = dpickle.load(f)
|
|
||||||
|
|
||||||
num_tokens = max(pp.id2token.keys()) + 1
|
|
||||||
print(f'Size of vocabulary for {fname}: {num_tokens:,}')
|
|
||||||
return num_tokens, pp
|
|
||||||
|
|
||||||
|
|
||||||
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
|
|
||||||
"""
|
|
||||||
Load decoder inputs.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
decoder_np_vecs : str
|
|
||||||
filename of serialized numpy.array of decoder input (issue title)
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
decoder_input_data : numpy.array
|
|
||||||
The data fed to the decoder as input during training for teacher forcing.
|
|
||||||
This is the same as `decoder_np_vecs` except the last position.
|
|
||||||
decoder_target_data : numpy.array
|
|
||||||
The data that the decoder data is trained to generate (issue title).
|
|
||||||
Calculated by sliding `decoder_np_vecs` one position forward.
|
|
||||||
"""
|
|
||||||
vectorized_title = np.load(decoder_np_vecs)
|
|
||||||
# For Decoder Input, you don't need the last word as that is only for prediction
|
|
||||||
# when we are training using Teacher Forcing.
|
|
||||||
decoder_input_data = vectorized_title[:, :-1]
|
|
||||||
|
|
||||||
# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
|
|
||||||
decoder_target_data = vectorized_title[:, 1:]
|
|
||||||
|
|
||||||
print(f'Shape of decoder input: {decoder_input_data.shape}')
|
|
||||||
print(f'Shape of decoder target: {decoder_target_data.shape}')
|
|
||||||
return decoder_input_data, decoder_target_data
|
|
||||||
|
|
||||||
|
|
||||||
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
|
|
||||||
"""
|
|
||||||
Load variables & data that are inputs to encoder.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
encoder_np_vecs : str
|
|
||||||
filename of serialized numpy.array of encoder input (issue title)
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
encoder_input_data : numpy.array
|
|
||||||
The issue body
|
|
||||||
doc_length : int
|
|
||||||
The standard document length of the input for the encoder after padding
|
|
||||||
the shape of this array will be (num_examples, doc_length)
|
|
||||||
"""
|
|
||||||
vectorized_body = np.load(encoder_np_vecs)
|
|
||||||
# Encoder input is simply the body of the issue text
|
|
||||||
encoder_input_data = vectorized_body
|
|
||||||
doc_length = encoder_input_data.shape[1]
|
|
||||||
print(f'Shape of encoder input: {encoder_input_data.shape}')
|
|
||||||
return encoder_input_data, doc_length
|
|
||||||
|
|
||||||
|
|
||||||
def viz_model_architecture(model):
|
|
||||||
"""Visualize model architecture in Jupyter notebook."""
|
|
||||||
display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
|
|
||||||
|
|
||||||
|
|
||||||
def free_gpu_mem():
|
|
||||||
"""Attempt to free gpu memory."""
|
|
||||||
K.get_session().close()
|
|
||||||
cfg = K.tf.ConfigProto()
|
|
||||||
cfg.gpu_options.allow_growth = True
|
|
||||||
K.set_session(K.tf.Session(config=cfg))
|
|
||||||
|
|
||||||
|
|
||||||
def test_gpu():
|
|
||||||
"""Run a toy computation task in tensorflow to test GPU."""
|
|
||||||
config = tf.ConfigProto()
|
|
||||||
config.gpu_options.allow_growth = True
|
|
||||||
session = tf.Session(config=config)
|
|
||||||
hello = tf.constant('Hello, TensorFlow!')
|
|
||||||
print(session.run(hello))
|
|
||||||
|
|
||||||
|
|
||||||
def plot_model_training_history(history_object):
|
|
||||||
"""Plots model train vs. validation loss."""
|
|
||||||
plt.title('model accuracy')
|
|
||||||
plt.ylabel('accuracy')
|
|
||||||
plt.xlabel('epoch')
|
|
||||||
plt.plot(history_object.history['loss'])
|
|
||||||
plt.plot(history_object.history['val_loss'])
|
|
||||||
plt.legend(['train', 'test'], loc='upper left')
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
|
|
||||||
def extract_encoder_model(model):
|
|
||||||
"""
|
|
||||||
Extract the encoder from the original Sequence to Sequence Model.
|
|
||||||
Returns a keras model object that has one input (body of issue) and one
|
|
||||||
output (encoding of issue, which is the last hidden state).
|
|
||||||
Input:
|
|
||||||
-----
|
|
||||||
model: keras model object
|
|
||||||
Returns:
|
|
||||||
-----
|
|
||||||
keras model object
|
|
||||||
"""
|
|
||||||
encoder_model = model.get_layer('Encoder-Model')
|
|
||||||
return encoder_model
|
|
||||||
|
|
||||||
|
|
||||||
def extract_decoder_model(model):
|
|
||||||
"""
|
|
||||||
Extract the decoder from the original model.
|
|
||||||
Inputs:
|
|
||||||
------
|
|
||||||
model: keras model object
|
|
||||||
Returns:
|
|
||||||
-------
|
|
||||||
A Keras model object with the following inputs and outputs:
|
|
||||||
Inputs of Keras Model That Is Returned:
|
|
||||||
1: the embedding index for the last predicted word or the <Start> indicator
|
|
||||||
2: the last hidden state, or in the case of the first word the hidden state from the encoder
|
|
||||||
Outputs of Keras Model That Is Returned:
|
|
||||||
1. Prediction (class probabilities) for the next word
|
|
||||||
2. The hidden state of the decoder, to be fed back into the decoder at the next time step
|
|
||||||
Implementation Notes:
|
|
||||||
----------------------
|
|
||||||
Must extract relevant layers and reconstruct part of the computation graph
|
|
||||||
to allow for different inputs as we are not going to use teacher forcing at
|
|
||||||
inference time.
|
|
||||||
"""
|
|
||||||
# the latent dimension is the same throughout the architecture so we are going to
|
|
||||||
# cheat and grab the latent dimension of the embedding because that is the same as what is
|
|
||||||
# output from the decoder
|
|
||||||
latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
|
|
||||||
|
|
||||||
# Reconstruct the input into the decoder
|
|
||||||
decoder_inputs = model.get_layer('Decoder-Input').input
|
|
||||||
dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
|
|
||||||
dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
|
|
||||||
|
|
||||||
# Instead of setting the intial state from the encoder and forgetting about it, during inference
|
|
||||||
# we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
|
|
||||||
# the GRU, thus we define this input layer for the state so we can add this capability
|
|
||||||
gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
|
|
||||||
|
|
||||||
# we need to reuse the weights that is why we are getting this
|
|
||||||
# If you inspect the decoder GRU that we created for training, it will take as input
|
|
||||||
# 2 tensors -> (1) is the embedding layer output for the teacher forcing
|
|
||||||
# (which will now be the last step's prediction, and will be _start_ on the first time step)
|
|
||||||
# (2) is the state, which we will initialize with the encoder on the first time step, but then
|
|
||||||
# grab the state after the first prediction and feed that back in again.
|
|
||||||
gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
|
|
||||||
|
|
||||||
# Reconstruct dense layers
|
|
||||||
dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
|
|
||||||
dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
|
|
||||||
decoder_model = Model([decoder_inputs, gru_inference_state_input],
|
|
||||||
[dense_out, gru_state_out])
|
|
||||||
return decoder_model
|
|
||||||
|
|
||||||
|
|
||||||
class Seq2Seq_Inference(object):
|
|
||||||
def __init__(self,
|
|
||||||
encoder_preprocessor,
|
|
||||||
decoder_preprocessor,
|
|
||||||
seq2seq_model):
|
|
||||||
|
|
||||||
self.pp_body = encoder_preprocessor
|
|
||||||
self.pp_title = decoder_preprocessor
|
|
||||||
self.seq2seq_model = seq2seq_model
|
|
||||||
self.encoder_model = extract_encoder_model(seq2seq_model)
|
|
||||||
self.decoder_model = extract_decoder_model(seq2seq_model)
|
|
||||||
self.default_max_len_title = self.pp_title.padding_maxlen
|
|
||||||
self.nn = None
|
|
||||||
self.rec_df = None
|
|
||||||
|
|
||||||
def generate_issue_title(self,
|
|
||||||
raw_input_text,
|
|
||||||
max_len_title=None):
|
|
||||||
"""
|
|
||||||
Use the seq2seq model to generate a title given the body of an issue.
|
|
||||||
Inputs
|
|
||||||
------
|
|
||||||
raw_input: str
|
|
||||||
The body of the issue text as an input string
|
|
||||||
max_len_title: int (optional)
|
|
||||||
The maximum length of the title the model will generate
|
|
||||||
"""
|
|
||||||
if max_len_title is None:
|
|
||||||
max_len_title = self.default_max_len_title
|
|
||||||
# get the encoder's features for the decoder
|
|
||||||
raw_tokenized = self.pp_body.transform([raw_input_text])
|
|
||||||
body_encoding = self.encoder_model.predict(raw_tokenized)
|
|
||||||
# we want to save the encoder's embedding before its updated by decoder
|
|
||||||
# because we can use that as an embedding for other tasks.
|
|
||||||
original_body_encoding = body_encoding
|
|
||||||
state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
|
|
||||||
|
|
||||||
decoded_sentence = []
|
|
||||||
stop_condition = False
|
|
||||||
while not stop_condition:
|
|
||||||
preds, st = self.decoder_model.predict([state_value, body_encoding])
|
|
||||||
|
|
||||||
# We are going to ignore indices 0 (padding) and indices 1 (unknown)
|
|
||||||
# Argmax will return the integer index corresponding to the
|
|
||||||
# prediction + 2 b/c we chopped off first two
|
|
||||||
pred_idx = np.argmax(preds[:, :, 2:]) + 2
|
|
||||||
|
|
||||||
# retrieve word from index prediction
|
|
||||||
pred_word_str = self.pp_title.id2token[pred_idx]
|
|
||||||
|
|
||||||
if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
|
|
||||||
stop_condition = True
|
|
||||||
break
|
|
||||||
decoded_sentence.append(pred_word_str)
|
|
||||||
|
|
||||||
# update the decoder for the next word
|
|
||||||
body_encoding = st
|
|
||||||
state_value = np.array(pred_idx).reshape(1, 1)
|
|
||||||
|
|
||||||
return original_body_encoding, ' '.join(decoded_sentence)
|
|
||||||
|
|
||||||
|
|
||||||
def print_example(self,
|
|
||||||
i,
|
|
||||||
body_text,
|
|
||||||
title_text,
|
|
||||||
url,
|
|
||||||
threshold):
|
|
||||||
"""
|
|
||||||
Prints an example of the model's prediction for manual inspection.
|
|
||||||
"""
|
|
||||||
if i:
|
|
||||||
print('\n\n==============================================')
|
|
||||||
print(f'============== Example # {i} =================\n')
|
|
||||||
|
|
||||||
if url:
|
|
||||||
print(url)
|
|
||||||
|
|
||||||
print(f"Issue Body:\n {body_text} \n")
|
|
||||||
|
|
||||||
if title_text:
|
|
||||||
print(f"Original Title:\n {title_text}")
|
|
||||||
|
|
||||||
emb, gen_title = self.generate_issue_title(body_text)
|
|
||||||
print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
|
|
||||||
|
|
||||||
if self.nn:
|
|
||||||
# return neighbors and distances
|
|
||||||
n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
|
|
||||||
include_distances=True)
|
|
||||||
neighbors = n[1:]
|
|
||||||
dist = d[1:]
|
|
||||||
|
|
||||||
if min(dist) <= threshold:
|
|
||||||
cols = ['issue_url', 'issue_title', 'body']
|
|
||||||
dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
|
|
||||||
dfcopy['dist'] = dist
|
|
||||||
similar_issues_df = dfcopy.query(f'dist <= {threshold}')
|
|
||||||
|
|
||||||
print("\n**** Similar Issues (using encoder embedding) ****:\n")
|
|
||||||
display(similar_issues_df)
|
|
||||||
|
|
||||||
|
|
||||||
def demo_model_predictions(self,
|
|
||||||
n,
|
|
||||||
issue_df,
|
|
||||||
threshold=1):
|
|
||||||
"""
|
|
||||||
Pick n random Issues and display predictions.
|
|
||||||
Input:
|
|
||||||
------
|
|
||||||
n : int
|
|
||||||
Number of issues to display from issue_df
|
|
||||||
issue_df : pandas DataFrame
|
|
||||||
DataFrame that contains two columns: `body` and `issue_title`.
|
|
||||||
threshold : float
|
|
||||||
distance threshold for recommendation of similar issues.
|
|
||||||
Returns:
|
|
||||||
--------
|
|
||||||
None
|
|
||||||
Prints the original issue body and the model's prediction.
|
|
||||||
"""
|
|
||||||
# Extract body and title from DF
|
|
||||||
body_text = issue_df.body.tolist()
|
|
||||||
title_text = issue_df.issue_title.tolist()
|
|
||||||
url = issue_df.issue_url.tolist()
|
|
||||||
|
|
||||||
demo_list = np.random.randint(low=1, high=len(body_text), size=n)
|
|
||||||
for i in demo_list:
|
|
||||||
self.print_example(i,
|
|
||||||
body_text=body_text[i],
|
|
||||||
title_text=title_text[i],
|
|
||||||
url=url[i],
|
|
||||||
threshold=threshold)
|
|
||||||
|
|
||||||
def prepare_recommender(self, vectorized_array, original_df):
|
|
||||||
"""
|
|
||||||
Use the annoy library to build recommender
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
vectorized_array : List[List[int]]
|
|
||||||
This is the list of list of integers that represents your corpus
|
|
||||||
that is fed into the seq2seq model for training.
|
|
||||||
original_df : pandas.DataFrame
|
|
||||||
This is the original dataframe that has the columns
|
|
||||||
['issue_url', 'issue_title', 'body']
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
annoy.AnnoyIndex object (see https://github.com/spotify/annoy)
|
|
||||||
"""
|
|
||||||
self.rec_df = original_df
|
|
||||||
emb = self.encoder_model.predict(x=vectorized_array,
|
|
||||||
batch_size=vectorized_array.shape[0]//200)
|
|
||||||
|
|
||||||
f = emb.shape[1]
|
|
||||||
self.nn = AnnoyIndex(f)
|
|
||||||
logging.warning('Adding embeddings')
|
|
||||||
for i in tqdm(range(len(emb))):
|
|
||||||
self.nn.add_item(i, emb[i])
|
|
||||||
logging.warning('Building trees for similarity lookup.')
|
|
||||||
self.nn.build(50)
|
|
||||||
return self.nn
|
|
||||||
|
|
||||||
def set_recsys_data(self, original_df):
|
|
||||||
self.rec_df = original_df
|
|
||||||
|
|
||||||
def set_recsys_annoyobj(self, annoyobj):
|
|
||||||
self.nn = annoyobj
|
|
||||||
|
|
||||||
def evaluate_model(self, holdout_bodies, holdout_titles):
|
|
||||||
"""
|
|
||||||
Method for calculating BLEU Score.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
holdout_bodies : List[str]
|
|
||||||
These are the issue bodies that we want to summarize
|
|
||||||
holdout_titles : List[str]
|
|
||||||
This is the ground truth we are trying to predict --> issue titles
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
bleu : float
|
|
||||||
The BLEU Score
|
|
||||||
"""
|
|
||||||
actual, predicted = list(), list()
|
|
||||||
assert len(holdout_bodies) == len(holdout_titles)
|
|
||||||
num_examples = len(holdout_bodies)
|
|
||||||
|
|
||||||
logging.warning('Generating predictions.')
|
|
||||||
# step over the whole set TODO: parallelize this
|
|
||||||
for i in tqdm_notebook(range(num_examples)):
|
|
||||||
_, yhat = self.generate_issue_title(holdout_bodies[i])
|
|
||||||
|
|
||||||
actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
|
|
||||||
predicted.append(self.pp_title.process_text([yhat])[0])
|
|
||||||
# calculate BLEU score
|
|
||||||
logging.warning('Calculating BLEU.')
|
|
||||||
bleu = corpus_bleu(actual, predicted)
|
|
||||||
return bleu
|
|
|
@ -1,11 +1,10 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import numpy as np
|
||||||
from keras.callbacks import CSVLogger, ModelCheckpoint
|
from keras.callbacks import CSVLogger, ModelCheckpoint
|
||||||
from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
|
from keras.layers import Input, GRU, Dense, Embedding, BatchNormalization
|
||||||
from keras.models import Model
|
from keras.models import Model
|
||||||
from keras import optimizers
|
from keras import optimizers
|
||||||
import numpy as np
|
|
||||||
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor
|
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor
|
||||||
from seq2seq_utils import viz_model_architecture
|
|
||||||
|
|
||||||
# Parsing flags.
|
# Parsing flags.
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
@ -18,7 +17,7 @@ parser.add_argument("--learning_rate", default="0.001")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
learning_rate=float(args.learning_rate)
|
learning_rate = float(args.learning_rate)
|
||||||
|
|
||||||
encoder_input_data, doc_length = load_encoder_inputs(args.input_train_body_vecs_npy)
|
encoder_input_data, doc_length = load_encoder_inputs(args.input_train_body_vecs_npy)
|
||||||
decoder_input_data, decoder_target_data = load_decoder_inputs(args.input_train_title_vecs_npy)
|
decoder_input_data, decoder_target_data = load_decoder_inputs(args.input_train_title_vecs_npy)
|
||||||
|
@ -35,7 +34,10 @@ latent_dim = 300
|
||||||
encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')
|
encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')
|
||||||
|
|
||||||
# Word embeding for encoder (ex: Issue Body)
|
# Word embeding for encoder (ex: Issue Body)
|
||||||
x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
|
x = Embedding(num_encoder_tokens,
|
||||||
|
latent_dim,
|
||||||
|
name='Body-Word-Embedding',
|
||||||
|
mask_zero=False)(encoder_inputs)
|
||||||
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)
|
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)
|
||||||
|
|
||||||
# We do not need the `encoder_output` just the hidden state.
|
# We do not need the `encoder_output` just the hidden state.
|
||||||
|
@ -53,7 +55,10 @@ seq2seq_encoder_out = encoder_model(encoder_inputs)
|
||||||
decoder_inputs = Input(shape=(None,), name='Decoder-Input') # for teacher forcing
|
decoder_inputs = Input(shape=(None,), name='Decoder-Input') # for teacher forcing
|
||||||
|
|
||||||
# Word Embedding For Decoder (ex: Issue Titles)
|
# Word Embedding For Decoder (ex: Issue Titles)
|
||||||
dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
|
dec_emb = Embedding(num_decoder_tokens,
|
||||||
|
latent_dim,
|
||||||
|
name='Decoder-Word-Embedding',
|
||||||
|
mask_zero=False)(decoder_inputs)
|
||||||
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)
|
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)
|
||||||
|
|
||||||
# Set up the decoder, using `decoder_state_input` as initial state.
|
# Set up the decoder, using `decoder_state_input` as initial state.
|
||||||
|
@ -71,21 +76,24 @@ decoder_outputs = decoder_dense(x)
|
||||||
|
|
||||||
seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
|
seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
|
||||||
|
|
||||||
seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy')
|
seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
|
||||||
|
loss='sparse_categorical_crossentropy')
|
||||||
|
|
||||||
seq2seq_Model.summary()
|
seq2seq_Model.summary()
|
||||||
|
|
||||||
script_name_base = 'tutorial_seq2seq'
|
script_name_base = 'tutorial_seq2seq'
|
||||||
csv_logger = CSVLogger('{:}.log'.format(script_name_base))
|
csv_logger = CSVLogger('{:}.log'.format(script_name_base))
|
||||||
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
|
model_checkpoint = ModelCheckpoint(
|
||||||
save_best_only=True)
|
'{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base), save_best_only=True)
|
||||||
|
|
||||||
batch_size = 1200
|
batch_size = 1200
|
||||||
epochs = 7
|
epochs = 7
|
||||||
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
|
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data],
|
||||||
batch_size=batch_size,
|
np.expand_dims(decoder_target_data, -1),
|
||||||
epochs=epochs,
|
batch_size=batch_size,
|
||||||
validation_split=0.12, callbacks=[csv_logger, model_checkpoint])
|
epochs=epochs,
|
||||||
|
validation_split=0.12,
|
||||||
|
callbacks=[csv_logger, model_checkpoint])
|
||||||
|
|
||||||
#############
|
#############
|
||||||
# Save model.
|
# Save model.
|
||||||
|
|
Loading…
Reference in New Issue