mirror of https://github.com/kubeflow/examples.git
Add .pylintrc (#61)
* Add .pylintrc * Resolve lint complaints in agents/trainer/task.py * Resolve lint complaints with flask app.py * Resolve linting issues Remove duplicate seq2seq_utils.py from workflow/workspace/src * Use python 3.5.2 with pylint to match prow Put pybullet import back into agents/trainer/task.py with a pylint ignore statement Use main(_) to ensure it works with tf.app.run
This commit is contained in:
parent
1d6946ead8
commit
41372c9314
|
@ -0,0 +1,399 @@
|
|||
[MASTER]
|
||||
|
||||
# Specify a configuration file.
|
||||
#rcfile=
|
||||
|
||||
# Python code to execute, usually for sys.path manipulation such as
|
||||
# pygtk.require().
|
||||
#init-hook=
|
||||
|
||||
# Add files or directories to the blacklist. They should be base names, not
|
||||
# paths.
|
||||
ignore=third_party
|
||||
|
||||
# Add files or directories matching the regex patterns to the blacklist. The
|
||||
# regex matches against base names, not paths.
|
||||
ignore-patterns=
|
||||
|
||||
# Pickle collected data for later comparisons.
|
||||
persistent=no
|
||||
|
||||
# List of plugins (as comma separated values of python modules names) to load,
|
||||
# usually to register additional checkers.
|
||||
load-plugins=
|
||||
|
||||
# Use multiple processes to speed up Pylint.
|
||||
jobs=4
|
||||
|
||||
# Allow loading of arbitrary C extensions. Extensions are imported into the
|
||||
# active Python interpreter and may run arbitrary code.
|
||||
unsafe-load-any-extension=no
|
||||
|
||||
# A comma-separated list of package or module names from where C extensions may
|
||||
# be loaded. Extensions are loading into the active Python interpreter and may
|
||||
# run arbitrary code
|
||||
extension-pkg-whitelist=
|
||||
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
|
||||
# Only show warnings with the listed confidence levels. Leave empty to show
|
||||
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
|
||||
confidence=
|
||||
|
||||
# Enable the message, report, category or checker with the given id(s). You can
|
||||
# either give multiple identifier separated by comma (,) or put this option
|
||||
# multiple time (only on the command line, not in the configuration file where
|
||||
# it should appear only once). See also the "--disable" option for examples.
|
||||
#enable=
|
||||
|
||||
# Disable the message, report, category or checker with the given id(s). You
|
||||
# can either give multiple identifiers separated by comma (,) or put this
|
||||
# option multiple times (only on the command line, not in the configuration
|
||||
# file where it should appear only once).You can also use "--disable=all" to
|
||||
# disable everything first and then reenable specific checks. For example, if
|
||||
# you want to run only the similarities checker, you can use "--disable=all
|
||||
# --enable=similarities". If you want to run only the classes checker, but have
|
||||
# no Warning level messages displayed, use"--disable=all --enable=classes
|
||||
# --disable=W"
|
||||
disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,range-builtin-not-iterating,suppressed-message,missing-docstring,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating,relative-import,invalid-name,bad-continuation,no-member,locally-disabled,fixme,import-error,too-many-locals
|
||||
|
||||
|
||||
[REPORTS]
|
||||
|
||||
# Set the output format. Available formats are text, parseable, colorized, msvs
|
||||
# (visual studio) and html. You can also give a reporter class, eg
|
||||
# mypackage.mymodule.MyReporterClass.
|
||||
output-format=text
|
||||
|
||||
# Put messages in a separate file for each module / package specified on the
|
||||
# command line instead of printing them on stdout. Reports (if any) will be
|
||||
# written in a file name "pylint_global.[txt|html]". This option is deprecated
|
||||
# and it will be removed in Pylint 2.0.
|
||||
files-output=no
|
||||
|
||||
# Tells whether to display a full report or only the messages
|
||||
reports=no
|
||||
|
||||
# Python expression which should return a note less than 10 (10 is the highest
|
||||
# note). You have access to the variables errors warning, statement which
|
||||
# respectively contain the number of errors / warnings messages and the total
|
||||
# number of statements analyzed. This is used by the global evaluation report
|
||||
# (RP0004).
|
||||
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
|
||||
|
||||
# Template used to display messages. This is a python new-style format string
|
||||
# used to format the message information. See doc for all details
|
||||
#msg-template=
|
||||
|
||||
|
||||
[BASIC]
|
||||
|
||||
# Good variable names which should always be accepted, separated by a comma
|
||||
good-names=i,j,k,ex,Run,_
|
||||
|
||||
# Bad variable names which should always be refused, separated by a comma
|
||||
bad-names=foo,bar,baz,toto,tutu,tata
|
||||
|
||||
# Colon-delimited sets of names that determine each other's naming style when
|
||||
# the name regexes allow several styles.
|
||||
name-group=
|
||||
|
||||
# Include a hint for the correct naming format with invalid-name
|
||||
include-naming-hint=no
|
||||
|
||||
# List of decorators that produce properties, such as abc.abstractproperty. Add
|
||||
# to this list to register other decorators that produce valid properties.
|
||||
property-classes=abc.abstractproperty
|
||||
|
||||
# Regular expression matching correct function names
|
||||
function-rgx=[a-z_][a-z0-9_]{2,30}$
|
||||
|
||||
# Naming hint for function names
|
||||
function-name-hint=[a-z_][a-z0-9_]{2,30}$
|
||||
|
||||
# Regular expression matching correct variable names
|
||||
variable-rgx=[a-z_][a-z0-9_]{2,30}$
|
||||
|
||||
# Naming hint for variable names
|
||||
variable-name-hint=[a-z_][a-z0-9_]{2,30}$
|
||||
|
||||
# Regular expression matching correct constant names
|
||||
const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
|
||||
|
||||
# Naming hint for constant names
|
||||
const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
|
||||
|
||||
# Regular expression matching correct attribute names
|
||||
attr-rgx=[a-z_][a-z0-9_]{2,30}$
|
||||
|
||||
# Naming hint for attribute names
|
||||
attr-name-hint=[a-z_][a-z0-9_]{2,30}$
|
||||
|
||||
# Regular expression matching correct argument names
|
||||
argument-rgx=[a-z_][a-z0-9_]{2,30}$
|
||||
|
||||
# Naming hint for argument names
|
||||
argument-name-hint=[a-z_][a-z0-9_]{2,30}$
|
||||
|
||||
# Regular expression matching correct class attribute names
|
||||
class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
|
||||
|
||||
# Naming hint for class attribute names
|
||||
class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
|
||||
|
||||
# Regular expression matching correct inline iteration names
|
||||
inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
|
||||
|
||||
# Naming hint for inline iteration names
|
||||
inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
|
||||
|
||||
# Regular expression matching correct class names
|
||||
class-rgx=[A-Z_][a-zA-Z0-9]+$
|
||||
|
||||
# Naming hint for class names
|
||||
class-name-hint=[A-Z_][a-zA-Z0-9]+$
|
||||
|
||||
# Regular expression matching correct module names
|
||||
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
|
||||
|
||||
# Naming hint for module names
|
||||
module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
|
||||
|
||||
# Regular expression matching correct method names
|
||||
method-rgx=[a-z_][a-z0-9_]{2,30}$
|
||||
|
||||
# Naming hint for method names
|
||||
method-name-hint=[a-z_][a-z0-9_]{2,30}$
|
||||
|
||||
# Regular expression which should only match function or class names that do
|
||||
# not require a docstring.
|
||||
no-docstring-rgx=^_
|
||||
|
||||
# Minimum line length for functions/classes that require docstrings, shorter
|
||||
# ones are exempt.
|
||||
docstring-min-length=-1
|
||||
|
||||
|
||||
[ELIF]
|
||||
|
||||
# Maximum number of nested blocks for function / method body
|
||||
max-nested-blocks=5
|
||||
|
||||
|
||||
[TYPECHECK]
|
||||
|
||||
# Tells whether missing members accessed in mixin class should be ignored. A
|
||||
# mixin class is detected if its name ends with "mixin" (case insensitive).
|
||||
ignore-mixin-members=yes
|
||||
|
||||
# List of module names for which member attributes should not be checked
|
||||
# (useful for modules/projects where namespaces are manipulated during runtime
|
||||
# and thus existing member attributes cannot be deduced by static analysis. It
|
||||
# supports qualified module names, as well as Unix pattern matching.
|
||||
ignored-modules=
|
||||
|
||||
# List of class names for which member attributes should not be checked (useful
|
||||
# for classes with dynamically set attributes). This supports the use of
|
||||
# qualified names.
|
||||
ignored-classes=optparse.Values,thread._local,_thread._local
|
||||
|
||||
# List of members which are set dynamically and missed by pylint inference
|
||||
# system, and so shouldn't trigger E1101 when accessed. Python regular
|
||||
# expressions are accepted.
|
||||
generated-members=
|
||||
|
||||
# List of decorators that produce context managers, such as
|
||||
# contextlib.contextmanager. Add to this list to register other decorators that
|
||||
# produce valid context managers.
|
||||
contextmanager-decorators=contextlib.contextmanager
|
||||
|
||||
|
||||
[FORMAT]
|
||||
|
||||
# Maximum number of characters on a single line.
|
||||
max-line-length=100
|
||||
|
||||
# Regexp for a line that is allowed to be longer than the limit.
|
||||
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
|
||||
|
||||
# Allow the body of an if to be on the same line as the test if there is no
|
||||
# else.
|
||||
single-line-if-stmt=no
|
||||
|
||||
# List of optional constructs for which whitespace checking is disabled. `dict-
|
||||
# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
|
||||
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
|
||||
# `empty-line` allows space-only lines.
|
||||
no-space-check=trailing-comma,dict-separator
|
||||
|
||||
# Maximum number of lines in a module
|
||||
max-module-lines=1000
|
||||
|
||||
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
|
||||
# tab).
|
||||
# Use 2 spaces consistent with TensorFlow style.
|
||||
indent-string=' '
|
||||
|
||||
# Number of spaces of indent required inside a hanging or continued line.
|
||||
indent-after-paren=4
|
||||
|
||||
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
|
||||
expected-line-ending-format=
|
||||
|
||||
|
||||
[MISCELLANEOUS]
|
||||
|
||||
# List of note tags to take in consideration, separated by a comma.
|
||||
notes=FIXME,XXX,TODO
|
||||
|
||||
|
||||
[VARIABLES]
|
||||
|
||||
# Tells whether we should check for unused import in __init__ files.
|
||||
init-import=no
|
||||
|
||||
# A regular expression matching the name of dummy variables (i.e. expectedly
|
||||
# not used).
|
||||
dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
|
||||
|
||||
# List of additional names supposed to be defined in builtins. Remember that
|
||||
# you should avoid to define new builtins when possible.
|
||||
additional-builtins=
|
||||
|
||||
# List of strings which can identify a callback function by name. A callback
|
||||
# name must start or end with one of those strings.
|
||||
callbacks=cb_,_cb
|
||||
|
||||
# List of qualified module names which can have objects that can redefine
|
||||
# builtins.
|
||||
redefining-builtins-modules=six.moves,future.builtins
|
||||
|
||||
|
||||
[LOGGING]
|
||||
|
||||
# Logging modules to check that the string format arguments are in logging
|
||||
# function parameter format
|
||||
logging-modules=logging
|
||||
|
||||
|
||||
[SIMILARITIES]
|
||||
|
||||
# Minimum lines number of a similarity.
|
||||
min-similarity-lines=4
|
||||
|
||||
# Ignore comments when computing similarities.
|
||||
ignore-comments=yes
|
||||
|
||||
# Ignore docstrings when computing similarities.
|
||||
ignore-docstrings=yes
|
||||
|
||||
# Ignore imports when computing similarities.
|
||||
ignore-imports=no
|
||||
|
||||
|
||||
[SPELLING]
|
||||
|
||||
# Spelling dictionary name. Available dictionaries: none. To make it working
|
||||
# install python-enchant package.
|
||||
spelling-dict=
|
||||
|
||||
# List of comma separated words that should not be checked.
|
||||
spelling-ignore-words=
|
||||
|
||||
# A path to a file that contains private dictionary; one word per line.
|
||||
spelling-private-dict-file=
|
||||
|
||||
# Tells whether to store unknown words to indicated private dictionary in
|
||||
# --spelling-private-dict-file option instead of raising a message.
|
||||
spelling-store-unknown-words=no
|
||||
|
||||
|
||||
[IMPORTS]
|
||||
|
||||
# Deprecated modules which should not be used, separated by a comma
|
||||
deprecated-modules=regsub,TERMIOS,Bastion,rexec
|
||||
|
||||
# Create a graph of every (i.e. internal and external) dependencies in the
|
||||
# given file (report RP0402 must not be disabled)
|
||||
import-graph=
|
||||
|
||||
# Create a graph of external dependencies in the given file (report RP0402 must
|
||||
# not be disabled)
|
||||
ext-import-graph=
|
||||
|
||||
# Create a graph of internal dependencies in the given file (report RP0402 must
|
||||
# not be disabled)
|
||||
int-import-graph=
|
||||
|
||||
# Force import order to recognize a module as part of the standard
|
||||
# compatibility libraries.
|
||||
known-standard-library=
|
||||
|
||||
# Force import order to recognize a module as part of a third party library.
|
||||
known-third-party=enchant
|
||||
|
||||
# Analyse import fallback blocks. This can be used to support both Python 2 and
|
||||
# 3 compatible code, which means that the block might have code that exists
|
||||
# only in one or another interpreter, leading to false positives when analysed.
|
||||
analyse-fallback-blocks=no
|
||||
|
||||
|
||||
[DESIGN]
|
||||
|
||||
# Maximum number of arguments for function / method
|
||||
max-args=7
|
||||
|
||||
# Argument names that match this expression will be ignored. Default to name
|
||||
# with leading underscore
|
||||
ignored-argument-names=_.*
|
||||
|
||||
# Maximum number of locals for function / method body
|
||||
max-locals=15
|
||||
|
||||
# Maximum number of return / yield for function / method body
|
||||
max-returns=6
|
||||
|
||||
# Maximum number of branch for function / method body
|
||||
max-branches=12
|
||||
|
||||
# Maximum number of statements in function / method body
|
||||
max-statements=50
|
||||
|
||||
# Maximum number of parents for a class (see R0901).
|
||||
max-parents=7
|
||||
|
||||
# Maximum number of attributes for a class (see R0902).
|
||||
max-attributes=7
|
||||
|
||||
# Minimum number of public methods for a class (see R0903).
|
||||
min-public-methods=0
|
||||
|
||||
# Maximum number of public methods for a class (see R0904).
|
||||
max-public-methods=20
|
||||
|
||||
# Maximum number of boolean expressions in a if statement
|
||||
max-bool-expr=5
|
||||
|
||||
|
||||
[CLASSES]
|
||||
|
||||
# List of method names used to declare (i.e. assign) instance attributes.
|
||||
defining-attr-methods=__init__,__new__,setUp
|
||||
|
||||
# List of valid names for the first argument in a class method.
|
||||
valid-classmethod-first-arg=cls
|
||||
|
||||
# List of valid names for the first argument in a metaclass class method.
|
||||
valid-metaclass-classmethod-first-arg=mcs
|
||||
|
||||
# List of member names, which should be excluded from the protected access
|
||||
# warning.
|
||||
exclude-protected=_asdict,_fields,_replace,_source,_make
|
||||
|
||||
|
||||
[EXCEPTIONS]
|
||||
|
||||
# Exceptions that will emit a warning when being caught. Defaults to
|
||||
# "Exception"
|
||||
overgeneral-exceptions=Exception
|
|
@ -12,18 +12,18 @@
|
|||
|
||||
"""Provides an entrypoint for the training task."""
|
||||
|
||||
#pylint: disable=unused-import
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import pprint
|
||||
import uuid
|
||||
|
||||
import pip
|
||||
import tensorflow as tf
|
||||
from google.cloud import storage
|
||||
import tensorflow as tf
|
||||
|
||||
import agents
|
||||
import pybullet_envs # To make AntBulletEnv-v0 available.
|
||||
|
@ -113,39 +113,39 @@ def hparams_base():
|
|||
"""Base hparams tf/Agents PPO """
|
||||
|
||||
# General
|
||||
algorithm = agents.ppo.PPOAlgorithm
|
||||
num_agents = 30
|
||||
eval_episodes = 30
|
||||
use_gpu = False
|
||||
# algorithm = agents.ppo.PPOAlgorithm
|
||||
# num_agents = 30
|
||||
# eval_episodes = 30
|
||||
# use_gpu = False
|
||||
|
||||
# Environment
|
||||
env = 'KukaBulletEnv-v0'
|
||||
normalize_ranges = True
|
||||
max_length = 1000
|
||||
# env = 'KukaBulletEnv-v0'
|
||||
# normalize_ranges = True
|
||||
# max_length = 1000
|
||||
|
||||
# Network
|
||||
network = agents.scripts.networks.feed_forward_gaussian
|
||||
weight_summaries = dict(
|
||||
all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*')
|
||||
policy_layers = 200, 100
|
||||
value_layers = 200, 100
|
||||
init_output_factor = 0.1
|
||||
init_logstd = -1
|
||||
init_std = 0.35
|
||||
# network = agents.scripts.networks.feed_forward_gaussian
|
||||
# weight_summaries = dict(
|
||||
# all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*')
|
||||
# policy_layers = 200, 100
|
||||
# value_layers = 200, 100
|
||||
# init_output_factor = 0.1
|
||||
# init_logstd = -1
|
||||
# init_std = 0.35
|
||||
|
||||
# Optimization
|
||||
update_every = 60
|
||||
update_epochs = 25
|
||||
optimizer = tf.train.AdamOptimizer
|
||||
learning_rate = 1e-4
|
||||
steps = 3e7 # 30M
|
||||
# update_every = 60
|
||||
# update_epochs = 25
|
||||
# optimizer = tf.train.AdamOptimizer
|
||||
# learning_rate = 1e-4
|
||||
# steps = 3e7 # 30M
|
||||
|
||||
# Losses
|
||||
discount = 0.995
|
||||
kl_target = 1e-2
|
||||
kl_cutoff_factor = 2
|
||||
kl_cutoff_coef = 1000
|
||||
kl_init_penalty = 1
|
||||
# discount = 0.995
|
||||
# kl_target = 1e-2
|
||||
# kl_cutoff_factor = 2
|
||||
# kl_cutoff_coef = 1000
|
||||
# kl_init_penalty = 1
|
||||
|
||||
return locals()
|
||||
|
||||
|
@ -158,9 +158,9 @@ def _object_import_from_string(name):
|
|||
return mod
|
||||
|
||||
|
||||
def _realize_import_attrs(d, filter):
|
||||
def _realize_import_attrs(d, hparam_filter):
|
||||
for k, v in d.items():
|
||||
if k in filter:
|
||||
if k in hparam_filter:
|
||||
imported = _object_import_from_string(v)
|
||||
# TODO: Provide an appropriately informative error if the import fails
|
||||
# except ImportError as e:
|
||||
|
@ -170,7 +170,7 @@ def _realize_import_attrs(d, filter):
|
|||
return d
|
||||
|
||||
|
||||
def _get_agents_configuration(hparam_set_name, log_dir=None, is_chief=False):
|
||||
def _get_agents_configuration(log_dir=None):
|
||||
"""Load hyperparameter config."""
|
||||
try:
|
||||
# Try to resume training.
|
||||
|
@ -243,23 +243,20 @@ def gcs_upload(local_dir, gcs_out_dir):
|
|||
blob.upload_from_filename(local_file_path)
|
||||
|
||||
|
||||
def main(unused_argv):
|
||||
def main(_):
|
||||
"""Run training."""
|
||||
tf.logging.set_verbosity(tf.logging.INFO)
|
||||
|
||||
if FLAGS.debug:
|
||||
tf.logging.set_verbosity(tf.logging.DEBUG)
|
||||
|
||||
run_config = tf.contrib.learn.RunConfig()
|
||||
|
||||
log_dir = FLAGS.logdir
|
||||
|
||||
agents_config = _get_agents_configuration(
|
||||
FLAGS.hparam_set_id, log_dir, run_config.is_chief)
|
||||
agents_config = _get_agents_configuration(log_dir)
|
||||
|
||||
if FLAGS.run_mode == 'train':
|
||||
for score in agents.scripts.train.train(agents_config, env_processes=True):
|
||||
logging.info('Score {}.'.format(score))
|
||||
logging.info('Score %s.', score)
|
||||
if FLAGS.run_mode == 'render':
|
||||
now = datetime.datetime.now()
|
||||
subdir = now.strftime("%m%d-%H%M") + "-" + uuid.uuid4().hex[0:4]
|
||||
|
|
|
@ -2,16 +2,25 @@
|
|||
Simple app that parses predictions from a trained model and displays them.
|
||||
"""
|
||||
|
||||
from flask import Flask, json, render_template, request
|
||||
import requests
|
||||
app = Flask(__name__)
|
||||
from flask import Flask, json, render_template, request
|
||||
APP = Flask(__name__)
|
||||
|
||||
@app.route("/")
|
||||
@APP.route("/")
|
||||
def index():
|
||||
"""Default route.
|
||||
|
||||
Placeholder, does nothing.
|
||||
"""
|
||||
return render_template("index.html")
|
||||
|
||||
@app.route("/summary", methods=['GET', 'POST'])
|
||||
@APP.route("/summary", methods=['GET', 'POST'])
|
||||
def summary():
|
||||
"""Main prediction route.
|
||||
|
||||
Provides a machine-generated summary of the given text. Sends a request to a live
|
||||
model trained on GitHub issues.
|
||||
"""
|
||||
if request.method == 'POST':
|
||||
issue_text = request.form["issue_text"]
|
||||
|
||||
|
@ -23,17 +32,17 @@ def summary():
|
|||
}
|
||||
}
|
||||
|
||||
r = requests.post(url = url,
|
||||
response = requests.post(url=url,
|
||||
headers=headers,
|
||||
data=json.dumps(json_data))
|
||||
|
||||
rjs = json.loads(r.text)
|
||||
summary = rjs["data"]["ndarray"][0][0]
|
||||
response_json = json.loads(response.text)
|
||||
issue_summary = response_json["data"]["ndarray"][0][0]
|
||||
|
||||
return render_template("summary.html",
|
||||
return render_template("issue_summary.html",
|
||||
issue_text=issue_text,
|
||||
summary = summary)
|
||||
issue_summary=issue_summary)
|
||||
return ('', 204)
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug = True, host = '0.0.0.0', port = 80)
|
||||
|
||||
APP.run(debug=True, host='0.0.0.0', port=80)
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
from __future__ import print_function
|
||||
|
||||
import dill as dpickle
|
||||
import numpy as np
|
||||
from keras.models import load_model
|
||||
|
||||
from seq2seq_utils import Seq2Seq_Inference
|
||||
|
||||
|
||||
class IssueSummarization(object):
|
||||
|
||||
def __init__(self):
|
||||
with open('body_pp.dpkl', 'rb') as f:
|
||||
body_pp = dpickle.load(f)
|
||||
with open('title_pp.dpkl', 'rb') as f:
|
||||
title_pp = dpickle.load(f)
|
||||
self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
|
||||
decoder_preprocessor=title_pp,
|
||||
seq2seq_model=load_model('seq2seq_model_tutorial.h5'))
|
||||
|
||||
def predict(self, X, feature_names):
|
||||
return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in X])
|
|
@ -0,0 +1,25 @@
|
|||
"""Generates predictions using a stored model.
|
||||
|
||||
Uses trained model files to generate a prediction.
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import dill as dpickle
|
||||
from keras.models import load_model
|
||||
from seq2seq_utils import Seq2Seq_Inference
|
||||
|
||||
class IssueSummarization(object):
|
||||
|
||||
def __init__(self):
|
||||
with open('body_pp.dpkl', 'rb') as body_file:
|
||||
body_pp = dpickle.load(body_file)
|
||||
with open('title_pp.dpkl', 'rb') as title_file:
|
||||
title_pp = dpickle.load(title_file)
|
||||
self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
|
||||
decoder_preprocessor=title_pp,
|
||||
seq2seq_model=load_model('seq2seq_model_tutorial.h5'))
|
||||
|
||||
def predict(self, input_text):
|
||||
return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in input_text])
|
|
@ -1,16 +1,15 @@
|
|||
import logging
|
||||
import dill as dpickle
|
||||
import numpy as np
|
||||
from matplotlib import pyplot as plt
|
||||
import tensorflow as tf
|
||||
from IPython.display import SVG, display
|
||||
from keras import backend as K
|
||||
from keras.layers import Input
|
||||
from keras.models import Model
|
||||
from IPython.display import SVG, display
|
||||
from keras.utils.vis_utils import model_to_dot
|
||||
import logging
|
||||
import numpy as np
|
||||
import dill as dpickle
|
||||
from annoy import AnnoyIndex
|
||||
from tqdm import tqdm, tqdm_notebook
|
||||
from random import random
|
||||
from nltk.translate.bleu_score import corpus_bleu
|
||||
|
||||
|
||||
|
@ -42,7 +41,7 @@ def load_text_processor(fname='title_pp.dpkl'):
|
|||
pp = dpickle.load(f)
|
||||
|
||||
num_tokens = max(pp.id2token.keys()) + 1
|
||||
print(f'Size of vocabulary for {fname}: {num_tokens:,}')
|
||||
print('Size of vocabulary for {}: {}'.format(fname, num_tokens))
|
||||
return num_tokens, pp
|
||||
|
||||
|
||||
|
@ -73,8 +72,8 @@ def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
|
|||
# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
|
||||
decoder_target_data = vectorized_title[:, 1:]
|
||||
|
||||
print(f'Shape of decoder input: {decoder_input_data.shape}')
|
||||
print(f'Shape of decoder target: {decoder_target_data.shape}')
|
||||
print('Shape of decoder input: {}'.format(decoder_input_data.shape))
|
||||
print('Shape of decoder target: {}'.format(decoder_target_data.shape))
|
||||
return decoder_input_data, decoder_target_data
|
||||
|
||||
|
||||
|
@ -100,7 +99,7 @@ def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
|
|||
# Encoder input is simply the body of the issue text
|
||||
encoder_input_data = vectorized_body
|
||||
doc_length = encoder_input_data.shape[1]
|
||||
print(f'Shape of encoder input: {encoder_input_data.shape}')
|
||||
print('Shape of encoder input: {}'.format(encoder_input_data.shape))
|
||||
return encoder_input_data, doc_length
|
||||
|
||||
|
||||
|
@ -195,16 +194,17 @@ def extract_decoder_model(model):
|
|||
dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
|
||||
|
||||
# Instead of setting the intial state from the encoder and forgetting about it, during inference
|
||||
# we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
|
||||
# the GRU, thus we define this input layer for the state so we can add this capability
|
||||
# we are not doing teacher forcing, so we will have to have a feedback loop from predictions back
|
||||
# into the GRU, thus we define this input layer for the state so we can add this capability
|
||||
gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
|
||||
|
||||
# we need to reuse the weights that is why we are getting this
|
||||
# If you inspect the decoder GRU that we created for training, it will take as input
|
||||
# 2 tensors -> (1) is the embedding layer output for the teacher forcing
|
||||
# (which will now be the last step's prediction, and will be _start_ on the first time step)
|
||||
# (2) is the state, which we will initialize with the encoder on the first time step, but then
|
||||
# grab the state after the first prediction and feed that back in again.
|
||||
# (which will now be the last step's prediction, and will be _start_ on the
|
||||
# first time step)
|
||||
# (2) is the state, which we will initialize with the encoder on the first time step
|
||||
# but then grab the state after the first prediction and feed that back in again.
|
||||
gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
|
||||
|
||||
# Reconstruct dense layers
|
||||
|
@ -216,6 +216,9 @@ def extract_decoder_model(model):
|
|||
|
||||
|
||||
class Seq2Seq_Inference(object):
|
||||
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
|
||||
def __init__(self,
|
||||
encoder_preprocessor,
|
||||
decoder_preprocessor,
|
||||
|
@ -291,18 +294,18 @@ class Seq2Seq_Inference(object):
|
|||
"""
|
||||
if i:
|
||||
print('\n\n==============================================')
|
||||
print(f'============== Example # {i} =================\n')
|
||||
print('============== Example # {} =================\n'.format(i))
|
||||
|
||||
if url:
|
||||
print(url)
|
||||
|
||||
print(f"Issue Body:\n {body_text} \n")
|
||||
print("Issue Body:\n {} \n".format(body_text))
|
||||
|
||||
if title_text:
|
||||
print(f"Original Title:\n {title_text}")
|
||||
print("Original Title:\n {}".format(title_text))
|
||||
|
||||
emb, gen_title = self.generate_issue_title(body_text)
|
||||
print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
|
||||
print("\n****** Machine Generated Title (Prediction) ******:\n {}".format(gen_title))
|
||||
|
||||
if self.nn:
|
||||
# return neighbors and distances
|
||||
|
@ -315,7 +318,7 @@ class Seq2Seq_Inference(object):
|
|||
cols = ['issue_url', 'issue_title', 'body']
|
||||
dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
|
||||
dfcopy['dist'] = dist
|
||||
similar_issues_df = dfcopy.query(f'dist <= {threshold}')
|
||||
similar_issues_df = dfcopy.query('dist <= {}'.format(threshold))
|
||||
|
||||
print("\n**** Similar Issues (using encoder embedding) ****:\n")
|
||||
display(similar_issues_df)
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
import argparse
|
||||
import keras
|
||||
import pandas as pd
|
||||
from seq2seq_utils import load_decoder_inputs
|
||||
from seq2seq_utils import load_encoder_inputs
|
||||
from seq2seq_utils import load_text_processor
|
||||
from seq2seq_utils import Seq2Seq_Inference
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import argparse
|
||||
import dill as dpickle
|
||||
from ktext.preprocess import processor
|
||||
import numpy as np
|
||||
from ktext.preprocess import processor
|
||||
import pandas as pd
|
||||
|
||||
# Parsing flags.
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
import argparse
|
||||
import glob
|
||||
import logging
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
|
@ -20,8 +18,8 @@ traindf, testdf = train_test_split(pd.read_csv(args.input_csv).sample(n=args.sam
|
|||
test_size=.10)
|
||||
|
||||
# Print stats about the shape of the data.
|
||||
print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')
|
||||
print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')
|
||||
print('Train: {:,} rows {:,} columns'.format(traindf.shape[0], traindf.shape[1]))
|
||||
print('Test: {:,} rows {:,} columns'.format(testdf.shape[0], testdf.shape[1]))
|
||||
|
||||
# Store output as CSV.
|
||||
traindf.to_csv(args.output_traindf_csv)
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
import argparse
|
||||
import keras
|
||||
import pandas as pd
|
||||
from seq2seq_utils import load_decoder_inputs
|
||||
from seq2seq_utils import load_encoder_inputs
|
||||
from seq2seq_utils import load_text_processor
|
||||
from seq2seq_utils import Seq2Seq_Inference
|
||||
|
||||
|
|
|
@ -1,393 +0,0 @@
|
|||
from matplotlib import pyplot as plt
|
||||
import tensorflow as tf
|
||||
from keras import backend as K
|
||||
from keras.layers import Input
|
||||
from keras.models import Model
|
||||
from IPython.display import SVG, display
|
||||
from keras.utils.vis_utils import model_to_dot
|
||||
import logging
|
||||
import numpy as np
|
||||
import dill as dpickle
|
||||
from annoy import AnnoyIndex
|
||||
from tqdm import tqdm, tqdm_notebook
|
||||
from random import random
|
||||
from nltk.translate.bleu_score import corpus_bleu
|
||||
|
||||
def load_text_processor(fname='title_pp.dpkl'):
|
||||
"""
|
||||
Load preprocessors from disk.
|
||||
Parameters
|
||||
----------
|
||||
fname: str
|
||||
file name of ktext.proccessor object
|
||||
Returns
|
||||
-------
|
||||
num_tokens : int
|
||||
size of vocabulary loaded into ktext.processor
|
||||
pp : ktext.processor
|
||||
the processor you are trying to load
|
||||
Typical Usage:
|
||||
-------------
|
||||
num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
|
||||
num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
|
||||
"""
|
||||
# Load files from disk
|
||||
with open(fname, 'rb') as f:
|
||||
pp = dpickle.load(f)
|
||||
|
||||
num_tokens = max(pp.id2token.keys()) + 1
|
||||
print(f'Size of vocabulary for {fname}: {num_tokens:,}')
|
||||
return num_tokens, pp
|
||||
|
||||
|
||||
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
|
||||
"""
|
||||
Load decoder inputs.
|
||||
Parameters
|
||||
----------
|
||||
decoder_np_vecs : str
|
||||
filename of serialized numpy.array of decoder input (issue title)
|
||||
Returns
|
||||
-------
|
||||
decoder_input_data : numpy.array
|
||||
The data fed to the decoder as input during training for teacher forcing.
|
||||
This is the same as `decoder_np_vecs` except the last position.
|
||||
decoder_target_data : numpy.array
|
||||
The data that the decoder data is trained to generate (issue title).
|
||||
Calculated by sliding `decoder_np_vecs` one position forward.
|
||||
"""
|
||||
vectorized_title = np.load(decoder_np_vecs)
|
||||
# For Decoder Input, you don't need the last word as that is only for prediction
|
||||
# when we are training using Teacher Forcing.
|
||||
decoder_input_data = vectorized_title[:, :-1]
|
||||
|
||||
# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
|
||||
decoder_target_data = vectorized_title[:, 1:]
|
||||
|
||||
print(f'Shape of decoder input: {decoder_input_data.shape}')
|
||||
print(f'Shape of decoder target: {decoder_target_data.shape}')
|
||||
return decoder_input_data, decoder_target_data
|
||||
|
||||
|
||||
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
|
||||
"""
|
||||
Load variables & data that are inputs to encoder.
|
||||
Parameters
|
||||
----------
|
||||
encoder_np_vecs : str
|
||||
filename of serialized numpy.array of encoder input (issue title)
|
||||
Returns
|
||||
-------
|
||||
encoder_input_data : numpy.array
|
||||
The issue body
|
||||
doc_length : int
|
||||
The standard document length of the input for the encoder after padding
|
||||
the shape of this array will be (num_examples, doc_length)
|
||||
"""
|
||||
vectorized_body = np.load(encoder_np_vecs)
|
||||
# Encoder input is simply the body of the issue text
|
||||
encoder_input_data = vectorized_body
|
||||
doc_length = encoder_input_data.shape[1]
|
||||
print(f'Shape of encoder input: {encoder_input_data.shape}')
|
||||
return encoder_input_data, doc_length
|
||||
|
||||
|
||||
def viz_model_architecture(model):
|
||||
"""Visualize model architecture in Jupyter notebook."""
|
||||
display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
|
||||
|
||||
|
||||
def free_gpu_mem():
|
||||
"""Attempt to free gpu memory."""
|
||||
K.get_session().close()
|
||||
cfg = K.tf.ConfigProto()
|
||||
cfg.gpu_options.allow_growth = True
|
||||
K.set_session(K.tf.Session(config=cfg))
|
||||
|
||||
|
||||
def test_gpu():
|
||||
"""Run a toy computation task in tensorflow to test GPU."""
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
session = tf.Session(config=config)
|
||||
hello = tf.constant('Hello, TensorFlow!')
|
||||
print(session.run(hello))
|
||||
|
||||
|
||||
def plot_model_training_history(history_object):
|
||||
"""Plots model train vs. validation loss."""
|
||||
plt.title('model accuracy')
|
||||
plt.ylabel('accuracy')
|
||||
plt.xlabel('epoch')
|
||||
plt.plot(history_object.history['loss'])
|
||||
plt.plot(history_object.history['val_loss'])
|
||||
plt.legend(['train', 'test'], loc='upper left')
|
||||
plt.show()
|
||||
|
||||
|
||||
def extract_encoder_model(model):
|
||||
"""
|
||||
Extract the encoder from the original Sequence to Sequence Model.
|
||||
Returns a keras model object that has one input (body of issue) and one
|
||||
output (encoding of issue, which is the last hidden state).
|
||||
Input:
|
||||
-----
|
||||
model: keras model object
|
||||
Returns:
|
||||
-----
|
||||
keras model object
|
||||
"""
|
||||
encoder_model = model.get_layer('Encoder-Model')
|
||||
return encoder_model
|
||||
|
||||
|
||||
def extract_decoder_model(model):
|
||||
"""
|
||||
Extract the decoder from the original model.
|
||||
Inputs:
|
||||
------
|
||||
model: keras model object
|
||||
Returns:
|
||||
-------
|
||||
A Keras model object with the following inputs and outputs:
|
||||
Inputs of Keras Model That Is Returned:
|
||||
1: the embedding index for the last predicted word or the <Start> indicator
|
||||
2: the last hidden state, or in the case of the first word the hidden state from the encoder
|
||||
Outputs of Keras Model That Is Returned:
|
||||
1. Prediction (class probabilities) for the next word
|
||||
2. The hidden state of the decoder, to be fed back into the decoder at the next time step
|
||||
Implementation Notes:
|
||||
----------------------
|
||||
Must extract relevant layers and reconstruct part of the computation graph
|
||||
to allow for different inputs as we are not going to use teacher forcing at
|
||||
inference time.
|
||||
"""
|
||||
# the latent dimension is the same throughout the architecture so we are going to
|
||||
# cheat and grab the latent dimension of the embedding because that is the same as what is
|
||||
# output from the decoder
|
||||
latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
|
||||
|
||||
# Reconstruct the input into the decoder
|
||||
decoder_inputs = model.get_layer('Decoder-Input').input
|
||||
dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
|
||||
dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
|
||||
|
||||
# Instead of setting the intial state from the encoder and forgetting about it, during inference
|
||||
# we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
|
||||
# the GRU, thus we define this input layer for the state so we can add this capability
|
||||
gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
|
||||
|
||||
# we need to reuse the weights that is why we are getting this
|
||||
# If you inspect the decoder GRU that we created for training, it will take as input
|
||||
# 2 tensors -> (1) is the embedding layer output for the teacher forcing
|
||||
# (which will now be the last step's prediction, and will be _start_ on the first time step)
|
||||
# (2) is the state, which we will initialize with the encoder on the first time step, but then
|
||||
# grab the state after the first prediction and feed that back in again.
|
||||
gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
|
||||
|
||||
# Reconstruct dense layers
|
||||
dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
|
||||
dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
|
||||
decoder_model = Model([decoder_inputs, gru_inference_state_input],
|
||||
[dense_out, gru_state_out])
|
||||
return decoder_model
|
||||
|
||||
|
||||
class Seq2Seq_Inference(object):
|
||||
def __init__(self,
|
||||
encoder_preprocessor,
|
||||
decoder_preprocessor,
|
||||
seq2seq_model):
|
||||
|
||||
self.pp_body = encoder_preprocessor
|
||||
self.pp_title = decoder_preprocessor
|
||||
self.seq2seq_model = seq2seq_model
|
||||
self.encoder_model = extract_encoder_model(seq2seq_model)
|
||||
self.decoder_model = extract_decoder_model(seq2seq_model)
|
||||
self.default_max_len_title = self.pp_title.padding_maxlen
|
||||
self.nn = None
|
||||
self.rec_df = None
|
||||
|
||||
def generate_issue_title(self,
|
||||
raw_input_text,
|
||||
max_len_title=None):
|
||||
"""
|
||||
Use the seq2seq model to generate a title given the body of an issue.
|
||||
Inputs
|
||||
------
|
||||
raw_input: str
|
||||
The body of the issue text as an input string
|
||||
max_len_title: int (optional)
|
||||
The maximum length of the title the model will generate
|
||||
"""
|
||||
if max_len_title is None:
|
||||
max_len_title = self.default_max_len_title
|
||||
# get the encoder's features for the decoder
|
||||
raw_tokenized = self.pp_body.transform([raw_input_text])
|
||||
body_encoding = self.encoder_model.predict(raw_tokenized)
|
||||
# we want to save the encoder's embedding before its updated by decoder
|
||||
# because we can use that as an embedding for other tasks.
|
||||
original_body_encoding = body_encoding
|
||||
state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)
|
||||
|
||||
decoded_sentence = []
|
||||
stop_condition = False
|
||||
while not stop_condition:
|
||||
preds, st = self.decoder_model.predict([state_value, body_encoding])
|
||||
|
||||
# We are going to ignore indices 0 (padding) and indices 1 (unknown)
|
||||
# Argmax will return the integer index corresponding to the
|
||||
# prediction + 2 b/c we chopped off first two
|
||||
pred_idx = np.argmax(preds[:, :, 2:]) + 2
|
||||
|
||||
# retrieve word from index prediction
|
||||
pred_word_str = self.pp_title.id2token[pred_idx]
|
||||
|
||||
if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
|
||||
stop_condition = True
|
||||
break
|
||||
decoded_sentence.append(pred_word_str)
|
||||
|
||||
# update the decoder for the next word
|
||||
body_encoding = st
|
||||
state_value = np.array(pred_idx).reshape(1, 1)
|
||||
|
||||
return original_body_encoding, ' '.join(decoded_sentence)
|
||||
|
||||
|
||||
def print_example(self,
|
||||
i,
|
||||
body_text,
|
||||
title_text,
|
||||
url,
|
||||
threshold):
|
||||
"""
|
||||
Prints an example of the model's prediction for manual inspection.
|
||||
"""
|
||||
if i:
|
||||
print('\n\n==============================================')
|
||||
print(f'============== Example # {i} =================\n')
|
||||
|
||||
if url:
|
||||
print(url)
|
||||
|
||||
print(f"Issue Body:\n {body_text} \n")
|
||||
|
||||
if title_text:
|
||||
print(f"Original Title:\n {title_text}")
|
||||
|
||||
emb, gen_title = self.generate_issue_title(body_text)
|
||||
print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
|
||||
|
||||
if self.nn:
|
||||
# return neighbors and distances
|
||||
n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
|
||||
include_distances=True)
|
||||
neighbors = n[1:]
|
||||
dist = d[1:]
|
||||
|
||||
if min(dist) <= threshold:
|
||||
cols = ['issue_url', 'issue_title', 'body']
|
||||
dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
|
||||
dfcopy['dist'] = dist
|
||||
similar_issues_df = dfcopy.query(f'dist <= {threshold}')
|
||||
|
||||
print("\n**** Similar Issues (using encoder embedding) ****:\n")
|
||||
display(similar_issues_df)
|
||||
|
||||
|
||||
def demo_model_predictions(self,
|
||||
n,
|
||||
issue_df,
|
||||
threshold=1):
|
||||
"""
|
||||
Pick n random Issues and display predictions.
|
||||
Input:
|
||||
------
|
||||
n : int
|
||||
Number of issues to display from issue_df
|
||||
issue_df : pandas DataFrame
|
||||
DataFrame that contains two columns: `body` and `issue_title`.
|
||||
threshold : float
|
||||
distance threshold for recommendation of similar issues.
|
||||
Returns:
|
||||
--------
|
||||
None
|
||||
Prints the original issue body and the model's prediction.
|
||||
"""
|
||||
# Extract body and title from DF
|
||||
body_text = issue_df.body.tolist()
|
||||
title_text = issue_df.issue_title.tolist()
|
||||
url = issue_df.issue_url.tolist()
|
||||
|
||||
demo_list = np.random.randint(low=1, high=len(body_text), size=n)
|
||||
for i in demo_list:
|
||||
self.print_example(i,
|
||||
body_text=body_text[i],
|
||||
title_text=title_text[i],
|
||||
url=url[i],
|
||||
threshold=threshold)
|
||||
|
||||
def prepare_recommender(self, vectorized_array, original_df):
|
||||
"""
|
||||
Use the annoy library to build recommender
|
||||
Parameters
|
||||
----------
|
||||
vectorized_array : List[List[int]]
|
||||
This is the list of list of integers that represents your corpus
|
||||
that is fed into the seq2seq model for training.
|
||||
original_df : pandas.DataFrame
|
||||
This is the original dataframe that has the columns
|
||||
['issue_url', 'issue_title', 'body']
|
||||
Returns
|
||||
-------
|
||||
annoy.AnnoyIndex object (see https://github.com/spotify/annoy)
|
||||
"""
|
||||
self.rec_df = original_df
|
||||
emb = self.encoder_model.predict(x=vectorized_array,
|
||||
batch_size=vectorized_array.shape[0]//200)
|
||||
|
||||
f = emb.shape[1]
|
||||
self.nn = AnnoyIndex(f)
|
||||
logging.warning('Adding embeddings')
|
||||
for i in tqdm(range(len(emb))):
|
||||
self.nn.add_item(i, emb[i])
|
||||
logging.warning('Building trees for similarity lookup.')
|
||||
self.nn.build(50)
|
||||
return self.nn
|
||||
|
||||
def set_recsys_data(self, original_df):
|
||||
self.rec_df = original_df
|
||||
|
||||
def set_recsys_annoyobj(self, annoyobj):
|
||||
self.nn = annoyobj
|
||||
|
||||
def evaluate_model(self, holdout_bodies, holdout_titles):
|
||||
"""
|
||||
Method for calculating BLEU Score.
|
||||
Parameters
|
||||
----------
|
||||
holdout_bodies : List[str]
|
||||
These are the issue bodies that we want to summarize
|
||||
holdout_titles : List[str]
|
||||
This is the ground truth we are trying to predict --> issue titles
|
||||
Returns
|
||||
-------
|
||||
bleu : float
|
||||
The BLEU Score
|
||||
"""
|
||||
actual, predicted = list(), list()
|
||||
assert len(holdout_bodies) == len(holdout_titles)
|
||||
num_examples = len(holdout_bodies)
|
||||
|
||||
logging.warning('Generating predictions.')
|
||||
# step over the whole set TODO: parallelize this
|
||||
for i in tqdm_notebook(range(num_examples)):
|
||||
_, yhat = self.generate_issue_title(holdout_bodies[i])
|
||||
|
||||
actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
|
||||
predicted.append(self.pp_title.process_text([yhat])[0])
|
||||
# calculate BLEU score
|
||||
logging.warning('Calculating BLEU.')
|
||||
bleu = corpus_bleu(actual, predicted)
|
||||
return bleu
|
|
@ -1,11 +1,10 @@
|
|||
import argparse
|
||||
import numpy as np
|
||||
from keras.callbacks import CSVLogger, ModelCheckpoint
|
||||
from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
|
||||
from keras.layers import Input, GRU, Dense, Embedding, BatchNormalization
|
||||
from keras.models import Model
|
||||
from keras import optimizers
|
||||
import numpy as np
|
||||
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor
|
||||
from seq2seq_utils import viz_model_architecture
|
||||
|
||||
# Parsing flags.
|
||||
parser = argparse.ArgumentParser()
|
||||
|
@ -35,7 +34,10 @@ latent_dim = 300
|
|||
encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')
|
||||
|
||||
# Word embeding for encoder (ex: Issue Body)
|
||||
x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
|
||||
x = Embedding(num_encoder_tokens,
|
||||
latent_dim,
|
||||
name='Body-Word-Embedding',
|
||||
mask_zero=False)(encoder_inputs)
|
||||
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)
|
||||
|
||||
# We do not need the `encoder_output` just the hidden state.
|
||||
|
@ -53,7 +55,10 @@ seq2seq_encoder_out = encoder_model(encoder_inputs)
|
|||
decoder_inputs = Input(shape=(None,), name='Decoder-Input') # for teacher forcing
|
||||
|
||||
# Word Embedding For Decoder (ex: Issue Titles)
|
||||
dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
|
||||
dec_emb = Embedding(num_decoder_tokens,
|
||||
latent_dim,
|
||||
name='Decoder-Word-Embedding',
|
||||
mask_zero=False)(decoder_inputs)
|
||||
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)
|
||||
|
||||
# Set up the decoder, using `decoder_state_input` as initial state.
|
||||
|
@ -71,21 +76,24 @@ decoder_outputs = decoder_dense(x)
|
|||
|
||||
seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
|
||||
|
||||
seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy')
|
||||
seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
|
||||
loss='sparse_categorical_crossentropy')
|
||||
|
||||
seq2seq_Model.summary()
|
||||
|
||||
script_name_base = 'tutorial_seq2seq'
|
||||
csv_logger = CSVLogger('{:}.log'.format(script_name_base))
|
||||
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
|
||||
save_best_only=True)
|
||||
model_checkpoint = ModelCheckpoint(
|
||||
'{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base), save_best_only=True)
|
||||
|
||||
batch_size = 1200
|
||||
epochs = 7
|
||||
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
|
||||
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data],
|
||||
np.expand_dims(decoder_target_data, -1),
|
||||
batch_size=batch_size,
|
||||
epochs=epochs,
|
||||
validation_split=0.12, callbacks=[csv_logger, model_checkpoint])
|
||||
validation_split=0.12,
|
||||
callbacks=[csv_logger, model_checkpoint])
|
||||
|
||||
#############
|
||||
# Save model.
|
||||
|
|
Loading…
Reference in New Issue