Merge UI and Search Server (#209)

* Use the nicer tf.gfile interface for search index creation

* Update documentation and more maintainable interface to search server

* Add ability to control number of outputs

* Serve React UI from the Flask server

* Update Dockerfile for the unified server and ui
This commit is contained in:
Sanyam Kapoor 2018-08-03 15:56:09 -07:00 committed by k8s-ci-robot
parent b6a4d06f00
commit f2151f66fc
28 changed files with 229 additions and 2696 deletions

View File

@ -1,13 +1,23 @@
FROM node:10.6
ARG BASE_IMAGE_TAG=1.8.0
ADD ui/ /ui
FROM tensorflow/tensorflow:$BASE_IMAGE_TAG
WORKDIR /ui
RUN curl -sL https://deb.nodesource.com/setup_10.x | bash - &&\
apt-get install -y nodejs &&\
pip install Flask~=1.0.0 \
nmslib~=1.7.0 \
numpy~=1.14.0 \
oauth2client~=4.1.0 \
requests~=2.18.0 \
tensor2tensor~=1.6.0 &&\
rm -rf /var/lib/apt/lists/*
RUN npm i && npm run build && npm i -g serve
ADD src/ /src
EXPOSE 5000
WORKDIR /src
ENTRYPOINT ["serve"]
RUN cd ui && npm i && npm run build && cd ..
CMD ["-l", "5000", "-n", "/ui/build"]
EXPOSE 8008
ENTRYPOINT ["python"]

View File

@ -1,105 +0,0 @@
"""
This module serves as the entrypoint to either create an nmslib index or
start a Flask server to serve the index via a simple REST interface. It
internally talks to TF Serving for inference related tasks. The
two entrypoints `server` and `creator` are exposed as `nmslib-create`
and `nmslib-serve` binaries (see `setup.py`). Use `-h` to get a list
of input CLI arguments to both.
"""
import os
import argparse
import csv
import numpy as np
from code_search.nmslib.gcs import maybe_download_gcs_file, maybe_upload_gcs_file
from code_search.nmslib.search_engine import CodeSearchEngine
from code_search.nmslib.search_server import CodeSearchServer
def parse_server_args(args):
parser = argparse.ArgumentParser(prog='nmslib Flask Server')
parser.add_argument('--tmp-dir', type=str, metavar='', default='/tmp/nmslib',
help='Path to temporary data directory')
parser.add_argument('--data-file', type=str, required=True,
help='Path to CSV file containing human-readable data')
parser.add_argument('--index-file', type=str, required=True,
help='Path to index file created by nmslib')
parser.add_argument('--problem', type=str, required=True,
help='Name of the T2T problem')
parser.add_argument('--data-dir', type=str, required=True,
help='Path to working data directory')
parser.add_argument('--serving-url', type=str, required=True,
help='Complete URL to TF Serving Inference server')
parser.add_argument('--host', type=str, metavar='', default='0.0.0.0',
help='Host to start server on')
parser.add_argument('--port', type=int, metavar='', default=8008,
help='Port to bind server to')
args = parser.parse_args(args)
args.tmp_dir = os.path.expanduser(args.tmp_dir)
args.data_file = os.path.expanduser(args.data_file)
args.index_file = os.path.expanduser(args.index_file)
args.data_dir = os.path.expanduser(args.data_dir)
return args
def parse_creator_args(args):
parser = argparse.ArgumentParser(prog='nmslib Index Creator')
parser.add_argument('--data-file', type=str, required=True,
help='Path to csv data file for human-readable data')
parser.add_argument('--index-file', type=str, metavar='', default='/tmp/index.nmslib',
help='Path to output index file')
parser.add_argument('--tmp-dir', type=str, metavar='', default='/tmp/nmslib',
help='Path to temporary data directory')
args = parser.parse_args(args)
args.data_file = os.path.expanduser(args.data_file)
args.index_file = os.path.expanduser(args.index_file)
args.tmp_dir = os.path.expanduser(args.tmp_dir)
return args
def server(argv=None):
args = parse_server_args(argv)
if not os.path.isdir(args.tmp_dir):
os.makedirs(args.tmp_dir, exist_ok=True)
# Download relevant files if needed
index_file = maybe_download_gcs_file(args.index_file, args.tmp_dir)
data_file = maybe_download_gcs_file(args.data_file, args.tmp_dir)
search_engine = CodeSearchEngine(args.problem, args.data_dir, args.serving_url,
index_file, data_file)
search_server = CodeSearchServer(engine=search_engine,
host=args.host, port=args.port)
search_server.run()
def creator(argv=None):
args = parse_creator_args(argv)
if not os.path.isdir(args.tmp_dir):
os.makedirs(args.tmp_dir)
data_file = maybe_download_gcs_file(args.data_file, args.tmp_dir)
data = np.empty((0, 128), dtype=np.float32)
with open(data_file, 'r') as csv_file:
data_reader = csv.reader(csv_file)
next(data_reader, None) # Skip the header
for row in data_reader:
vector_string = row[-1]
embedding_vector = [float(value) for value in vector_string.split(',')]
np_row = np.expand_dims(embedding_vector, axis=0)
data = np.append(data, np_row, axis=0)
tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file))
CodeSearchEngine.create_index(data, tmp_index_file)
maybe_upload_gcs_file(tmp_index_file, args.index_file)

View File

@ -0,0 +1,45 @@
import argparse
import os
def add_common_arguments(parser):
parser.add_argument('--data_dir', type=str, metavar='',
help='Path to directory with CSV files containing function embeddings')
parser.add_argument('--lookup_file', type=str, metavar='',
help='Path to CSV file for reverse index lookup.')
parser.add_argument('--index_file', type=str, metavar='',
help='Path to output index file')
parser.add_argument('--tmp_dir', type=str, metavar='', default='/tmp/code_search',
help='Path to temporary data directory')
def add_server_arguments(parser):
parser.add_argument('--problem', type=str, metavar='',
help='Name of the T2T problem')
parser.add_argument('--serving_url', type=str, metavar='',
help='Complete URL to TF Serving Inference server')
parser.add_argument('--host', type=str, metavar='', default='0.0.0.0',
help='Host to start server on')
parser.add_argument('--port', type=int, metavar='', default=8008,
help='Port to bind server to')
parser.add_argument('--ui_dir', type=int, metavar='',
help='Path to static assets for the UI')
def parse_arguments(argv=None):
parser = argparse.ArgumentParser(prog='Code Search Index Server')
add_common_arguments(parser)
server_args_parser = parser.add_argument_group('Server Arguments')
add_server_arguments(server_args_parser)
args = parser.parse_args(argv)
args.data_dir = os.path.expanduser(args.data_dir)
args.lookup_file = os.path.expanduser(args.lookup_file)
args.index_file = os.path.expanduser(args.index_file)
args.tmp_dir = os.path.expanduser(args.tmp_dir)
args.ui_dir = os.path.abspath(os.path.join(__file__, '../../../../ui/build'))
return args

View File

@ -0,0 +1,57 @@
import csv
import os
import numpy as np
import tensorflow as tf
import code_search.nmslib.cli.arguments as arguments
import code_search.nmslib.search_engine as search_engine
def create_search_index(argv=None):
"""Create NMSLib index and a reverse lookup CSV file.
This routine reads a list CSV data files at a given
directory, combines them into one for reverse lookup
and uses the embeddings string to create an NMSLib index.
This embedding is the last column of all CSV files.
Args:
argv: A list of strings representing command line arguments.
"""
tf.logging.set_verbosity(tf.logging.INFO)
args = arguments.parse_arguments(argv)
if not os.path.isdir(args.tmp_dir):
os.makedirs(args.tmp_dir)
tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file))
tmp_lookup_file = os.path.join(args.tmp_dir, os.path.basename(args.lookup_file))
embeddings_data = []
with open(tmp_lookup_file, 'w') as lookup_file:
lookup_writer = csv.writer(lookup_file)
for csv_file_path in tf.gfile.Glob('{}/*.csv'.format(args.data_dir)):
tf.logging.debug('Reading {}'.format(csv_file_path))
with tf.gfile.Open(csv_file_path) as csv_file:
reader = csv.reader(csv_file)
for row in reader:
embedding_string = row[-1]
embedding_vector = [float(value) for value in embedding_string.split(',')]
embeddings_data.append(embedding_vector)
lookup_writer.writerow(row[:-1])
embeddings_data = np.array(embeddings_data)
search_engine.CodeSearchEngine.create_index(embeddings_data, tmp_index_file)
tf.gfile.Copy(tmp_lookup_file, args.lookup_file)
tf.gfile.Copy(tmp_index_file, args.index_file)
if __name__ == '__main__':
create_search_index()

View File

@ -0,0 +1,66 @@
import csv
import json
import os
import functools
import requests
import tensorflow as tf
import code_search.nmslib.cli.arguments as arguments
import code_search.t2t.query as query
from code_search.nmslib.search_engine import CodeSearchEngine
from code_search.nmslib.search_server import CodeSearchServer
def embed_query(encoder, serving_url, query_str):
data = {"instances": [{"input": {"b64": encoder(query_str)}}]}
response = requests.post(url=serving_url,
headers={'content-type': 'application/json'},
data=json.dumps(data))
result = response.json()
return result['predictions'][0]['outputs']
def start_search_server(argv=None):
"""Start a Flask REST server.
This routine starts a Flask server which maintains
an in memory index and a reverse-lookup database of
Python files which can be queried via a simple REST
API. It also serves the UI for a friendlier interface.
Args:
argv: A list of strings representing command line arguments.
"""
tf.logging.set_verbosity(tf.logging.INFO)
args = arguments.parse_arguments(argv)
if not os.path.isdir(args.tmp_dir):
os.makedirs(args.tmp_dir)
tf.logging.debug('Reading {}'.format(args.lookup_file))
lookup_data = []
with tf.gfile.Open(args.lookup_file) as lookup_file:
reader = csv.reader(lookup_file)
for row in reader:
lookup_data.append(row)
tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file))
tf.logging.debug('Reading {}'.format(args.index_file))
if not os.path.isfile(tmp_index_file):
tf.gfile.Copy(args.index_file, tmp_index_file)
encoder = query.get_encoder(args.problem, args.data_dir)
query_encoder = functools.partial(query.encode_query, encoder)
embedding_fn = functools.partial(embed_query, query_encoder, args.serving_url)
search_engine = CodeSearchEngine(tmp_index_file, lookup_data, embedding_fn)
search_server = CodeSearchServer(search_engine, args.ui_dir, host=args.host, port=args.port)
search_server.run()
if __name__ == '__main__':
start_search_server()

View File

@ -1,68 +0,0 @@
import re
import os
from google.cloud import storage
def is_gcs_path(gcs_path_string):
"""
Checks if strings are of the format
"gs://bucket_name" or "gs://bucket_name/file/path"
"""
return bool(re.match(r'gs://([^/]+)(/.+)?', gcs_path_string))
def parse_gcs_path(gcs_path_string):
"""
Get the bucket name and file path from a valid GCS path
string (see `is_gcs_path`)
"""
if not is_gcs_path(gcs_path_string):
raise ValueError("{} is not a valid GCS path".format(gcs_path_string))
_, full_path = gcs_path_string.split('//')
bucket_name, bucket_path = full_path.split('/', 1)
return bucket_name, bucket_path
def download_gcs_file(src_file, target_file):
"""
Download a source file to the target file from GCS
and return the target file path
"""
storage_client = storage.Client()
bucket_name, bucket_path = parse_gcs_path(src_file)
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(bucket_path)
blob.download_to_filename(target_file)
return target_file
def maybe_download_gcs_file(src_file, target_dir):
"""Wraps `download_gcs_file` with checks"""
if not is_gcs_path(src_file):
return src_file
target_file = os.path.join(target_dir, os.path.basename(src_file))
return download_gcs_file(src_file, target_file)
def upload_gcs_file(src_file, target_file):
"""
Upload a source file to the target file in GCS
and return the target file path
"""
storage_client = storage.Client()
bucket_name, bucket_path = parse_gcs_path(target_file)
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(bucket_path)
blob.upload_from_filename(src_file)
return target_file
def maybe_upload_gcs_file(src_file, target_file):
"""Wraps `upload_gcs_file` with checks"""
if not is_gcs_path(target_file):
os.rename(src_file, target_file)
return target_file
return upload_gcs_file(src_file, target_file)

View File

@ -1,71 +1,46 @@
import json
import csv
import requests
import nmslib
from code_search.t2t.query import get_encoder, encode_query
class CodeSearchEngine:
"""This is a utility class which takes an nmslib
index file and a data file to return data from"""
def __init__(self, problem, data_dir, serving_url, index_file, data_file):
self._serving_url = serving_url
self._problem = problem
self._data_dir = data_dir
self._index_file = index_file
self._data_file = data_file
"""Instantiate the Search Index instance.
self._data_index = self.read_lookup_data_file(data_file)
This is a utility class which takes an nmslib
index file and a data file to return data from.
Args:
index_file: Path string to the nmslib index file.
lookup_data: A list representing the data in the same order as in index.
embedding_fn: A function which takes a string and returns a high-dimensional
embedding.
"""
DICT_LABELS = ['nwo', 'path', 'function_name', 'lineno', 'original_function']
def __init__(self, index_file, lookup_data, embedding_fn):
self.index = CodeSearchEngine.nmslib_init()
self.index.loadIndex(index_file)
def embed(self, query_str):
"""Get query embedding from TFServing
This involves encoding the input query
for the TF Serving service
"""
encoder = get_encoder(self._problem, self._data_dir)
encoded_query = encode_query(encoder, query_str)
data = {"instances": [{"input": {"b64": encoded_query}}]}
response = requests.post(url=self._serving_url,
headers={'content-type': 'application/json'},
data=json.dumps(data))
result = response.json()
result['predictions'] = [preds['outputs'] for preds in result['predictions']]
return result
self.lookup_data = lookup_data
self.embedding_fn = embedding_fn
def query(self, query_str, k=2):
embedding = self.embed(query_str)
idxs, dists = self.index.knnQuery(embedding['predictions'][0], k=k)
embedding = self.embedding_fn(query_str)
idxs, dists = self.index.knnQuery(embedding, k=k)
result = [self._data_index[id] for id in idxs]
result = [dict(zip(self.DICT_LABELS, self.lookup_data[id])) for id in idxs]
for i, dist in enumerate(dists):
result[i]['score'] = str(dist)
return result
@staticmethod
def read_lookup_data_file(data_file):
data_list = []
with open(data_file, 'r') as csv_file:
dict_reader = csv.DictReader(csv_file)
for row in dict_reader:
row.pop('function_embedding')
data_list.append(row)
return data_list
@staticmethod
def nmslib_init():
"""Initializes an nmslib index object"""
"""Initializes an nmslib index object."""
index = nmslib.init(method='hnsw', space='cosinesimil')
return index
@staticmethod
def create_index(data, save_path):
"""Add numpy data to the index and save to path"""
"""Add numpy data to the index and save to path."""
index = CodeSearchEngine.nmslib_init()
index.addDataPointBatch(data)
index.createIndex({'post': 2}, print_progress=True)

View File

@ -1,12 +1,23 @@
from flask import Flask, request, abort, jsonify, make_response
from flask_cors import CORS
from flask import Flask, request, abort, jsonify, make_response, redirect
class CodeSearchServer:
"""This utility class wraps the search engine into
an HTTP server based on Flask"""
def __init__(self, engine, host='0.0.0.0', port=8008):
self.app = Flask(__name__)
"""Flask server wrapping the Search Engine.
This utility class simply wraps the search engine
into an HTTP server based on Flask. The root path
is redirected to `index.html` as Flask does not
do that automatically.
Args:
engine: An instance of CodeSearchEngine.
ui_dir: Path to directory containing index.html and
other static assets for the web application.
host: A string host in IPv4 format.
port: An integer for port binding.
"""
def __init__(self, engine, ui_dir, host='0.0.0.0', port=8008):
self.app = Flask(__name__, static_folder=ui_dir, static_url_path='')
self.host = host
self.port = port
self.engine = engine
@ -16,6 +27,10 @@ class CodeSearchServer:
def init_routes(self):
# pylint: disable=unused-variable
@self.app.route('/')
def index():
return redirect('/index.html', code=302)
@self.app.route('/ping')
def ping():
return make_response(jsonify(status=200), 200)
@ -27,19 +42,9 @@ class CodeSearchServer:
abort(make_response(
jsonify(status=400, error="empty query"), 400))
result = self.engine.query(query_str)
return make_response(jsonify(result=result))
@self.app.route('/embed')
def embed():
query_str = request.args.get('q')
if not query_str:
abort(make_response(
jsonify(status=400, error="empty query"), 400))
result = self.engine.embed(query_str)
num_results = int(request.args.get('n', 2))
result = self.engine.query(query_str, k=num_results)
return make_response(jsonify(result=result))
def run(self):
CORS(self.app)
self.app.run(host=self.host, port=self.port)

View File

@ -1,8 +1,6 @@
astor~=0.6.0
apache-beam[gcp]~=2.5.0
Flask~=1.0.0
flask-cors~=3.0.0
google-cloud-storage~=1.10.0
nltk~=3.3.0
nmslib~=1.7.0
numpy~=1.14.0

View File

@ -57,10 +57,4 @@ setup(name='code-search',
cmdclass={
'build': Build,
'CustomCommands': CustomCommands,
},
entry_points={
'console_scripts': [
'nmslib-serve=code_search.nmslib.cli:server',
'nmslib-create=code_search.nmslib.cli:creator',
]
})

View File

Before

Width:  |  Height:  |  Size: 4.2 KiB

After

Width:  |  Height:  |  Size: 4.2 KiB

View File

@ -1,6 +1,6 @@
import request from 'superagent';
const SEARCH_URL='//localhost:8008/query'
const SEARCH_URL='/query';
function code_search_api(str) {
return request.get(SEARCH_URL).query({'q': str});

View File

Before

Width:  |  Height:  |  Size: 2.2 KiB

After

Width:  |  Height:  |  Size: 2.2 KiB

File diff suppressed because it is too large Load Diff