Merge UI and Search Server (#209)

* Use the nicer tf.gfile interface for search index creation * Update documentation and more maintainable interface to search server * Add ability to control number of outputs * Serve React UI from the Flask server * Update Dockerfile for the unified server and ui
2018-08-03 15:56:09 -07:00 · 2018-08-03 15:56:09 -07:00 · f2151f66fc
parent b6a4d06f00
commit f2151f66fc
28 changed files with 229 additions and 2696 deletions
--- a/code_search/docker/ui/Dockerfile
+++ b/code_search/docker/ui/Dockerfile
@ -1,13 +1,23 @@
-FROM node:10.6
+ARG BASE_IMAGE_TAG=1.8.0

-ADD ui/ /ui
+FROM tensorflow/tensorflow:$BASE_IMAGE_TAG

-WORKDIR /ui
+RUN curl -sL https://deb.nodesource.com/setup_10.x | bash - &&\
+    apt-get install -y nodejs &&\
+    pip install Flask~=1.0.0 \
+                nmslib~=1.7.0 \
+                numpy~=1.14.0 \
+                oauth2client~=4.1.0 \
+                requests~=2.18.0 \
+                tensor2tensor~=1.6.0 &&\
+    rm -rf /var/lib/apt/lists/*

-RUN npm i && npm run build && npm i -g serve
+ADD src/ /src

-EXPOSE 5000
+WORKDIR /src

-ENTRYPOINT ["serve"]
+RUN cd ui && npm i && npm run build && cd ..

-CMD ["-l", "5000", "-n", "/ui/build"]
+EXPOSE 8008
+
+ENTRYPOINT ["python"]
--- a/code_search/src/code_search/nmslib/cli.py
+++ b/code_search/src/code_search/nmslib/cli.py
@ -1,105 +0,0 @@
-"""
-This module serves as the entrypoint to either create an nmslib index or
-start a Flask server to serve the index via a simple REST interface. It
-internally talks to TF Serving for inference related tasks. The
-two entrypoints `server` and `creator` are exposed as `nmslib-create`
-and `nmslib-serve` binaries (see `setup.py`). Use `-h` to get a list
-of input CLI arguments to both.
-"""
-
-import os
-import argparse
-import csv
-import numpy as np
-
-from code_search.nmslib.gcs import maybe_download_gcs_file, maybe_upload_gcs_file
-from code_search.nmslib.search_engine import CodeSearchEngine
-from code_search.nmslib.search_server import CodeSearchServer
-
-def parse_server_args(args):
-  parser = argparse.ArgumentParser(prog='nmslib Flask Server')
-
-  parser.add_argument('--tmp-dir', type=str, metavar='', default='/tmp/nmslib',
-                     help='Path to temporary data directory')
-  parser.add_argument('--data-file', type=str, required=True,
-                     help='Path to CSV file containing human-readable data')
-  parser.add_argument('--index-file', type=str, required=True,
-                     help='Path to index file created by nmslib')
-  parser.add_argument('--problem', type=str, required=True,
-                      help='Name of the T2T problem')
-  parser.add_argument('--data-dir', type=str, required=True,
-                     help='Path to working data directory')
-  parser.add_argument('--serving-url', type=str, required=True,
-                      help='Complete URL to TF Serving Inference server')
-  parser.add_argument('--host', type=str, metavar='', default='0.0.0.0',
-                     help='Host to start server on')
-  parser.add_argument('--port', type=int, metavar='', default=8008,
-                     help='Port to bind server to')
-
-  args = parser.parse_args(args)
-  args.tmp_dir = os.path.expanduser(args.tmp_dir)
-  args.data_file = os.path.expanduser(args.data_file)
-  args.index_file = os.path.expanduser(args.index_file)
-  args.data_dir = os.path.expanduser(args.data_dir)
-
-  return args
-
-
-def parse_creator_args(args):
-  parser = argparse.ArgumentParser(prog='nmslib Index Creator')
-
-  parser.add_argument('--data-file', type=str, required=True,
-                     help='Path to csv data file for human-readable data')
-  parser.add_argument('--index-file', type=str, metavar='', default='/tmp/index.nmslib',
-                     help='Path to output index file')
-  parser.add_argument('--tmp-dir', type=str, metavar='', default='/tmp/nmslib',
-                     help='Path to temporary data directory')
-
-  args = parser.parse_args(args)
-  args.data_file = os.path.expanduser(args.data_file)
-  args.index_file = os.path.expanduser(args.index_file)
-  args.tmp_dir = os.path.expanduser(args.tmp_dir)
-
-  return args
-
-def server(argv=None):
-  args = parse_server_args(argv)
-
-  if not os.path.isdir(args.tmp_dir):
-    os.makedirs(args.tmp_dir, exist_ok=True)
-
-  # Download relevant files if needed
-  index_file = maybe_download_gcs_file(args.index_file, args.tmp_dir)
-  data_file = maybe_download_gcs_file(args.data_file, args.tmp_dir)
-
-  search_engine = CodeSearchEngine(args.problem, args.data_dir, args.serving_url,
-                                   index_file, data_file)
-
-  search_server = CodeSearchServer(engine=search_engine,
-                                   host=args.host, port=args.port)
-  search_server.run()
-
-
-def creator(argv=None):
-  args = parse_creator_args(argv)
-
-  if not os.path.isdir(args.tmp_dir):
-    os.makedirs(args.tmp_dir)
-
-  data_file = maybe_download_gcs_file(args.data_file, args.tmp_dir)
-
-  data = np.empty((0, 128), dtype=np.float32)
-  with open(data_file, 'r') as csv_file:
-    data_reader = csv.reader(csv_file)
-    next(data_reader, None) # Skip the header
-    for row in data_reader:
-      vector_string = row[-1]
-      embedding_vector = [float(value) for value in vector_string.split(',')]
-      np_row = np.expand_dims(embedding_vector, axis=0)
-      data = np.append(data, np_row, axis=0)
-
-  tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file))
-
-  CodeSearchEngine.create_index(data, tmp_index_file)
-
-  maybe_upload_gcs_file(tmp_index_file, args.index_file)
--- a/code_search/src/code_search/nmslib/cli/init.py
+++ b/code_search/src/code_search/nmslib/cli/init.py
--- a/code_search/src/code_search/nmslib/cli/arguments.py
+++ b/code_search/src/code_search/nmslib/cli/arguments.py
@ -0,0 +1,45 @@
+import argparse
+import os
+
+
+def add_common_arguments(parser):
+  parser.add_argument('--data_dir', type=str, metavar='',
+                     help='Path to directory with CSV files containing function embeddings')
+  parser.add_argument('--lookup_file', type=str, metavar='',
+                     help='Path to CSV file for reverse index lookup.')
+  parser.add_argument('--index_file', type=str, metavar='',
+                     help='Path to output index file')
+  parser.add_argument('--tmp_dir', type=str, metavar='', default='/tmp/code_search',
+                     help='Path to temporary data directory')
+
+
+def add_server_arguments(parser):
+  parser.add_argument('--problem', type=str, metavar='',
+                      help='Name of the T2T problem')
+  parser.add_argument('--serving_url', type=str, metavar='',
+                      help='Complete URL to TF Serving Inference server')
+  parser.add_argument('--host', type=str, metavar='', default='0.0.0.0',
+                     help='Host to start server on')
+  parser.add_argument('--port', type=int, metavar='', default=8008,
+                     help='Port to bind server to')
+  parser.add_argument('--ui_dir', type=int, metavar='',
+                     help='Path to static assets for the UI')
+
+
+def parse_arguments(argv=None):
+  parser = argparse.ArgumentParser(prog='Code Search Index Server')
+
+  add_common_arguments(parser)
+
+  server_args_parser = parser.add_argument_group('Server Arguments')
+  add_server_arguments(server_args_parser)
+
+  args = parser.parse_args(argv)
+  args.data_dir = os.path.expanduser(args.data_dir)
+  args.lookup_file = os.path.expanduser(args.lookup_file)
+  args.index_file = os.path.expanduser(args.index_file)
+  args.tmp_dir = os.path.expanduser(args.tmp_dir)
+
+  args.ui_dir = os.path.abspath(os.path.join(__file__, '../../../../ui/build'))
+
+  return args
--- a/code_search/src/code_search/nmslib/cli/create_search_index.py
+++ b/code_search/src/code_search/nmslib/cli/create_search_index.py
@ -0,0 +1,57 @@
+import csv
+import os
+import numpy as np
+import tensorflow as tf
+
+import code_search.nmslib.cli.arguments as arguments
+import code_search.nmslib.search_engine as search_engine
+
+
+def create_search_index(argv=None):
+  """Create NMSLib index and a reverse lookup CSV file.
+
+  This routine reads a list CSV data files at a given
+  directory, combines them into one for reverse lookup
+  and uses the embeddings string to create an NMSLib index.
+  This embedding is the last column of all CSV files.
+
+  Args:
+    argv: A list of strings representing command line arguments.
+  """
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  args = arguments.parse_arguments(argv)
+
+  if not os.path.isdir(args.tmp_dir):
+    os.makedirs(args.tmp_dir)
+
+  tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file))
+  tmp_lookup_file = os.path.join(args.tmp_dir, os.path.basename(args.lookup_file))
+
+  embeddings_data = []
+
+  with open(tmp_lookup_file, 'w') as lookup_file:
+    lookup_writer = csv.writer(lookup_file)
+
+    for csv_file_path in tf.gfile.Glob('{}/*.csv'.format(args.data_dir)):
+      tf.logging.debug('Reading {}'.format(csv_file_path))
+
+      with tf.gfile.Open(csv_file_path) as csv_file:
+        reader = csv.reader(csv_file)
+        for row in reader:
+          embedding_string = row[-1]
+          embedding_vector = [float(value) for value in embedding_string.split(',')]
+          embeddings_data.append(embedding_vector)
+
+          lookup_writer.writerow(row[:-1])
+
+  embeddings_data = np.array(embeddings_data)
+
+  search_engine.CodeSearchEngine.create_index(embeddings_data, tmp_index_file)
+
+  tf.gfile.Copy(tmp_lookup_file, args.lookup_file)
+  tf.gfile.Copy(tmp_index_file, args.index_file)
+
+
+if __name__ == '__main__':
+  create_search_index()
--- a/code_search/src/code_search/nmslib/cli/start_search_server.py
+++ b/code_search/src/code_search/nmslib/cli/start_search_server.py
@ -0,0 +1,66 @@
+import csv
+import json
+import os
+import functools
+import requests
+import tensorflow as tf
+
+import code_search.nmslib.cli.arguments as arguments
+import code_search.t2t.query as query
+from code_search.nmslib.search_engine import CodeSearchEngine
+from code_search.nmslib.search_server import CodeSearchServer
+
+
+def embed_query(encoder, serving_url, query_str):
+  data = {"instances": [{"input": {"b64": encoder(query_str)}}]}
+
+  response = requests.post(url=serving_url,
+                           headers={'content-type': 'application/json'},
+                           data=json.dumps(data))
+
+  result = response.json()
+  return result['predictions'][0]['outputs']
+
+
+def start_search_server(argv=None):
+  """Start a Flask REST server.
+
+  This routine starts a Flask server which maintains
+  an in memory index and a reverse-lookup database of
+  Python files which can be queried via a simple REST
+  API. It also serves the UI for a friendlier interface.
+
+  Args:
+    argv: A list of strings representing command line arguments.
+  """
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  args = arguments.parse_arguments(argv)
+
+  if not os.path.isdir(args.tmp_dir):
+    os.makedirs(args.tmp_dir)
+
+  tf.logging.debug('Reading {}'.format(args.lookup_file))
+  lookup_data = []
+  with tf.gfile.Open(args.lookup_file) as lookup_file:
+    reader = csv.reader(lookup_file)
+    for row in reader:
+      lookup_data.append(row)
+
+  tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file))
+
+  tf.logging.debug('Reading {}'.format(args.index_file))
+  if not os.path.isfile(tmp_index_file):
+    tf.gfile.Copy(args.index_file, tmp_index_file)
+
+  encoder = query.get_encoder(args.problem, args.data_dir)
+  query_encoder = functools.partial(query.encode_query, encoder)
+  embedding_fn = functools.partial(embed_query, query_encoder, args.serving_url)
+
+  search_engine = CodeSearchEngine(tmp_index_file, lookup_data, embedding_fn)
+  search_server = CodeSearchServer(search_engine, args.ui_dir, host=args.host, port=args.port)
+  search_server.run()
+
+
+if __name__ == '__main__':
+  start_search_server()
--- a/code_search/src/code_search/nmslib/gcs.py
+++ b/code_search/src/code_search/nmslib/gcs.py
@ -1,68 +0,0 @@
-import re
-import os
-from google.cloud import storage
-
-
-def is_gcs_path(gcs_path_string):
-  """
-  Checks if strings are of the format
-  "gs://bucket_name" or "gs://bucket_name/file/path"
-  """
-  return bool(re.match(r'gs://([^/]+)(/.+)?', gcs_path_string))
-
-def parse_gcs_path(gcs_path_string):
-  """
-  Get the bucket name and file path from a valid GCS path
-  string (see `is_gcs_path`)
-  """
-  if not is_gcs_path(gcs_path_string):
-    raise ValueError("{} is not a valid GCS path".format(gcs_path_string))
-
-  _, full_path = gcs_path_string.split('//')
-  bucket_name, bucket_path = full_path.split('/', 1)
-  return bucket_name, bucket_path
-
-
-def download_gcs_file(src_file, target_file):
-  """
-  Download a source file to the target file from GCS
-  and return the target file path
-  """
-  storage_client = storage.Client()
-  bucket_name, bucket_path = parse_gcs_path(src_file)
-  bucket = storage_client.get_bucket(bucket_name)
-  blob = bucket.blob(bucket_path)
-  blob.download_to_filename(target_file)
-  return target_file
-
-
-def maybe_download_gcs_file(src_file, target_dir):
-  """Wraps `download_gcs_file` with checks"""
-  if not is_gcs_path(src_file):
-    return src_file
-
-  target_file = os.path.join(target_dir, os.path.basename(src_file))
-
-  return download_gcs_file(src_file, target_file)
-
-
-def upload_gcs_file(src_file, target_file):
-  """
-  Upload a source file to the target file in GCS
-  and return the target file path
-  """
-  storage_client = storage.Client()
-  bucket_name, bucket_path = parse_gcs_path(target_file)
-  bucket = storage_client.get_bucket(bucket_name)
-  blob = bucket.blob(bucket_path)
-  blob.upload_from_filename(src_file)
-  return target_file
-
-
-def maybe_upload_gcs_file(src_file, target_file):
-  """Wraps `upload_gcs_file` with checks"""
-  if not is_gcs_path(target_file):
-    os.rename(src_file, target_file)
-    return target_file
-
-  return upload_gcs_file(src_file, target_file)
--- a/code_search/src/code_search/nmslib/search_engine.py
+++ b/code_search/src/code_search/nmslib/search_engine.py
@ -1,71 +1,46 @@
-import json
-import csv
-import requests
 import nmslib
-from code_search.t2t.query import get_encoder, encode_query


 class CodeSearchEngine:
-  """This is a utility class which takes an nmslib
-  index file and a data file to return data from"""
-  def __init__(self, problem, data_dir, serving_url, index_file, data_file):
-    self._serving_url = serving_url
-    self._problem = problem
-    self._data_dir = data_dir
-    self._index_file = index_file
-    self._data_file = data_file
+  """Instantiate the Search Index instance.

-    self._data_index = self.read_lookup_data_file(data_file)
+  This is a utility class which takes an nmslib
+  index file and a data file to return data from.

+  Args:
+    index_file: Path string to the nmslib index file.
+    lookup_data: A list representing the data in the same order as in index.
+    embedding_fn: A function which takes a string and returns a high-dimensional
+                  embedding.
+  """
+
+  DICT_LABELS = ['nwo', 'path', 'function_name', 'lineno', 'original_function']
+
+  def __init__(self, index_file, lookup_data, embedding_fn):
    self.index = CodeSearchEngine.nmslib_init()
    self.index.loadIndex(index_file)

-  def embed(self, query_str):
-    """Get query embedding from TFServing
-
-    This involves encoding the input query
-    for the TF Serving service
-    """
-    encoder = get_encoder(self._problem, self._data_dir)
-    encoded_query = encode_query(encoder, query_str)
-    data = {"instances": [{"input": {"b64": encoded_query}}]}
-
-    response = requests.post(url=self._serving_url,
-                             headers={'content-type': 'application/json'},
-                             data=json.dumps(data))
-
-    result = response.json()
-    result['predictions'] = [preds['outputs'] for preds in result['predictions']]
-    return result
+    self.lookup_data = lookup_data
+    self.embedding_fn = embedding_fn

  def query(self, query_str, k=2):
-    embedding = self.embed(query_str)
-    idxs, dists = self.index.knnQuery(embedding['predictions'][0], k=k)
+    embedding = self.embedding_fn(query_str)
+    idxs, dists = self.index.knnQuery(embedding, k=k)

-    result = [self._data_index[id] for id in idxs]
+    result = [dict(zip(self.DICT_LABELS, self.lookup_data[id])) for id in idxs]
    for i, dist in enumerate(dists):
      result[i]['score'] = str(dist)
    return result

-  @staticmethod
-  def read_lookup_data_file(data_file):
-    data_list = []
-    with open(data_file, 'r') as csv_file:
-      dict_reader = csv.DictReader(csv_file)
-      for row in dict_reader:
-        row.pop('function_embedding')
-        data_list.append(row)
-    return data_list
-
  @staticmethod
  def nmslib_init():
-    """Initializes an nmslib index object"""
+    """Initializes an nmslib index object."""
    index = nmslib.init(method='hnsw', space='cosinesimil')
    return index

  @staticmethod
  def create_index(data, save_path):
-    """Add numpy data to the index and save to path"""
+    """Add numpy data to the index and save to path."""
    index = CodeSearchEngine.nmslib_init()
    index.addDataPointBatch(data)
    index.createIndex({'post': 2}, print_progress=True)
--- a/code_search/src/code_search/nmslib/search_server.py
+++ b/code_search/src/code_search/nmslib/search_server.py
@ -1,12 +1,23 @@
-from flask import Flask, request, abort, jsonify, make_response
-from flask_cors import CORS
+from flask import Flask, request, abort, jsonify, make_response, redirect


 class CodeSearchServer:
-  """This utility class wraps the search engine into
-  an HTTP server based on Flask"""
-  def __init__(self, engine, host='0.0.0.0', port=8008):
-    self.app = Flask(__name__)
+  """Flask server wrapping the Search Engine.
+
+  This utility class simply wraps the search engine
+  into an HTTP server based on Flask. The root path
+  is redirected to `index.html` as Flask does not
+  do that automatically.
+
+  Args:
+    engine: An instance of CodeSearchEngine.
+    ui_dir: Path to directory containing index.html and
+            other static assets for the web application.
+    host: A string host in IPv4 format.
+    port: An integer for port binding.
+  """
+  def __init__(self, engine, ui_dir, host='0.0.0.0', port=8008):
+    self.app = Flask(__name__, static_folder=ui_dir, static_url_path='')
    self.host = host
    self.port = port
    self.engine = engine
@ -16,6 +27,10 @@ class CodeSearchServer:
  def init_routes(self):
    # pylint: disable=unused-variable

+    @self.app.route('/')
+    def index():
+      return redirect('/index.html', code=302)
+
    @self.app.route('/ping')
    def ping():
      return make_response(jsonify(status=200), 200)
@ -27,19 +42,9 @@ class CodeSearchServer:
        abort(make_response(
          jsonify(status=400, error="empty query"), 400))

-      result = self.engine.query(query_str)
-      return make_response(jsonify(result=result))
-
-    @self.app.route('/embed')
-    def embed():
-      query_str = request.args.get('q')
-      if not query_str:
-        abort(make_response(
-          jsonify(status=400, error="empty query"), 400))
-
-      result = self.engine.embed(query_str)
+      num_results = int(request.args.get('n', 2))
+      result = self.engine.query(query_str, k=num_results)
      return make_response(jsonify(result=result))

  def run(self):
-    CORS(self.app)
    self.app.run(host=self.host, port=self.port)
--- a/code_search/src/requirements.txt
+++ b/code_search/src/requirements.txt
@ -1,8 +1,6 @@
 astor~=0.6.0
 apache-beam[gcp]~=2.5.0
 Flask~=1.0.0
-flask-cors~=3.0.0
-google-cloud-storage~=1.10.0
 nltk~=3.3.0
 nmslib~=1.7.0
 numpy~=1.14.0
--- a/code_search/src/setup.py
+++ b/code_search/src/setup.py
@ -57,10 +57,4 @@ setup(name='code-search',
      cmdclass={
          'build': Build,
          'CustomCommands': CustomCommands,
-      },
-      entry_points={
-        'console_scripts': [
-          'nmslib-serve=code_search.nmslib.cli:server',
-          'nmslib-create=code_search.nmslib.cli:creator',
-        ]
      })
--- a/code_search/src/ui/.gitignore
+++ b/code_search/src/ui/.gitignore
--- a/code_search/src/ui/package-lock.json
+++ b/code_search/src/ui/package-lock.json
--- a/code_search/src/ui/package.json
+++ b/code_search/src/ui/package.json
--- a/code_search/src/ui/public/favicon.ico
+++ b/code_search/src/ui/public/favicon.ico
--- a/code_search/src/ui/public/index.html
+++ b/code_search/src/ui/public/index.html
--- a/code_search/src/ui/public/manifest.json
+++ b/code_search/src/ui/public/manifest.json
--- a/code_search/src/ui/src/App.css
+++ b/code_search/src/ui/src/App.css
--- a/code_search/src/ui/src/App.js
+++ b/code_search/src/ui/src/App.js
--- a/code_search/src/ui/src/App.test.js
+++ b/code_search/src/ui/src/App.test.js
--- a/code_search/src/ui/src/CodeSample.js
+++ b/code_search/src/ui/src/CodeSample.js
--- a/code_search/src/ui/src/CodeSearchApi.js
+++ b/code_search/src/ui/src/CodeSearchApi.js
@ -1,6 +1,6 @@
 import request from 'superagent';

-const SEARCH_URL='//localhost:8008/query'
+const SEARCH_URL='/query';

 function code_search_api(str) {
  return request.get(SEARCH_URL).query({'q': str});
--- a/code_search/src/ui/src/index.css
+++ b/code_search/src/ui/src/index.css
--- a/code_search/src/ui/src/index.js
+++ b/code_search/src/ui/src/index.js
--- a/code_search/src/ui/src/logo.svg
+++ b/code_search/src/ui/src/logo.svg
--- a/code_search/src/ui/src/registerServiceWorker.js
+++ b/code_search/src/ui/src/registerServiceWorker.js
--- a/code_search/src/ui/src/theme.js
+++ b/code_search/src/ui/src/theme.js
--- a/code_search/ui/README.md
+++ b/code_search/ui/README.md