diff --git a/code_search/indexing_server/Dockerfile b/code_search/indexing_server/Dockerfile new file mode 100644 index 00000000..218d5707 --- /dev/null +++ b/code_search/indexing_server/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.6 + +ADD . /app + +WORKDIR /app + +RUN pip install . + +ENTRYPOINT ["sh"] diff --git a/code_search/indexing_server/build_image.sh b/code_search/indexing_server/build_image.sh new file mode 100755 index 00000000..3f7ec05b --- /dev/null +++ b/code_search/indexing_server/build_image.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +set -e + +PROJECT=${PROJECT:-} +BUILD_IMAGE_TAG=${BUILD_IMAGE_TAG:-nmslib:devel} + +# Directory of this script used as docker context +_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +pushd "$_SCRIPT_DIR" + +docker build -t ${BUILD_IMAGE_TAG} . + +# Push image to GCR if PROJECT available +if [[ ! -z "${PROJECT}" ]]; then + docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG} + docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG} +fi + +popd diff --git a/code_search/indexing_server/nmslib_flask/__init__.py b/code_search/indexing_server/nmslib_flask/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/code_search/indexing_server/nmslib_flask/cli.py b/code_search/indexing_server/nmslib_flask/cli.py new file mode 100644 index 00000000..4003dc2f --- /dev/null +++ b/code_search/indexing_server/nmslib_flask/cli.py @@ -0,0 +1,71 @@ +import sys +import os +import argparse +import numpy as np +from nmslib_flask.gcs import maybe_download_gcs_file, maybe_upload_gcs_file +from nmslib_flask.search_engine import CodeSearchEngine +from nmslib_flask.search_server import CodeSearchServer + +def parse_server_args(args): + parser = argparse.ArgumentParser(prog='nmslib Flask Server') + + parser.add_argument('--index-file', type=str, required=True, + help='Path to index file created by nmslib') + parser.add_argument('--data-file', type=str, required=True, + help='Path to csv file for human-readable data') + parser.add_argument('--data-dir', type=str, metavar='', default='/tmp', + help='Path to working data directory') + parser.add_argument('--host', type=str, metavar='', default='0.0.0.0', + help='Host to start server on') + parser.add_argument('--port', type=int, metavar='', default=8008, + help='Port to bind server to') + + return parser.parse_args(args) + + +def parse_creator_args(args): + parser = argparse.ArgumentParser(prog='nmslib Index Creator') + + parser.add_argument('--data-file', type=str, required=True, + help='Path to csv data file for human-readable data') + parser.add_argument('--output-file', type=str, metavar='', default='/tmp/index.nmslib', + help='Path to output index file') + parser.add_argument('--data-dir', type=str, metavar='', default='/tmp', + help='Path to working data directory') + + return parser.parse_args(args) + +def server(): + args = parse_server_args(sys.argv[1:]) + + if not os.path.isdir(args.data_dir): + os.makedirs(args.data_dir, exist_ok=True) + + # Download relevant files if needed + index_file = maybe_download_gcs_file(args.index_file, args.data_dir) + data_file = maybe_download_gcs_file(args.data_file, args.data_dir) + + search_engine = CodeSearchEngine(index_file, data_file) + + search_server = CodeSearchServer(engine=search_engine, + host=args.host, port=args.port) + search_server.run() + + +def creator(): + args = parse_creator_args(sys.argv[1:]) + + if not os.path.isdir(args.data_dir): + os.makedirs(args.data_dir, exist_ok=True) + + data_file = maybe_download_gcs_file(args.data_file, args.data_dir) + + # TODO(sanyamkapoor): parse data file into a numpy array + + data = np.load(data_file) + + tmp_output_file = os.path.join(args.data_dir, os.path.basename(args.output_file)) + + CodeSearchEngine.create_index(data, tmp_output_file) + + maybe_upload_gcs_file(tmp_output_file, args.output_file) diff --git a/code_search/indexing_server/nmslib_flask/gcs.py b/code_search/indexing_server/nmslib_flask/gcs.py new file mode 100644 index 00000000..b05960d1 --- /dev/null +++ b/code_search/indexing_server/nmslib_flask/gcs.py @@ -0,0 +1,67 @@ +import re +import os +from google.cloud import storage + + +def is_gcs_path(gcs_path_string): + """ + Checks if strings are of the format + "gs://bucket_name" or "gs://bucket_name/file/path" + """ + return bool(re.match(r'gs://([^/]+)(/.+)?', gcs_path_string)) + +def parse_gcs_path(gcs_path_string): + """ + Get the bucket name and file path from a valid GCS path + string (see `is_gcs_path`) + """ + if not is_gcs_path(gcs_path_string): + raise ValueError("{} is not a valid GCS path".format(gcs_path_string)) + + _, full_path = gcs_path_string.split('//') + bucket_name, bucket_path = full_path.split('/', 1) + return bucket_name, bucket_path + + +def download_gcs_file(src_file, target_file): + """ + Download a source file to the target file from GCS + and return the target file path + """ + storage_client = storage.Client() + bucket_name, bucket_path = parse_gcs_path(src_file) + bucket = storage_client.get_bucket(bucket_name) + blob = bucket.blob(bucket_path) + blob.download_to_filename(target_file) + return target_file + + +def maybe_download_gcs_file(src_file, target_dir): + """Wraps `download_gcs_file` with checks""" + if not is_gcs_path(src_file): + return src_file + + target_file = os.path.join(target_dir, os.path.basename(src_file)) + + return download_gcs_file(src_file, target_file) + + +def upload_gcs_file(src_file, target_file): + """ + Upload a source file to the target file in GCS + and return the target file path + """ + storage_client = storage.Client() + bucket_name, bucket_path = parse_gcs_path(target_file) + bucket = storage_client.get_bucket(bucket_name) + blob = bucket.blob(bucket_path) + blob.upload_from_filename(src_file) + return target_file + + +def maybe_upload_gcs_file(src_file, target_file): + """Wraps `upload_gcs_file` with checks""" + if not is_gcs_path(target_file): + return target_file + + return upload_gcs_file(src_file, target_file) diff --git a/code_search/indexing_server/nmslib_flask/search_engine.py b/code_search/indexing_server/nmslib_flask/search_engine.py new file mode 100644 index 00000000..4dfca1a5 --- /dev/null +++ b/code_search/indexing_server/nmslib_flask/search_engine.py @@ -0,0 +1,46 @@ +import nmslib +import numpy as np + + +class CodeSearchEngine: + """This is a utility class which takes an nmslib + index file and a data file to return data from""" + def __init__(self, index_file: str, data_file: str): + self._index_file = index_file + self._data_file = data_file + + self.index = CodeSearchEngine.nmslib_init() + self.index.loadIndex(index_file) + + # TODO: load the reverse-index map for actual code data + # self.data_map = + + def embed(self, query_str): + # TODO load trained model and embed input strings + raise NotImplementedError + + def query(self, query_str: str, k=2): + embedding = self.embed(query_str) + idxs, dists = self.index.knnQuery(embedding, k=k) + + # TODO(sanyamkapoor): initialize data map and return + # list of dicts + # [ + # {'src': self.data_map[idx], 'dist': dist} + # for idx, dist in zip(idxs, dists) + # ] + return idxs, dists + + @staticmethod + def nmslib_init(): + """Initializes an nmslib index object""" + index = nmslib.init(method='hnsw', space='cosinesimil') + return index + + @staticmethod + def create_index(data: np.array, save_path: str): + """Add numpy data to the index and save to path""" + index = CodeSearchEngine.nmslib_init() + index.addDataPointBatch(data) + index.createIndex({'post': 2}, print_progress=True) + index.saveIndex(save_path) diff --git a/code_search/indexing_server/nmslib_flask/search_server.py b/code_search/indexing_server/nmslib_flask/search_server.py new file mode 100644 index 00000000..8daac87e --- /dev/null +++ b/code_search/indexing_server/nmslib_flask/search_server.py @@ -0,0 +1,31 @@ +from flask import Flask, request, abort, jsonify, make_response + + +class CodeSearchServer: + """This utility class wraps the search engine into + an HTTP server based on Flask""" + def __init__(self, engine, host='0.0.0.0', port=8008): + self.app = Flask(__name__) + self.host = host + self.port = port + self.engine = engine + + def init_routes(self): + # pylint: disable=unused-variable + + @self.app.route('/ping') + def ping(): + return make_response(jsonify(status=200), 200) + + @self.app.route('/query') + def query(): + query_str = request.args.get('query') + if not query_str: + abort(make_response( + jsonify(status=400, error="empty query"), 400)) + + result = self.engine.search(query_str) + return make_response(jsonify(result=result)) + + def run(self): + self.app.run(host=self.host, port=self.port) diff --git a/code_search/indexing_server/requirements.txt b/code_search/indexing_server/requirements.txt new file mode 100644 index 00000000..d69a7c43 --- /dev/null +++ b/code_search/indexing_server/requirements.txt @@ -0,0 +1,4 @@ +Flask~=1.0.0 +nmslib~=1.7.0 +numpy~=1.14.0 +google-cloud-storage~=1.10.0 diff --git a/code_search/indexing_server/setup.py b/code_search/indexing_server/setup.py new file mode 100644 index 00000000..2455f51c --- /dev/null +++ b/code_search/indexing_server/setup.py @@ -0,0 +1,23 @@ +from setuptools import setup, find_packages + +with open('requirements.txt', 'r') as f: + install_requires = f.readlines() + +VERSION = '0.1.0' + +setup(name='code-search-index-server', + description='Kubeflow Code Search Demo - Index Server', + url='https://www.github.com/kubeflow/examples', + author='Sanyam Kapoor', + author_email='sanyamkapoor@google.com', + version=VERSION, + license='MIT', + packages=find_packages(), + install_requires=install_requires, + extras_require={}, + entry_points={ + 'console_scripts': [ + 'nmslib-serve=nmslib_flask.cli:server', + 'nmslib-create=nmslib_flask.cli:creator', + ] + })