Python package for indexing and serving the index (#150)

* Add a utility python package for indexing and serving the index

* Add CLI arguments, conditional GCS download

* Complete skeleton CLIs for serving and index creation

* Fix lint issues
This commit is contained in:
Sanyam Kapoor 2018-06-20 15:34:05 -07:00 committed by k8s-ci-robot
parent 4bd30a1e68
commit 21506ffc51
9 changed files with 272 additions and 0 deletions

View File

@ -0,0 +1,9 @@
FROM python:3.6
ADD . /app
WORKDIR /app
RUN pip install .
ENTRYPOINT ["sh"]

View File

@ -0,0 +1,21 @@
#!/usr/bin/env bash
set -e
PROJECT=${PROJECT:-}
BUILD_IMAGE_TAG=${BUILD_IMAGE_TAG:-nmslib:devel}
# Directory of this script used as docker context
_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
pushd "$_SCRIPT_DIR"
docker build -t ${BUILD_IMAGE_TAG} .
# Push image to GCR if PROJECT available
if [[ ! -z "${PROJECT}" ]]; then
docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
fi
popd

View File

@ -0,0 +1,71 @@
import sys
import os
import argparse
import numpy as np
from nmslib_flask.gcs import maybe_download_gcs_file, maybe_upload_gcs_file
from nmslib_flask.search_engine import CodeSearchEngine
from nmslib_flask.search_server import CodeSearchServer
def parse_server_args(args):
parser = argparse.ArgumentParser(prog='nmslib Flask Server')
parser.add_argument('--index-file', type=str, required=True,
help='Path to index file created by nmslib')
parser.add_argument('--data-file', type=str, required=True,
help='Path to csv file for human-readable data')
parser.add_argument('--data-dir', type=str, metavar='', default='/tmp',
help='Path to working data directory')
parser.add_argument('--host', type=str, metavar='', default='0.0.0.0',
help='Host to start server on')
parser.add_argument('--port', type=int, metavar='', default=8008,
help='Port to bind server to')
return parser.parse_args(args)
def parse_creator_args(args):
parser = argparse.ArgumentParser(prog='nmslib Index Creator')
parser.add_argument('--data-file', type=str, required=True,
help='Path to csv data file for human-readable data')
parser.add_argument('--output-file', type=str, metavar='', default='/tmp/index.nmslib',
help='Path to output index file')
parser.add_argument('--data-dir', type=str, metavar='', default='/tmp',
help='Path to working data directory')
return parser.parse_args(args)
def server():
args = parse_server_args(sys.argv[1:])
if not os.path.isdir(args.data_dir):
os.makedirs(args.data_dir, exist_ok=True)
# Download relevant files if needed
index_file = maybe_download_gcs_file(args.index_file, args.data_dir)
data_file = maybe_download_gcs_file(args.data_file, args.data_dir)
search_engine = CodeSearchEngine(index_file, data_file)
search_server = CodeSearchServer(engine=search_engine,
host=args.host, port=args.port)
search_server.run()
def creator():
args = parse_creator_args(sys.argv[1:])
if not os.path.isdir(args.data_dir):
os.makedirs(args.data_dir, exist_ok=True)
data_file = maybe_download_gcs_file(args.data_file, args.data_dir)
# TODO(sanyamkapoor): parse data file into a numpy array
data = np.load(data_file)
tmp_output_file = os.path.join(args.data_dir, os.path.basename(args.output_file))
CodeSearchEngine.create_index(data, tmp_output_file)
maybe_upload_gcs_file(tmp_output_file, args.output_file)

View File

@ -0,0 +1,67 @@
import re
import os
from google.cloud import storage
def is_gcs_path(gcs_path_string):
"""
Checks if strings are of the format
"gs://bucket_name" or "gs://bucket_name/file/path"
"""
return bool(re.match(r'gs://([^/]+)(/.+)?', gcs_path_string))
def parse_gcs_path(gcs_path_string):
"""
Get the bucket name and file path from a valid GCS path
string (see `is_gcs_path`)
"""
if not is_gcs_path(gcs_path_string):
raise ValueError("{} is not a valid GCS path".format(gcs_path_string))
_, full_path = gcs_path_string.split('//')
bucket_name, bucket_path = full_path.split('/', 1)
return bucket_name, bucket_path
def download_gcs_file(src_file, target_file):
"""
Download a source file to the target file from GCS
and return the target file path
"""
storage_client = storage.Client()
bucket_name, bucket_path = parse_gcs_path(src_file)
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(bucket_path)
blob.download_to_filename(target_file)
return target_file
def maybe_download_gcs_file(src_file, target_dir):
"""Wraps `download_gcs_file` with checks"""
if not is_gcs_path(src_file):
return src_file
target_file = os.path.join(target_dir, os.path.basename(src_file))
return download_gcs_file(src_file, target_file)
def upload_gcs_file(src_file, target_file):
"""
Upload a source file to the target file in GCS
and return the target file path
"""
storage_client = storage.Client()
bucket_name, bucket_path = parse_gcs_path(target_file)
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(bucket_path)
blob.upload_from_filename(src_file)
return target_file
def maybe_upload_gcs_file(src_file, target_file):
"""Wraps `upload_gcs_file` with checks"""
if not is_gcs_path(target_file):
return target_file
return upload_gcs_file(src_file, target_file)

View File

@ -0,0 +1,46 @@
import nmslib
import numpy as np
class CodeSearchEngine:
"""This is a utility class which takes an nmslib
index file and a data file to return data from"""
def __init__(self, index_file: str, data_file: str):
self._index_file = index_file
self._data_file = data_file
self.index = CodeSearchEngine.nmslib_init()
self.index.loadIndex(index_file)
# TODO: load the reverse-index map for actual code data
# self.data_map =
def embed(self, query_str):
# TODO load trained model and embed input strings
raise NotImplementedError
def query(self, query_str: str, k=2):
embedding = self.embed(query_str)
idxs, dists = self.index.knnQuery(embedding, k=k)
# TODO(sanyamkapoor): initialize data map and return
# list of dicts
# [
# {'src': self.data_map[idx], 'dist': dist}
# for idx, dist in zip(idxs, dists)
# ]
return idxs, dists
@staticmethod
def nmslib_init():
"""Initializes an nmslib index object"""
index = nmslib.init(method='hnsw', space='cosinesimil')
return index
@staticmethod
def create_index(data: np.array, save_path: str):
"""Add numpy data to the index and save to path"""
index = CodeSearchEngine.nmslib_init()
index.addDataPointBatch(data)
index.createIndex({'post': 2}, print_progress=True)
index.saveIndex(save_path)

View File

@ -0,0 +1,31 @@
from flask import Flask, request, abort, jsonify, make_response
class CodeSearchServer:
"""This utility class wraps the search engine into
an HTTP server based on Flask"""
def __init__(self, engine, host='0.0.0.0', port=8008):
self.app = Flask(__name__)
self.host = host
self.port = port
self.engine = engine
def init_routes(self):
# pylint: disable=unused-variable
@self.app.route('/ping')
def ping():
return make_response(jsonify(status=200), 200)
@self.app.route('/query')
def query():
query_str = request.args.get('query')
if not query_str:
abort(make_response(
jsonify(status=400, error="empty query"), 400))
result = self.engine.search(query_str)
return make_response(jsonify(result=result))
def run(self):
self.app.run(host=self.host, port=self.port)

View File

@ -0,0 +1,4 @@
Flask~=1.0.0
nmslib~=1.7.0
numpy~=1.14.0
google-cloud-storage~=1.10.0

View File

@ -0,0 +1,23 @@
from setuptools import setup, find_packages
with open('requirements.txt', 'r') as f:
install_requires = f.readlines()
VERSION = '0.1.0'
setup(name='code-search-index-server',
description='Kubeflow Code Search Demo - Index Server',
url='https://www.github.com/kubeflow/examples',
author='Sanyam Kapoor',
author_email='sanyamkapoor@google.com',
version=VERSION,
license='MIT',
packages=find_packages(),
install_requires=install_requires,
extras_require={},
entry_points={
'console_scripts': [
'nmslib-serve=nmslib_flask.cli:server',
'nmslib-create=nmslib_flask.cli:creator',
]
})