mirror of https://github.com/kubeflow/examples.git
Python package for indexing and serving the index (#150)
* Add a utility python package for indexing and serving the index * Add CLI arguments, conditional GCS download * Complete skeleton CLIs for serving and index creation * Fix lint issues
This commit is contained in:
parent
4bd30a1e68
commit
21506ffc51
|
|
@ -0,0 +1,9 @@
|
||||||
|
FROM python:3.6
|
||||||
|
|
||||||
|
ADD . /app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN pip install .
|
||||||
|
|
||||||
|
ENTRYPOINT ["sh"]
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
PROJECT=${PROJECT:-}
|
||||||
|
BUILD_IMAGE_TAG=${BUILD_IMAGE_TAG:-nmslib:devel}
|
||||||
|
|
||||||
|
# Directory of this script used as docker context
|
||||||
|
_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||||
|
|
||||||
|
pushd "$_SCRIPT_DIR"
|
||||||
|
|
||||||
|
docker build -t ${BUILD_IMAGE_TAG} .
|
||||||
|
|
||||||
|
# Push image to GCR if PROJECT available
|
||||||
|
if [[ ! -z "${PROJECT}" ]]; then
|
||||||
|
docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
|
||||||
|
docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
|
||||||
|
fi
|
||||||
|
|
||||||
|
popd
|
||||||
|
|
@ -0,0 +1,71 @@
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
from nmslib_flask.gcs import maybe_download_gcs_file, maybe_upload_gcs_file
|
||||||
|
from nmslib_flask.search_engine import CodeSearchEngine
|
||||||
|
from nmslib_flask.search_server import CodeSearchServer
|
||||||
|
|
||||||
|
def parse_server_args(args):
|
||||||
|
parser = argparse.ArgumentParser(prog='nmslib Flask Server')
|
||||||
|
|
||||||
|
parser.add_argument('--index-file', type=str, required=True,
|
||||||
|
help='Path to index file created by nmslib')
|
||||||
|
parser.add_argument('--data-file', type=str, required=True,
|
||||||
|
help='Path to csv file for human-readable data')
|
||||||
|
parser.add_argument('--data-dir', type=str, metavar='', default='/tmp',
|
||||||
|
help='Path to working data directory')
|
||||||
|
parser.add_argument('--host', type=str, metavar='', default='0.0.0.0',
|
||||||
|
help='Host to start server on')
|
||||||
|
parser.add_argument('--port', type=int, metavar='', default=8008,
|
||||||
|
help='Port to bind server to')
|
||||||
|
|
||||||
|
return parser.parse_args(args)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_creator_args(args):
|
||||||
|
parser = argparse.ArgumentParser(prog='nmslib Index Creator')
|
||||||
|
|
||||||
|
parser.add_argument('--data-file', type=str, required=True,
|
||||||
|
help='Path to csv data file for human-readable data')
|
||||||
|
parser.add_argument('--output-file', type=str, metavar='', default='/tmp/index.nmslib',
|
||||||
|
help='Path to output index file')
|
||||||
|
parser.add_argument('--data-dir', type=str, metavar='', default='/tmp',
|
||||||
|
help='Path to working data directory')
|
||||||
|
|
||||||
|
return parser.parse_args(args)
|
||||||
|
|
||||||
|
def server():
|
||||||
|
args = parse_server_args(sys.argv[1:])
|
||||||
|
|
||||||
|
if not os.path.isdir(args.data_dir):
|
||||||
|
os.makedirs(args.data_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Download relevant files if needed
|
||||||
|
index_file = maybe_download_gcs_file(args.index_file, args.data_dir)
|
||||||
|
data_file = maybe_download_gcs_file(args.data_file, args.data_dir)
|
||||||
|
|
||||||
|
search_engine = CodeSearchEngine(index_file, data_file)
|
||||||
|
|
||||||
|
search_server = CodeSearchServer(engine=search_engine,
|
||||||
|
host=args.host, port=args.port)
|
||||||
|
search_server.run()
|
||||||
|
|
||||||
|
|
||||||
|
def creator():
|
||||||
|
args = parse_creator_args(sys.argv[1:])
|
||||||
|
|
||||||
|
if not os.path.isdir(args.data_dir):
|
||||||
|
os.makedirs(args.data_dir, exist_ok=True)
|
||||||
|
|
||||||
|
data_file = maybe_download_gcs_file(args.data_file, args.data_dir)
|
||||||
|
|
||||||
|
# TODO(sanyamkapoor): parse data file into a numpy array
|
||||||
|
|
||||||
|
data = np.load(data_file)
|
||||||
|
|
||||||
|
tmp_output_file = os.path.join(args.data_dir, os.path.basename(args.output_file))
|
||||||
|
|
||||||
|
CodeSearchEngine.create_index(data, tmp_output_file)
|
||||||
|
|
||||||
|
maybe_upload_gcs_file(tmp_output_file, args.output_file)
|
||||||
|
|
@ -0,0 +1,67 @@
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
from google.cloud import storage
|
||||||
|
|
||||||
|
|
||||||
|
def is_gcs_path(gcs_path_string):
|
||||||
|
"""
|
||||||
|
Checks if strings are of the format
|
||||||
|
"gs://bucket_name" or "gs://bucket_name/file/path"
|
||||||
|
"""
|
||||||
|
return bool(re.match(r'gs://([^/]+)(/.+)?', gcs_path_string))
|
||||||
|
|
||||||
|
def parse_gcs_path(gcs_path_string):
|
||||||
|
"""
|
||||||
|
Get the bucket name and file path from a valid GCS path
|
||||||
|
string (see `is_gcs_path`)
|
||||||
|
"""
|
||||||
|
if not is_gcs_path(gcs_path_string):
|
||||||
|
raise ValueError("{} is not a valid GCS path".format(gcs_path_string))
|
||||||
|
|
||||||
|
_, full_path = gcs_path_string.split('//')
|
||||||
|
bucket_name, bucket_path = full_path.split('/', 1)
|
||||||
|
return bucket_name, bucket_path
|
||||||
|
|
||||||
|
|
||||||
|
def download_gcs_file(src_file, target_file):
|
||||||
|
"""
|
||||||
|
Download a source file to the target file from GCS
|
||||||
|
and return the target file path
|
||||||
|
"""
|
||||||
|
storage_client = storage.Client()
|
||||||
|
bucket_name, bucket_path = parse_gcs_path(src_file)
|
||||||
|
bucket = storage_client.get_bucket(bucket_name)
|
||||||
|
blob = bucket.blob(bucket_path)
|
||||||
|
blob.download_to_filename(target_file)
|
||||||
|
return target_file
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_download_gcs_file(src_file, target_dir):
|
||||||
|
"""Wraps `download_gcs_file` with checks"""
|
||||||
|
if not is_gcs_path(src_file):
|
||||||
|
return src_file
|
||||||
|
|
||||||
|
target_file = os.path.join(target_dir, os.path.basename(src_file))
|
||||||
|
|
||||||
|
return download_gcs_file(src_file, target_file)
|
||||||
|
|
||||||
|
|
||||||
|
def upload_gcs_file(src_file, target_file):
|
||||||
|
"""
|
||||||
|
Upload a source file to the target file in GCS
|
||||||
|
and return the target file path
|
||||||
|
"""
|
||||||
|
storage_client = storage.Client()
|
||||||
|
bucket_name, bucket_path = parse_gcs_path(target_file)
|
||||||
|
bucket = storage_client.get_bucket(bucket_name)
|
||||||
|
blob = bucket.blob(bucket_path)
|
||||||
|
blob.upload_from_filename(src_file)
|
||||||
|
return target_file
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_upload_gcs_file(src_file, target_file):
|
||||||
|
"""Wraps `upload_gcs_file` with checks"""
|
||||||
|
if not is_gcs_path(target_file):
|
||||||
|
return target_file
|
||||||
|
|
||||||
|
return upload_gcs_file(src_file, target_file)
|
||||||
|
|
@ -0,0 +1,46 @@
|
||||||
|
import nmslib
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class CodeSearchEngine:
|
||||||
|
"""This is a utility class which takes an nmslib
|
||||||
|
index file and a data file to return data from"""
|
||||||
|
def __init__(self, index_file: str, data_file: str):
|
||||||
|
self._index_file = index_file
|
||||||
|
self._data_file = data_file
|
||||||
|
|
||||||
|
self.index = CodeSearchEngine.nmslib_init()
|
||||||
|
self.index.loadIndex(index_file)
|
||||||
|
|
||||||
|
# TODO: load the reverse-index map for actual code data
|
||||||
|
# self.data_map =
|
||||||
|
|
||||||
|
def embed(self, query_str):
|
||||||
|
# TODO load trained model and embed input strings
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def query(self, query_str: str, k=2):
|
||||||
|
embedding = self.embed(query_str)
|
||||||
|
idxs, dists = self.index.knnQuery(embedding, k=k)
|
||||||
|
|
||||||
|
# TODO(sanyamkapoor): initialize data map and return
|
||||||
|
# list of dicts
|
||||||
|
# [
|
||||||
|
# {'src': self.data_map[idx], 'dist': dist}
|
||||||
|
# for idx, dist in zip(idxs, dists)
|
||||||
|
# ]
|
||||||
|
return idxs, dists
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def nmslib_init():
|
||||||
|
"""Initializes an nmslib index object"""
|
||||||
|
index = nmslib.init(method='hnsw', space='cosinesimil')
|
||||||
|
return index
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_index(data: np.array, save_path: str):
|
||||||
|
"""Add numpy data to the index and save to path"""
|
||||||
|
index = CodeSearchEngine.nmslib_init()
|
||||||
|
index.addDataPointBatch(data)
|
||||||
|
index.createIndex({'post': 2}, print_progress=True)
|
||||||
|
index.saveIndex(save_path)
|
||||||
|
|
@ -0,0 +1,31 @@
|
||||||
|
from flask import Flask, request, abort, jsonify, make_response
|
||||||
|
|
||||||
|
|
||||||
|
class CodeSearchServer:
|
||||||
|
"""This utility class wraps the search engine into
|
||||||
|
an HTTP server based on Flask"""
|
||||||
|
def __init__(self, engine, host='0.0.0.0', port=8008):
|
||||||
|
self.app = Flask(__name__)
|
||||||
|
self.host = host
|
||||||
|
self.port = port
|
||||||
|
self.engine = engine
|
||||||
|
|
||||||
|
def init_routes(self):
|
||||||
|
# pylint: disable=unused-variable
|
||||||
|
|
||||||
|
@self.app.route('/ping')
|
||||||
|
def ping():
|
||||||
|
return make_response(jsonify(status=200), 200)
|
||||||
|
|
||||||
|
@self.app.route('/query')
|
||||||
|
def query():
|
||||||
|
query_str = request.args.get('query')
|
||||||
|
if not query_str:
|
||||||
|
abort(make_response(
|
||||||
|
jsonify(status=400, error="empty query"), 400))
|
||||||
|
|
||||||
|
result = self.engine.search(query_str)
|
||||||
|
return make_response(jsonify(result=result))
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.app.run(host=self.host, port=self.port)
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
Flask~=1.0.0
|
||||||
|
nmslib~=1.7.0
|
||||||
|
numpy~=1.14.0
|
||||||
|
google-cloud-storage~=1.10.0
|
||||||
|
|
@ -0,0 +1,23 @@
|
||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
with open('requirements.txt', 'r') as f:
|
||||||
|
install_requires = f.readlines()
|
||||||
|
|
||||||
|
VERSION = '0.1.0'
|
||||||
|
|
||||||
|
setup(name='code-search-index-server',
|
||||||
|
description='Kubeflow Code Search Demo - Index Server',
|
||||||
|
url='https://www.github.com/kubeflow/examples',
|
||||||
|
author='Sanyam Kapoor',
|
||||||
|
author_email='sanyamkapoor@google.com',
|
||||||
|
version=VERSION,
|
||||||
|
license='MIT',
|
||||||
|
packages=find_packages(),
|
||||||
|
install_requires=install_requires,
|
||||||
|
extras_require={},
|
||||||
|
entry_points={
|
||||||
|
'console_scripts': [
|
||||||
|
'nmslib-serve=nmslib_flask.cli:server',
|
||||||
|
'nmslib-create=nmslib_flask.cli:creator',
|
||||||
|
]
|
||||||
|
})
|
||||||
Loading…
Reference in New Issue