mirror of https://github.com/kubeflow/examples.git
				
				
				
			Python package for indexing and serving the index (#150)
* Add a utility python package for indexing and serving the index * Add CLI arguments, conditional GCS download * Complete skeleton CLIs for serving and index creation * Fix lint issues
This commit is contained in:
		
							parent
							
								
									4bd30a1e68
								
							
						
					
					
						commit
						21506ffc51
					
				|  | @ -0,0 +1,9 @@ | |||
| FROM python:3.6 | ||||
| 
 | ||||
| ADD . /app | ||||
| 
 | ||||
| WORKDIR /app | ||||
| 
 | ||||
| RUN pip install . | ||||
| 
 | ||||
| ENTRYPOINT ["sh"] | ||||
|  | @ -0,0 +1,21 @@ | |||
| #!/usr/bin/env bash | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| PROJECT=${PROJECT:-} | ||||
| BUILD_IMAGE_TAG=${BUILD_IMAGE_TAG:-nmslib:devel} | ||||
| 
 | ||||
| # Directory of this script used as docker context | ||||
| _SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||||
| 
 | ||||
| pushd "$_SCRIPT_DIR" | ||||
| 
 | ||||
| docker build -t ${BUILD_IMAGE_TAG} . | ||||
| 
 | ||||
| # Push image to GCR if PROJECT available | ||||
| if [[ ! -z "${PROJECT}" ]]; then | ||||
|     docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG} | ||||
|     docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG} | ||||
| fi | ||||
| 
 | ||||
| popd | ||||
|  | @ -0,0 +1,71 @@ | |||
| import sys | ||||
| import os | ||||
| import argparse | ||||
| import numpy as np | ||||
| from nmslib_flask.gcs import maybe_download_gcs_file, maybe_upload_gcs_file | ||||
| from nmslib_flask.search_engine import CodeSearchEngine | ||||
| from nmslib_flask.search_server import CodeSearchServer | ||||
| 
 | ||||
| def parse_server_args(args): | ||||
|   parser = argparse.ArgumentParser(prog='nmslib Flask Server') | ||||
| 
 | ||||
|   parser.add_argument('--index-file', type=str, required=True, | ||||
|                      help='Path to index file created by nmslib') | ||||
|   parser.add_argument('--data-file', type=str, required=True, | ||||
|                      help='Path to csv file for human-readable data') | ||||
|   parser.add_argument('--data-dir', type=str, metavar='', default='/tmp', | ||||
|                      help='Path to working data directory') | ||||
|   parser.add_argument('--host', type=str, metavar='', default='0.0.0.0', | ||||
|                      help='Host to start server on') | ||||
|   parser.add_argument('--port', type=int, metavar='', default=8008, | ||||
|                      help='Port to bind server to') | ||||
| 
 | ||||
|   return parser.parse_args(args) | ||||
| 
 | ||||
| 
 | ||||
| def parse_creator_args(args): | ||||
|   parser = argparse.ArgumentParser(prog='nmslib Index Creator') | ||||
| 
 | ||||
|   parser.add_argument('--data-file', type=str, required=True, | ||||
|                      help='Path to csv data file for human-readable data') | ||||
|   parser.add_argument('--output-file', type=str, metavar='', default='/tmp/index.nmslib', | ||||
|                      help='Path to output index file') | ||||
|   parser.add_argument('--data-dir', type=str, metavar='', default='/tmp', | ||||
|                      help='Path to working data directory') | ||||
| 
 | ||||
|   return parser.parse_args(args) | ||||
| 
 | ||||
| def server(): | ||||
|   args = parse_server_args(sys.argv[1:]) | ||||
| 
 | ||||
|   if not os.path.isdir(args.data_dir): | ||||
|     os.makedirs(args.data_dir, exist_ok=True) | ||||
| 
 | ||||
|   # Download relevant files if needed | ||||
|   index_file = maybe_download_gcs_file(args.index_file, args.data_dir) | ||||
|   data_file = maybe_download_gcs_file(args.data_file, args.data_dir) | ||||
| 
 | ||||
|   search_engine = CodeSearchEngine(index_file, data_file) | ||||
| 
 | ||||
|   search_server = CodeSearchServer(engine=search_engine, | ||||
|                                    host=args.host, port=args.port) | ||||
|   search_server.run() | ||||
| 
 | ||||
| 
 | ||||
| def creator(): | ||||
|   args = parse_creator_args(sys.argv[1:]) | ||||
| 
 | ||||
|   if not os.path.isdir(args.data_dir): | ||||
|     os.makedirs(args.data_dir, exist_ok=True) | ||||
| 
 | ||||
|   data_file = maybe_download_gcs_file(args.data_file, args.data_dir) | ||||
| 
 | ||||
|   # TODO(sanyamkapoor): parse data file into a numpy array | ||||
| 
 | ||||
|   data = np.load(data_file) | ||||
| 
 | ||||
|   tmp_output_file = os.path.join(args.data_dir, os.path.basename(args.output_file)) | ||||
| 
 | ||||
|   CodeSearchEngine.create_index(data, tmp_output_file) | ||||
| 
 | ||||
|   maybe_upload_gcs_file(tmp_output_file, args.output_file) | ||||
|  | @ -0,0 +1,67 @@ | |||
| import re | ||||
| import os | ||||
| from google.cloud import storage | ||||
| 
 | ||||
| 
 | ||||
| def is_gcs_path(gcs_path_string): | ||||
|   """ | ||||
|   Checks if strings are of the format | ||||
|   "gs://bucket_name" or "gs://bucket_name/file/path" | ||||
|   """ | ||||
|   return bool(re.match(r'gs://([^/]+)(/.+)?', gcs_path_string)) | ||||
| 
 | ||||
| def parse_gcs_path(gcs_path_string): | ||||
|   """ | ||||
|   Get the bucket name and file path from a valid GCS path | ||||
|   string (see `is_gcs_path`) | ||||
|   """ | ||||
|   if not is_gcs_path(gcs_path_string): | ||||
|     raise ValueError("{} is not a valid GCS path".format(gcs_path_string)) | ||||
| 
 | ||||
|   _, full_path = gcs_path_string.split('//') | ||||
|   bucket_name, bucket_path = full_path.split('/', 1) | ||||
|   return bucket_name, bucket_path | ||||
| 
 | ||||
| 
 | ||||
| def download_gcs_file(src_file, target_file): | ||||
|   """ | ||||
|   Download a source file to the target file from GCS | ||||
|   and return the target file path | ||||
|   """ | ||||
|   storage_client = storage.Client() | ||||
|   bucket_name, bucket_path = parse_gcs_path(src_file) | ||||
|   bucket = storage_client.get_bucket(bucket_name) | ||||
|   blob = bucket.blob(bucket_path) | ||||
|   blob.download_to_filename(target_file) | ||||
|   return target_file | ||||
| 
 | ||||
| 
 | ||||
| def maybe_download_gcs_file(src_file, target_dir): | ||||
|   """Wraps `download_gcs_file` with checks""" | ||||
|   if not is_gcs_path(src_file): | ||||
|     return src_file | ||||
| 
 | ||||
|   target_file = os.path.join(target_dir, os.path.basename(src_file)) | ||||
| 
 | ||||
|   return download_gcs_file(src_file, target_file) | ||||
| 
 | ||||
| 
 | ||||
| def upload_gcs_file(src_file, target_file): | ||||
|   """ | ||||
|   Upload a source file to the target file in GCS | ||||
|   and return the target file path | ||||
|   """ | ||||
|   storage_client = storage.Client() | ||||
|   bucket_name, bucket_path = parse_gcs_path(target_file) | ||||
|   bucket = storage_client.get_bucket(bucket_name) | ||||
|   blob = bucket.blob(bucket_path) | ||||
|   blob.upload_from_filename(src_file) | ||||
|   return target_file | ||||
| 
 | ||||
| 
 | ||||
| def maybe_upload_gcs_file(src_file, target_file): | ||||
|   """Wraps `upload_gcs_file` with checks""" | ||||
|   if not is_gcs_path(target_file): | ||||
|     return target_file | ||||
| 
 | ||||
|   return upload_gcs_file(src_file, target_file) | ||||
|  | @ -0,0 +1,46 @@ | |||
| import nmslib | ||||
| import numpy as np | ||||
| 
 | ||||
| 
 | ||||
| class CodeSearchEngine: | ||||
|   """This is a utility class which takes an nmslib | ||||
|   index file and a data file to return data from""" | ||||
|   def __init__(self, index_file: str, data_file: str): | ||||
|     self._index_file = index_file | ||||
|     self._data_file = data_file | ||||
| 
 | ||||
|     self.index = CodeSearchEngine.nmslib_init() | ||||
|     self.index.loadIndex(index_file) | ||||
| 
 | ||||
|     # TODO: load the reverse-index map for actual code data | ||||
|     # self.data_map = | ||||
| 
 | ||||
|   def embed(self, query_str): | ||||
|     # TODO load trained model and embed input strings | ||||
|     raise NotImplementedError | ||||
| 
 | ||||
|   def query(self, query_str: str, k=2): | ||||
|     embedding = self.embed(query_str) | ||||
|     idxs, dists = self.index.knnQuery(embedding, k=k) | ||||
| 
 | ||||
|     # TODO(sanyamkapoor): initialize data map and return | ||||
|     # list of dicts | ||||
|     # [ | ||||
|     #     {'src': self.data_map[idx], 'dist': dist} | ||||
|     #     for idx, dist in zip(idxs, dists) | ||||
|     # ] | ||||
|     return idxs, dists | ||||
| 
 | ||||
|   @staticmethod | ||||
|   def nmslib_init(): | ||||
|     """Initializes an nmslib index object""" | ||||
|     index = nmslib.init(method='hnsw', space='cosinesimil') | ||||
|     return index | ||||
| 
 | ||||
|   @staticmethod | ||||
|   def create_index(data: np.array, save_path: str): | ||||
|     """Add numpy data to the index and save to path""" | ||||
|     index = CodeSearchEngine.nmslib_init() | ||||
|     index.addDataPointBatch(data) | ||||
|     index.createIndex({'post': 2}, print_progress=True) | ||||
|     index.saveIndex(save_path) | ||||
|  | @ -0,0 +1,31 @@ | |||
| from flask import Flask, request, abort, jsonify, make_response | ||||
| 
 | ||||
| 
 | ||||
| class CodeSearchServer: | ||||
|   """This utility class wraps the search engine into | ||||
|   an HTTP server based on Flask""" | ||||
|   def __init__(self, engine, host='0.0.0.0', port=8008): | ||||
|     self.app = Flask(__name__) | ||||
|     self.host = host | ||||
|     self.port = port | ||||
|     self.engine = engine | ||||
| 
 | ||||
|   def init_routes(self): | ||||
|     # pylint: disable=unused-variable | ||||
| 
 | ||||
|     @self.app.route('/ping') | ||||
|     def ping(): | ||||
|       return make_response(jsonify(status=200), 200) | ||||
| 
 | ||||
|     @self.app.route('/query') | ||||
|     def query(): | ||||
|       query_str = request.args.get('query') | ||||
|       if not query_str: | ||||
|         abort(make_response( | ||||
|           jsonify(status=400, error="empty query"), 400)) | ||||
| 
 | ||||
|       result = self.engine.search(query_str) | ||||
|       return make_response(jsonify(result=result)) | ||||
| 
 | ||||
|   def run(self): | ||||
|     self.app.run(host=self.host, port=self.port) | ||||
|  | @ -0,0 +1,4 @@ | |||
| Flask~=1.0.0 | ||||
| nmslib~=1.7.0 | ||||
| numpy~=1.14.0 | ||||
| google-cloud-storage~=1.10.0 | ||||
|  | @ -0,0 +1,23 @@ | |||
| from setuptools import setup, find_packages | ||||
| 
 | ||||
| with open('requirements.txt', 'r') as f: | ||||
|   install_requires = f.readlines() | ||||
| 
 | ||||
| VERSION = '0.1.0' | ||||
| 
 | ||||
| setup(name='code-search-index-server', | ||||
|       description='Kubeflow Code Search Demo - Index Server', | ||||
|       url='https://www.github.com/kubeflow/examples', | ||||
|       author='Sanyam Kapoor', | ||||
|       author_email='sanyamkapoor@google.com', | ||||
|       version=VERSION, | ||||
|       license='MIT', | ||||
|       packages=find_packages(), | ||||
|       install_requires=install_requires, | ||||
|       extras_require={}, | ||||
|       entry_points={ | ||||
|         'console_scripts': [ | ||||
|           'nmslib-serve=nmslib_flask.cli:server', | ||||
|           'nmslib-create=nmslib_flask.cli:creator', | ||||
|         ] | ||||
|       }) | ||||
		Loading…
	
		Reference in New Issue