Create a script to update the index and lookup file used to serve predictions. (#352)

* This script will be the last step in a pipeline to continuously update
  the index for serving.

* The script updates the parameters of the search index server to point
  to the supplied index files. It then commits them and creates a PR
  to push those commits.

* Restructure the parameters for the search index server so that we can use
  ks param set to override the indexFile and lookupFile.

* We do this because we want to be able to push a new index by doing
  ks param set in a continuously running pipeline
* Remove default parameters from search-index-server

* Create a dockerfile suitable for running this script.
This commit is contained in:
Jeremy Lewi 2018-11-26 06:35:27 -08:00 committed by k8s-ci-robot
parent 4f95e85e63
commit 5d6a4e9d71
8 changed files with 158 additions and 14 deletions

View File

@ -72,6 +72,17 @@ build-ui-gcb:
gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.ui.json \
--timeout=3600 ./build
build-index-updater-gcb:
mkdir -p build
jsonnet ./docker/index_updater/build.jsonnet --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \
> ./build/build.index_updater.json
cp -r ./docker ./build/
cp -r ./src ./build/
rm -rf ./build/src/code_search/dataflow/cli/test_data
rm -rf ./build/src/code_search/t2t/test_data
gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.index_updater.json \
--timeout=3600 ./build
# Build but don't attach the latest tag. This allows manual testing/inspection of the image
# first.
push-cpu: build-cpu

View File

@ -0,0 +1,9 @@
FROM ubuntu:xenial
RUN apt-get update && apt-get install -y wget &&\
rm -rf /var/lib/apt/lists/*
RUN wget -O /tmp/hub-linux-amd64-2.6.0.tgz https://github.com/github/hub/releases/download/v2.6.0/hub-linux-amd64-2.6.0.tgz && \
cd /usr/local && \
tar -xvf /tmp/hub-linux-amd64-2.6.0.tgz && \
ln -sf /usr/local/hub-linux-amd64-2.6.0/bin/hub /usr/local/bin/hub

View File

@ -0,0 +1,3 @@
# Index Updater
A Docker image and script suitable for updating the index served.

View File

@ -0,0 +1,26 @@
// TODO(jlewi): We should tag the image latest and then
// use latest as a cache so that rebuilds are fast
// https://cloud.google.com/cloud-build/docs/speeding-up-builds#using_a_cached_docker_image
{
"steps": [
{
"id": "build",
"name": "gcr.io/cloud-builders/docker",
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search/index_updater:" + std.extVar("tag"),
"--label=git-versions=" + std.extVar("gitVersion"),
"--file=docker/index_updater/Dockerfile",
"."],
},
{
"id": "tag",
"name": "gcr.io/cloud-builders/docker",
"args": ["tag", "gcr.io/kubeflow-examples/code-search/index_updater:" + std.extVar("tag"),
"gcr.io/kubeflow-examples/code-search/index_updater:latest",],
"waitFor": ["build"],
},
],
"images": ["gcr.io/kubeflow-examples/code-search/index_updater:" + std.extVar("tag"),
"gcr.io/kubeflow-examples/code-search/index_updater:latest",
],
}

View File

@ -0,0 +1,95 @@
#!/bin/bash
#
# This script creates a PR updating the nmslib index used by search-index-server.
# It uses ks CLI to update the parameters.
# After creating and pushing a commit it uses the hub github CLI to create a PR.
#
# The argument --base can be used to change the owner/org of the repo the PR is opened on.
# To use the main kubeflow/examples repo use
# --base=kubeflow:master
#
# To use user alex's fork use
# --base=alex/master
set -ex
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" > /dev/null && pwd)"
parseArgs() {
# Parse all command line options
while [[ $# -gt 0 ]]; do
# Parameters should be of the form
# --{name}=${value}
echo parsing "$1"
if [[ $1 =~ ^--(.*)=(.*)$ ]]; then
name=${BASH_REMATCH[1]}
value=${BASH_REMATCH[2]}
eval ${name}="${value}"
elif [[ $1 =~ ^--(.*)$ ]]; then
name=${BASH_REMATCH[1]}
value=true
eval ${name}="${value}"
else
echo "Argument $1 did not match the pattern --{name}={value} or --{name}"
fi
shift
done
}
usage() {
echo "Usage: update_index.sh --base=OWNER:branch --appDir=<ksonnet app dir> --env=<ksonnet environment> --indexFile=<index file> --lookupFile=<lookup file>"
}
parseArgs $*
if [ ! -z ${help} ]; then
usage
fi
if [ -z ${dryrun} ]; then
dryrun=false
fi
# List of required parameters
names=(appDir env lookupFile indexFile base)
missingParam=false
for i in ${names[@]}; do
if [ -z ${!i} ]; then
echo "--${i} not set"
missingParam=true
fi
done
if ${missingParam}; then
usage
exit 1
fi
cd ${appDir}
ks param set --env=${env} search-index-server indexFile ${indexFile}
ks param set --env=${env} search-index-server lookupFile ${lookupFile}
git add .
if (! ${dryrun}); then
git commit -m "Update the lookup and index file."
git push
else
echo "dryrun; not committing to git."
fi
FILE=$(mktemp tmp.create_pull_request.XXXX)
cat <<EOF >$FILE
Update the lookup and index file.
This PR is automatically generated by update_index.sh.
This PR updates the index and lookup file used to serve
predictions.
EOF
# Create a pull request
if (! ${dryrun}); then
hub pull-request --base=${base} -F ${FILE}
fi

View File

@ -97,11 +97,9 @@
indexFile: $.components["t2t-code-search"].workingDir + "/code_search_index.nmslib",
},
"search-index-server": {
// Most defaults should be defined in experiments.libsonnet.
// Parameters will be used to override those values.
name: "search-index-server",
problem: $.components["t2t-code-search"].problem,
dataDir: $.components["t2t-code-search"].workingDir + "/data",
lookupFile: $.components["t2t-code-search"].workingDir + "/code_search_index.csv",
indexFile: $.components["t2t-code-search"].workingDir + "/code_search_index.nmslib",
servingUrl: "http://t2t-code-search.kubeflow:8500/v1/models/t2t-code-search:predict",
// 1 replica is convenient for debugging but we should bump after debugging.
replicas: 1,

View File

@ -7,9 +7,10 @@ local experiments = import "experiments.libsonnet";
local experimentName = baseParams.experiment;
local experimentParams = experiments[experimentName];
local params = baseParams + experimentParams + {
name: "search-index-server",
};
// baseParams override experiment parameters because we want to be able to set a new
// index and csv file by doing ks param set.
local params = experimentParams + baseParams;
local deploymentSpec = {
apiVersion: "extensions/v1beta1",

View File

@ -1,14 +1,15 @@
local params = std.extVar("__ksonnet/params");
local globals = import "globals.libsonnet";
local params = std.extVar('__ksonnet/params');
local globals = import 'globals.libsonnet';
local envParams = params {
components+: {
"t2t-code-search"+: {
},
"t2t-code-search"+: {},
"t2t-code-search-datagen"+: {
githubTable: "",
githubTable: '',
},
"submit-preprocess-job"+: {
githubTable: "",
githubTable: '',
},
"search-index-server"+: {
},
},
};
@ -18,4 +19,4 @@ local envParams = params {
[x]: envParams.components[x] + globals
for x in std.objectFields(envParams.components)
},
}
}