mirror of https://github.com/kubeflow/examples.git
Add update search index pipeline (#361)
* add search index creator container * add pipeline * update op name * update readme * update scripts * typo fix * Update Makefile * Update Makefile * address comments * fix ks * update pipeline * restructure the images * remove echo * update image * format * format * address comments
This commit is contained in:
parent
15007fdeea
commit
31390d39a0
|
|
@ -72,15 +72,16 @@ build-ui-gcb:
|
||||||
gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.ui.json \
|
gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.ui.json \
|
||||||
--timeout=3600 ./build
|
--timeout=3600 ./build
|
||||||
|
|
||||||
build-index-updater-gcb:
|
build-ks-gcb:
|
||||||
mkdir -p build
|
mkdir -p build
|
||||||
jsonnet ./docker/index_updater/build.jsonnet --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \
|
jsonnet ./docker/ks/build.jsonnet --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \
|
||||||
> ./build/build.index_updater.json
|
> ./build/build.ks.json
|
||||||
cp -r ./docker ./build/
|
cp -r ./docker ./build/
|
||||||
|
cp -r ./kubeflow ./build/
|
||||||
cp -r ./src ./build/
|
cp -r ./src ./build/
|
||||||
rm -rf ./build/src/code_search/dataflow/cli/test_data
|
rm -rf ./build/src/code_search/dataflow/cli/test_data
|
||||||
rm -rf ./build/src/code_search/t2t/test_data
|
rm -rf ./build/src/code_search/t2t/test_data
|
||||||
gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.index_updater.json \
|
gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.ks.json \
|
||||||
--timeout=3600 ./build
|
--timeout=3600 ./build
|
||||||
|
|
||||||
# Build but don't attach the latest tag. This allows manual testing/inspection of the image
|
# Build but don't attach the latest tag. This allows manual testing/inspection of the image
|
||||||
|
|
|
||||||
|
|
@ -1,9 +0,0 @@
|
||||||
FROM ubuntu:xenial
|
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y wget &&\
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
RUN wget -O /tmp/hub-linux-amd64-2.6.0.tgz https://github.com/github/hub/releases/download/v2.6.0/hub-linux-amd64-2.6.0.tgz && \
|
|
||||||
cd /usr/local && \
|
|
||||||
tar -xvf /tmp/hub-linux-amd64-2.6.0.tgz && \
|
|
||||||
ln -sf /usr/local/hub-linux-amd64-2.6.0/bin/hub /usr/local/bin/hub
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
||||||
# Index Updater
|
|
||||||
|
|
||||||
A Docker image and script suitable for updating the index served.
|
|
||||||
|
|
@ -0,0 +1,23 @@
|
||||||
|
FROM ubuntu:xenial
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y wget &&\
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN wget -O /tmp/hub-linux-amd64-2.6.0.tgz https://github.com/github/hub/releases/download/v2.6.0/hub-linux-amd64-2.6.0.tgz && \
|
||||||
|
cd /usr/local && \
|
||||||
|
tar -xvf /tmp/hub-linux-amd64-2.6.0.tgz && \
|
||||||
|
ln -sf /usr/local/hub-linux-amd64-2.6.0/bin/hub /usr/local/bin/hub
|
||||||
|
|
||||||
|
RUN wget -O /opt/ks_0.12.0_linux_amd64.tar.gz \
|
||||||
|
https://github.com/ksonnet/ksonnet/releases/download/v0.12.0/ks_0.12.0_linux_amd64.tar.gz && \
|
||||||
|
tar -C /opt -xzf /opt/ks_0.12.0_linux_amd64.tar.gz && \
|
||||||
|
cp /opt/ks_0.12.0_linux_amd64/ks /bin/. && \
|
||||||
|
rm -f /opt/ks_0.12.0_linux_amd64.tar.gz && \
|
||||||
|
wget -O /bin/kubectl \
|
||||||
|
https://storage.googleapis.com/kubernetes-release/release/v1.11.2/bin/linux/amd64/kubectl && \
|
||||||
|
chmod u+x /bin/kubectl
|
||||||
|
|
||||||
|
ADD kubeflow /usr/local/src
|
||||||
|
ADD docker/ks/*.sh /usr/local/src/
|
||||||
|
|
||||||
|
WORKDIR /usr/local/src
|
||||||
|
|
@ -0,0 +1,49 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# This script is a wrapper script for calling search index creator ksonnet component
|
||||||
|
# and creates a kubernetes job to compute search index.
|
||||||
|
# For more details about search index ksonnet component, check
|
||||||
|
# https://github.com/kubeflow/examples/blob/master/code_search/kubeflow/components/search-index-creator.jsonnet
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" > /dev/null && pwd)"
|
||||||
|
|
||||||
|
# Providing negative value to kubeflow wait would wait for a week
|
||||||
|
timeout="-1s"
|
||||||
|
# Ksonnet Environment name. Always use pipeline
|
||||||
|
ksEnvName="pipeline"
|
||||||
|
# Search index creator ksonnet component name
|
||||||
|
component="search-index-creator"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "Usage: launch_search_index_creator_job.sh --workingDir=<working dir> --workflowId=<workflow id invoking the container>
|
||||||
|
--dataDir=<data dir> --timeout=<timeout> --namespace=<kubernetes namespace> --cluster=<cluster to deploy job to> "
|
||||||
|
}
|
||||||
|
|
||||||
|
# List of required parameters
|
||||||
|
names=(workingDir workflowId dataDir namespace cluster)
|
||||||
|
|
||||||
|
source "${DIR}/parse_arguments.sh"
|
||||||
|
|
||||||
|
# Configure kubectl to use the underlying cluster
|
||||||
|
kubectl config set-cluster "${cluster}" --server=https://kubernetes.default --certificate-authority=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||||
|
kubectl config set-credentials pipeline --token "$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
|
||||||
|
kubectl config set-context kubeflow --cluster "${cluster}" --user pipeline
|
||||||
|
kubectl config use-context kubeflow
|
||||||
|
ks env set "${ksEnvName}" --namespace="${namespace}"
|
||||||
|
|
||||||
|
# Apply parameters
|
||||||
|
ks param set ${component} dataDir ${dataDir} --env ${ksEnvName}
|
||||||
|
ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName}
|
||||||
|
ks param set ${component} lookupFile ${workingDir}/code-embeddings-index/${workflowId}/embedding-to-info.csv --env ${ksEnvName}
|
||||||
|
ks param set ${component} indexFile ${workingDir}/code-embeddings-index/${workflowId}/embeddings.index --env ${ksEnvName}
|
||||||
|
|
||||||
|
ks apply ${ksEnvName} -c "${component}"
|
||||||
|
|
||||||
|
JOB_NAME="pipeline-create-search-index-${workflowId}"
|
||||||
|
echo "wait for ${JOB_NAME} to finish"
|
||||||
|
|
||||||
|
kubectl wait --timeout="${timeout}" --for=condition=complete job/${JOB_NAME} -n "${namespace}"
|
||||||
|
# If the wait above failed, then the script will fail fast and following command won't run.
|
||||||
|
# TODO complete doesn't mean it's successful. Check the job succeeded.
|
||||||
|
echo "${JOB_NAME} is succeeded"
|
||||||
|
|
@ -0,0 +1,39 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Common logic to parse the argument
|
||||||
|
# To use it, define the usage() function for help message and name array variables for required arguments in the
|
||||||
|
# parent script
|
||||||
|
|
||||||
|
parseArgs() {
|
||||||
|
# Parse all command line options
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
# Parameters should be of the form
|
||||||
|
# --{name}=${value}
|
||||||
|
echo parsing "$1"
|
||||||
|
if [[ $1 =~ ^--(.*)=(.*)$ ]]; then
|
||||||
|
name=${BASH_REMATCH[1]}
|
||||||
|
value=${BASH_REMATCH[2]}
|
||||||
|
eval ${name}="${value}"
|
||||||
|
elif [[ $1 =~ ^--(.*)$ ]]; then
|
||||||
|
name=${BASH_REMATCH[1]}
|
||||||
|
value=true
|
||||||
|
eval ${name}="${value}"
|
||||||
|
else
|
||||||
|
echo "Argument $1 did not match the pattern --{name}={value} or --{name}"
|
||||||
|
fi
|
||||||
|
shift
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
parseArgs $*
|
||||||
|
|
||||||
|
missingParam=false
|
||||||
|
for i in ${names[@]}; do
|
||||||
|
if [ -z ${!i} ]; then
|
||||||
|
echo "--${i} not set"
|
||||||
|
missingParam=true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if ${missingParam}; then
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
@ -14,33 +14,14 @@ set -ex
|
||||||
|
|
||||||
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" > /dev/null && pwd)"
|
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" > /dev/null && pwd)"
|
||||||
|
|
||||||
parseArgs() {
|
|
||||||
# Parse all command line options
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
# Parameters should be of the form
|
|
||||||
# --{name}=${value}
|
|
||||||
echo parsing "$1"
|
|
||||||
if [[ $1 =~ ^--(.*)=(.*)$ ]]; then
|
|
||||||
name=${BASH_REMATCH[1]}
|
|
||||||
value=${BASH_REMATCH[2]}
|
|
||||||
|
|
||||||
eval ${name}="${value}"
|
|
||||||
elif [[ $1 =~ ^--(.*)$ ]]; then
|
|
||||||
name=${BASH_REMATCH[1]}
|
|
||||||
value=true
|
|
||||||
eval ${name}="${value}"
|
|
||||||
else
|
|
||||||
echo "Argument $1 did not match the pattern --{name}={value} or --{name}"
|
|
||||||
fi
|
|
||||||
shift
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo "Usage: update_index.sh --base=OWNER:branch --appDir=<ksonnet app dir> --env=<ksonnet environment> --indexFile=<index file> --lookupFile=<lookup file>"
|
echo "Usage: update_index.sh --base=OWNER:branch --appDir=<ksonnet app dir> --env=<ksonnet environment> --indexFile=<index file> --lookupFile=<lookup file>"
|
||||||
}
|
}
|
||||||
|
|
||||||
parseArgs $*
|
# List of required parameters
|
||||||
|
names=(appDir env lookupFile indexFile base)
|
||||||
|
|
||||||
|
source "${DIR}/parse_arguments.sh"
|
||||||
|
|
||||||
if [ ! -z ${help} ]; then
|
if [ ! -z ${help} ]; then
|
||||||
usage
|
usage
|
||||||
|
|
@ -50,22 +31,6 @@ if [ -z ${dryrun} ]; then
|
||||||
dryrun=false
|
dryrun=false
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# List of required parameters
|
|
||||||
names=(appDir env lookupFile indexFile base)
|
|
||||||
|
|
||||||
|
|
||||||
missingParam=false
|
|
||||||
for i in ${names[@]}; do
|
|
||||||
if [ -z ${!i} ]; then
|
|
||||||
echo "--${i} not set"
|
|
||||||
missingParam=true
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if ${missingParam}; then
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
cd ${appDir}
|
cd ${appDir}
|
||||||
ks param set --env=${env} search-index-server indexFile ${indexFile}
|
ks param set --env=${env} search-index-server indexFile ${indexFile}
|
||||||
ks param set --env=${env} search-index-server lookupFile ${lookupFile}
|
ks param set --env=${env} search-index-server lookupFile ${lookupFile}
|
||||||
|
|
@ -25,5 +25,8 @@
|
||||||
problem: "kf_github_function_docstring",
|
problem: "kf_github_function_docstring",
|
||||||
// modelBasePath shouldn't have integer in it.
|
// modelBasePath shouldn't have integer in it.
|
||||||
modelBasePath: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/",
|
modelBasePath: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/",
|
||||||
|
|
||||||
|
lookupFile: "null",
|
||||||
|
indexFile: "null",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -79,11 +79,9 @@
|
||||||
},
|
},
|
||||||
"search-index-creator": {
|
"search-index-creator": {
|
||||||
name: "search-index-creator",
|
name: "search-index-creator",
|
||||||
jobNameSuffix: "",
|
jobNameSuffix: "null",
|
||||||
image: $.components["t2t-job"].dataflowImage,
|
image: $.components["t2t-job"].dataflowImage,
|
||||||
dataDir: $.components["t2t-code-search"].workingDir + "/data",
|
dataDir: $.components["t2t-code-search"].workingDir + "/data",
|
||||||
lookupFile: $.components["t2t-code-search"].workingDir + "/code_search_index.csv",
|
|
||||||
indexFile: $.components["t2t-code-search"].workingDir + "/code_search_index.nmslib",
|
|
||||||
},
|
},
|
||||||
"submit-preprocess-job": {
|
"submit-preprocess-job": {
|
||||||
name: "submit-preprocess-job",
|
name: "submit-preprocess-job",
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
To run the pipeline, follow the kubeflow pipeline instruction and compile index_update_pipeline.py and upload to pipeline
|
||||||
|
page.
|
||||||
|
|
||||||
|
Provide the parameter, e.g.
|
||||||
|
|
||||||
|
```
|
||||||
|
PROJECT='code-search-demo'
|
||||||
|
CLUSTER_NAME='cs-demo-1103'
|
||||||
|
WORKING_DIR='gs://code-search-demo/pipeline'
|
||||||
|
SAVED_MODEL_DIR='gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/'
|
||||||
|
DATA_DIR='gs://code-search-demo/20181104/data'
|
||||||
|
```
|
||||||
|
|
@ -0,0 +1,142 @@
|
||||||
|
from typing import Dict
|
||||||
|
from kubernetes import client as k8s_client
|
||||||
|
import kfp.dsl as dsl
|
||||||
|
|
||||||
|
# disable max arg lint check
|
||||||
|
#pylint: disable=R0913
|
||||||
|
|
||||||
|
def default_gcp_op(name: str, image: str, command: str = None,
|
||||||
|
arguments: str = None, file_inputs: Dict[dsl.PipelineParam, str] = None,
|
||||||
|
file_outputs: Dict[str, str] = None, is_exit_handler=False):
|
||||||
|
"""An operator that mounts the default GCP service account to the container.
|
||||||
|
|
||||||
|
The user-gcp-sa secret is created as part of the kubeflow deployment that
|
||||||
|
stores the access token for kubeflow user service account.
|
||||||
|
|
||||||
|
With this service account, the container has a range of GCP APIs to
|
||||||
|
access to. This service account is automatically created as part of the
|
||||||
|
kubeflow deployment.
|
||||||
|
|
||||||
|
For the list of the GCP APIs this service account can access to, check
|
||||||
|
https://github.com/kubeflow/kubeflow/blob/7b0db0d92d65c0746ac52b000cbc290dac7c62b1/deployment/gke/deployment_manager_configs/iam_bindings_template.yaml#L18
|
||||||
|
|
||||||
|
If you want to call the GCP APIs in a different project, grant the kf-user
|
||||||
|
service account access permission.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return (
|
||||||
|
dsl.ContainerOp(
|
||||||
|
name,
|
||||||
|
image,
|
||||||
|
command,
|
||||||
|
arguments,
|
||||||
|
file_inputs,
|
||||||
|
file_outputs,
|
||||||
|
is_exit_handler,
|
||||||
|
)
|
||||||
|
.add_volume(
|
||||||
|
k8s_client.V1Volume(
|
||||||
|
name='gcp-credentials',
|
||||||
|
secret=k8s_client.V1SecretVolumeSource(
|
||||||
|
secret_name='user-gcp-sa'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.add_volume_mount(
|
||||||
|
k8s_client.V1VolumeMount(
|
||||||
|
mount_path='/secret/gcp-credentials',
|
||||||
|
name='gcp-credentials',
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.add_env_variable(
|
||||||
|
k8s_client.V1EnvVar(
|
||||||
|
name='GOOGLE_APPLICATION_CREDENTIALS',
|
||||||
|
value='/secret/gcp-credentials/user-gcp-sa.json'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def dataflow_function_embedding_op(
|
||||||
|
project: 'GcpProject', runner: str, target_dataset: str, problem: str,
|
||||||
|
data_dir: 'GcsUri', saved_model_dir: 'GcsUri', temp_location: 'GcsUri',
|
||||||
|
staging_location: 'GcsUri',
|
||||||
|
job_name: str, worker_machine_type: str,
|
||||||
|
num_workers: int, step_name='dataflow_function_embedding'):
|
||||||
|
return default_gcp_op(
|
||||||
|
name=step_name,
|
||||||
|
image='gcr.io/kubeflow-examples/code-search-dataflow:latest',
|
||||||
|
command=[
|
||||||
|
'python2',
|
||||||
|
'-m',
|
||||||
|
'code_search.dataflow.cli.create_function_embeddings',
|
||||||
|
],
|
||||||
|
arguments=[
|
||||||
|
'--project', project,
|
||||||
|
'--runner', runner,
|
||||||
|
'--target_dataset', target_dataset,
|
||||||
|
'--problem', problem,
|
||||||
|
'--data_dir', data_dir,
|
||||||
|
'--saved_model_dir', saved_model_dir,
|
||||||
|
'--job_name', job_name,
|
||||||
|
'--temp_location', temp_location,
|
||||||
|
'--staging_location', staging_location,
|
||||||
|
'--worker_machine_type', worker_machine_type,
|
||||||
|
'--num_workers', num_workers,
|
||||||
|
'--requirements_file', 'requirements.dataflow.txt',
|
||||||
|
'--wait_until_finished',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def search_index_creator_op(
|
||||||
|
working_dir: str, data_dir: str, workflow_id: str, cluster_name: str, namespace: str):
|
||||||
|
return dsl.ContainerOp(
|
||||||
|
# use component name as step name
|
||||||
|
name='search_index_creator',
|
||||||
|
image='gcr.io/kubeflow-examples/code-search-ks:v20181126-e62ebca-dirty-4103da',
|
||||||
|
command=['/usr/local/src/launch_search_index_creator_job.sh'],
|
||||||
|
arguments=[
|
||||||
|
'--workingDir=%s' % working_dir,
|
||||||
|
'--dataDir=%s' % data_dir,
|
||||||
|
'--workflowId=%s' % workflow_id,
|
||||||
|
'--cluster=%s' % cluster_name,
|
||||||
|
'--namespace=%s' % namespace,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# The pipeline definition
|
||||||
|
@dsl.pipeline(
|
||||||
|
name='function_embedding',
|
||||||
|
description='Example function embedding pipeline'
|
||||||
|
)
|
||||||
|
def function_embedding_update(
|
||||||
|
project,
|
||||||
|
working_dir,
|
||||||
|
data_dir,
|
||||||
|
saved_model_dir,
|
||||||
|
cluster_name,
|
||||||
|
namespace,
|
||||||
|
problem=dsl.PipelineParam(
|
||||||
|
name='problem', value='kf_github_function_docstring'),
|
||||||
|
runner=dsl.PipelineParam(name='runnder', value='DataflowRunner'),
|
||||||
|
target_dataset=dsl.PipelineParam(
|
||||||
|
name='target-dataset', value='code_search'),
|
||||||
|
worker_machine_type=dsl.PipelineParam(
|
||||||
|
name='worker-machine-type', value='n1-highcpu-32'),
|
||||||
|
num_workers=dsl.PipelineParam(name='num-workers', value=5)):
|
||||||
|
workflow_name = '{{workflow.name}}'
|
||||||
|
temp_location = '%s/dataflow/%s/temp' % (working_dir, workflow_name)
|
||||||
|
staging_location = '%s/dataflow/%s/staging' % (working_dir, workflow_name)
|
||||||
|
function_embedding = dataflow_function_embedding_op(
|
||||||
|
project, runner, target_dataset, problem, data_dir,
|
||||||
|
saved_model_dir, temp_location, staging_location, workflow_name,
|
||||||
|
worker_machine_type, num_workers)
|
||||||
|
search_index_creator_op(
|
||||||
|
working_dir, data_dir, workflow_name, cluster_name, namespace).after(function_embedding)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import kfp.compiler as compiler
|
||||||
|
|
||||||
|
compiler.Compiler().compile(function_embedding_update, __file__ + '.tar.gz')
|
||||||
Loading…
Reference in New Issue