Add update search index pipeline (#361)

* add search index creator container * add pipeline * update op name * update readme * update scripts * typo fix * Update Makefile * Update Makefile * address comments * fix ks * update pipeline * restructure the images * remove echo * update image * format * format * address comments
2018-11-27 04:43:55 -08:00 · 2018-11-27 04:43:55 -08:00 · 31390d39a0
parent 15007fdeea
commit 31390d39a0
12 changed files with 285 additions and 65 deletions
--- a/code_search/Makefile
+++ b/code_search/Makefile
@ -72,15 +72,16 @@ build-ui-gcb:
 	gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.ui.json \
 		--timeout=3600 ./build
-build-index-updater-gcb:
+build-ks-gcb:
 	mkdir  -p build
-	jsonnet ./docker/index_updater/build.jsonnet --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \
+	jsonnet ./docker/ks/build.jsonnet --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \
-	  > ./build/build.index_updater.json
+	  > ./build/build.ks.json
 	cp -r ./docker ./build/
 	cp -r ./kubeflow ./build/
 	cp -r ./src ./build/
 	rm -rf ./build/src/code_search/dataflow/cli/test_data
 	rm -rf ./build/src/code_search/t2t/test_data
-	gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.index_updater.json \
+	gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.ks.json \
 		--timeout=3600 ./build
 # Build but don't attach the latest tag. This allows manual testing/inspection of the image
--- a/code_search/docker/index_updater/Dockerfile
+++ b/code_search/docker/index_updater/Dockerfile
@ -1,9 +0,0 @@
 FROM ubuntu:xenial
 RUN apt-get update && apt-get install -y wget &&\
    rm -rf /var/lib/apt/lists/*
 RUN wget -O /tmp/hub-linux-amd64-2.6.0.tgz https://github.com/github/hub/releases/download/v2.6.0/hub-linux-amd64-2.6.0.tgz && \
 	cd /usr/local && \
 	tar -xvf /tmp/hub-linux-amd64-2.6.0.tgz && \
 	ln -sf /usr/local/hub-linux-amd64-2.6.0/bin/hub /usr/local/bin/hub
--- a/code_search/docker/index_updater/README.md
+++ b/code_search/docker/index_updater/README.md
@ -1,3 +0,0 @@
 # Index Updater
 A Docker image and script suitable for updating the index served.
--- a/code_search/docker/ks/Dockerfile
+++ b/code_search/docker/ks/Dockerfile
@ -0,0 +1,23 @@
 FROM ubuntu:xenial
 RUN apt-get update && apt-get install -y wget &&\
    rm -rf /var/lib/apt/lists/*
 RUN wget -O /tmp/hub-linux-amd64-2.6.0.tgz https://github.com/github/hub/releases/download/v2.6.0/hub-linux-amd64-2.6.0.tgz && \
 	cd /usr/local && \
 	tar -xvf /tmp/hub-linux-amd64-2.6.0.tgz && \
 	ln -sf /usr/local/hub-linux-amd64-2.6.0/bin/hub /usr/local/bin/hub
 RUN wget -O /opt/ks_0.12.0_linux_amd64.tar.gz \
      https://github.com/ksonnet/ksonnet/releases/download/v0.12.0/ks_0.12.0_linux_amd64.tar.gz && \
    tar -C /opt -xzf /opt/ks_0.12.0_linux_amd64.tar.gz && \
    cp /opt/ks_0.12.0_linux_amd64/ks /bin/. && \
    rm -f /opt/ks_0.12.0_linux_amd64.tar.gz && \
    wget -O /bin/kubectl \
      https://storage.googleapis.com/kubernetes-release/release/v1.11.2/bin/linux/amd64/kubectl && \
    chmod u+x /bin/kubectl
 ADD kubeflow /usr/local/src
 ADD docker/ks/*.sh /usr/local/src/
 WORKDIR /usr/local/src
--- a/code_search/docker/index_updater/build.jsonnet
+++ b/code_search/docker/index_updater/build.jsonnet
--- a/code_search/docker/ks/launch_search_index_creator_job.sh
+++ b/code_search/docker/ks/launch_search_index_creator_job.sh
@ -0,0 +1,49 @@
 #!/bin/bash
 # This script is a wrapper script for calling search index creator ksonnet component
 # and creates a kubernetes job to compute search index.
 # For more details about search index ksonnet component, check
 # https://github.com/kubeflow/examples/blob/master/code_search/kubeflow/components/search-index-creator.jsonnet
 set -ex
 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" > /dev/null && pwd)"
 # Providing negative value to kubeflow wait would wait for a week
 timeout="-1s"
 # Ksonnet Environment name. Always use pipeline
 ksEnvName="pipeline"
 # Search index creator ksonnet component name
 component="search-index-creator"
 usage() {
 	echo "Usage: launch_search_index_creator_job.sh --workingDir=<working dir> --workflowId=<workflow id invoking the container>
 	--dataDir=<data dir> --timeout=<timeout>  --namespace=<kubernetes namespace>  --cluster=<cluster to deploy job to> "
 }
 # List of required parameters
 names=(workingDir workflowId dataDir namespace cluster)
 source "${DIR}/parse_arguments.sh"
 # Configure kubectl to use the underlying cluster
 kubectl config set-cluster "${cluster}" --server=https://kubernetes.default --certificate-authority=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 kubectl config set-credentials pipeline --token "$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
 kubectl config set-context kubeflow --cluster "${cluster}" --user pipeline
 kubectl config use-context kubeflow
 ks env set "${ksEnvName}" --namespace="${namespace}"
 # Apply parameters
 ks param set ${component} dataDir ${dataDir} --env ${ksEnvName}
 ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName}
 ks param set ${component} lookupFile ${workingDir}/code-embeddings-index/${workflowId}/embedding-to-info.csv --env ${ksEnvName}
 ks param set ${component} indexFile ${workingDir}/code-embeddings-index/${workflowId}/embeddings.index --env ${ksEnvName}
 ks apply ${ksEnvName} -c "${component}"
 JOB_NAME="pipeline-create-search-index-${workflowId}"
 echo "wait for ${JOB_NAME} to finish"
 kubectl wait --timeout="${timeout}" --for=condition=complete job/${JOB_NAME} -n "${namespace}"
 # If the wait above failed, then the script will fail fast and following command won't run.
 # TODO complete doesn't mean it's successful. Check the job succeeded.
 echo "${JOB_NAME} is succeeded"
--- a/code_search/docker/ks/parse_arguments.sh
+++ b/code_search/docker/ks/parse_arguments.sh
@ -0,0 +1,39 @@
 #!/bin/bash
 # Common logic to parse the argument
 # To use it, define the usage() function for help message and name array variables for required arguments in the
 # parent script
 parseArgs() {
  # Parse all command line options
  while [[ $# -gt 0 ]]; do
    # Parameters should be of the form
    # --{name}=${value}
    echo parsing "$1"
    if [[ $1 =~ ^--(.*)=(.*)$ ]]; then
    	name=${BASH_REMATCH[1]}
    	value=${BASH_REMATCH[2]}
     	eval ${name}="${value}"
    elif [[ $1 =~ ^--(.*)$ ]]; then
 		name=${BASH_REMATCH[1]}
 		value=true
 		eval ${name}="${value}"
    else
    	echo "Argument $1 did not match the pattern --{name}={value} or --{name}"
    fi
    shift
  done
 }
 parseArgs $*
 missingParam=false
 for i in ${names[@]}; do
 	if [ -z ${!i} ]; then
 		echo "--${i} not set"
 		missingParam=true
 	fi
 done
 if ${missingParam}; then
 	usage
 	exit 1
 fi
--- a/code_search/docker/index_updater/update_index.sh
+++ b/code_search/docker/index_updater/update_index.sh
@ -14,33 +14,14 @@ set -ex
 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" > /dev/null && pwd)"
 parseArgs() {
  # Parse all command line options
  while [[ $# -gt 0 ]]; do
    # Parameters should be of the form
    # --{name}=${value}
    echo parsing "$1"
    if [[ $1 =~ ^--(.*)=(.*)$ ]]; then
    	name=${BASH_REMATCH[1]}
    	value=${BASH_REMATCH[2]}
    	eval ${name}="${value}"
    elif [[ $1 =~ ^--(.*)$ ]]; then
 		name=${BASH_REMATCH[1]}
 		value=true
 		eval ${name}="${value}"
    else
    	echo "Argument $1 did not match the pattern --{name}={value} or --{name}"
    fi
    shift
  done
 }
 usage() {
 	echo "Usage: update_index.sh --base=OWNER:branch --appDir=<ksonnet app dir> --env=<ksonnet environment> --indexFile=<index file> --lookupFile=<lookup file>"
 }
-parseArgs $*
+# List of required parameters
 names=(appDir env lookupFile indexFile base)
 source "${DIR}/parse_arguments.sh"
 if [ ! -z ${help} ]; then
 	usage
@ -50,22 +31,6 @@ if [ -z ${dryrun} ]; then
 	dryrun=false
 fi
 # List of required parameters
 names=(appDir env lookupFile indexFile base)
 missingParam=false
 for i in ${names[@]}; do
 	if [ -z ${!i} ]; then
 		echo "--${i} not set"
 		missingParam=true		
 	fi	
 done
 if ${missingParam}; then
 	usage
 	exit 1
 fi
 cd ${appDir}
 ks param set --env=${env} search-index-server indexFile ${indexFile}
 ks param set --env=${env} search-index-server lookupFile ${lookupFile}
--- a/code_search/kubeflow/components/experiments.libsonnet
+++ b/code_search/kubeflow/components/experiments.libsonnet
@ -25,5 +25,8 @@
    problem: "kf_github_function_docstring",
    // modelBasePath shouldn't have integer in it.
    modelBasePath: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/",
    lookupFile: "null",
    indexFile: "null",
  },
 }
--- a/code_search/kubeflow/components/params.libsonnet
+++ b/code_search/kubeflow/components/params.libsonnet
@ -79,11 +79,9 @@
    },
    "search-index-creator": {
      name: "search-index-creator",
-      jobNameSuffix: "",
+      jobNameSuffix: "null",
      image: $.components["t2t-job"].dataflowImage,
      dataDir: $.components["t2t-code-search"].workingDir + "/data",
      lookupFile: $.components["t2t-code-search"].workingDir + "/code_search_index.csv",
      indexFile: $.components["t2t-code-search"].workingDir + "/code_search_index.nmslib",
    },
    "submit-preprocess-job": {
      name: "submit-preprocess-job",
--- a/code_search/pipeline/README.md
+++ b/code_search/pipeline/README.md
@ -0,0 +1,12 @@
 To run the pipeline, follow the kubeflow pipeline instruction and compile index_update_pipeline.py and upload to pipeline
 page.
 Provide the parameter, e.g. 
 ```
 PROJECT='code-search-demo'
 CLUSTER_NAME='cs-demo-1103'
 WORKING_DIR='gs://code-search-demo/pipeline'
 SAVED_MODEL_DIR='gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/'
 DATA_DIR='gs://code-search-demo/20181104/data'
 ```
--- a/code_search/pipeline/index_update_pipeline.py
+++ b/code_search/pipeline/index_update_pipeline.py
@ -0,0 +1,142 @@
 from typing import Dict
 from kubernetes import client as k8s_client
 import kfp.dsl as dsl
 # disable max arg lint check
 #pylint: disable=R0913
 def default_gcp_op(name: str, image: str, command: str = None,
           arguments: str = None, file_inputs: Dict[dsl.PipelineParam, str] = None,
           file_outputs: Dict[str, str] = None, is_exit_handler=False):
  """An operator that mounts the default GCP service account to the container.
  The user-gcp-sa secret is created as part of the kubeflow deployment that
  stores the access token for kubeflow user service account.
  With this service account, the container has a range of GCP APIs to
  access to. This service account is automatically created as part of the
  kubeflow deployment.
  For the list of the GCP APIs this service account can access to, check
  https://github.com/kubeflow/kubeflow/blob/7b0db0d92d65c0746ac52b000cbc290dac7c62b1/deployment/gke/deployment_manager_configs/iam_bindings_template.yaml#L18
  If you want to call the GCP APIs in a different project, grant the kf-user
  service account access permission.
  """
  return (
    dsl.ContainerOp(
      name,
      image,
      command,
      arguments,
      file_inputs,
      file_outputs,
      is_exit_handler,
    )
      .add_volume(
      k8s_client.V1Volume(
        name='gcp-credentials',
        secret=k8s_client.V1SecretVolumeSource(
          secret_name='user-gcp-sa'
        )
      )
    )
      .add_volume_mount(
      k8s_client.V1VolumeMount(
        mount_path='/secret/gcp-credentials',
        name='gcp-credentials',
      )
    )
      .add_env_variable(
      k8s_client.V1EnvVar(
        name='GOOGLE_APPLICATION_CREDENTIALS',
        value='/secret/gcp-credentials/user-gcp-sa.json'
      )
    )
  )
 def dataflow_function_embedding_op(
                   project: 'GcpProject', runner: str, target_dataset: str, problem: str,
                   data_dir: 'GcsUri', saved_model_dir: 'GcsUri', temp_location: 'GcsUri',
                   staging_location: 'GcsUri',
                   job_name: str, worker_machine_type: str,
                   num_workers: int, step_name='dataflow_function_embedding'):
  return default_gcp_op(
    name=step_name,
    image='gcr.io/kubeflow-examples/code-search-dataflow:latest',
    command=[
      'python2',
      '-m',
      'code_search.dataflow.cli.create_function_embeddings',
    ],
    arguments=[
      '--project', project,
      '--runner', runner,
      '--target_dataset', target_dataset,
      '--problem', problem,
      '--data_dir', data_dir,
      '--saved_model_dir', saved_model_dir,
      '--job_name', job_name,
      '--temp_location', temp_location,
      '--staging_location', staging_location,
      '--worker_machine_type', worker_machine_type,
      '--num_workers', num_workers,
      '--requirements_file', 'requirements.dataflow.txt',
      '--wait_until_finished',
    ]
  )
 def search_index_creator_op(
        working_dir: str, data_dir: str, workflow_id: str, cluster_name: str, namespace: str):
  return dsl.ContainerOp(
    # use component name as step name
    name='search_index_creator',
    image='gcr.io/kubeflow-examples/code-search-ks:v20181126-e62ebca-dirty-4103da',
    command=['/usr/local/src/launch_search_index_creator_job.sh'],
    arguments=[
      '--workingDir=%s' % working_dir,
      '--dataDir=%s' % data_dir,
      '--workflowId=%s' % workflow_id,
      '--cluster=%s' % cluster_name,
      '--namespace=%s' % namespace,
    ]
  )
 # The pipeline definition
@dsl.pipeline(
  name='function_embedding',
  description='Example function embedding pipeline'
 )
 def function_embedding_update(
    project,
    working_dir,
    data_dir,
    saved_model_dir,
    cluster_name,
    namespace,
    problem=dsl.PipelineParam(
      name='problem', value='kf_github_function_docstring'),
    runner=dsl.PipelineParam(name='runnder', value='DataflowRunner'),
    target_dataset=dsl.PipelineParam(
      name='target-dataset', value='code_search'),
    worker_machine_type=dsl.PipelineParam(
      name='worker-machine-type', value='n1-highcpu-32'),
    num_workers=dsl.PipelineParam(name='num-workers', value=5)):
  workflow_name = '{{workflow.name}}'
  temp_location = '%s/dataflow/%s/temp' % (working_dir, workflow_name)
  staging_location = '%s/dataflow/%s/staging' % (working_dir, workflow_name)
  function_embedding = dataflow_function_embedding_op(
                            project, runner, target_dataset, problem, data_dir,
                            saved_model_dir, temp_location, staging_location, workflow_name,
                            worker_machine_type, num_workers)
  search_index_creator_op(
    working_dir, data_dir, workflow_name, cluster_name, namespace).after(function_embedding)
 if __name__ == '__main__':
  import kfp.compiler as compiler
  compiler.Compiler().compile(function_embedding_update, __file__ + '.tar.gz')
		`@ -1,3 +0,0 @@`
			`# Index Updater`

			`A Docker image and script suitable for updating the index served.`