diff --git a/code_search/Makefile b/code_search/Makefile index 0c4cac25..55949def 100644 --- a/code_search/Makefile +++ b/code_search/Makefile @@ -72,15 +72,16 @@ build-ui-gcb: gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.ui.json \ --timeout=3600 ./build -build-index-updater-gcb: +build-ks-gcb: mkdir -p build - jsonnet ./docker/index_updater/build.jsonnet --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \ - > ./build/build.index_updater.json + jsonnet ./docker/ks/build.jsonnet --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \ + > ./build/build.ks.json cp -r ./docker ./build/ + cp -r ./kubeflow ./build/ cp -r ./src ./build/ rm -rf ./build/src/code_search/dataflow/cli/test_data rm -rf ./build/src/code_search/t2t/test_data - gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.index_updater.json \ + gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.ks.json \ --timeout=3600 ./build # Build but don't attach the latest tag. This allows manual testing/inspection of the image diff --git a/code_search/docker/index_updater/Dockerfile b/code_search/docker/index_updater/Dockerfile deleted file mode 100644 index 1222d285..00000000 --- a/code_search/docker/index_updater/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM ubuntu:xenial - -RUN apt-get update && apt-get install -y wget &&\ - rm -rf /var/lib/apt/lists/* - -RUN wget -O /tmp/hub-linux-amd64-2.6.0.tgz https://github.com/github/hub/releases/download/v2.6.0/hub-linux-amd64-2.6.0.tgz && \ - cd /usr/local && \ - tar -xvf /tmp/hub-linux-amd64-2.6.0.tgz && \ - ln -sf /usr/local/hub-linux-amd64-2.6.0/bin/hub /usr/local/bin/hub diff --git a/code_search/docker/index_updater/README.md b/code_search/docker/index_updater/README.md deleted file mode 100644 index 354b5e55..00000000 --- a/code_search/docker/index_updater/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Index Updater - -A Docker image and script suitable for updating the index served. \ No newline at end of file diff --git a/code_search/docker/ks/Dockerfile b/code_search/docker/ks/Dockerfile new file mode 100644 index 00000000..cc2cba3e --- /dev/null +++ b/code_search/docker/ks/Dockerfile @@ -0,0 +1,23 @@ +FROM ubuntu:xenial + +RUN apt-get update && apt-get install -y wget &&\ + rm -rf /var/lib/apt/lists/* + +RUN wget -O /tmp/hub-linux-amd64-2.6.0.tgz https://github.com/github/hub/releases/download/v2.6.0/hub-linux-amd64-2.6.0.tgz && \ + cd /usr/local && \ + tar -xvf /tmp/hub-linux-amd64-2.6.0.tgz && \ + ln -sf /usr/local/hub-linux-amd64-2.6.0/bin/hub /usr/local/bin/hub + +RUN wget -O /opt/ks_0.12.0_linux_amd64.tar.gz \ + https://github.com/ksonnet/ksonnet/releases/download/v0.12.0/ks_0.12.0_linux_amd64.tar.gz && \ + tar -C /opt -xzf /opt/ks_0.12.0_linux_amd64.tar.gz && \ + cp /opt/ks_0.12.0_linux_amd64/ks /bin/. && \ + rm -f /opt/ks_0.12.0_linux_amd64.tar.gz && \ + wget -O /bin/kubectl \ + https://storage.googleapis.com/kubernetes-release/release/v1.11.2/bin/linux/amd64/kubectl && \ + chmod u+x /bin/kubectl + +ADD kubeflow /usr/local/src +ADD docker/ks/*.sh /usr/local/src/ + +WORKDIR /usr/local/src diff --git a/code_search/docker/index_updater/build.jsonnet b/code_search/docker/ks/build.jsonnet similarity index 81% rename from code_search/docker/index_updater/build.jsonnet rename to code_search/docker/ks/build.jsonnet index 9c568bde..b8825856 100644 --- a/code_search/docker/index_updater/build.jsonnet +++ b/code_search/docker/ks/build.jsonnet @@ -7,20 +7,20 @@ { "id": "build", "name": "gcr.io/cloud-builders/docker", - "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search/index_updater:" + std.extVar("tag"), - "--label=git-versions=" + std.extVar("gitVersion"), - "--file=docker/index_updater/Dockerfile", + "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search/index_updater:" + std.extVar("tag"), + "--label=git-versions=" + std.extVar("gitVersion"), + "--file=docker/index_updater/Dockerfile", "."], }, { "id": "tag", "name": "gcr.io/cloud-builders/docker", - "args": ["tag", "gcr.io/kubeflow-examples/code-search/index_updater:" + std.extVar("tag"), + "args": ["tag", "gcr.io/kubeflow-examples/code-search/index_updater:" + std.extVar("tag"), "gcr.io/kubeflow-examples/code-search/index_updater:latest",], "waitFor": ["build"], - }, + }, ], - "images": ["gcr.io/kubeflow-examples/code-search/index_updater:" + std.extVar("tag"), - "gcr.io/kubeflow-examples/code-search/index_updater:latest", + "images": ["gcr.io/kubeflow-examples/code-search/index_updater:" + std.extVar("tag"), + "gcr.io/kubeflow-examples/code-search/index_updater:latest", ], } \ No newline at end of file diff --git a/code_search/docker/ks/launch_search_index_creator_job.sh b/code_search/docker/ks/launch_search_index_creator_job.sh new file mode 100755 index 00000000..3cf763e0 --- /dev/null +++ b/code_search/docker/ks/launch_search_index_creator_job.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# This script is a wrapper script for calling search index creator ksonnet component +# and creates a kubernetes job to compute search index. +# For more details about search index ksonnet component, check +# https://github.com/kubeflow/examples/blob/master/code_search/kubeflow/components/search-index-creator.jsonnet + +set -ex + +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" > /dev/null && pwd)" + +# Providing negative value to kubeflow wait would wait for a week +timeout="-1s" +# Ksonnet Environment name. Always use pipeline +ksEnvName="pipeline" +# Search index creator ksonnet component name +component="search-index-creator" + +usage() { + echo "Usage: launch_search_index_creator_job.sh --workingDir= --workflowId= + --dataDir= --timeout= --namespace= --cluster= " +} + +# List of required parameters +names=(workingDir workflowId dataDir namespace cluster) + +source "${DIR}/parse_arguments.sh" + +# Configure kubectl to use the underlying cluster +kubectl config set-cluster "${cluster}" --server=https://kubernetes.default --certificate-authority=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt +kubectl config set-credentials pipeline --token "$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" +kubectl config set-context kubeflow --cluster "${cluster}" --user pipeline +kubectl config use-context kubeflow +ks env set "${ksEnvName}" --namespace="${namespace}" + +# Apply parameters +ks param set ${component} dataDir ${dataDir} --env ${ksEnvName} +ks param set ${component} jobNameSuffix ${workflowId} --env ${ksEnvName} +ks param set ${component} lookupFile ${workingDir}/code-embeddings-index/${workflowId}/embedding-to-info.csv --env ${ksEnvName} +ks param set ${component} indexFile ${workingDir}/code-embeddings-index/${workflowId}/embeddings.index --env ${ksEnvName} + +ks apply ${ksEnvName} -c "${component}" + +JOB_NAME="pipeline-create-search-index-${workflowId}" +echo "wait for ${JOB_NAME} to finish" + +kubectl wait --timeout="${timeout}" --for=condition=complete job/${JOB_NAME} -n "${namespace}" +# If the wait above failed, then the script will fail fast and following command won't run. +# TODO complete doesn't mean it's successful. Check the job succeeded. +echo "${JOB_NAME} is succeeded" diff --git a/code_search/docker/ks/parse_arguments.sh b/code_search/docker/ks/parse_arguments.sh new file mode 100644 index 00000000..303b077a --- /dev/null +++ b/code_search/docker/ks/parse_arguments.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Common logic to parse the argument +# To use it, define the usage() function for help message and name array variables for required arguments in the +# parent script + +parseArgs() { + # Parse all command line options + while [[ $# -gt 0 ]]; do + # Parameters should be of the form + # --{name}=${value} + echo parsing "$1" + if [[ $1 =~ ^--(.*)=(.*)$ ]]; then + name=${BASH_REMATCH[1]} + value=${BASH_REMATCH[2]} + eval ${name}="${value}" + elif [[ $1 =~ ^--(.*)$ ]]; then + name=${BASH_REMATCH[1]} + value=true + eval ${name}="${value}" + else + echo "Argument $1 did not match the pattern --{name}={value} or --{name}" + fi + shift + done +} + +parseArgs $* + +missingParam=false +for i in ${names[@]}; do + if [ -z ${!i} ]; then + echo "--${i} not set" + missingParam=true + fi +done + if ${missingParam}; then + usage + exit 1 +fi diff --git a/code_search/docker/index_updater/update_index.sh b/code_search/docker/ks/update_index.sh similarity index 67% rename from code_search/docker/index_updater/update_index.sh rename to code_search/docker/ks/update_index.sh index 5cb4d976..66231fb2 100755 --- a/code_search/docker/index_updater/update_index.sh +++ b/code_search/docker/ks/update_index.sh @@ -14,33 +14,14 @@ set -ex DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" > /dev/null && pwd)" -parseArgs() { - # Parse all command line options - while [[ $# -gt 0 ]]; do - # Parameters should be of the form - # --{name}=${value} - echo parsing "$1" - if [[ $1 =~ ^--(.*)=(.*)$ ]]; then - name=${BASH_REMATCH[1]} - value=${BASH_REMATCH[2]} - - eval ${name}="${value}" - elif [[ $1 =~ ^--(.*)$ ]]; then - name=${BASH_REMATCH[1]} - value=true - eval ${name}="${value}" - else - echo "Argument $1 did not match the pattern --{name}={value} or --{name}" - fi - shift - done -} - usage() { echo "Usage: update_index.sh --base=OWNER:branch --appDir= --env= --indexFile= --lookupFile=" } -parseArgs $* +# List of required parameters +names=(appDir env lookupFile indexFile base) + +source "${DIR}/parse_arguments.sh" if [ ! -z ${help} ]; then usage @@ -50,22 +31,6 @@ if [ -z ${dryrun} ]; then dryrun=false fi -# List of required parameters -names=(appDir env lookupFile indexFile base) - - -missingParam=false -for i in ${names[@]}; do - if [ -z ${!i} ]; then - echo "--${i} not set" - missingParam=true - fi -done - -if ${missingParam}; then - usage - exit 1 -fi cd ${appDir} ks param set --env=${env} search-index-server indexFile ${indexFile} ks param set --env=${env} search-index-server lookupFile ${lookupFile} diff --git a/code_search/kubeflow/components/experiments.libsonnet b/code_search/kubeflow/components/experiments.libsonnet index 17c8a096..653567c0 100644 --- a/code_search/kubeflow/components/experiments.libsonnet +++ b/code_search/kubeflow/components/experiments.libsonnet @@ -25,5 +25,8 @@ problem: "kf_github_function_docstring", // modelBasePath shouldn't have integer in it. modelBasePath: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/", + + lookupFile: "null", + indexFile: "null", }, } diff --git a/code_search/kubeflow/components/params.libsonnet b/code_search/kubeflow/components/params.libsonnet index 8b9738cd..ae58e115 100644 --- a/code_search/kubeflow/components/params.libsonnet +++ b/code_search/kubeflow/components/params.libsonnet @@ -79,11 +79,9 @@ }, "search-index-creator": { name: "search-index-creator", - jobNameSuffix: "", + jobNameSuffix: "null", image: $.components["t2t-job"].dataflowImage, dataDir: $.components["t2t-code-search"].workingDir + "/data", - lookupFile: $.components["t2t-code-search"].workingDir + "/code_search_index.csv", - indexFile: $.components["t2t-code-search"].workingDir + "/code_search_index.nmslib", }, "submit-preprocess-job": { name: "submit-preprocess-job", diff --git a/code_search/pipeline/README.md b/code_search/pipeline/README.md new file mode 100644 index 00000000..e9b8bae9 --- /dev/null +++ b/code_search/pipeline/README.md @@ -0,0 +1,12 @@ +To run the pipeline, follow the kubeflow pipeline instruction and compile index_update_pipeline.py and upload to pipeline +page. + +Provide the parameter, e.g. + +``` +PROJECT='code-search-demo' +CLUSTER_NAME='cs-demo-1103' +WORKING_DIR='gs://code-search-demo/pipeline' +SAVED_MODEL_DIR='gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/' +DATA_DIR='gs://code-search-demo/20181104/data' +``` \ No newline at end of file diff --git a/code_search/pipeline/index_update_pipeline.py b/code_search/pipeline/index_update_pipeline.py new file mode 100644 index 00000000..810021a8 --- /dev/null +++ b/code_search/pipeline/index_update_pipeline.py @@ -0,0 +1,142 @@ +from typing import Dict +from kubernetes import client as k8s_client +import kfp.dsl as dsl + +# disable max arg lint check +#pylint: disable=R0913 + +def default_gcp_op(name: str, image: str, command: str = None, + arguments: str = None, file_inputs: Dict[dsl.PipelineParam, str] = None, + file_outputs: Dict[str, str] = None, is_exit_handler=False): + """An operator that mounts the default GCP service account to the container. + + The user-gcp-sa secret is created as part of the kubeflow deployment that + stores the access token for kubeflow user service account. + + With this service account, the container has a range of GCP APIs to + access to. This service account is automatically created as part of the + kubeflow deployment. + + For the list of the GCP APIs this service account can access to, check + https://github.com/kubeflow/kubeflow/blob/7b0db0d92d65c0746ac52b000cbc290dac7c62b1/deployment/gke/deployment_manager_configs/iam_bindings_template.yaml#L18 + + If you want to call the GCP APIs in a different project, grant the kf-user + service account access permission. + """ + + return ( + dsl.ContainerOp( + name, + image, + command, + arguments, + file_inputs, + file_outputs, + is_exit_handler, + ) + .add_volume( + k8s_client.V1Volume( + name='gcp-credentials', + secret=k8s_client.V1SecretVolumeSource( + secret_name='user-gcp-sa' + ) + ) + ) + .add_volume_mount( + k8s_client.V1VolumeMount( + mount_path='/secret/gcp-credentials', + name='gcp-credentials', + ) + ) + .add_env_variable( + k8s_client.V1EnvVar( + name='GOOGLE_APPLICATION_CREDENTIALS', + value='/secret/gcp-credentials/user-gcp-sa.json' + ) + ) + ) + +def dataflow_function_embedding_op( + project: 'GcpProject', runner: str, target_dataset: str, problem: str, + data_dir: 'GcsUri', saved_model_dir: 'GcsUri', temp_location: 'GcsUri', + staging_location: 'GcsUri', + job_name: str, worker_machine_type: str, + num_workers: int, step_name='dataflow_function_embedding'): + return default_gcp_op( + name=step_name, + image='gcr.io/kubeflow-examples/code-search-dataflow:latest', + command=[ + 'python2', + '-m', + 'code_search.dataflow.cli.create_function_embeddings', + ], + arguments=[ + '--project', project, + '--runner', runner, + '--target_dataset', target_dataset, + '--problem', problem, + '--data_dir', data_dir, + '--saved_model_dir', saved_model_dir, + '--job_name', job_name, + '--temp_location', temp_location, + '--staging_location', staging_location, + '--worker_machine_type', worker_machine_type, + '--num_workers', num_workers, + '--requirements_file', 'requirements.dataflow.txt', + '--wait_until_finished', + ] + ) + + +def search_index_creator_op( + working_dir: str, data_dir: str, workflow_id: str, cluster_name: str, namespace: str): + return dsl.ContainerOp( + # use component name as step name + name='search_index_creator', + image='gcr.io/kubeflow-examples/code-search-ks:v20181126-e62ebca-dirty-4103da', + command=['/usr/local/src/launch_search_index_creator_job.sh'], + arguments=[ + '--workingDir=%s' % working_dir, + '--dataDir=%s' % data_dir, + '--workflowId=%s' % workflow_id, + '--cluster=%s' % cluster_name, + '--namespace=%s' % namespace, + ] + ) + + +# The pipeline definition +@dsl.pipeline( + name='function_embedding', + description='Example function embedding pipeline' +) +def function_embedding_update( + project, + working_dir, + data_dir, + saved_model_dir, + cluster_name, + namespace, + problem=dsl.PipelineParam( + name='problem', value='kf_github_function_docstring'), + runner=dsl.PipelineParam(name='runnder', value='DataflowRunner'), + target_dataset=dsl.PipelineParam( + name='target-dataset', value='code_search'), + worker_machine_type=dsl.PipelineParam( + name='worker-machine-type', value='n1-highcpu-32'), + num_workers=dsl.PipelineParam(name='num-workers', value=5)): + workflow_name = '{{workflow.name}}' + temp_location = '%s/dataflow/%s/temp' % (working_dir, workflow_name) + staging_location = '%s/dataflow/%s/staging' % (working_dir, workflow_name) + function_embedding = dataflow_function_embedding_op( + project, runner, target_dataset, problem, data_dir, + saved_model_dir, temp_location, staging_location, workflow_name, + worker_machine_type, num_workers) + search_index_creator_op( + working_dir, data_dir, workflow_name, cluster_name, namespace).after(function_embedding) + + +if __name__ == '__main__': + import kfp.compiler as compiler + + compiler.Compiler().compile(function_embedding_update, __file__ + '.tar.gz')