diff --git a/code_search/.gitignore b/code_search/.gitignore index d8a9972e..89ed79ec 100644 --- a/code_search/.gitignore +++ b/code_search/.gitignore @@ -106,3 +106,4 @@ venv.bak/ # Virtual Environments venv/ +*.key.json diff --git a/code_search/README.md b/code_search/README.md index 9bab5763..7644af3c 100644 --- a/code_search/README.md +++ b/code_search/README.md @@ -84,16 +84,13 @@ This script builds and pushes the docker image to Google Container Registry. $ gcloud auth configure-docker ``` -* Setup environment variables -``` -$ export PROJECT= # (optional) setup project ID. if not set, image is not published to GCR -$ export BUILD_IMAGE_TAG=code-search:devel # (optional) to change built image tag -$ export BASE_IMAGE_TAG=1.8.0-gpu-py3 # (optional) for GPU base image -``` - * Build and push the image ``` -$ ./language_task/build_image.sh +$ PROJECT=my-project ./language_task/build_image.sh +``` +and a GPU image +``` +$ GPU=1 PROJECT=my-project ./language_task/build_image.sh ``` See [GCR Pushing and Pulling Images](https://cloud.google.com/container-registry/docs/pushing-and-pulling) for more. @@ -124,29 +121,14 @@ $ docker run --rm -it -v ${MOUNT_DATA_DIR}:/data -v ${MOUNT_OUTPUT_DIR}:/output --model=transformer --hparams_set=transformer_base ``` -#### 2.2.2 Docstrings Language Model +### 2.2 Train on Kubeflow -This part trains a language model based on the docstrings in the dataset and uses `tensor2tensor` - -* Generate `TFRecords` for training -``` -$ export MOUNT_DATA_DIR=/path/to/data/folder -$ docker run --rm -it -v ${MOUNT_DATA_DIR}:/data ${BUILD_IMAGE_TAG} \ - t2t-datagen --problem=github_docstring_language_model --data_dir=/data +* Setup secrets for access permissions Google Cloud Storage and Google Container Registry +```shell +$ PROJECT=my-project ./create_secrets.sh ``` -* Train language model using `Tranformer Networks` and a custom hyper-parameters set -``` -$ export MOUNT_DATA_DIR=/path/to/data/folder -$ export MOUNT_OUTPUT_DIR=/path/to/output/folder -$ docker run --rm -it -v ${MOUNT_DATA_DIR}:/data -v ${MOUNT_OUTPUT_DIR}:/output ${BUILD_IMAGE_TAG} \ - t2t-trainer --problem=github_docstring_language_model --data_dir=/data --output_dir=/output \ - --model=transformer --hparams_set=transformer_gh_lm -``` - -### 2.3 Train on Kubeflow - -TODO +**NOTE**: Use `create_secrets.sh -d` to remove any side-effects of the above step. # Acknowledgements diff --git a/code_search/create_secrets.sh b/code_search/create_secrets.sh new file mode 100755 index 00000000..6ba834ba --- /dev/null +++ b/code_search/create_secrets.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash + +## +# This script creates all the necessary service accounts and permissions +# needed for the training jobs to pull private images from +# Google Cloud Registry and access Google Cloud Storage. To +# undo all the changes made, add a "-d" flag while executing the +# script. +# + +set -ex + +export PROJECT=${PROJECT:-} + +if [[ -z "${PROJECT}" ]]; then + echo "PROJECT environment variable missing!" + exit 1 +fi + +export SA_NAME=code-search-access +export SA_EMAIL=${SA_NAME}@${PROJECT}.iam.gserviceaccount.com +export SA_KEY_FILE=${SA_EMAIL}.key.json + + +if [[ "${1}" = "-d" ]]; then + gcloud projects remove-iam-policy-binding ${PROJECT} \ + --member=serviceAccount:${SA_EMAIL} \ + --role=roles/storage.admin + + gcloud iam service-accounts delete ${SA_EMAIL} --quiet + + rm -f ${SA_KEY_FILE} + + kubectl delete secret gcp-credentials gcp-registry-credentials + + exit 0 +fi + + +gcloud iam service-accounts create ${SA_NAME} --display-name ${SA_EMAIL} + +gcloud projects add-iam-policy-binding ${PROJECT} \ + --member=serviceAccount:${SA_EMAIL} \ + --role=roles/storage.admin + +gcloud iam service-accounts keys create ${SA_KEY_FILE} \ + --iam-account=${SA_EMAIL} + +kubectl create secret docker-registry gcp-registry-credentials \ + --docker-server=https://gcr.io \ + --docker-username=_json_key \ + --docker-password="$(cat ${SA_KEY_FILE})" \ + --docker-email=${SA_EMAIL} + +kubectl create secret generic gcp-credentials \ + --from-file=key.json="${SA_KEY_FILE}" + diff --git a/code_search/kubeflow/app.yaml b/code_search/kubeflow/app.yaml index e59f772c..8f7b6a64 100644 --- a/code_search/kubeflow/app.yaml +++ b/code_search/kubeflow/app.yaml @@ -1,11 +1,11 @@ apiVersion: 0.1.0 environments: - default: + code-search: destination: namespace: kubeflow - server: https://130.211.225.204 + server: https://35.193.190.6 k8sVersion: v1.9.6 - path: default + path: code-search kind: ksonnet.io/app libraries: tf-job: diff --git a/code_search/kubeflow/components/params.libsonnet b/code_search/kubeflow/components/params.libsonnet index 324a48cc..53a03ba4 100644 --- a/code_search/kubeflow/components/params.libsonnet +++ b/code_search/kubeflow/components/params.libsonnet @@ -7,27 +7,68 @@ // Component-level parameters, defined initially from 'ks prototype use ...' // Each object below should correspond to a component in the components/ directory "t2t-job": { - numWorker: 1, + jobType: "trainer", + numMaster: 1, - numPs: 1, + numWorker: 0, + numPs: 0, numWorkerGpu: 0, numPsGpu: 0, train_steps: 100, eval_steps: 10, - image: "gcr.io/kubeflow-dev/code-search:devel", - imageGpu: "gcr.io/kubeflow-dev/code-search:gpu-devel", + image: "gcr.io/kubeflow-dev/code-search:v20180621-266e689", + imageGpu: "gcr.io/kubeflow-dev/code-search:v20180621-gpu-db4f1ee", imagePullSecrets: [], + + dataDir: "null", + outputDir: "null", + model: "null", + hparams_set: "null", + + // TODO(sanyamkapoor): A workaround for tensorflow/tensor2tensor#879 + gsDataDir: "null", + gsOutputDir: "null", }, - "t2t-gh-summarizer": { - "name": "github_function_summarizer", - "problem": "github_function_summarizer", - "dataDir": "gs://kubeflow-dev/code-search/raw_data", - "outputDir": "gs://kubeflow-dev/code-search/train", - "model": "transformer", - "hparams_set": "transformer_base" + "t2t-translate-datagen": { + jobType: "datagen", + + name: "translate-ende-wmt32k-datagen", + problem: "translate_ende_wmt32k", + + // TODO(sanyamkapoor): A workaround for tensorflow/tensor2tensor#879 + dataDir: "/data", + outputDir: "/data", + gsOutputDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/datagen", + }, + + "t2t-translate-exporter": { + jobType: "exporter", + + name: "translate-ende-wmt32k-exporter", + problem: "translate_ende_wmt32k", + dataDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/datagen", + outputDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/output", + model: "transformer", + hparams_set: "transformer_base_single_gpu", + }, + + "t2t-translate": { + jobType: "trainer", + numMaster: 1, + numWorker: 2, + numPs: 1, + numWorkerGpu: 1, + numPsGpu: 1, + + name: "translate-ende-wmt32k", + problem: "translate_ende_wmt32k", + dataDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/datagen", + outputDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/output", + model: "transformer", + hparams_set: "transformer_base_single_gpu", }, }, } diff --git a/code_search/kubeflow/components/t2t-job.libsonnet b/code_search/kubeflow/components/t2t-job.libsonnet index 23d58f85..529388f9 100644 --- a/code_search/kubeflow/components/t2t-job.libsonnet +++ b/code_search/kubeflow/components/t2t-job.libsonnet @@ -2,17 +2,52 @@ local tfJob = import "kubeflow/tf-job/tf-job.libsonnet"; local baseParams = std.extVar("__ksonnet/params").components["t2t-job"]; { - parts(newParams, env):: { - local params = baseParams + newParams, + getGcloudAuthCmd():: + [ + "/root/google-cloud-sdk/bin/gcloud", + "auth", + "activate-service-account", + "--key-file", + "$GOOGLE_APPLICATION_CREDENTIALS", + ], - local t2tCmd = { - datagen: [ - "t2t-datagen", - "--problem=" + params.problem, - "--data_dir=" + params.dataDir, - ], + getGsUtilCmd(src_dir, dst_dir):: + [ + "/root/google-cloud-sdk/bin/gsutil", + "cp", + "-r", + src_dir, + dst_dir, + ], - trainer: [ + wrapGsUtil(cmd, params):: { + local resultCmd = + (if params.gsDataDir == "null" && params.gsOutputDir == "null" then [] else $.getGcloudAuthCmd() + ["&&"]) + + (if params.gsDataDir == "null" then [] else $.getGsUtilCmd(params.gsDataDir, params.dataDir) + ["&&"]) + + cmd + + (if params.gsOutputDir == "null" then [] else ["&&"] + $.getGsUtilCmd(params.outputDir, params.gsOutputDir)), + result: ["-c", std.join(" ", resultCmd)] + }.result, + + getDatagenCmd(params):: + [ + "t2t-datagen", + "--problem=" + params.problem, + "--data_dir=" + params.dataDir, + ], + + getExporterCmd(params):: + [ + "t2t-exporter", + "--problem=" + params.problem, + "--data_dir=" + params.dataDir, + "--output_dir=" + params.outputDir, + "--model=" + params.model, + "--hparams_set=" + params.hparams_set, + ], + + getTrainerCmd(params):: { + local trainer = [ "t2t-trainer", "--problem=" + params.problem, "--data_dir=" + params.dataDir, @@ -22,7 +57,7 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"]; "--train_steps=" + std.toString(params.train_steps), ], - workerBase: self.trainer + [ + local workerBase = trainer + [ "--schedule=train", "--ps_gpu=" + std.toString(params.numPsGpu), "--worker_gpu=" + std.toString(params.numWorkerGpu), @@ -31,36 +66,112 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"]; "--eval_steps=" + std.toString(params.eval_steps), ], - ps: self.trainer + [ + ps: trainer + [ "--schedule=run_std_server", "--ps_job=/job:ps", ], - worker: self.workerBase + [ + worker: workerBase + [ "--worker_job=/job:worker", ], - master: self.workerBase + [ + master: workerBase + [ "--worker_job=/job:master", ], + }, + + tfJobReplica(replicaType, number, args, image, numGpus=0, imagePullSecrets=[], env=[], volumes=[], volumeMounts=[]):: + local containerSpec = { + image: image, + name: "tensorflow", + [if std.length(args) > 0 then "args"]: args, + [if numGpus > 0 then "resources"]: { + limits: { + "nvidia.com/gpu": numGpus, + }, + }, + [if std.length(env) > 0 then "env"]: env, + [if std.length(volumeMounts) > 0 then "volumeMounts"]: volumeMounts, + }; + { + replicas: number, + template: { + spec: { + containers: [ containerSpec ], + [if std.length(imagePullSecrets) > 0 then "imagePullSecrets"]: imagePullSecrets, + [if std.length(volumes) > 0 then "volumes"]: volumes, + restartPolicy: "OnFailure", + }, + }, + tfReplicaType: replicaType, }, + parts(newParams, env):: { + local params = baseParams + newParams, + local terminationPolicy = if params.numMaster == 1 then tfJob.parts.tfJobTerminationPolicy("MASTER", 0) else tfJob.parts.tfJobTerminationPolicy("WORKER", 0), local workerImage = if params.numWorkerGpu > 0 then params.imageGpu else params.image, - local psImage = if params.numPsGpu > 0 then params.imageGpu else params.image, + local workerImagePullSecrets = [ + { name: "gcp-registry-credentials" }, + ], + local workerEnv = [ + { + name: "GOOGLE_APPLICATION_CREDENTIALS", + value: "/secret/gcp-credentials/key.json" + }, + ], + local workerVolumes = [ + { + name: "gcp-credentials", + secret: { + secretName: "gcp-credentials", + }, + }, + ], + local workerVolumeMounts = [ + { + mountPath: "/secret/gcp-credentials", + name: "gcp-credentials", + }, + ], + + // TODO(sanyamkapoor): A workaround for tensorflow/tensor2tensor#879 + // once fixed, simply get rid of $.wrapGsUtil method + local cmd = $.getTrainerCmd(params), + local finalCmd = { + master: $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + cmd.master, params), + worker: $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + cmd.worker, params), + ps: $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + cmd.ps, params), + }, + local datagenCmd = $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + $.getDatagenCmd(params), params), + local exporterCmd = $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + $.getExporterCmd(params), params), job:: tfJob.parts.tfJob( params.name, env.namespace, - [ - tfJob.parts.tfJobReplica("MASTER", params.numMaster, t2tCmd.master, workerImage, params.imagePullSecrets, params.numWorkerGpu), - tfJob.parts.tfJobReplica("WORKER", params.numWorker, t2tCmd.worker, workerImage, params.imagePullSecrets, params.numWorkerGpu), - tfJob.parts.tfJobReplica("PS", params.numPs, t2tCmd.ps, psImage, params.imagePullSecrets, params.numPsGpu), - ], + if params.jobType == "datagen" then + [ + $.tfJobReplica("MASTER", params.numMaster, datagenCmd, workerImage, params.numWorkerGpu, + workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts), + ] + else if params.jobType == "exporter" then + [ + $.tfJobReplica("MASTER", params.numMaster, exporterCmd, workerImage, params.numWorkerGpu, + workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts), + ] + else + [ + $.tfJobReplica("MASTER", params.numMaster, finalCmd.master, workerImage, params.numWorkerGpu, + workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts), + $.tfJobReplica("WORKER", params.numWorker, finalCmd.worker, workerImage, params.numWorkerGpu, + workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts), + $.tfJobReplica("PS", params.numPs, finalCmd.ps, workerImage, params.numPsGpu, + workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts), + ], terminationPolicy ), }, diff --git a/code_search/kubeflow/components/t2t-translate-datagen.jsonnet b/code_search/kubeflow/components/t2t-translate-datagen.jsonnet new file mode 100644 index 00000000..9bffd8e9 --- /dev/null +++ b/code_search/kubeflow/components/t2t-translate-datagen.jsonnet @@ -0,0 +1,7 @@ +local k = import "k.libsonnet"; +local t2tJob = import "t2t-job.libsonnet"; + +local env = std.extVar("__ksonnet/environments"); +local params = std.extVar("__ksonnet/params").components["t2t-translate-datagen"]; + +std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job])) diff --git a/code_search/kubeflow/components/t2t-translate-exporter.jsonnet b/code_search/kubeflow/components/t2t-translate-exporter.jsonnet new file mode 100644 index 00000000..a47329b8 --- /dev/null +++ b/code_search/kubeflow/components/t2t-translate-exporter.jsonnet @@ -0,0 +1,7 @@ +local k = import "k.libsonnet"; +local t2tJob = import "t2t-job.libsonnet"; + +local env = std.extVar("__ksonnet/environments"); +local params = std.extVar("__ksonnet/params").components["t2t-translate-exporter"]; + +std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job])) diff --git a/code_search/kubeflow/components/t2t-gh-summarizer.jsonnet b/code_search/kubeflow/components/t2t-translate.jsonnet similarity index 70% rename from code_search/kubeflow/components/t2t-gh-summarizer.jsonnet rename to code_search/kubeflow/components/t2t-translate.jsonnet index bcbc8132..fc07eb9f 100644 --- a/code_search/kubeflow/components/t2t-gh-summarizer.jsonnet +++ b/code_search/kubeflow/components/t2t-translate.jsonnet @@ -2,6 +2,6 @@ local k = import "k.libsonnet"; local t2tJob = import "t2t-job.libsonnet"; local env = std.extVar("__ksonnet/environments"); -local params = std.extVar("__ksonnet/params").components["t2t-gh-summarizer"]; +local params = std.extVar("__ksonnet/params").components["t2t-translate"]; std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job])) diff --git a/code_search/language_task/Dockerfile b/code_search/language_task/Dockerfile index a97a0722..1dc1900d 100644 --- a/code_search/language_task/Dockerfile +++ b/code_search/language_task/Dockerfile @@ -17,4 +17,12 @@ ADD t2t-entrypoint.sh /usr/local/sbin/t2t-entrypoint ENV T2T_USR_DIR=/t2t_problems -ENTRYPOINT ["/usr/local/sbin/t2t-entrypoint"] +WORKDIR /t2t_problems + +#ENTRYPOINT ["/usr/local/sbin/t2t-entrypoint"] + +# TODO(sanyamkapoor): A workaround for tensorflow/tensor2tensor#879 +RUN apt-get update && apt-get install -y curl python &&\ + curl https://sdk.cloud.google.com | bash &&\ + rm -rf /var/lib/apt/lists/* +ENTRYPOINT ["bash"] diff --git a/code_search/language_task/build_image.sh b/code_search/language_task/build_image.sh index d7a29ced..714b9caf 100755 --- a/code_search/language_task/build_image.sh +++ b/code_search/language_task/build_image.sh @@ -1,10 +1,18 @@ #!/usr/bin/env bash -set -e +set -ex PROJECT=${PROJECT:-} -BASE_IMAGE_TAG=${BASE_IMAGE_TAG:-1.8.0-py3} # 1.8.0-gpu-py3 for GPU-based image -BUILD_IMAGE_TAG=${BUILD_IMAGE_TAG:-code-search:devel} + +if [[ -z "${PROJECT}" ]]; then + echo "PROJECT environment variable missing!" + exit 1 +fi + +GPU=${GPU:-0} + +BASE_IMAGE_TAG=$([[ "${GPU}" = "1" ]] && echo "1.8.0-gpu-py3" || echo "1.8.0-py3") +BUILD_IMAGE_TAG="code-search:v$(date +%Y%m%d)$([[ ${GPU} = "1" ]] && echo '-gpu' || echo '')-$(python3 -c 'import uuid; print(uuid.uuid4().hex[:7]);')" # Directory of this script used as docker context _SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" @@ -13,10 +21,8 @@ pushd "$_SCRIPT_DIR" docker build -t ${BUILD_IMAGE_TAG} --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} . -# Push image to GCR if PROJECT available -if [[ ! -z "${PROJECT}" ]]; then - docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG} - docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG} -fi +# Push image to GCR PROJECT available +docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG} +docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG} popd