Extension of T2T Ksonnet component (#149)

* Add jobs derived from t2t component, GCP credentials assumed

* Add script to create IAM role bindings for Docker container to use

* Fix names to hyphens

* Add t2t-exporter wrapper

* Fix typos

* A temporary workaround for tensorflow/tensor2tensor#879

* Complete working pipeline of datagen, trainer and exporter

* Add docstring to create_secrets.sh
This commit is contained in:
Sanyam Kapoor 2018-06-25 15:09:22 -07:00 committed by k8s-ci-robot
parent 21506ffc51
commit 656e1e3e7c
11 changed files with 291 additions and 71 deletions

View File

@ -106,3 +106,4 @@ venv.bak/
# Virtual Environments
venv/
*.key.json

View File

@ -84,16 +84,13 @@ This script builds and pushes the docker image to Google Container Registry.
$ gcloud auth configure-docker
```
* Setup environment variables
```
$ export PROJECT=<your_project> # (optional) setup project ID. if not set, image is not published to GCR
$ export BUILD_IMAGE_TAG=code-search:devel # (optional) to change built image tag
$ export BASE_IMAGE_TAG=1.8.0-gpu-py3 # (optional) for GPU base image
```
* Build and push the image
```
$ ./language_task/build_image.sh
$ PROJECT=my-project ./language_task/build_image.sh
```
and a GPU image
```
$ GPU=1 PROJECT=my-project ./language_task/build_image.sh
```
See [GCR Pushing and Pulling Images](https://cloud.google.com/container-registry/docs/pushing-and-pulling) for more.
@ -124,29 +121,14 @@ $ docker run --rm -it -v ${MOUNT_DATA_DIR}:/data -v ${MOUNT_OUTPUT_DIR}:/output
--model=transformer --hparams_set=transformer_base
```
#### 2.2.2 Docstrings Language Model
### 2.2 Train on Kubeflow
This part trains a language model based on the docstrings in the dataset and uses `tensor2tensor`
* Generate `TFRecords` for training
```
$ export MOUNT_DATA_DIR=/path/to/data/folder
$ docker run --rm -it -v ${MOUNT_DATA_DIR}:/data ${BUILD_IMAGE_TAG} \
t2t-datagen --problem=github_docstring_language_model --data_dir=/data
* Setup secrets for access permissions Google Cloud Storage and Google Container Registry
```shell
$ PROJECT=my-project ./create_secrets.sh
```
* Train language model using `Tranformer Networks` and a custom hyper-parameters set
```
$ export MOUNT_DATA_DIR=/path/to/data/folder
$ export MOUNT_OUTPUT_DIR=/path/to/output/folder
$ docker run --rm -it -v ${MOUNT_DATA_DIR}:/data -v ${MOUNT_OUTPUT_DIR}:/output ${BUILD_IMAGE_TAG} \
t2t-trainer --problem=github_docstring_language_model --data_dir=/data --output_dir=/output \
--model=transformer --hparams_set=transformer_gh_lm
```
### 2.3 Train on Kubeflow
TODO
**NOTE**: Use `create_secrets.sh -d` to remove any side-effects of the above step.
# Acknowledgements

57
code_search/create_secrets.sh Executable file
View File

@ -0,0 +1,57 @@
#!/usr/bin/env bash
##
# This script creates all the necessary service accounts and permissions
# needed for the training jobs to pull private images from
# Google Cloud Registry and access Google Cloud Storage. To
# undo all the changes made, add a "-d" flag while executing the
# script.
#
set -ex
export PROJECT=${PROJECT:-}
if [[ -z "${PROJECT}" ]]; then
echo "PROJECT environment variable missing!"
exit 1
fi
export SA_NAME=code-search-access
export SA_EMAIL=${SA_NAME}@${PROJECT}.iam.gserviceaccount.com
export SA_KEY_FILE=${SA_EMAIL}.key.json
if [[ "${1}" = "-d" ]]; then
gcloud projects remove-iam-policy-binding ${PROJECT} \
--member=serviceAccount:${SA_EMAIL} \
--role=roles/storage.admin
gcloud iam service-accounts delete ${SA_EMAIL} --quiet
rm -f ${SA_KEY_FILE}
kubectl delete secret gcp-credentials gcp-registry-credentials
exit 0
fi
gcloud iam service-accounts create ${SA_NAME} --display-name ${SA_EMAIL}
gcloud projects add-iam-policy-binding ${PROJECT} \
--member=serviceAccount:${SA_EMAIL} \
--role=roles/storage.admin
gcloud iam service-accounts keys create ${SA_KEY_FILE} \
--iam-account=${SA_EMAIL}
kubectl create secret docker-registry gcp-registry-credentials \
--docker-server=https://gcr.io \
--docker-username=_json_key \
--docker-password="$(cat ${SA_KEY_FILE})" \
--docker-email=${SA_EMAIL}
kubectl create secret generic gcp-credentials \
--from-file=key.json="${SA_KEY_FILE}"

View File

@ -1,11 +1,11 @@
apiVersion: 0.1.0
environments:
default:
code-search:
destination:
namespace: kubeflow
server: https://130.211.225.204
server: https://35.193.190.6
k8sVersion: v1.9.6
path: default
path: code-search
kind: ksonnet.io/app
libraries:
tf-job:

View File

@ -7,27 +7,68 @@
// Component-level parameters, defined initially from 'ks prototype use ...'
// Each object below should correspond to a component in the components/ directory
"t2t-job": {
numWorker: 1,
jobType: "trainer",
numMaster: 1,
numPs: 1,
numWorker: 0,
numPs: 0,
numWorkerGpu: 0,
numPsGpu: 0,
train_steps: 100,
eval_steps: 10,
image: "gcr.io/kubeflow-dev/code-search:devel",
imageGpu: "gcr.io/kubeflow-dev/code-search:gpu-devel",
image: "gcr.io/kubeflow-dev/code-search:v20180621-266e689",
imageGpu: "gcr.io/kubeflow-dev/code-search:v20180621-gpu-db4f1ee",
imagePullSecrets: [],
dataDir: "null",
outputDir: "null",
model: "null",
hparams_set: "null",
// TODO(sanyamkapoor): A workaround for tensorflow/tensor2tensor#879
gsDataDir: "null",
gsOutputDir: "null",
},
"t2t-gh-summarizer": {
"name": "github_function_summarizer",
"problem": "github_function_summarizer",
"dataDir": "gs://kubeflow-dev/code-search/raw_data",
"outputDir": "gs://kubeflow-dev/code-search/train",
"model": "transformer",
"hparams_set": "transformer_base"
"t2t-translate-datagen": {
jobType: "datagen",
name: "translate-ende-wmt32k-datagen",
problem: "translate_ende_wmt32k",
// TODO(sanyamkapoor): A workaround for tensorflow/tensor2tensor#879
dataDir: "/data",
outputDir: "/data",
gsOutputDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/datagen",
},
"t2t-translate-exporter": {
jobType: "exporter",
name: "translate-ende-wmt32k-exporter",
problem: "translate_ende_wmt32k",
dataDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/datagen",
outputDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/output",
model: "transformer",
hparams_set: "transformer_base_single_gpu",
},
"t2t-translate": {
jobType: "trainer",
numMaster: 1,
numWorker: 2,
numPs: 1,
numWorkerGpu: 1,
numPsGpu: 1,
name: "translate-ende-wmt32k",
problem: "translate_ende_wmt32k",
dataDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/datagen",
outputDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/output",
model: "transformer",
hparams_set: "transformer_base_single_gpu",
},
},
}

View File

@ -2,17 +2,52 @@ local tfJob = import "kubeflow/tf-job/tf-job.libsonnet";
local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
{
parts(newParams, env):: {
local params = baseParams + newParams,
getGcloudAuthCmd()::
[
"/root/google-cloud-sdk/bin/gcloud",
"auth",
"activate-service-account",
"--key-file",
"$GOOGLE_APPLICATION_CREDENTIALS",
],
local t2tCmd = {
datagen: [
"t2t-datagen",
"--problem=" + params.problem,
"--data_dir=" + params.dataDir,
],
getGsUtilCmd(src_dir, dst_dir)::
[
"/root/google-cloud-sdk/bin/gsutil",
"cp",
"-r",
src_dir,
dst_dir,
],
trainer: [
wrapGsUtil(cmd, params):: {
local resultCmd =
(if params.gsDataDir == "null" && params.gsOutputDir == "null" then [] else $.getGcloudAuthCmd() + ["&&"]) +
(if params.gsDataDir == "null" then [] else $.getGsUtilCmd(params.gsDataDir, params.dataDir) + ["&&"]) +
cmd +
(if params.gsOutputDir == "null" then [] else ["&&"] + $.getGsUtilCmd(params.outputDir, params.gsOutputDir)),
result: ["-c", std.join(" ", resultCmd)]
}.result,
getDatagenCmd(params)::
[
"t2t-datagen",
"--problem=" + params.problem,
"--data_dir=" + params.dataDir,
],
getExporterCmd(params)::
[
"t2t-exporter",
"--problem=" + params.problem,
"--data_dir=" + params.dataDir,
"--output_dir=" + params.outputDir,
"--model=" + params.model,
"--hparams_set=" + params.hparams_set,
],
getTrainerCmd(params):: {
local trainer = [
"t2t-trainer",
"--problem=" + params.problem,
"--data_dir=" + params.dataDir,
@ -22,7 +57,7 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
"--train_steps=" + std.toString(params.train_steps),
],
workerBase: self.trainer + [
local workerBase = trainer + [
"--schedule=train",
"--ps_gpu=" + std.toString(params.numPsGpu),
"--worker_gpu=" + std.toString(params.numWorkerGpu),
@ -31,36 +66,112 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
"--eval_steps=" + std.toString(params.eval_steps),
],
ps: self.trainer + [
ps: trainer + [
"--schedule=run_std_server",
"--ps_job=/job:ps",
],
worker: self.workerBase + [
worker: workerBase + [
"--worker_job=/job:worker",
],
master: self.workerBase + [
master: workerBase + [
"--worker_job=/job:master",
],
},
tfJobReplica(replicaType, number, args, image, numGpus=0, imagePullSecrets=[], env=[], volumes=[], volumeMounts=[])::
local containerSpec = {
image: image,
name: "tensorflow",
[if std.length(args) > 0 then "args"]: args,
[if numGpus > 0 then "resources"]: {
limits: {
"nvidia.com/gpu": numGpus,
},
},
[if std.length(env) > 0 then "env"]: env,
[if std.length(volumeMounts) > 0 then "volumeMounts"]: volumeMounts,
};
{
replicas: number,
template: {
spec: {
containers: [ containerSpec ],
[if std.length(imagePullSecrets) > 0 then "imagePullSecrets"]: imagePullSecrets,
[if std.length(volumes) > 0 then "volumes"]: volumes,
restartPolicy: "OnFailure",
},
},
tfReplicaType: replicaType,
},
parts(newParams, env):: {
local params = baseParams + newParams,
local terminationPolicy = if params.numMaster == 1
then tfJob.parts.tfJobTerminationPolicy("MASTER", 0)
else tfJob.parts.tfJobTerminationPolicy("WORKER", 0),
local workerImage = if params.numWorkerGpu > 0 then params.imageGpu else params.image,
local psImage = if params.numPsGpu > 0 then params.imageGpu else params.image,
local workerImagePullSecrets = [
{ name: "gcp-registry-credentials" },
],
local workerEnv = [
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/key.json"
},
],
local workerVolumes = [
{
name: "gcp-credentials",
secret: {
secretName: "gcp-credentials",
},
},
],
local workerVolumeMounts = [
{
mountPath: "/secret/gcp-credentials",
name: "gcp-credentials",
},
],
// TODO(sanyamkapoor): A workaround for tensorflow/tensor2tensor#879
// once fixed, simply get rid of $.wrapGsUtil method
local cmd = $.getTrainerCmd(params),
local finalCmd = {
master: $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + cmd.master, params),
worker: $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + cmd.worker, params),
ps: $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + cmd.ps, params),
},
local datagenCmd = $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + $.getDatagenCmd(params), params),
local exporterCmd = $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + $.getExporterCmd(params), params),
job::
tfJob.parts.tfJob(
params.name,
env.namespace,
[
tfJob.parts.tfJobReplica("MASTER", params.numMaster, t2tCmd.master, workerImage, params.imagePullSecrets, params.numWorkerGpu),
tfJob.parts.tfJobReplica("WORKER", params.numWorker, t2tCmd.worker, workerImage, params.imagePullSecrets, params.numWorkerGpu),
tfJob.parts.tfJobReplica("PS", params.numPs, t2tCmd.ps, psImage, params.imagePullSecrets, params.numPsGpu),
],
if params.jobType == "datagen" then
[
$.tfJobReplica("MASTER", params.numMaster, datagenCmd, workerImage, params.numWorkerGpu,
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
]
else if params.jobType == "exporter" then
[
$.tfJobReplica("MASTER", params.numMaster, exporterCmd, workerImage, params.numWorkerGpu,
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
]
else
[
$.tfJobReplica("MASTER", params.numMaster, finalCmd.master, workerImage, params.numWorkerGpu,
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
$.tfJobReplica("WORKER", params.numWorker, finalCmd.worker, workerImage, params.numWorkerGpu,
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
$.tfJobReplica("PS", params.numPs, finalCmd.ps, workerImage, params.numPsGpu,
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
],
terminationPolicy
),
},

View File

@ -0,0 +1,7 @@
local k = import "k.libsonnet";
local t2tJob = import "t2t-job.libsonnet";
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["t2t-translate-datagen"];
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))

View File

@ -0,0 +1,7 @@
local k = import "k.libsonnet";
local t2tJob = import "t2t-job.libsonnet";
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["t2t-translate-exporter"];
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))

View File

@ -2,6 +2,6 @@ local k = import "k.libsonnet";
local t2tJob = import "t2t-job.libsonnet";
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["t2t-gh-summarizer"];
local params = std.extVar("__ksonnet/params").components["t2t-translate"];
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))

View File

@ -17,4 +17,12 @@ ADD t2t-entrypoint.sh /usr/local/sbin/t2t-entrypoint
ENV T2T_USR_DIR=/t2t_problems
ENTRYPOINT ["/usr/local/sbin/t2t-entrypoint"]
WORKDIR /t2t_problems
#ENTRYPOINT ["/usr/local/sbin/t2t-entrypoint"]
# TODO(sanyamkapoor): A workaround for tensorflow/tensor2tensor#879
RUN apt-get update && apt-get install -y curl python &&\
curl https://sdk.cloud.google.com | bash &&\
rm -rf /var/lib/apt/lists/*
ENTRYPOINT ["bash"]

View File

@ -1,10 +1,18 @@
#!/usr/bin/env bash
set -e
set -ex
PROJECT=${PROJECT:-}
BASE_IMAGE_TAG=${BASE_IMAGE_TAG:-1.8.0-py3} # 1.8.0-gpu-py3 for GPU-based image
BUILD_IMAGE_TAG=${BUILD_IMAGE_TAG:-code-search:devel}
if [[ -z "${PROJECT}" ]]; then
echo "PROJECT environment variable missing!"
exit 1
fi
GPU=${GPU:-0}
BASE_IMAGE_TAG=$([[ "${GPU}" = "1" ]] && echo "1.8.0-gpu-py3" || echo "1.8.0-py3")
BUILD_IMAGE_TAG="code-search:v$(date +%Y%m%d)$([[ ${GPU} = "1" ]] && echo '-gpu' || echo '')-$(python3 -c 'import uuid; print(uuid.uuid4().hex[:7]);')"
# Directory of this script used as docker context
_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
@ -13,10 +21,8 @@ pushd "$_SCRIPT_DIR"
docker build -t ${BUILD_IMAGE_TAG} --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} .
# Push image to GCR if PROJECT available
if [[ ! -z "${PROJECT}" ]]; then
docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
fi
# Push image to GCR PROJECT available
docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
popd