mirror of https://github.com/kubeflow/examples.git
Extension of T2T Ksonnet component (#149)
* Add jobs derived from t2t component, GCP credentials assumed * Add script to create IAM role bindings for Docker container to use * Fix names to hyphens * Add t2t-exporter wrapper * Fix typos * A temporary workaround for tensorflow/tensor2tensor#879 * Complete working pipeline of datagen, trainer and exporter * Add docstring to create_secrets.sh
This commit is contained in:
parent
21506ffc51
commit
656e1e3e7c
|
|
@ -106,3 +106,4 @@ venv.bak/
|
|||
# Virtual Environments
|
||||
venv/
|
||||
|
||||
*.key.json
|
||||
|
|
|
|||
|
|
@ -84,16 +84,13 @@ This script builds and pushes the docker image to Google Container Registry.
|
|||
$ gcloud auth configure-docker
|
||||
```
|
||||
|
||||
* Setup environment variables
|
||||
```
|
||||
$ export PROJECT=<your_project> # (optional) setup project ID. if not set, image is not published to GCR
|
||||
$ export BUILD_IMAGE_TAG=code-search:devel # (optional) to change built image tag
|
||||
$ export BASE_IMAGE_TAG=1.8.0-gpu-py3 # (optional) for GPU base image
|
||||
```
|
||||
|
||||
* Build and push the image
|
||||
```
|
||||
$ ./language_task/build_image.sh
|
||||
$ PROJECT=my-project ./language_task/build_image.sh
|
||||
```
|
||||
and a GPU image
|
||||
```
|
||||
$ GPU=1 PROJECT=my-project ./language_task/build_image.sh
|
||||
```
|
||||
|
||||
See [GCR Pushing and Pulling Images](https://cloud.google.com/container-registry/docs/pushing-and-pulling) for more.
|
||||
|
|
@ -124,29 +121,14 @@ $ docker run --rm -it -v ${MOUNT_DATA_DIR}:/data -v ${MOUNT_OUTPUT_DIR}:/output
|
|||
--model=transformer --hparams_set=transformer_base
|
||||
```
|
||||
|
||||
#### 2.2.2 Docstrings Language Model
|
||||
### 2.2 Train on Kubeflow
|
||||
|
||||
This part trains a language model based on the docstrings in the dataset and uses `tensor2tensor`
|
||||
|
||||
* Generate `TFRecords` for training
|
||||
```
|
||||
$ export MOUNT_DATA_DIR=/path/to/data/folder
|
||||
$ docker run --rm -it -v ${MOUNT_DATA_DIR}:/data ${BUILD_IMAGE_TAG} \
|
||||
t2t-datagen --problem=github_docstring_language_model --data_dir=/data
|
||||
* Setup secrets for access permissions Google Cloud Storage and Google Container Registry
|
||||
```shell
|
||||
$ PROJECT=my-project ./create_secrets.sh
|
||||
```
|
||||
|
||||
* Train language model using `Tranformer Networks` and a custom hyper-parameters set
|
||||
```
|
||||
$ export MOUNT_DATA_DIR=/path/to/data/folder
|
||||
$ export MOUNT_OUTPUT_DIR=/path/to/output/folder
|
||||
$ docker run --rm -it -v ${MOUNT_DATA_DIR}:/data -v ${MOUNT_OUTPUT_DIR}:/output ${BUILD_IMAGE_TAG} \
|
||||
t2t-trainer --problem=github_docstring_language_model --data_dir=/data --output_dir=/output \
|
||||
--model=transformer --hparams_set=transformer_gh_lm
|
||||
```
|
||||
|
||||
### 2.3 Train on Kubeflow
|
||||
|
||||
TODO
|
||||
**NOTE**: Use `create_secrets.sh -d` to remove any side-effects of the above step.
|
||||
|
||||
# Acknowledgements
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,57 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
##
|
||||
# This script creates all the necessary service accounts and permissions
|
||||
# needed for the training jobs to pull private images from
|
||||
# Google Cloud Registry and access Google Cloud Storage. To
|
||||
# undo all the changes made, add a "-d" flag while executing the
|
||||
# script.
|
||||
#
|
||||
|
||||
set -ex
|
||||
|
||||
export PROJECT=${PROJECT:-}
|
||||
|
||||
if [[ -z "${PROJECT}" ]]; then
|
||||
echo "PROJECT environment variable missing!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export SA_NAME=code-search-access
|
||||
export SA_EMAIL=${SA_NAME}@${PROJECT}.iam.gserviceaccount.com
|
||||
export SA_KEY_FILE=${SA_EMAIL}.key.json
|
||||
|
||||
|
||||
if [[ "${1}" = "-d" ]]; then
|
||||
gcloud projects remove-iam-policy-binding ${PROJECT} \
|
||||
--member=serviceAccount:${SA_EMAIL} \
|
||||
--role=roles/storage.admin
|
||||
|
||||
gcloud iam service-accounts delete ${SA_EMAIL} --quiet
|
||||
|
||||
rm -f ${SA_KEY_FILE}
|
||||
|
||||
kubectl delete secret gcp-credentials gcp-registry-credentials
|
||||
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
gcloud iam service-accounts create ${SA_NAME} --display-name ${SA_EMAIL}
|
||||
|
||||
gcloud projects add-iam-policy-binding ${PROJECT} \
|
||||
--member=serviceAccount:${SA_EMAIL} \
|
||||
--role=roles/storage.admin
|
||||
|
||||
gcloud iam service-accounts keys create ${SA_KEY_FILE} \
|
||||
--iam-account=${SA_EMAIL}
|
||||
|
||||
kubectl create secret docker-registry gcp-registry-credentials \
|
||||
--docker-server=https://gcr.io \
|
||||
--docker-username=_json_key \
|
||||
--docker-password="$(cat ${SA_KEY_FILE})" \
|
||||
--docker-email=${SA_EMAIL}
|
||||
|
||||
kubectl create secret generic gcp-credentials \
|
||||
--from-file=key.json="${SA_KEY_FILE}"
|
||||
|
||||
|
|
@ -1,11 +1,11 @@
|
|||
apiVersion: 0.1.0
|
||||
environments:
|
||||
default:
|
||||
code-search:
|
||||
destination:
|
||||
namespace: kubeflow
|
||||
server: https://130.211.225.204
|
||||
server: https://35.193.190.6
|
||||
k8sVersion: v1.9.6
|
||||
path: default
|
||||
path: code-search
|
||||
kind: ksonnet.io/app
|
||||
libraries:
|
||||
tf-job:
|
||||
|
|
|
|||
|
|
@ -7,27 +7,68 @@
|
|||
// Component-level parameters, defined initially from 'ks prototype use ...'
|
||||
// Each object below should correspond to a component in the components/ directory
|
||||
"t2t-job": {
|
||||
numWorker: 1,
|
||||
jobType: "trainer",
|
||||
|
||||
numMaster: 1,
|
||||
numPs: 1,
|
||||
numWorker: 0,
|
||||
numPs: 0,
|
||||
numWorkerGpu: 0,
|
||||
numPsGpu: 0,
|
||||
|
||||
train_steps: 100,
|
||||
eval_steps: 10,
|
||||
|
||||
image: "gcr.io/kubeflow-dev/code-search:devel",
|
||||
imageGpu: "gcr.io/kubeflow-dev/code-search:gpu-devel",
|
||||
image: "gcr.io/kubeflow-dev/code-search:v20180621-266e689",
|
||||
imageGpu: "gcr.io/kubeflow-dev/code-search:v20180621-gpu-db4f1ee",
|
||||
imagePullSecrets: [],
|
||||
|
||||
dataDir: "null",
|
||||
outputDir: "null",
|
||||
model: "null",
|
||||
hparams_set: "null",
|
||||
|
||||
// TODO(sanyamkapoor): A workaround for tensorflow/tensor2tensor#879
|
||||
gsDataDir: "null",
|
||||
gsOutputDir: "null",
|
||||
},
|
||||
|
||||
"t2t-gh-summarizer": {
|
||||
"name": "github_function_summarizer",
|
||||
"problem": "github_function_summarizer",
|
||||
"dataDir": "gs://kubeflow-dev/code-search/raw_data",
|
||||
"outputDir": "gs://kubeflow-dev/code-search/train",
|
||||
"model": "transformer",
|
||||
"hparams_set": "transformer_base"
|
||||
"t2t-translate-datagen": {
|
||||
jobType: "datagen",
|
||||
|
||||
name: "translate-ende-wmt32k-datagen",
|
||||
problem: "translate_ende_wmt32k",
|
||||
|
||||
// TODO(sanyamkapoor): A workaround for tensorflow/tensor2tensor#879
|
||||
dataDir: "/data",
|
||||
outputDir: "/data",
|
||||
gsOutputDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/datagen",
|
||||
},
|
||||
|
||||
"t2t-translate-exporter": {
|
||||
jobType: "exporter",
|
||||
|
||||
name: "translate-ende-wmt32k-exporter",
|
||||
problem: "translate_ende_wmt32k",
|
||||
dataDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/datagen",
|
||||
outputDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/output",
|
||||
model: "transformer",
|
||||
hparams_set: "transformer_base_single_gpu",
|
||||
},
|
||||
|
||||
"t2t-translate": {
|
||||
jobType: "trainer",
|
||||
numMaster: 1,
|
||||
numWorker: 2,
|
||||
numPs: 1,
|
||||
numWorkerGpu: 1,
|
||||
numPsGpu: 1,
|
||||
|
||||
name: "translate-ende-wmt32k",
|
||||
problem: "translate_ende_wmt32k",
|
||||
dataDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/datagen",
|
||||
outputDir: "gs://kubeflow-examples/t2t-translate/translate_ende_wmt32k/output",
|
||||
model: "transformer",
|
||||
hparams_set: "transformer_base_single_gpu",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,17 +2,52 @@ local tfJob = import "kubeflow/tf-job/tf-job.libsonnet";
|
|||
local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
|
||||
|
||||
{
|
||||
parts(newParams, env):: {
|
||||
local params = baseParams + newParams,
|
||||
getGcloudAuthCmd()::
|
||||
[
|
||||
"/root/google-cloud-sdk/bin/gcloud",
|
||||
"auth",
|
||||
"activate-service-account",
|
||||
"--key-file",
|
||||
"$GOOGLE_APPLICATION_CREDENTIALS",
|
||||
],
|
||||
|
||||
local t2tCmd = {
|
||||
datagen: [
|
||||
"t2t-datagen",
|
||||
"--problem=" + params.problem,
|
||||
"--data_dir=" + params.dataDir,
|
||||
],
|
||||
getGsUtilCmd(src_dir, dst_dir)::
|
||||
[
|
||||
"/root/google-cloud-sdk/bin/gsutil",
|
||||
"cp",
|
||||
"-r",
|
||||
src_dir,
|
||||
dst_dir,
|
||||
],
|
||||
|
||||
trainer: [
|
||||
wrapGsUtil(cmd, params):: {
|
||||
local resultCmd =
|
||||
(if params.gsDataDir == "null" && params.gsOutputDir == "null" then [] else $.getGcloudAuthCmd() + ["&&"]) +
|
||||
(if params.gsDataDir == "null" then [] else $.getGsUtilCmd(params.gsDataDir, params.dataDir) + ["&&"]) +
|
||||
cmd +
|
||||
(if params.gsOutputDir == "null" then [] else ["&&"] + $.getGsUtilCmd(params.outputDir, params.gsOutputDir)),
|
||||
result: ["-c", std.join(" ", resultCmd)]
|
||||
}.result,
|
||||
|
||||
getDatagenCmd(params)::
|
||||
[
|
||||
"t2t-datagen",
|
||||
"--problem=" + params.problem,
|
||||
"--data_dir=" + params.dataDir,
|
||||
],
|
||||
|
||||
getExporterCmd(params)::
|
||||
[
|
||||
"t2t-exporter",
|
||||
"--problem=" + params.problem,
|
||||
"--data_dir=" + params.dataDir,
|
||||
"--output_dir=" + params.outputDir,
|
||||
"--model=" + params.model,
|
||||
"--hparams_set=" + params.hparams_set,
|
||||
],
|
||||
|
||||
getTrainerCmd(params):: {
|
||||
local trainer = [
|
||||
"t2t-trainer",
|
||||
"--problem=" + params.problem,
|
||||
"--data_dir=" + params.dataDir,
|
||||
|
|
@ -22,7 +57,7 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
|
|||
"--train_steps=" + std.toString(params.train_steps),
|
||||
],
|
||||
|
||||
workerBase: self.trainer + [
|
||||
local workerBase = trainer + [
|
||||
"--schedule=train",
|
||||
"--ps_gpu=" + std.toString(params.numPsGpu),
|
||||
"--worker_gpu=" + std.toString(params.numWorkerGpu),
|
||||
|
|
@ -31,36 +66,112 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
|
|||
"--eval_steps=" + std.toString(params.eval_steps),
|
||||
],
|
||||
|
||||
ps: self.trainer + [
|
||||
ps: trainer + [
|
||||
"--schedule=run_std_server",
|
||||
"--ps_job=/job:ps",
|
||||
],
|
||||
|
||||
worker: self.workerBase + [
|
||||
worker: workerBase + [
|
||||
"--worker_job=/job:worker",
|
||||
],
|
||||
|
||||
master: self.workerBase + [
|
||||
master: workerBase + [
|
||||
"--worker_job=/job:master",
|
||||
],
|
||||
},
|
||||
|
||||
tfJobReplica(replicaType, number, args, image, numGpus=0, imagePullSecrets=[], env=[], volumes=[], volumeMounts=[])::
|
||||
local containerSpec = {
|
||||
image: image,
|
||||
name: "tensorflow",
|
||||
[if std.length(args) > 0 then "args"]: args,
|
||||
[if numGpus > 0 then "resources"]: {
|
||||
limits: {
|
||||
"nvidia.com/gpu": numGpus,
|
||||
},
|
||||
},
|
||||
[if std.length(env) > 0 then "env"]: env,
|
||||
[if std.length(volumeMounts) > 0 then "volumeMounts"]: volumeMounts,
|
||||
};
|
||||
{
|
||||
replicas: number,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [ containerSpec ],
|
||||
[if std.length(imagePullSecrets) > 0 then "imagePullSecrets"]: imagePullSecrets,
|
||||
[if std.length(volumes) > 0 then "volumes"]: volumes,
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
tfReplicaType: replicaType,
|
||||
},
|
||||
|
||||
parts(newParams, env):: {
|
||||
local params = baseParams + newParams,
|
||||
|
||||
local terminationPolicy = if params.numMaster == 1
|
||||
then tfJob.parts.tfJobTerminationPolicy("MASTER", 0)
|
||||
else tfJob.parts.tfJobTerminationPolicy("WORKER", 0),
|
||||
|
||||
local workerImage = if params.numWorkerGpu > 0 then params.imageGpu else params.image,
|
||||
local psImage = if params.numPsGpu > 0 then params.imageGpu else params.image,
|
||||
local workerImagePullSecrets = [
|
||||
{ name: "gcp-registry-credentials" },
|
||||
],
|
||||
local workerEnv = [
|
||||
{
|
||||
name: "GOOGLE_APPLICATION_CREDENTIALS",
|
||||
value: "/secret/gcp-credentials/key.json"
|
||||
},
|
||||
],
|
||||
local workerVolumes = [
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: "gcp-credentials",
|
||||
},
|
||||
},
|
||||
],
|
||||
local workerVolumeMounts = [
|
||||
{
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
name: "gcp-credentials",
|
||||
},
|
||||
],
|
||||
|
||||
// TODO(sanyamkapoor): A workaround for tensorflow/tensor2tensor#879
|
||||
// once fixed, simply get rid of $.wrapGsUtil method
|
||||
local cmd = $.getTrainerCmd(params),
|
||||
local finalCmd = {
|
||||
master: $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + cmd.master, params),
|
||||
worker: $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + cmd.worker, params),
|
||||
ps: $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + cmd.ps, params),
|
||||
},
|
||||
local datagenCmd = $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + $.getDatagenCmd(params), params),
|
||||
local exporterCmd = $.wrapGsUtil(["/usr/local/sbin/t2t-entrypoint"] + $.getExporterCmd(params), params),
|
||||
|
||||
job::
|
||||
tfJob.parts.tfJob(
|
||||
params.name,
|
||||
env.namespace,
|
||||
[
|
||||
tfJob.parts.tfJobReplica("MASTER", params.numMaster, t2tCmd.master, workerImage, params.imagePullSecrets, params.numWorkerGpu),
|
||||
tfJob.parts.tfJobReplica("WORKER", params.numWorker, t2tCmd.worker, workerImage, params.imagePullSecrets, params.numWorkerGpu),
|
||||
tfJob.parts.tfJobReplica("PS", params.numPs, t2tCmd.ps, psImage, params.imagePullSecrets, params.numPsGpu),
|
||||
],
|
||||
if params.jobType == "datagen" then
|
||||
[
|
||||
$.tfJobReplica("MASTER", params.numMaster, datagenCmd, workerImage, params.numWorkerGpu,
|
||||
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
|
||||
]
|
||||
else if params.jobType == "exporter" then
|
||||
[
|
||||
$.tfJobReplica("MASTER", params.numMaster, exporterCmd, workerImage, params.numWorkerGpu,
|
||||
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
|
||||
]
|
||||
else
|
||||
[
|
||||
$.tfJobReplica("MASTER", params.numMaster, finalCmd.master, workerImage, params.numWorkerGpu,
|
||||
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
|
||||
$.tfJobReplica("WORKER", params.numWorker, finalCmd.worker, workerImage, params.numWorkerGpu,
|
||||
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
|
||||
$.tfJobReplica("PS", params.numPs, finalCmd.ps, workerImage, params.numPsGpu,
|
||||
workerImagePullSecrets, workerEnv, workerVolumes, workerVolumeMounts),
|
||||
],
|
||||
terminationPolicy
|
||||
),
|
||||
},
|
||||
|
|
|
|||
|
|
@ -0,0 +1,7 @@
|
|||
local k = import "k.libsonnet";
|
||||
local t2tJob = import "t2t-job.libsonnet";
|
||||
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-translate-datagen"];
|
||||
|
||||
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
local k = import "k.libsonnet";
|
||||
local t2tJob = import "t2t-job.libsonnet";
|
||||
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-translate-exporter"];
|
||||
|
||||
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))
|
||||
|
|
@ -2,6 +2,6 @@ local k = import "k.libsonnet";
|
|||
local t2tJob = import "t2t-job.libsonnet";
|
||||
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-gh-summarizer"];
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-translate"];
|
||||
|
||||
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))
|
||||
|
|
@ -17,4 +17,12 @@ ADD t2t-entrypoint.sh /usr/local/sbin/t2t-entrypoint
|
|||
|
||||
ENV T2T_USR_DIR=/t2t_problems
|
||||
|
||||
ENTRYPOINT ["/usr/local/sbin/t2t-entrypoint"]
|
||||
WORKDIR /t2t_problems
|
||||
|
||||
#ENTRYPOINT ["/usr/local/sbin/t2t-entrypoint"]
|
||||
|
||||
# TODO(sanyamkapoor): A workaround for tensorflow/tensor2tensor#879
|
||||
RUN apt-get update && apt-get install -y curl python &&\
|
||||
curl https://sdk.cloud.google.com | bash &&\
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
ENTRYPOINT ["bash"]
|
||||
|
|
|
|||
|
|
@ -1,10 +1,18 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -ex
|
||||
|
||||
PROJECT=${PROJECT:-}
|
||||
BASE_IMAGE_TAG=${BASE_IMAGE_TAG:-1.8.0-py3} # 1.8.0-gpu-py3 for GPU-based image
|
||||
BUILD_IMAGE_TAG=${BUILD_IMAGE_TAG:-code-search:devel}
|
||||
|
||||
if [[ -z "${PROJECT}" ]]; then
|
||||
echo "PROJECT environment variable missing!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
GPU=${GPU:-0}
|
||||
|
||||
BASE_IMAGE_TAG=$([[ "${GPU}" = "1" ]] && echo "1.8.0-gpu-py3" || echo "1.8.0-py3")
|
||||
BUILD_IMAGE_TAG="code-search:v$(date +%Y%m%d)$([[ ${GPU} = "1" ]] && echo '-gpu' || echo '')-$(python3 -c 'import uuid; print(uuid.uuid4().hex[:7]);')"
|
||||
|
||||
# Directory of this script used as docker context
|
||||
_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
|
|
@ -13,10 +21,8 @@ pushd "$_SCRIPT_DIR"
|
|||
|
||||
docker build -t ${BUILD_IMAGE_TAG} --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} .
|
||||
|
||||
# Push image to GCR if PROJECT available
|
||||
if [[ ! -z "${PROJECT}" ]]; then
|
||||
docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
|
||||
docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
|
||||
fi
|
||||
# Push image to GCR PROJECT available
|
||||
docker tag ${BUILD_IMAGE_TAG} gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
|
||||
docker push gcr.io/${PROJECT}/${BUILD_IMAGE_TAG}
|
||||
|
||||
popd
|
||||
|
|
|
|||
Loading…
Reference in New Issue