Improvements to the tensor2tensor trainer for the GitHub summarization example. (#109)

* Improvements to the tensor2tensor traininer for the GitHub summarization example. * Simplify the launcher; we can just pass through most command line arguments and not use environment variables and command line arguments. * This makes it easier to control the job just by setting the parameters in the template rather than having to rebuild the images. * Add a Makefile to build the image. * Replace the tensor2tensor jsonnet with a newer version of the jsonnet used with T2T. * Address reviewer comments. * Install pip packages as user Jovyan * Rely on implicit string conversion with concatenation in template file.
2018-04-29 20:39:16 -07:00 · 2018-04-29 20:39:16 -07:00 · 79aa2074cd
parent afdd4c544e
commit 79aa2074cd
8 changed files with 230 additions and 157 deletions
--- a/github_issue_summarization/ks-kubeflow/components/params.libsonnet
+++ b/github_issue_summarization/ks-kubeflow/components/params.libsonnet
@ -45,6 +45,7 @@
      replicas: 2,
    },
    tensor2tensor: {
      cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180428-9da5cb7-dirty-4e1f35",
      namespace: "null",
    },    
    tensorboard: {
--- a/github_issue_summarization/ks-kubeflow/components/tensor2tensor.jsonnet
+++ b/github_issue_summarization/ks-kubeflow/components/tensor2tensor.jsonnet
@ -4,4 +4,4 @@ local k = import "k.libsonnet";
 local tensor2tensor = import "tensor2tensor.libsonnet";
-std.prune(k.core.v1.list.new([tensor2tensor.parts(params)]))
+std.prune(k.core.v1.list.new([tensor2tensor.parts(params, env).job]))
--- a/github_issue_summarization/ks-kubeflow/components/tensor2tensor.libsonnet
+++ b/github_issue_summarization/ks-kubeflow/components/tensor2tensor.libsonnet
@ -1,95 +1,150 @@
 {
-  parts(params):: {
+  parts(params, env):: {
-    apiVersion: "kubeflow.org/v1alpha1",
+    // Define some defaults.
-    kind: "TFJob",
+    local updatedParams = {
-    metadata: {
+      sync: "0",
-      name: "tensor2tensor",
+      
-      namespace: params.namespace,
+      dataDir: "gs://kubeflow-examples-data/gh_issue_summarization/data",
-    },
+      usrDir: "./github",
-    spec: {
+      problem: "github_issue_summarization_problem",
-      replicaSpecs: [
+
-        {
+      model: "transformer_encoder",
-          replicas: 1,
+      hparams: "transformer_github_issues",
-          template: {
+      hparamsSet: "transformer_github_issues",      
-            spec: {
+      outputGCSPath: "gs://kubecon-gh-demo/gh-t2t-out/temp",
-              containers: [
+
-                {
+      gpuImage: null,
-                  image: params.image,
+      cpuImage: null,
-                  name: "tensorflow",
+
-                  command: [
+      trainSteps: 20000,
-                    "bash",
+      evalSteps: 10,
-                  ],
+
-                  args: [
+      psGpu: 0,
-                    "/home/jovyan/train_dist_launcher.sh",
+      workerGpu: 0,
-                    "1",
+
-                    params.workers,
+      workers: 3,
-                    "0",
+      masters: 1,
-                    params.train_steps,
+      ps: 1,
-                    "/job:master",
+
-                    "False",
+      jobName: "tensor2tensor",
-                  ],
+    } + params,
-                },
+
-              ],
+    local containerEnv = [
-              restartPolicy: "OnFailure",
+      {
        name: "PYTHONPATH",
        value: "/home/jovyan",
      }
    ],
    local baseCommand = [      
      "/home/jovyan/github/t2t_launcher.sh",
      "--train_steps=" + updatedParams.trainSteps,
      "--hparams_set=" + updatedParams.hparams,
      "--model=" + updatedParams.model,
      "--problems=" + updatedParams.problem,
      "--t2t_usr_dir=" + updatedParams.usrDir,
      "--data_dir=" + updatedParams.dataDir,
      "--output_dir=" + updatedParams.outputGCSPath,
    ],
    local psCommand = baseCommand + [
      "--schedule=run_std_server",
    ],
    local totalWorkerReplicas = updatedParams.workers + updatedParams.masters,
    local workerBaseCommand = baseCommand + [
      "--schedule=train",
      "--sync=" + updatedParams.sync,
      "--ps_gpu=" + updatedParams.psGpu,
      "--worker_gpu=" + updatedParams.workerGpu,
      // We explicitly want to add worker and 
      "--worker_replicas=" + totalWorkerReplicas,
      "--ps_replicas=" + updatedParams.ps,
      "--eval_steps=" + updatedParams.evalSteps,
    ],
    local workerCommand = workerBaseCommand + [
      "--worker_job=/job:worker",
    ],
    local masterCommand = workerBaseCommand + [
      "--worker_job=/job:master",
    ],
    local namespace = env.namespace,
    job:: {
      apiVersion: "kubeflow.org/v1alpha1",
      kind: "TFJob",
      metadata: {
        name: updatedParams.jobName,
        namespace: env.namespace,
      },
      spec: {
        replicaSpecs: [
          {
            replicas: 1,
            template: {
              spec: {
                containers: [
                  {
                    image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
                    name: "tensorflow",
                    command: masterCommand,
                    env: containerEnv,
                    [if updatedParams.workerGpu > 0 then "resources"]: {
                      limits: {
                        "nvidia.com/gpu": updatedParams.workerGpu,
                      },
                    },
                  },
                ],
                restartPolicy: "OnFailure",
              },
            },
            tfReplicaType: "MASTER",
          },
-          tfReplicaType: "MASTER",
+          {
-        },
+            replicas: updatedParams.workers,
-        {
+            template: {
-          replicas: params.workers,
+              spec: {
-          template: {
+                containers: [
-            spec: {
+                  {
-              containers: [
+                    image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
-                {
+                    name: "tensorflow",
-                  image: params.image,
+                    command: workerCommand,
-                  name: "tensorflow",
+                    env: containerEnv,
-                  command: [
+                    [if updatedParams.workerGpu > 0 then "resources"]: {
-                    "bash",
+                      limits: {
-                  ],
+                        "nvidia.com/gpu": updatedParams.workerGpu,
-                  args: [
+                      },
-                    "/home/jovyan/train_dist_launcher.sh",
+                    },
-                    "1",
+                  },
-                    params.workers,
+                ],
-                    "0",
+                restartPolicy: "OnFailure",
-                    params.train_steps,
+              },
                    "/job:master",
                    "False",
                  ],
                },
              ],
              restartPolicy: "OnFailure",
            },
            tfReplicaType: "WORKER",
          },
-          tfReplicaType: "WORKER",
+          {
-        },
+            replicas: updatedParams.ps,
-        {
+            template: {
-          replicas: 1,
+              spec: {
-          template: {
+                containers: [
-            spec: {
+                  {
-              containers: [
+                    image: updatedParams.cpuImage,
-                {
+                    name: "tensorflow",
-                  image: params.image,
+                    command: psCommand,
-                  name: "tensorflow",
+                    env: containerEnv,
-                  command: [
+                  },
-                    "bash",
+                ],
-                  ],
+                restartPolicy: "OnFailure",
-                  args: [
+              },
                    "/home/jovyan/ps_dist_launcher.sh",
                  ],
                },
              ],
              restartPolicy: "OnFailure",
            },
            tfReplicaType: "PS",
          },
        ],
        terminationPolicy: {
          chief: {
            replicaIndex: 0,
            replicaName: "MASTER",
          },
          tfReplicaType: "PS",
        },
      ],
      terminationPolicy: {
        chief: {
          replicaIndex: 0,
          replicaName: "MASTER",
        },
      },
-    },
+    },  // job
-  },
+  },  //parts
-}
+}
--- a/github_issue_summarization/tensor2tensor/github/Dockerfile
+++ b/github_issue_summarization/tensor2tensor/github/Dockerfile
@ -1,14 +1,21 @@
-FROM gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
+# Docker image to train a model using T2T
 #
 # For GPU use gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
 # For CPU use gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-cpu:latest
 ARG BASE_IMAGE=gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
 FROM $BASE_IMAGE
 # Install pip packages as user jovyan
 RUN pip install tensor2tensor h5py
 USER root
-RUN pip install tensor2tensor && \
+RUN apt-get install -y jq
      apt-get install -y jq
 COPY __init__.py github/__init__.py
 COPY github_problem.py github/github_problem.py
-COPY ps_dist_launcher.sh github/ps_dist_launcher.sh
+COPY t2t_launcher.sh github/t2t_launcher.sh
-COPY train_dist_launcher.sh github/train_dist_launcher.sh
+RUN chmod a+rx github/t2t_launcher.sh
 RUN chown -R jovyan:users /home/jovyan/github
--- a/github_issue_summarization/tensor2tensor/github/Makefile
+++ b/github_issue_summarization/tensor2tensor/github/Makefile
@ -0,0 +1,60 @@
 # Copyright 2017 The Kubernetes Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Requirements:
 #   https://github.com/mattrobenolt/jinja2-cli
 #   pip install jinja2-clie
 # Update the Airflow deployment
 # List any changed  files. We only include files in the notebooks directory.
 # because that is the code in the docker image.
 # In particular we exclude changes to the ksonnet configs.
 CHANGED_FILES := $(shell git diff-files --relative=github_issue_summarization/tensor2tensor/github)
 ifeq ($(strip $(CHANGED_FILES)),)
 # Changed files is empty; not dirty
 # Don't include --dirty because it could be dirty if files outside the ones we care
 # about changed.
 TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always)
 else
 TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always --dirty)-$(shell git diff | shasum -a256 | cut -c -6)
 endif
 DIR := $(shell pwd)
 # You can override this on the command line as
 # make PROJECT=kubeflow-examples <target>
 PROJECT := kubeflow-examples
 IMG := gcr.io/$(PROJECT)/issue-summarization-t2t-trainer
 CPU_IMG := $(IMG)-cpu
 CPU_BASE_IMG := gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-cpu:v20180419-0ad94c4e
 echo:
 	@echo changed files $(CHANGED_FILES)
 	@echo tag $(TAG)		
 push-cpu: build-cpu
 	gcloud docker -- push $(CPU_IMG):$(TAG)
 set-image: push-cpu
 	# Set the image to use
 	cd ../../ks-kubeflow && ks param set tensor2tensor cpuImage $(CPU_IMG):$(TAG)
 # To build without the cache set the environment variable
 # export DOCKER_BUILD_OPTS=--no-cache
 build-cpu:
 	docker build ${DOCKER_BUILD_OPTS} --build-arg BASE_IMAGE=$(CPU_BASE_IMG) -f Dockerfile -t $(CPU_IMG):$(TAG) ./
 	@echo Built $(CPU_IMG):$(TAG)
--- a/github_issue_summarization/tensor2tensor/github/ps_dist_launcher.sh
+++ b/github_issue_summarization/tensor2tensor/github/ps_dist_launcher.sh
@ -1,26 +0,0 @@
 #!/bin/bash
 # TODO(ankushagarwal): Convert this to a python launcher script
 set -x
 export TF_CONFIG=${TF_CONFIG}
 echo "TF_CONFIG = ${TF_CONFIG}"
 OUTDIR=./out
 DATA_DIR=gs://kubeflow-examples/tensor2tensor/data
 TMP_DIR=./tmp
 PROBLEM=github_issue_summarization_problem
 USR_DIR=./github
 HPARAMS_SET=transformer_github_issues
 WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
 WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
 MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
 rm -rf "${OUTDIR}" "${TMP_DIR}"
 mkdir -p "${OUTDIR}"
 mkdir -p "${TMP_DIR}"
 t2t-trainer \
  --data_dir=${DATA_DIR} \
  --t2t_usr_dir=${USR_DIR} \
  --problems=${PROBLEM} \
  --model=transformer \
  --hparams_set=${HPARAMS_SET} \
  --output_dir=$OUTDIR --job-dir=$OUTDIR --train_steps=1000 \
  --master=grpc://${MASTER_INSTANCE} \
  --schedule=run_std_server
--- a/github_issue_summarization/tensor2tensor/github/t2t_launcher.sh
+++ b/github_issue_summarization/tensor2tensor/github/t2t_launcher.sh
@ -0,0 +1,15 @@
 #!/bin/bash
 set -x
 echo environment
 env | sort
 WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
 WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
 MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
 t2t-trainer \
  --master=grpc://${MASTER_INSTANCE} \
  --worker_id=${WORKER_ID} \
  --tmp_dir=/tmp \
  "$@"
 # Sleep to give fluentd time to capture logs
 sleep 120  
--- a/github_issue_summarization/tensor2tensor/github/train_dist_launcher.sh
+++ b/github_issue_summarization/tensor2tensor/github/train_dist_launcher.sh
@ -1,39 +0,0 @@
 #!/bin/bash
 # TODO(ankushagarwal): Convert this to a python launcher script
 set -x
 PS_REPLICAS="${1}"
 WORKER_REPLICAS="${2}"
 WORKER_GPU="${3}"
 TRAIN_STEPS="${4}"
 WORKER_JOB="${5}"
 SYNC="${6}"
 export TF_CONFIG=$(echo ${TF_CONFIG} | sed 's/"worker"/"master"/g')
 echo "TF_CONFIG = ${TF_CONFIG}"
 OUTDIR=./out
 DATA_DIR=gs://kubeflow-examples/tensor2tensor/data
 TMP_DIR=./tmp
 PROBLEM=github_issue_summarization_problem
 USR_DIR=./github
 HPARAMS_SET=transformer_github_issues
 WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
 WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
 MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
 rm -rf "${OUTDIR}" "${TMP_DIR}"
 mkdir -p "${OUTDIR}"
 mkdir -p "${TMP_DIR}"
 t2t-trainer \
  --data_dir=${DATA_DIR} \
  --t2t_usr_dir=${USR_DIR} \
  --problems=${PROBLEM} \
  --model=transformer \
  --hparams_set=${HPARAMS_SET} \
  --output_dir=$OUTDIR --job-dir=$OUTDIR --train_steps=${TRAIN_STEPS} \
  --master=grpc://${MASTER_INSTANCE} \
  --ps_replicas=${PS_REPLICAS} \
  --worker_replicas=${WORKER_REPLICAS} \
  --worker_gpu=${WORKER_GPU} \
  --worker_id=${WORKER_ID} \
  --worker_job=${WORKER_JOB} \
  --ps_gpu=0 \
  --schedule=train \
  --sync=${SYNC}
`@ -4,4 +4,4 @@ local k = import "k.libsonnet";`

	`local tensor2tensor = import "tensor2tensor.libsonnet";`	`local tensor2tensor = import "tensor2tensor.libsonnet";`

	`std.prune(k.core.v1.list.new([tensor2tensor.parts(params)]))`	`std.prune(k.core.v1.list.new([tensor2tensor.parts(params, env).job]))`