Improvements to the tensor2tensor trainer for the GitHub summarization example. (#109)

* Improvements to the tensor2tensor traininer for the GitHub summarization example. * Simplify the launcher; we can just pass through most command line arguments and not use environment variables and command line arguments. * This makes it easier to control the job just by setting the parameters in the template rather than having to rebuild the images. * Add a Makefile to build the image. * Replace the tensor2tensor jsonnet with a newer version of the jsonnet used with T2T. * Address reviewer comments. * Install pip packages as user Jovyan * Rely on implicit string conversion with concatenation in template file.
2018-04-29 20:39:16 -07:00 · 2018-04-29 20:39:16 -07:00 · 79aa2074cd
parent afdd4c544e
commit 79aa2074cd
8 changed files with 230 additions and 157 deletions
--- a/github_issue_summarization/ks-kubeflow/components/params.libsonnet
+++ b/github_issue_summarization/ks-kubeflow/components/params.libsonnet
@ -45,6 +45,7 @@
      replicas: 2,
    },
    tensor2tensor: {
+      cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180428-9da5cb7-dirty-4e1f35",
      namespace: "null",
    },    
    tensorboard: {
--- a/github_issue_summarization/ks-kubeflow/components/tensor2tensor.jsonnet
+++ b/github_issue_summarization/ks-kubeflow/components/tensor2tensor.jsonnet
@ -4,4 +4,4 @@ local k = import "k.libsonnet";

 local tensor2tensor = import "tensor2tensor.libsonnet";

-std.prune(k.core.v1.list.new([tensor2tensor.parts(params)]))
+std.prune(k.core.v1.list.new([tensor2tensor.parts(params, env).job]))
--- a/github_issue_summarization/ks-kubeflow/components/tensor2tensor.libsonnet
+++ b/github_issue_summarization/ks-kubeflow/components/tensor2tensor.libsonnet
@ -1,95 +1,150 @@
 {
-  parts(params):: {
-    apiVersion: "kubeflow.org/v1alpha1",
-    kind: "TFJob",
-    metadata: {
-      name: "tensor2tensor",
-      namespace: params.namespace,
-    },
-    spec: {
-      replicaSpecs: [
-        {
-          replicas: 1,
-          template: {
-            spec: {
-              containers: [
-                {
-                  image: params.image,
-                  name: "tensorflow",
-                  command: [
-                    "bash",
-                  ],
-                  args: [
-                    "/home/jovyan/train_dist_launcher.sh",
-                    "1",
-                    params.workers,
-                    "0",
-                    params.train_steps,
-                    "/job:master",
-                    "False",
-                  ],
-                },
-              ],
-              restartPolicy: "OnFailure",
+  parts(params, env):: {
+    // Define some defaults.
+    local updatedParams = {
+      sync: "0",
+      
+      dataDir: "gs://kubeflow-examples-data/gh_issue_summarization/data",
+      usrDir: "./github",
+      problem: "github_issue_summarization_problem",
+
+      model: "transformer_encoder",
+      hparams: "transformer_github_issues",
+      hparamsSet: "transformer_github_issues",      
+      outputGCSPath: "gs://kubecon-gh-demo/gh-t2t-out/temp",
+
+      gpuImage: null,
+      cpuImage: null,
+
+      trainSteps: 20000,
+      evalSteps: 10,
+
+      psGpu: 0,
+      workerGpu: 0,
+
+      workers: 3,
+      masters: 1,
+      ps: 1,
+
+      jobName: "tensor2tensor",
+    } + params,
+
+    local containerEnv = [
+      {
+        name: "PYTHONPATH",
+        value: "/home/jovyan",
+      }
+    ],
+    local baseCommand = [      
+      "/home/jovyan/github/t2t_launcher.sh",
+      "--train_steps=" + updatedParams.trainSteps,
+      "--hparams_set=" + updatedParams.hparams,
+      "--model=" + updatedParams.model,
+      "--problems=" + updatedParams.problem,
+      "--t2t_usr_dir=" + updatedParams.usrDir,
+      "--data_dir=" + updatedParams.dataDir,
+      "--output_dir=" + updatedParams.outputGCSPath,
+    ],
+    local psCommand = baseCommand + [
+      "--schedule=run_std_server",
+    ],
+    local totalWorkerReplicas = updatedParams.workers + updatedParams.masters,
+    local workerBaseCommand = baseCommand + [
+      "--schedule=train",
+      "--sync=" + updatedParams.sync,
+      "--ps_gpu=" + updatedParams.psGpu,
+      "--worker_gpu=" + updatedParams.workerGpu,
+      // We explicitly want to add worker and 
+      "--worker_replicas=" + totalWorkerReplicas,
+      "--ps_replicas=" + updatedParams.ps,
+      "--eval_steps=" + updatedParams.evalSteps,
+    ],
+    local workerCommand = workerBaseCommand + [
+      "--worker_job=/job:worker",
+    ],
+    local masterCommand = workerBaseCommand + [
+      "--worker_job=/job:master",
+    ],
+    local namespace = env.namespace,
+
+    job:: {
+      apiVersion: "kubeflow.org/v1alpha1",
+      kind: "TFJob",
+      metadata: {
+        name: updatedParams.jobName,
+        namespace: env.namespace,
+      },
+      spec: {
+        replicaSpecs: [
+          {
+            replicas: 1,
+            template: {
+              spec: {
+                containers: [
+                  {
+                    image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
+                    name: "tensorflow",
+                    command: masterCommand,
+                    env: containerEnv,
+                    [if updatedParams.workerGpu > 0 then "resources"]: {
+                      limits: {
+                        "nvidia.com/gpu": updatedParams.workerGpu,
+                      },
+                    },
+                  },
+                ],
+                restartPolicy: "OnFailure",
+              },
            },
+            tfReplicaType: "MASTER",
          },
-          tfReplicaType: "MASTER",
-        },
-        {
-          replicas: params.workers,
-          template: {
-            spec: {
-              containers: [
-                {
-                  image: params.image,
-                  name: "tensorflow",
-                  command: [
-                    "bash",
-                  ],
-                  args: [
-                    "/home/jovyan/train_dist_launcher.sh",
-                    "1",
-                    params.workers,
-                    "0",
-                    params.train_steps,
-                    "/job:master",
-                    "False",
-                  ],
-                },
-              ],
-              restartPolicy: "OnFailure",
+          {
+            replicas: updatedParams.workers,
+            template: {
+              spec: {
+                containers: [
+                  {
+                    image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
+                    name: "tensorflow",
+                    command: workerCommand,
+                    env: containerEnv,
+                    [if updatedParams.workerGpu > 0 then "resources"]: {
+                      limits: {
+                        "nvidia.com/gpu": updatedParams.workerGpu,
+                      },
+                    },
+                  },
+                ],
+                restartPolicy: "OnFailure",
+              },
            },
+            tfReplicaType: "WORKER",
          },
-          tfReplicaType: "WORKER",
-        },
-        {
-          replicas: 1,
-          template: {
-            spec: {
-              containers: [
-                {
-                  image: params.image,
-                  name: "tensorflow",
-                  command: [
-                    "bash",
-                  ],
-                  args: [
-                    "/home/jovyan/ps_dist_launcher.sh",
-                  ],
-                },
-              ],
-              restartPolicy: "OnFailure",
+          {
+            replicas: updatedParams.ps,
+            template: {
+              spec: {
+                containers: [
+                  {
+                    image: updatedParams.cpuImage,
+                    name: "tensorflow",
+                    command: psCommand,
+                    env: containerEnv,
+                  },
+                ],
+                restartPolicy: "OnFailure",
+              },
            },
+            tfReplicaType: "PS",
+          },
+        ],
+        terminationPolicy: {
+          chief: {
+            replicaIndex: 0,
+            replicaName: "MASTER",
          },
-          tfReplicaType: "PS",
-        },
-      ],
-      terminationPolicy: {
-        chief: {
-          replicaIndex: 0,
-          replicaName: "MASTER",
        },
      },
-    },
-  },
-}
+    },  // job
+  },  //parts
+}
--- a/github_issue_summarization/tensor2tensor/github/Dockerfile
+++ b/github_issue_summarization/tensor2tensor/github/Dockerfile
@ -1,14 +1,21 @@
-FROM gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
+# Docker image to train a model using T2T
+#
+# For GPU use gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
+# For CPU use gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-cpu:latest
+ARG BASE_IMAGE=gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
+FROM $BASE_IMAGE
+
+# Install pip packages as user jovyan
+RUN pip install tensor2tensor h5py

 USER root

-RUN pip install tensor2tensor && \
-      apt-get install -y jq
+RUN apt-get install -y jq

 COPY __init__.py github/__init__.py
 COPY github_problem.py github/github_problem.py
-COPY ps_dist_launcher.sh github/ps_dist_launcher.sh
-COPY train_dist_launcher.sh github/train_dist_launcher.sh
+COPY t2t_launcher.sh github/t2t_launcher.sh
+RUN chmod a+rx github/t2t_launcher.sh

 RUN chown -R jovyan:users /home/jovyan/github

--- a/github_issue_summarization/tensor2tensor/github/Makefile
+++ b/github_issue_summarization/tensor2tensor/github/Makefile
@ -0,0 +1,60 @@
+# Copyright 2017 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Requirements:
+#   https://github.com/mattrobenolt/jinja2-cli
+#   pip install jinja2-clie
+# Update the Airflow deployment
+
+# List any changed  files. We only include files in the notebooks directory.
+# because that is the code in the docker image.
+# In particular we exclude changes to the ksonnet configs.
+CHANGED_FILES := $(shell git diff-files --relative=github_issue_summarization/tensor2tensor/github)
+
+ifeq ($(strip $(CHANGED_FILES)),)
+# Changed files is empty; not dirty
+# Don't include --dirty because it could be dirty if files outside the ones we care
+# about changed.
+TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always)
+else
+TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always --dirty)-$(shell git diff | shasum -a256 | cut -c -6)
+endif
+
+DIR := $(shell pwd)
+
+# You can override this on the command line as
+# make PROJECT=kubeflow-examples <target>
+PROJECT := kubeflow-examples
+
+IMG := gcr.io/$(PROJECT)/issue-summarization-t2t-trainer
+CPU_IMG := $(IMG)-cpu
+
+CPU_BASE_IMG := gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-cpu:v20180419-0ad94c4e
+
+echo:
+	@echo changed files $(CHANGED_FILES)
+	@echo tag $(TAG)		
+
+push-cpu: build-cpu
+	gcloud docker -- push $(CPU_IMG):$(TAG)
+
+set-image: push-cpu
+	# Set the image to use
+	cd ../../ks-kubeflow && ks param set tensor2tensor cpuImage $(CPU_IMG):$(TAG)
+
+# To build without the cache set the environment variable
+# export DOCKER_BUILD_OPTS=--no-cache
+build-cpu:
+	docker build ${DOCKER_BUILD_OPTS} --build-arg BASE_IMAGE=$(CPU_BASE_IMG) -f Dockerfile -t $(CPU_IMG):$(TAG) ./
+	@echo Built $(CPU_IMG):$(TAG)
--- a/github_issue_summarization/tensor2tensor/github/ps_dist_launcher.sh
+++ b/github_issue_summarization/tensor2tensor/github/ps_dist_launcher.sh
@ -1,26 +0,0 @@
-#!/bin/bash
-# TODO(ankushagarwal): Convert this to a python launcher script
-set -x
-export TF_CONFIG=${TF_CONFIG}
-echo "TF_CONFIG = ${TF_CONFIG}"
-OUTDIR=./out
-DATA_DIR=gs://kubeflow-examples/tensor2tensor/data
-TMP_DIR=./tmp
-PROBLEM=github_issue_summarization_problem
-USR_DIR=./github
-HPARAMS_SET=transformer_github_issues
-WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
-WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
-MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
-rm -rf "${OUTDIR}" "${TMP_DIR}"
-mkdir -p "${OUTDIR}"
-mkdir -p "${TMP_DIR}"
-t2t-trainer \
-  --data_dir=${DATA_DIR} \
-  --t2t_usr_dir=${USR_DIR} \
-  --problems=${PROBLEM} \
-  --model=transformer \
-  --hparams_set=${HPARAMS_SET} \
-  --output_dir=$OUTDIR --job-dir=$OUTDIR --train_steps=1000 \
-  --master=grpc://${MASTER_INSTANCE} \
-  --schedule=run_std_server
--- a/github_issue_summarization/tensor2tensor/github/t2t_launcher.sh
+++ b/github_issue_summarization/tensor2tensor/github/t2t_launcher.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+set -x
+echo environment
+env | sort
+WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
+WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
+MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
+t2t-trainer \
+  --master=grpc://${MASTER_INSTANCE} \
+  --worker_id=${WORKER_ID} \
+  --tmp_dir=/tmp \
+  "$@"
+
+# Sleep to give fluentd time to capture logs
+sleep 120  
--- a/github_issue_summarization/tensor2tensor/github/train_dist_launcher.sh
+++ b/github_issue_summarization/tensor2tensor/github/train_dist_launcher.sh
@ -1,39 +0,0 @@
-#!/bin/bash
-# TODO(ankushagarwal): Convert this to a python launcher script
-set -x
-PS_REPLICAS="${1}"
-WORKER_REPLICAS="${2}"
-WORKER_GPU="${3}"
-TRAIN_STEPS="${4}"
-WORKER_JOB="${5}"
-SYNC="${6}"
-export TF_CONFIG=$(echo ${TF_CONFIG} | sed 's/"worker"/"master"/g')
-echo "TF_CONFIG = ${TF_CONFIG}"
-OUTDIR=./out
-DATA_DIR=gs://kubeflow-examples/tensor2tensor/data
-TMP_DIR=./tmp
-PROBLEM=github_issue_summarization_problem
-USR_DIR=./github
-HPARAMS_SET=transformer_github_issues
-WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
-WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
-MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
-rm -rf "${OUTDIR}" "${TMP_DIR}"
-mkdir -p "${OUTDIR}"
-mkdir -p "${TMP_DIR}"
-t2t-trainer \
-  --data_dir=${DATA_DIR} \
-  --t2t_usr_dir=${USR_DIR} \
-  --problems=${PROBLEM} \
-  --model=transformer \
-  --hparams_set=${HPARAMS_SET} \
-  --output_dir=$OUTDIR --job-dir=$OUTDIR --train_steps=${TRAIN_STEPS} \
-  --master=grpc://${MASTER_INSTANCE} \
-  --ps_replicas=${PS_REPLICAS} \
-  --worker_replicas=${WORKER_REPLICAS} \
-  --worker_gpu=${WORKER_GPU} \
-  --worker_id=${WORKER_ID} \
-  --worker_job=${WORKER_JOB} \
-  --ps_gpu=0 \
-  --schedule=train \
-  --sync=${SYNC}