Improvements to the tensor2tensor trainer for the GitHub summarization example. (#109)

* Improvements to the tensor2tensor traininer for the GitHub summarization example.

* Simplify the launcher; we can just pass through most command line arguments and not
  use environment variables and command line arguments.

  * This makes it easier to control the job just by setting the parameters in the template
    rather than having to rebuild the images.

* Add a Makefile to build the image.

* Replace the tensor2tensor jsonnet with a newer version of the jsonnet used with T2T.

* Address reviewer comments.

* Install pip packages as user Jovyan
* Rely on implicit string conversion with concatenation in template file.
This commit is contained in:
Jeremy Lewi 2018-04-29 20:39:16 -07:00 committed by k8s-ci-robot
parent afdd4c544e
commit 79aa2074cd
8 changed files with 230 additions and 157 deletions

View File

@ -45,6 +45,7 @@
replicas: 2,
},
tensor2tensor: {
cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180428-9da5cb7-dirty-4e1f35",
namespace: "null",
},
tensorboard: {

View File

@ -4,4 +4,4 @@ local k = import "k.libsonnet";
local tensor2tensor = import "tensor2tensor.libsonnet";
std.prune(k.core.v1.list.new([tensor2tensor.parts(params)]))
std.prune(k.core.v1.list.new([tensor2tensor.parts(params, env).job]))

View File

@ -1,95 +1,150 @@
{
parts(params):: {
apiVersion: "kubeflow.org/v1alpha1",
kind: "TFJob",
metadata: {
name: "tensor2tensor",
namespace: params.namespace,
},
spec: {
replicaSpecs: [
{
replicas: 1,
template: {
spec: {
containers: [
{
image: params.image,
name: "tensorflow",
command: [
"bash",
],
args: [
"/home/jovyan/train_dist_launcher.sh",
"1",
params.workers,
"0",
params.train_steps,
"/job:master",
"False",
],
},
],
restartPolicy: "OnFailure",
parts(params, env):: {
// Define some defaults.
local updatedParams = {
sync: "0",
dataDir: "gs://kubeflow-examples-data/gh_issue_summarization/data",
usrDir: "./github",
problem: "github_issue_summarization_problem",
model: "transformer_encoder",
hparams: "transformer_github_issues",
hparamsSet: "transformer_github_issues",
outputGCSPath: "gs://kubecon-gh-demo/gh-t2t-out/temp",
gpuImage: null,
cpuImage: null,
trainSteps: 20000,
evalSteps: 10,
psGpu: 0,
workerGpu: 0,
workers: 3,
masters: 1,
ps: 1,
jobName: "tensor2tensor",
} + params,
local containerEnv = [
{
name: "PYTHONPATH",
value: "/home/jovyan",
}
],
local baseCommand = [
"/home/jovyan/github/t2t_launcher.sh",
"--train_steps=" + updatedParams.trainSteps,
"--hparams_set=" + updatedParams.hparams,
"--model=" + updatedParams.model,
"--problems=" + updatedParams.problem,
"--t2t_usr_dir=" + updatedParams.usrDir,
"--data_dir=" + updatedParams.dataDir,
"--output_dir=" + updatedParams.outputGCSPath,
],
local psCommand = baseCommand + [
"--schedule=run_std_server",
],
local totalWorkerReplicas = updatedParams.workers + updatedParams.masters,
local workerBaseCommand = baseCommand + [
"--schedule=train",
"--sync=" + updatedParams.sync,
"--ps_gpu=" + updatedParams.psGpu,
"--worker_gpu=" + updatedParams.workerGpu,
// We explicitly want to add worker and
"--worker_replicas=" + totalWorkerReplicas,
"--ps_replicas=" + updatedParams.ps,
"--eval_steps=" + updatedParams.evalSteps,
],
local workerCommand = workerBaseCommand + [
"--worker_job=/job:worker",
],
local masterCommand = workerBaseCommand + [
"--worker_job=/job:master",
],
local namespace = env.namespace,
job:: {
apiVersion: "kubeflow.org/v1alpha1",
kind: "TFJob",
metadata: {
name: updatedParams.jobName,
namespace: env.namespace,
},
spec: {
replicaSpecs: [
{
replicas: 1,
template: {
spec: {
containers: [
{
image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
name: "tensorflow",
command: masterCommand,
env: containerEnv,
[if updatedParams.workerGpu > 0 then "resources"]: {
limits: {
"nvidia.com/gpu": updatedParams.workerGpu,
},
},
},
],
restartPolicy: "OnFailure",
},
},
tfReplicaType: "MASTER",
},
tfReplicaType: "MASTER",
},
{
replicas: params.workers,
template: {
spec: {
containers: [
{
image: params.image,
name: "tensorflow",
command: [
"bash",
],
args: [
"/home/jovyan/train_dist_launcher.sh",
"1",
params.workers,
"0",
params.train_steps,
"/job:master",
"False",
],
},
],
restartPolicy: "OnFailure",
{
replicas: updatedParams.workers,
template: {
spec: {
containers: [
{
image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
name: "tensorflow",
command: workerCommand,
env: containerEnv,
[if updatedParams.workerGpu > 0 then "resources"]: {
limits: {
"nvidia.com/gpu": updatedParams.workerGpu,
},
},
},
],
restartPolicy: "OnFailure",
},
},
tfReplicaType: "WORKER",
},
tfReplicaType: "WORKER",
},
{
replicas: 1,
template: {
spec: {
containers: [
{
image: params.image,
name: "tensorflow",
command: [
"bash",
],
args: [
"/home/jovyan/ps_dist_launcher.sh",
],
},
],
restartPolicy: "OnFailure",
{
replicas: updatedParams.ps,
template: {
spec: {
containers: [
{
image: updatedParams.cpuImage,
name: "tensorflow",
command: psCommand,
env: containerEnv,
},
],
restartPolicy: "OnFailure",
},
},
tfReplicaType: "PS",
},
],
terminationPolicy: {
chief: {
replicaIndex: 0,
replicaName: "MASTER",
},
tfReplicaType: "PS",
},
],
terminationPolicy: {
chief: {
replicaIndex: 0,
replicaName: "MASTER",
},
},
},
},
}
}, // job
}, //parts
}

View File

@ -1,14 +1,21 @@
FROM gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
# Docker image to train a model using T2T
#
# For GPU use gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
# For CPU use gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-cpu:latest
ARG BASE_IMAGE=gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
FROM $BASE_IMAGE
# Install pip packages as user jovyan
RUN pip install tensor2tensor h5py
USER root
RUN pip install tensor2tensor && \
apt-get install -y jq
RUN apt-get install -y jq
COPY __init__.py github/__init__.py
COPY github_problem.py github/github_problem.py
COPY ps_dist_launcher.sh github/ps_dist_launcher.sh
COPY train_dist_launcher.sh github/train_dist_launcher.sh
COPY t2t_launcher.sh github/t2t_launcher.sh
RUN chmod a+rx github/t2t_launcher.sh
RUN chown -R jovyan:users /home/jovyan/github

View File

@ -0,0 +1,60 @@
# Copyright 2017 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Requirements:
# https://github.com/mattrobenolt/jinja2-cli
# pip install jinja2-clie
# Update the Airflow deployment
# List any changed files. We only include files in the notebooks directory.
# because that is the code in the docker image.
# In particular we exclude changes to the ksonnet configs.
CHANGED_FILES := $(shell git diff-files --relative=github_issue_summarization/tensor2tensor/github)
ifeq ($(strip $(CHANGED_FILES)),)
# Changed files is empty; not dirty
# Don't include --dirty because it could be dirty if files outside the ones we care
# about changed.
TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always)
else
TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always --dirty)-$(shell git diff | shasum -a256 | cut -c -6)
endif
DIR := $(shell pwd)
# You can override this on the command line as
# make PROJECT=kubeflow-examples <target>
PROJECT := kubeflow-examples
IMG := gcr.io/$(PROJECT)/issue-summarization-t2t-trainer
CPU_IMG := $(IMG)-cpu
CPU_BASE_IMG := gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-cpu:v20180419-0ad94c4e
echo:
@echo changed files $(CHANGED_FILES)
@echo tag $(TAG)
push-cpu: build-cpu
gcloud docker -- push $(CPU_IMG):$(TAG)
set-image: push-cpu
# Set the image to use
cd ../../ks-kubeflow && ks param set tensor2tensor cpuImage $(CPU_IMG):$(TAG)
# To build without the cache set the environment variable
# export DOCKER_BUILD_OPTS=--no-cache
build-cpu:
docker build ${DOCKER_BUILD_OPTS} --build-arg BASE_IMAGE=$(CPU_BASE_IMG) -f Dockerfile -t $(CPU_IMG):$(TAG) ./
@echo Built $(CPU_IMG):$(TAG)

View File

@ -1,26 +0,0 @@
#!/bin/bash
# TODO(ankushagarwal): Convert this to a python launcher script
set -x
export TF_CONFIG=${TF_CONFIG}
echo "TF_CONFIG = ${TF_CONFIG}"
OUTDIR=./out
DATA_DIR=gs://kubeflow-examples/tensor2tensor/data
TMP_DIR=./tmp
PROBLEM=github_issue_summarization_problem
USR_DIR=./github
HPARAMS_SET=transformer_github_issues
WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
rm -rf "${OUTDIR}" "${TMP_DIR}"
mkdir -p "${OUTDIR}"
mkdir -p "${TMP_DIR}"
t2t-trainer \
--data_dir=${DATA_DIR} \
--t2t_usr_dir=${USR_DIR} \
--problems=${PROBLEM} \
--model=transformer \
--hparams_set=${HPARAMS_SET} \
--output_dir=$OUTDIR --job-dir=$OUTDIR --train_steps=1000 \
--master=grpc://${MASTER_INSTANCE} \
--schedule=run_std_server

View File

@ -0,0 +1,15 @@
#!/bin/bash
set -x
echo environment
env | sort
WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
t2t-trainer \
--master=grpc://${MASTER_INSTANCE} \
--worker_id=${WORKER_ID} \
--tmp_dir=/tmp \
"$@"
# Sleep to give fluentd time to capture logs
sleep 120

View File

@ -1,39 +0,0 @@
#!/bin/bash
# TODO(ankushagarwal): Convert this to a python launcher script
set -x
PS_REPLICAS="${1}"
WORKER_REPLICAS="${2}"
WORKER_GPU="${3}"
TRAIN_STEPS="${4}"
WORKER_JOB="${5}"
SYNC="${6}"
export TF_CONFIG=$(echo ${TF_CONFIG} | sed 's/"worker"/"master"/g')
echo "TF_CONFIG = ${TF_CONFIG}"
OUTDIR=./out
DATA_DIR=gs://kubeflow-examples/tensor2tensor/data
TMP_DIR=./tmp
PROBLEM=github_issue_summarization_problem
USR_DIR=./github
HPARAMS_SET=transformer_github_issues
WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
rm -rf "${OUTDIR}" "${TMP_DIR}"
mkdir -p "${OUTDIR}"
mkdir -p "${TMP_DIR}"
t2t-trainer \
--data_dir=${DATA_DIR} \
--t2t_usr_dir=${USR_DIR} \
--problems=${PROBLEM} \
--model=transformer \
--hparams_set=${HPARAMS_SET} \
--output_dir=$OUTDIR --job-dir=$OUTDIR --train_steps=${TRAIN_STEPS} \
--master=grpc://${MASTER_INSTANCE} \
--ps_replicas=${PS_REPLICAS} \
--worker_replicas=${WORKER_REPLICAS} \
--worker_gpu=${WORKER_GPU} \
--worker_id=${WORKER_ID} \
--worker_job=${WORKER_JOB} \
--ps_gpu=0 \
--schedule=train \
--sync=${SYNC}