mirror of https://github.com/kubeflow/examples.git
Improvements to the tensor2tensor trainer for the GitHub summarization example. (#109)
* Improvements to the tensor2tensor traininer for the GitHub summarization example.
* Simplify the launcher; we can just pass through most command line arguments and not
use environment variables and command line arguments.
* This makes it easier to control the job just by setting the parameters in the template
rather than having to rebuild the images.
* Add a Makefile to build the image.
* Replace the tensor2tensor jsonnet with a newer version of the jsonnet used with T2T.
* Address reviewer comments.
* Install pip packages as user Jovyan
* Rely on implicit string conversion with concatenation in template file.
This commit is contained in:
parent
afdd4c544e
commit
79aa2074cd
|
|
@ -45,6 +45,7 @@
|
||||||
replicas: 2,
|
replicas: 2,
|
||||||
},
|
},
|
||||||
tensor2tensor: {
|
tensor2tensor: {
|
||||||
|
cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180428-9da5cb7-dirty-4e1f35",
|
||||||
namespace: "null",
|
namespace: "null",
|
||||||
},
|
},
|
||||||
tensorboard: {
|
tensorboard: {
|
||||||
|
|
|
||||||
|
|
@ -4,4 +4,4 @@ local k = import "k.libsonnet";
|
||||||
|
|
||||||
local tensor2tensor = import "tensor2tensor.libsonnet";
|
local tensor2tensor = import "tensor2tensor.libsonnet";
|
||||||
|
|
||||||
std.prune(k.core.v1.list.new([tensor2tensor.parts(params)]))
|
std.prune(k.core.v1.list.new([tensor2tensor.parts(params, env).job]))
|
||||||
|
|
|
||||||
|
|
@ -1,95 +1,150 @@
|
||||||
{
|
{
|
||||||
parts(params):: {
|
parts(params, env):: {
|
||||||
apiVersion: "kubeflow.org/v1alpha1",
|
// Define some defaults.
|
||||||
kind: "TFJob",
|
local updatedParams = {
|
||||||
metadata: {
|
sync: "0",
|
||||||
name: "tensor2tensor",
|
|
||||||
namespace: params.namespace,
|
dataDir: "gs://kubeflow-examples-data/gh_issue_summarization/data",
|
||||||
},
|
usrDir: "./github",
|
||||||
spec: {
|
problem: "github_issue_summarization_problem",
|
||||||
replicaSpecs: [
|
|
||||||
{
|
model: "transformer_encoder",
|
||||||
replicas: 1,
|
hparams: "transformer_github_issues",
|
||||||
template: {
|
hparamsSet: "transformer_github_issues",
|
||||||
spec: {
|
outputGCSPath: "gs://kubecon-gh-demo/gh-t2t-out/temp",
|
||||||
containers: [
|
|
||||||
{
|
gpuImage: null,
|
||||||
image: params.image,
|
cpuImage: null,
|
||||||
name: "tensorflow",
|
|
||||||
command: [
|
trainSteps: 20000,
|
||||||
"bash",
|
evalSteps: 10,
|
||||||
],
|
|
||||||
args: [
|
psGpu: 0,
|
||||||
"/home/jovyan/train_dist_launcher.sh",
|
workerGpu: 0,
|
||||||
"1",
|
|
||||||
params.workers,
|
workers: 3,
|
||||||
"0",
|
masters: 1,
|
||||||
params.train_steps,
|
ps: 1,
|
||||||
"/job:master",
|
|
||||||
"False",
|
jobName: "tensor2tensor",
|
||||||
],
|
} + params,
|
||||||
},
|
|
||||||
],
|
local containerEnv = [
|
||||||
restartPolicy: "OnFailure",
|
{
|
||||||
|
name: "PYTHONPATH",
|
||||||
|
value: "/home/jovyan",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
local baseCommand = [
|
||||||
|
"/home/jovyan/github/t2t_launcher.sh",
|
||||||
|
"--train_steps=" + updatedParams.trainSteps,
|
||||||
|
"--hparams_set=" + updatedParams.hparams,
|
||||||
|
"--model=" + updatedParams.model,
|
||||||
|
"--problems=" + updatedParams.problem,
|
||||||
|
"--t2t_usr_dir=" + updatedParams.usrDir,
|
||||||
|
"--data_dir=" + updatedParams.dataDir,
|
||||||
|
"--output_dir=" + updatedParams.outputGCSPath,
|
||||||
|
],
|
||||||
|
local psCommand = baseCommand + [
|
||||||
|
"--schedule=run_std_server",
|
||||||
|
],
|
||||||
|
local totalWorkerReplicas = updatedParams.workers + updatedParams.masters,
|
||||||
|
local workerBaseCommand = baseCommand + [
|
||||||
|
"--schedule=train",
|
||||||
|
"--sync=" + updatedParams.sync,
|
||||||
|
"--ps_gpu=" + updatedParams.psGpu,
|
||||||
|
"--worker_gpu=" + updatedParams.workerGpu,
|
||||||
|
// We explicitly want to add worker and
|
||||||
|
"--worker_replicas=" + totalWorkerReplicas,
|
||||||
|
"--ps_replicas=" + updatedParams.ps,
|
||||||
|
"--eval_steps=" + updatedParams.evalSteps,
|
||||||
|
],
|
||||||
|
local workerCommand = workerBaseCommand + [
|
||||||
|
"--worker_job=/job:worker",
|
||||||
|
],
|
||||||
|
local masterCommand = workerBaseCommand + [
|
||||||
|
"--worker_job=/job:master",
|
||||||
|
],
|
||||||
|
local namespace = env.namespace,
|
||||||
|
|
||||||
|
job:: {
|
||||||
|
apiVersion: "kubeflow.org/v1alpha1",
|
||||||
|
kind: "TFJob",
|
||||||
|
metadata: {
|
||||||
|
name: updatedParams.jobName,
|
||||||
|
namespace: env.namespace,
|
||||||
|
},
|
||||||
|
spec: {
|
||||||
|
replicaSpecs: [
|
||||||
|
{
|
||||||
|
replicas: 1,
|
||||||
|
template: {
|
||||||
|
spec: {
|
||||||
|
containers: [
|
||||||
|
{
|
||||||
|
image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
|
||||||
|
name: "tensorflow",
|
||||||
|
command: masterCommand,
|
||||||
|
env: containerEnv,
|
||||||
|
[if updatedParams.workerGpu > 0 then "resources"]: {
|
||||||
|
limits: {
|
||||||
|
"nvidia.com/gpu": updatedParams.workerGpu,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
restartPolicy: "OnFailure",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
|
tfReplicaType: "MASTER",
|
||||||
},
|
},
|
||||||
tfReplicaType: "MASTER",
|
{
|
||||||
},
|
replicas: updatedParams.workers,
|
||||||
{
|
template: {
|
||||||
replicas: params.workers,
|
spec: {
|
||||||
template: {
|
containers: [
|
||||||
spec: {
|
{
|
||||||
containers: [
|
image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
|
||||||
{
|
name: "tensorflow",
|
||||||
image: params.image,
|
command: workerCommand,
|
||||||
name: "tensorflow",
|
env: containerEnv,
|
||||||
command: [
|
[if updatedParams.workerGpu > 0 then "resources"]: {
|
||||||
"bash",
|
limits: {
|
||||||
],
|
"nvidia.com/gpu": updatedParams.workerGpu,
|
||||||
args: [
|
},
|
||||||
"/home/jovyan/train_dist_launcher.sh",
|
},
|
||||||
"1",
|
},
|
||||||
params.workers,
|
],
|
||||||
"0",
|
restartPolicy: "OnFailure",
|
||||||
params.train_steps,
|
},
|
||||||
"/job:master",
|
|
||||||
"False",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
restartPolicy: "OnFailure",
|
|
||||||
},
|
},
|
||||||
|
tfReplicaType: "WORKER",
|
||||||
},
|
},
|
||||||
tfReplicaType: "WORKER",
|
{
|
||||||
},
|
replicas: updatedParams.ps,
|
||||||
{
|
template: {
|
||||||
replicas: 1,
|
spec: {
|
||||||
template: {
|
containers: [
|
||||||
spec: {
|
{
|
||||||
containers: [
|
image: updatedParams.cpuImage,
|
||||||
{
|
name: "tensorflow",
|
||||||
image: params.image,
|
command: psCommand,
|
||||||
name: "tensorflow",
|
env: containerEnv,
|
||||||
command: [
|
},
|
||||||
"bash",
|
],
|
||||||
],
|
restartPolicy: "OnFailure",
|
||||||
args: [
|
},
|
||||||
"/home/jovyan/ps_dist_launcher.sh",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
restartPolicy: "OnFailure",
|
|
||||||
},
|
},
|
||||||
|
tfReplicaType: "PS",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
terminationPolicy: {
|
||||||
|
chief: {
|
||||||
|
replicaIndex: 0,
|
||||||
|
replicaName: "MASTER",
|
||||||
},
|
},
|
||||||
tfReplicaType: "PS",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
terminationPolicy: {
|
|
||||||
chief: {
|
|
||||||
replicaIndex: 0,
|
|
||||||
replicaName: "MASTER",
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
}, // job
|
||||||
},
|
}, //parts
|
||||||
}
|
}
|
||||||
|
|
@ -1,14 +1,21 @@
|
||||||
FROM gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
|
# Docker image to train a model using T2T
|
||||||
|
#
|
||||||
|
# For GPU use gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
|
||||||
|
# For CPU use gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-cpu:latest
|
||||||
|
ARG BASE_IMAGE=gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
|
||||||
|
FROM $BASE_IMAGE
|
||||||
|
|
||||||
|
# Install pip packages as user jovyan
|
||||||
|
RUN pip install tensor2tensor h5py
|
||||||
|
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
RUN pip install tensor2tensor && \
|
RUN apt-get install -y jq
|
||||||
apt-get install -y jq
|
|
||||||
|
|
||||||
COPY __init__.py github/__init__.py
|
COPY __init__.py github/__init__.py
|
||||||
COPY github_problem.py github/github_problem.py
|
COPY github_problem.py github/github_problem.py
|
||||||
COPY ps_dist_launcher.sh github/ps_dist_launcher.sh
|
COPY t2t_launcher.sh github/t2t_launcher.sh
|
||||||
COPY train_dist_launcher.sh github/train_dist_launcher.sh
|
RUN chmod a+rx github/t2t_launcher.sh
|
||||||
|
|
||||||
RUN chown -R jovyan:users /home/jovyan/github
|
RUN chown -R jovyan:users /home/jovyan/github
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,60 @@
|
||||||
|
# Copyright 2017 The Kubernetes Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# Requirements:
|
||||||
|
# https://github.com/mattrobenolt/jinja2-cli
|
||||||
|
# pip install jinja2-clie
|
||||||
|
# Update the Airflow deployment
|
||||||
|
|
||||||
|
# List any changed files. We only include files in the notebooks directory.
|
||||||
|
# because that is the code in the docker image.
|
||||||
|
# In particular we exclude changes to the ksonnet configs.
|
||||||
|
CHANGED_FILES := $(shell git diff-files --relative=github_issue_summarization/tensor2tensor/github)
|
||||||
|
|
||||||
|
ifeq ($(strip $(CHANGED_FILES)),)
|
||||||
|
# Changed files is empty; not dirty
|
||||||
|
# Don't include --dirty because it could be dirty if files outside the ones we care
|
||||||
|
# about changed.
|
||||||
|
TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always)
|
||||||
|
else
|
||||||
|
TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always --dirty)-$(shell git diff | shasum -a256 | cut -c -6)
|
||||||
|
endif
|
||||||
|
|
||||||
|
DIR := $(shell pwd)
|
||||||
|
|
||||||
|
# You can override this on the command line as
|
||||||
|
# make PROJECT=kubeflow-examples <target>
|
||||||
|
PROJECT := kubeflow-examples
|
||||||
|
|
||||||
|
IMG := gcr.io/$(PROJECT)/issue-summarization-t2t-trainer
|
||||||
|
CPU_IMG := $(IMG)-cpu
|
||||||
|
|
||||||
|
CPU_BASE_IMG := gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-cpu:v20180419-0ad94c4e
|
||||||
|
|
||||||
|
echo:
|
||||||
|
@echo changed files $(CHANGED_FILES)
|
||||||
|
@echo tag $(TAG)
|
||||||
|
|
||||||
|
push-cpu: build-cpu
|
||||||
|
gcloud docker -- push $(CPU_IMG):$(TAG)
|
||||||
|
|
||||||
|
set-image: push-cpu
|
||||||
|
# Set the image to use
|
||||||
|
cd ../../ks-kubeflow && ks param set tensor2tensor cpuImage $(CPU_IMG):$(TAG)
|
||||||
|
|
||||||
|
# To build without the cache set the environment variable
|
||||||
|
# export DOCKER_BUILD_OPTS=--no-cache
|
||||||
|
build-cpu:
|
||||||
|
docker build ${DOCKER_BUILD_OPTS} --build-arg BASE_IMAGE=$(CPU_BASE_IMG) -f Dockerfile -t $(CPU_IMG):$(TAG) ./
|
||||||
|
@echo Built $(CPU_IMG):$(TAG)
|
||||||
|
|
@ -1,26 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
# TODO(ankushagarwal): Convert this to a python launcher script
|
|
||||||
set -x
|
|
||||||
export TF_CONFIG=${TF_CONFIG}
|
|
||||||
echo "TF_CONFIG = ${TF_CONFIG}"
|
|
||||||
OUTDIR=./out
|
|
||||||
DATA_DIR=gs://kubeflow-examples/tensor2tensor/data
|
|
||||||
TMP_DIR=./tmp
|
|
||||||
PROBLEM=github_issue_summarization_problem
|
|
||||||
USR_DIR=./github
|
|
||||||
HPARAMS_SET=transformer_github_issues
|
|
||||||
WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
|
|
||||||
WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
|
|
||||||
MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
|
|
||||||
rm -rf "${OUTDIR}" "${TMP_DIR}"
|
|
||||||
mkdir -p "${OUTDIR}"
|
|
||||||
mkdir -p "${TMP_DIR}"
|
|
||||||
t2t-trainer \
|
|
||||||
--data_dir=${DATA_DIR} \
|
|
||||||
--t2t_usr_dir=${USR_DIR} \
|
|
||||||
--problems=${PROBLEM} \
|
|
||||||
--model=transformer \
|
|
||||||
--hparams_set=${HPARAMS_SET} \
|
|
||||||
--output_dir=$OUTDIR --job-dir=$OUTDIR --train_steps=1000 \
|
|
||||||
--master=grpc://${MASTER_INSTANCE} \
|
|
||||||
--schedule=run_std_server
|
|
||||||
|
|
@ -0,0 +1,15 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -x
|
||||||
|
echo environment
|
||||||
|
env | sort
|
||||||
|
WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
|
||||||
|
WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
|
||||||
|
MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
|
||||||
|
t2t-trainer \
|
||||||
|
--master=grpc://${MASTER_INSTANCE} \
|
||||||
|
--worker_id=${WORKER_ID} \
|
||||||
|
--tmp_dir=/tmp \
|
||||||
|
"$@"
|
||||||
|
|
||||||
|
# Sleep to give fluentd time to capture logs
|
||||||
|
sleep 120
|
||||||
|
|
@ -1,39 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
# TODO(ankushagarwal): Convert this to a python launcher script
|
|
||||||
set -x
|
|
||||||
PS_REPLICAS="${1}"
|
|
||||||
WORKER_REPLICAS="${2}"
|
|
||||||
WORKER_GPU="${3}"
|
|
||||||
TRAIN_STEPS="${4}"
|
|
||||||
WORKER_JOB="${5}"
|
|
||||||
SYNC="${6}"
|
|
||||||
export TF_CONFIG=$(echo ${TF_CONFIG} | sed 's/"worker"/"master"/g')
|
|
||||||
echo "TF_CONFIG = ${TF_CONFIG}"
|
|
||||||
OUTDIR=./out
|
|
||||||
DATA_DIR=gs://kubeflow-examples/tensor2tensor/data
|
|
||||||
TMP_DIR=./tmp
|
|
||||||
PROBLEM=github_issue_summarization_problem
|
|
||||||
USR_DIR=./github
|
|
||||||
HPARAMS_SET=transformer_github_issues
|
|
||||||
WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
|
|
||||||
WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
|
|
||||||
MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
|
|
||||||
rm -rf "${OUTDIR}" "${TMP_DIR}"
|
|
||||||
mkdir -p "${OUTDIR}"
|
|
||||||
mkdir -p "${TMP_DIR}"
|
|
||||||
t2t-trainer \
|
|
||||||
--data_dir=${DATA_DIR} \
|
|
||||||
--t2t_usr_dir=${USR_DIR} \
|
|
||||||
--problems=${PROBLEM} \
|
|
||||||
--model=transformer \
|
|
||||||
--hparams_set=${HPARAMS_SET} \
|
|
||||||
--output_dir=$OUTDIR --job-dir=$OUTDIR --train_steps=${TRAIN_STEPS} \
|
|
||||||
--master=grpc://${MASTER_INSTANCE} \
|
|
||||||
--ps_replicas=${PS_REPLICAS} \
|
|
||||||
--worker_replicas=${WORKER_REPLICAS} \
|
|
||||||
--worker_gpu=${WORKER_GPU} \
|
|
||||||
--worker_id=${WORKER_ID} \
|
|
||||||
--worker_job=${WORKER_JOB} \
|
|
||||||
--ps_gpu=0 \
|
|
||||||
--schedule=train \
|
|
||||||
--sync=${SYNC}
|
|
||||||
Loading…
Reference in New Issue