mirror of https://github.com/kubeflow/examples.git
Improvements to the tensor2tensor trainer for the GitHub summarization example. (#109)
* Improvements to the tensor2tensor traininer for the GitHub summarization example.
* Simplify the launcher; we can just pass through most command line arguments and not
use environment variables and command line arguments.
* This makes it easier to control the job just by setting the parameters in the template
rather than having to rebuild the images.
* Add a Makefile to build the image.
* Replace the tensor2tensor jsonnet with a newer version of the jsonnet used with T2T.
* Address reviewer comments.
* Install pip packages as user Jovyan
* Rely on implicit string conversion with concatenation in template file.
This commit is contained in:
parent
afdd4c544e
commit
79aa2074cd
|
|
@ -45,6 +45,7 @@
|
|||
replicas: 2,
|
||||
},
|
||||
tensor2tensor: {
|
||||
cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180428-9da5cb7-dirty-4e1f35",
|
||||
namespace: "null",
|
||||
},
|
||||
tensorboard: {
|
||||
|
|
|
|||
|
|
@ -4,4 +4,4 @@ local k = import "k.libsonnet";
|
|||
|
||||
local tensor2tensor = import "tensor2tensor.libsonnet";
|
||||
|
||||
std.prune(k.core.v1.list.new([tensor2tensor.parts(params)]))
|
||||
std.prune(k.core.v1.list.new([tensor2tensor.parts(params, env).job]))
|
||||
|
|
|
|||
|
|
@ -1,95 +1,150 @@
|
|||
{
|
||||
parts(params):: {
|
||||
apiVersion: "kubeflow.org/v1alpha1",
|
||||
kind: "TFJob",
|
||||
metadata: {
|
||||
name: "tensor2tensor",
|
||||
namespace: params.namespace,
|
||||
},
|
||||
spec: {
|
||||
replicaSpecs: [
|
||||
{
|
||||
replicas: 1,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
image: params.image,
|
||||
name: "tensorflow",
|
||||
command: [
|
||||
"bash",
|
||||
],
|
||||
args: [
|
||||
"/home/jovyan/train_dist_launcher.sh",
|
||||
"1",
|
||||
params.workers,
|
||||
"0",
|
||||
params.train_steps,
|
||||
"/job:master",
|
||||
"False",
|
||||
],
|
||||
},
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
parts(params, env):: {
|
||||
// Define some defaults.
|
||||
local updatedParams = {
|
||||
sync: "0",
|
||||
|
||||
dataDir: "gs://kubeflow-examples-data/gh_issue_summarization/data",
|
||||
usrDir: "./github",
|
||||
problem: "github_issue_summarization_problem",
|
||||
|
||||
model: "transformer_encoder",
|
||||
hparams: "transformer_github_issues",
|
||||
hparamsSet: "transformer_github_issues",
|
||||
outputGCSPath: "gs://kubecon-gh-demo/gh-t2t-out/temp",
|
||||
|
||||
gpuImage: null,
|
||||
cpuImage: null,
|
||||
|
||||
trainSteps: 20000,
|
||||
evalSteps: 10,
|
||||
|
||||
psGpu: 0,
|
||||
workerGpu: 0,
|
||||
|
||||
workers: 3,
|
||||
masters: 1,
|
||||
ps: 1,
|
||||
|
||||
jobName: "tensor2tensor",
|
||||
} + params,
|
||||
|
||||
local containerEnv = [
|
||||
{
|
||||
name: "PYTHONPATH",
|
||||
value: "/home/jovyan",
|
||||
}
|
||||
],
|
||||
local baseCommand = [
|
||||
"/home/jovyan/github/t2t_launcher.sh",
|
||||
"--train_steps=" + updatedParams.trainSteps,
|
||||
"--hparams_set=" + updatedParams.hparams,
|
||||
"--model=" + updatedParams.model,
|
||||
"--problems=" + updatedParams.problem,
|
||||
"--t2t_usr_dir=" + updatedParams.usrDir,
|
||||
"--data_dir=" + updatedParams.dataDir,
|
||||
"--output_dir=" + updatedParams.outputGCSPath,
|
||||
],
|
||||
local psCommand = baseCommand + [
|
||||
"--schedule=run_std_server",
|
||||
],
|
||||
local totalWorkerReplicas = updatedParams.workers + updatedParams.masters,
|
||||
local workerBaseCommand = baseCommand + [
|
||||
"--schedule=train",
|
||||
"--sync=" + updatedParams.sync,
|
||||
"--ps_gpu=" + updatedParams.psGpu,
|
||||
"--worker_gpu=" + updatedParams.workerGpu,
|
||||
// We explicitly want to add worker and
|
||||
"--worker_replicas=" + totalWorkerReplicas,
|
||||
"--ps_replicas=" + updatedParams.ps,
|
||||
"--eval_steps=" + updatedParams.evalSteps,
|
||||
],
|
||||
local workerCommand = workerBaseCommand + [
|
||||
"--worker_job=/job:worker",
|
||||
],
|
||||
local masterCommand = workerBaseCommand + [
|
||||
"--worker_job=/job:master",
|
||||
],
|
||||
local namespace = env.namespace,
|
||||
|
||||
job:: {
|
||||
apiVersion: "kubeflow.org/v1alpha1",
|
||||
kind: "TFJob",
|
||||
metadata: {
|
||||
name: updatedParams.jobName,
|
||||
namespace: env.namespace,
|
||||
},
|
||||
spec: {
|
||||
replicaSpecs: [
|
||||
{
|
||||
replicas: 1,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
|
||||
name: "tensorflow",
|
||||
command: masterCommand,
|
||||
env: containerEnv,
|
||||
[if updatedParams.workerGpu > 0 then "resources"]: {
|
||||
limits: {
|
||||
"nvidia.com/gpu": updatedParams.workerGpu,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
tfReplicaType: "MASTER",
|
||||
},
|
||||
tfReplicaType: "MASTER",
|
||||
},
|
||||
{
|
||||
replicas: params.workers,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
image: params.image,
|
||||
name: "tensorflow",
|
||||
command: [
|
||||
"bash",
|
||||
],
|
||||
args: [
|
||||
"/home/jovyan/train_dist_launcher.sh",
|
||||
"1",
|
||||
params.workers,
|
||||
"0",
|
||||
params.train_steps,
|
||||
"/job:master",
|
||||
"False",
|
||||
],
|
||||
},
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
{
|
||||
replicas: updatedParams.workers,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage,
|
||||
name: "tensorflow",
|
||||
command: workerCommand,
|
||||
env: containerEnv,
|
||||
[if updatedParams.workerGpu > 0 then "resources"]: {
|
||||
limits: {
|
||||
"nvidia.com/gpu": updatedParams.workerGpu,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
tfReplicaType: "WORKER",
|
||||
},
|
||||
tfReplicaType: "WORKER",
|
||||
},
|
||||
{
|
||||
replicas: 1,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
image: params.image,
|
||||
name: "tensorflow",
|
||||
command: [
|
||||
"bash",
|
||||
],
|
||||
args: [
|
||||
"/home/jovyan/ps_dist_launcher.sh",
|
||||
],
|
||||
},
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
{
|
||||
replicas: updatedParams.ps,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
image: updatedParams.cpuImage,
|
||||
name: "tensorflow",
|
||||
command: psCommand,
|
||||
env: containerEnv,
|
||||
},
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
tfReplicaType: "PS",
|
||||
},
|
||||
],
|
||||
terminationPolicy: {
|
||||
chief: {
|
||||
replicaIndex: 0,
|
||||
replicaName: "MASTER",
|
||||
},
|
||||
tfReplicaType: "PS",
|
||||
},
|
||||
],
|
||||
terminationPolicy: {
|
||||
chief: {
|
||||
replicaIndex: 0,
|
||||
replicaName: "MASTER",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}, // job
|
||||
}, //parts
|
||||
}
|
||||
|
|
@ -1,14 +1,21 @@
|
|||
FROM gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
|
||||
# Docker image to train a model using T2T
|
||||
#
|
||||
# For GPU use gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
|
||||
# For CPU use gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-cpu:latest
|
||||
ARG BASE_IMAGE=gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
# Install pip packages as user jovyan
|
||||
RUN pip install tensor2tensor h5py
|
||||
|
||||
USER root
|
||||
|
||||
RUN pip install tensor2tensor && \
|
||||
apt-get install -y jq
|
||||
RUN apt-get install -y jq
|
||||
|
||||
COPY __init__.py github/__init__.py
|
||||
COPY github_problem.py github/github_problem.py
|
||||
COPY ps_dist_launcher.sh github/ps_dist_launcher.sh
|
||||
COPY train_dist_launcher.sh github/train_dist_launcher.sh
|
||||
COPY t2t_launcher.sh github/t2t_launcher.sh
|
||||
RUN chmod a+rx github/t2t_launcher.sh
|
||||
|
||||
RUN chown -R jovyan:users /home/jovyan/github
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,60 @@
|
|||
# Copyright 2017 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Requirements:
|
||||
# https://github.com/mattrobenolt/jinja2-cli
|
||||
# pip install jinja2-clie
|
||||
# Update the Airflow deployment
|
||||
|
||||
# List any changed files. We only include files in the notebooks directory.
|
||||
# because that is the code in the docker image.
|
||||
# In particular we exclude changes to the ksonnet configs.
|
||||
CHANGED_FILES := $(shell git diff-files --relative=github_issue_summarization/tensor2tensor/github)
|
||||
|
||||
ifeq ($(strip $(CHANGED_FILES)),)
|
||||
# Changed files is empty; not dirty
|
||||
# Don't include --dirty because it could be dirty if files outside the ones we care
|
||||
# about changed.
|
||||
TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always)
|
||||
else
|
||||
TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always --dirty)-$(shell git diff | shasum -a256 | cut -c -6)
|
||||
endif
|
||||
|
||||
DIR := $(shell pwd)
|
||||
|
||||
# You can override this on the command line as
|
||||
# make PROJECT=kubeflow-examples <target>
|
||||
PROJECT := kubeflow-examples
|
||||
|
||||
IMG := gcr.io/$(PROJECT)/issue-summarization-t2t-trainer
|
||||
CPU_IMG := $(IMG)-cpu
|
||||
|
||||
CPU_BASE_IMG := gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-cpu:v20180419-0ad94c4e
|
||||
|
||||
echo:
|
||||
@echo changed files $(CHANGED_FILES)
|
||||
@echo tag $(TAG)
|
||||
|
||||
push-cpu: build-cpu
|
||||
gcloud docker -- push $(CPU_IMG):$(TAG)
|
||||
|
||||
set-image: push-cpu
|
||||
# Set the image to use
|
||||
cd ../../ks-kubeflow && ks param set tensor2tensor cpuImage $(CPU_IMG):$(TAG)
|
||||
|
||||
# To build without the cache set the environment variable
|
||||
# export DOCKER_BUILD_OPTS=--no-cache
|
||||
build-cpu:
|
||||
docker build ${DOCKER_BUILD_OPTS} --build-arg BASE_IMAGE=$(CPU_BASE_IMG) -f Dockerfile -t $(CPU_IMG):$(TAG) ./
|
||||
@echo Built $(CPU_IMG):$(TAG)
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
#!/bin/bash
|
||||
# TODO(ankushagarwal): Convert this to a python launcher script
|
||||
set -x
|
||||
export TF_CONFIG=${TF_CONFIG}
|
||||
echo "TF_CONFIG = ${TF_CONFIG}"
|
||||
OUTDIR=./out
|
||||
DATA_DIR=gs://kubeflow-examples/tensor2tensor/data
|
||||
TMP_DIR=./tmp
|
||||
PROBLEM=github_issue_summarization_problem
|
||||
USR_DIR=./github
|
||||
HPARAMS_SET=transformer_github_issues
|
||||
WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
|
||||
WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
|
||||
MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
|
||||
rm -rf "${OUTDIR}" "${TMP_DIR}"
|
||||
mkdir -p "${OUTDIR}"
|
||||
mkdir -p "${TMP_DIR}"
|
||||
t2t-trainer \
|
||||
--data_dir=${DATA_DIR} \
|
||||
--t2t_usr_dir=${USR_DIR} \
|
||||
--problems=${PROBLEM} \
|
||||
--model=transformer \
|
||||
--hparams_set=${HPARAMS_SET} \
|
||||
--output_dir=$OUTDIR --job-dir=$OUTDIR --train_steps=1000 \
|
||||
--master=grpc://${MASTER_INSTANCE} \
|
||||
--schedule=run_std_server
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash
|
||||
set -x
|
||||
echo environment
|
||||
env | sort
|
||||
WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
|
||||
WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
|
||||
MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
|
||||
t2t-trainer \
|
||||
--master=grpc://${MASTER_INSTANCE} \
|
||||
--worker_id=${WORKER_ID} \
|
||||
--tmp_dir=/tmp \
|
||||
"$@"
|
||||
|
||||
# Sleep to give fluentd time to capture logs
|
||||
sleep 120
|
||||
|
|
@ -1,39 +0,0 @@
|
|||
#!/bin/bash
|
||||
# TODO(ankushagarwal): Convert this to a python launcher script
|
||||
set -x
|
||||
PS_REPLICAS="${1}"
|
||||
WORKER_REPLICAS="${2}"
|
||||
WORKER_GPU="${3}"
|
||||
TRAIN_STEPS="${4}"
|
||||
WORKER_JOB="${5}"
|
||||
SYNC="${6}"
|
||||
export TF_CONFIG=$(echo ${TF_CONFIG} | sed 's/"worker"/"master"/g')
|
||||
echo "TF_CONFIG = ${TF_CONFIG}"
|
||||
OUTDIR=./out
|
||||
DATA_DIR=gs://kubeflow-examples/tensor2tensor/data
|
||||
TMP_DIR=./tmp
|
||||
PROBLEM=github_issue_summarization_problem
|
||||
USR_DIR=./github
|
||||
HPARAMS_SET=transformer_github_issues
|
||||
WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index")
|
||||
WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type")
|
||||
MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]")
|
||||
rm -rf "${OUTDIR}" "${TMP_DIR}"
|
||||
mkdir -p "${OUTDIR}"
|
||||
mkdir -p "${TMP_DIR}"
|
||||
t2t-trainer \
|
||||
--data_dir=${DATA_DIR} \
|
||||
--t2t_usr_dir=${USR_DIR} \
|
||||
--problems=${PROBLEM} \
|
||||
--model=transformer \
|
||||
--hparams_set=${HPARAMS_SET} \
|
||||
--output_dir=$OUTDIR --job-dir=$OUTDIR --train_steps=${TRAIN_STEPS} \
|
||||
--master=grpc://${MASTER_INSTANCE} \
|
||||
--ps_replicas=${PS_REPLICAS} \
|
||||
--worker_replicas=${WORKER_REPLICAS} \
|
||||
--worker_gpu=${WORKER_GPU} \
|
||||
--worker_id=${WORKER_ID} \
|
||||
--worker_job=${WORKER_JOB} \
|
||||
--ps_gpu=0 \
|
||||
--schedule=train \
|
||||
--sync=${SYNC}
|
||||
Loading…
Reference in New Issue