diff --git a/github_issue_summarization/ks-kubeflow/components/params.libsonnet b/github_issue_summarization/ks-kubeflow/components/params.libsonnet index cb9dfd0a..060ce49f 100644 --- a/github_issue_summarization/ks-kubeflow/components/params.libsonnet +++ b/github_issue_summarization/ks-kubeflow/components/params.libsonnet @@ -45,6 +45,7 @@ replicas: 2, }, tensor2tensor: { + cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180428-9da5cb7-dirty-4e1f35", namespace: "null", }, tensorboard: { diff --git a/github_issue_summarization/ks-kubeflow/components/tensor2tensor.jsonnet b/github_issue_summarization/ks-kubeflow/components/tensor2tensor.jsonnet index 914e8322..08983c68 100644 --- a/github_issue_summarization/ks-kubeflow/components/tensor2tensor.jsonnet +++ b/github_issue_summarization/ks-kubeflow/components/tensor2tensor.jsonnet @@ -4,4 +4,4 @@ local k = import "k.libsonnet"; local tensor2tensor = import "tensor2tensor.libsonnet"; -std.prune(k.core.v1.list.new([tensor2tensor.parts(params)])) +std.prune(k.core.v1.list.new([tensor2tensor.parts(params, env).job])) diff --git a/github_issue_summarization/ks-kubeflow/components/tensor2tensor.libsonnet b/github_issue_summarization/ks-kubeflow/components/tensor2tensor.libsonnet index 359293b6..d440b2ee 100644 --- a/github_issue_summarization/ks-kubeflow/components/tensor2tensor.libsonnet +++ b/github_issue_summarization/ks-kubeflow/components/tensor2tensor.libsonnet @@ -1,95 +1,150 @@ { - parts(params):: { - apiVersion: "kubeflow.org/v1alpha1", - kind: "TFJob", - metadata: { - name: "tensor2tensor", - namespace: params.namespace, - }, - spec: { - replicaSpecs: [ - { - replicas: 1, - template: { - spec: { - containers: [ - { - image: params.image, - name: "tensorflow", - command: [ - "bash", - ], - args: [ - "/home/jovyan/train_dist_launcher.sh", - "1", - params.workers, - "0", - params.train_steps, - "/job:master", - "False", - ], - }, - ], - restartPolicy: "OnFailure", + parts(params, env):: { + // Define some defaults. + local updatedParams = { + sync: "0", + + dataDir: "gs://kubeflow-examples-data/gh_issue_summarization/data", + usrDir: "./github", + problem: "github_issue_summarization_problem", + + model: "transformer_encoder", + hparams: "transformer_github_issues", + hparamsSet: "transformer_github_issues", + outputGCSPath: "gs://kubecon-gh-demo/gh-t2t-out/temp", + + gpuImage: null, + cpuImage: null, + + trainSteps: 20000, + evalSteps: 10, + + psGpu: 0, + workerGpu: 0, + + workers: 3, + masters: 1, + ps: 1, + + jobName: "tensor2tensor", + } + params, + + local containerEnv = [ + { + name: "PYTHONPATH", + value: "/home/jovyan", + } + ], + local baseCommand = [ + "/home/jovyan/github/t2t_launcher.sh", + "--train_steps=" + updatedParams.trainSteps, + "--hparams_set=" + updatedParams.hparams, + "--model=" + updatedParams.model, + "--problems=" + updatedParams.problem, + "--t2t_usr_dir=" + updatedParams.usrDir, + "--data_dir=" + updatedParams.dataDir, + "--output_dir=" + updatedParams.outputGCSPath, + ], + local psCommand = baseCommand + [ + "--schedule=run_std_server", + ], + local totalWorkerReplicas = updatedParams.workers + updatedParams.masters, + local workerBaseCommand = baseCommand + [ + "--schedule=train", + "--sync=" + updatedParams.sync, + "--ps_gpu=" + updatedParams.psGpu, + "--worker_gpu=" + updatedParams.workerGpu, + // We explicitly want to add worker and + "--worker_replicas=" + totalWorkerReplicas, + "--ps_replicas=" + updatedParams.ps, + "--eval_steps=" + updatedParams.evalSteps, + ], + local workerCommand = workerBaseCommand + [ + "--worker_job=/job:worker", + ], + local masterCommand = workerBaseCommand + [ + "--worker_job=/job:master", + ], + local namespace = env.namespace, + + job:: { + apiVersion: "kubeflow.org/v1alpha1", + kind: "TFJob", + metadata: { + name: updatedParams.jobName, + namespace: env.namespace, + }, + spec: { + replicaSpecs: [ + { + replicas: 1, + template: { + spec: { + containers: [ + { + image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage, + name: "tensorflow", + command: masterCommand, + env: containerEnv, + [if updatedParams.workerGpu > 0 then "resources"]: { + limits: { + "nvidia.com/gpu": updatedParams.workerGpu, + }, + }, + }, + ], + restartPolicy: "OnFailure", + }, }, + tfReplicaType: "MASTER", }, - tfReplicaType: "MASTER", - }, - { - replicas: params.workers, - template: { - spec: { - containers: [ - { - image: params.image, - name: "tensorflow", - command: [ - "bash", - ], - args: [ - "/home/jovyan/train_dist_launcher.sh", - "1", - params.workers, - "0", - params.train_steps, - "/job:master", - "False", - ], - }, - ], - restartPolicy: "OnFailure", + { + replicas: updatedParams.workers, + template: { + spec: { + containers: [ + { + image: if updatedParams.workerGpu > 0 then updatedParams.gpuImage else updatedParams.cpuImage, + name: "tensorflow", + command: workerCommand, + env: containerEnv, + [if updatedParams.workerGpu > 0 then "resources"]: { + limits: { + "nvidia.com/gpu": updatedParams.workerGpu, + }, + }, + }, + ], + restartPolicy: "OnFailure", + }, }, + tfReplicaType: "WORKER", }, - tfReplicaType: "WORKER", - }, - { - replicas: 1, - template: { - spec: { - containers: [ - { - image: params.image, - name: "tensorflow", - command: [ - "bash", - ], - args: [ - "/home/jovyan/ps_dist_launcher.sh", - ], - }, - ], - restartPolicy: "OnFailure", + { + replicas: updatedParams.ps, + template: { + spec: { + containers: [ + { + image: updatedParams.cpuImage, + name: "tensorflow", + command: psCommand, + env: containerEnv, + }, + ], + restartPolicy: "OnFailure", + }, }, + tfReplicaType: "PS", + }, + ], + terminationPolicy: { + chief: { + replicaIndex: 0, + replicaName: "MASTER", }, - tfReplicaType: "PS", - }, - ], - terminationPolicy: { - chief: { - replicaIndex: 0, - replicaName: "MASTER", }, }, - }, - }, -} + }, // job + }, //parts +} \ No newline at end of file diff --git a/github_issue_summarization/tensor2tensor/github/Dockerfile b/github_issue_summarization/tensor2tensor/github/Dockerfile index 6321ecde..4900fc4f 100644 --- a/github_issue_summarization/tensor2tensor/github/Dockerfile +++ b/github_issue_summarization/tensor2tensor/github/Dockerfile @@ -1,14 +1,21 @@ -FROM gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest +# Docker image to train a model using T2T +# +# For GPU use gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest +# For CPU use gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-cpu:latest +ARG BASE_IMAGE=gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:latest +FROM $BASE_IMAGE + +# Install pip packages as user jovyan +RUN pip install tensor2tensor h5py USER root -RUN pip install tensor2tensor && \ - apt-get install -y jq +RUN apt-get install -y jq COPY __init__.py github/__init__.py COPY github_problem.py github/github_problem.py -COPY ps_dist_launcher.sh github/ps_dist_launcher.sh -COPY train_dist_launcher.sh github/train_dist_launcher.sh +COPY t2t_launcher.sh github/t2t_launcher.sh +RUN chmod a+rx github/t2t_launcher.sh RUN chown -R jovyan:users /home/jovyan/github diff --git a/github_issue_summarization/tensor2tensor/github/Makefile b/github_issue_summarization/tensor2tensor/github/Makefile new file mode 100644 index 00000000..2c6f6a5f --- /dev/null +++ b/github_issue_summarization/tensor2tensor/github/Makefile @@ -0,0 +1,60 @@ +# Copyright 2017 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requirements: +# https://github.com/mattrobenolt/jinja2-cli +# pip install jinja2-clie +# Update the Airflow deployment + +# List any changed files. We only include files in the notebooks directory. +# because that is the code in the docker image. +# In particular we exclude changes to the ksonnet configs. +CHANGED_FILES := $(shell git diff-files --relative=github_issue_summarization/tensor2tensor/github) + +ifeq ($(strip $(CHANGED_FILES)),) +# Changed files is empty; not dirty +# Don't include --dirty because it could be dirty if files outside the ones we care +# about changed. +TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always) +else +TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always --dirty)-$(shell git diff | shasum -a256 | cut -c -6) +endif + +DIR := $(shell pwd) + +# You can override this on the command line as +# make PROJECT=kubeflow-examples +PROJECT := kubeflow-examples + +IMG := gcr.io/$(PROJECT)/issue-summarization-t2t-trainer +CPU_IMG := $(IMG)-cpu + +CPU_BASE_IMG := gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-cpu:v20180419-0ad94c4e + +echo: + @echo changed files $(CHANGED_FILES) + @echo tag $(TAG) + +push-cpu: build-cpu + gcloud docker -- push $(CPU_IMG):$(TAG) + +set-image: push-cpu + # Set the image to use + cd ../../ks-kubeflow && ks param set tensor2tensor cpuImage $(CPU_IMG):$(TAG) + +# To build without the cache set the environment variable +# export DOCKER_BUILD_OPTS=--no-cache +build-cpu: + docker build ${DOCKER_BUILD_OPTS} --build-arg BASE_IMAGE=$(CPU_BASE_IMG) -f Dockerfile -t $(CPU_IMG):$(TAG) ./ + @echo Built $(CPU_IMG):$(TAG) \ No newline at end of file diff --git a/github_issue_summarization/tensor2tensor/github/ps_dist_launcher.sh b/github_issue_summarization/tensor2tensor/github/ps_dist_launcher.sh deleted file mode 100755 index fb423670..00000000 --- a/github_issue_summarization/tensor2tensor/github/ps_dist_launcher.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -# TODO(ankushagarwal): Convert this to a python launcher script -set -x -export TF_CONFIG=${TF_CONFIG} -echo "TF_CONFIG = ${TF_CONFIG}" -OUTDIR=./out -DATA_DIR=gs://kubeflow-examples/tensor2tensor/data -TMP_DIR=./tmp -PROBLEM=github_issue_summarization_problem -USR_DIR=./github -HPARAMS_SET=transformer_github_issues -WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index") -WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type") -MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]") -rm -rf "${OUTDIR}" "${TMP_DIR}" -mkdir -p "${OUTDIR}" -mkdir -p "${TMP_DIR}" -t2t-trainer \ - --data_dir=${DATA_DIR} \ - --t2t_usr_dir=${USR_DIR} \ - --problems=${PROBLEM} \ - --model=transformer \ - --hparams_set=${HPARAMS_SET} \ - --output_dir=$OUTDIR --job-dir=$OUTDIR --train_steps=1000 \ - --master=grpc://${MASTER_INSTANCE} \ - --schedule=run_std_server diff --git a/github_issue_summarization/tensor2tensor/github/t2t_launcher.sh b/github_issue_summarization/tensor2tensor/github/t2t_launcher.sh new file mode 100644 index 00000000..aa2dee57 --- /dev/null +++ b/github_issue_summarization/tensor2tensor/github/t2t_launcher.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -x +echo environment +env | sort +WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index") +WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type") +MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]") +t2t-trainer \ + --master=grpc://${MASTER_INSTANCE} \ + --worker_id=${WORKER_ID} \ + --tmp_dir=/tmp \ + "$@" + +# Sleep to give fluentd time to capture logs +sleep 120 \ No newline at end of file diff --git a/github_issue_summarization/tensor2tensor/github/train_dist_launcher.sh b/github_issue_summarization/tensor2tensor/github/train_dist_launcher.sh deleted file mode 100755 index 532287bd..00000000 --- a/github_issue_summarization/tensor2tensor/github/train_dist_launcher.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# TODO(ankushagarwal): Convert this to a python launcher script -set -x -PS_REPLICAS="${1}" -WORKER_REPLICAS="${2}" -WORKER_GPU="${3}" -TRAIN_STEPS="${4}" -WORKER_JOB="${5}" -SYNC="${6}" -export TF_CONFIG=$(echo ${TF_CONFIG} | sed 's/"worker"/"master"/g') -echo "TF_CONFIG = ${TF_CONFIG}" -OUTDIR=./out -DATA_DIR=gs://kubeflow-examples/tensor2tensor/data -TMP_DIR=./tmp -PROBLEM=github_issue_summarization_problem -USR_DIR=./github -HPARAMS_SET=transformer_github_issues -WORKER_ID=$(echo ${TF_CONFIG} | jq ".task.index") -WORKER_TYPE=$(echo ${TF_CONFIG} | jq -r ".task.type") -MASTER_INSTANCE=$(echo ${TF_CONFIG} | jq -r ".cluster.${WORKER_TYPE}[${WORKER_ID}]") -rm -rf "${OUTDIR}" "${TMP_DIR}" -mkdir -p "${OUTDIR}" -mkdir -p "${TMP_DIR}" -t2t-trainer \ - --data_dir=${DATA_DIR} \ - --t2t_usr_dir=${USR_DIR} \ - --problems=${PROBLEM} \ - --model=transformer \ - --hparams_set=${HPARAMS_SET} \ - --output_dir=$OUTDIR --job-dir=$OUTDIR --train_steps=${TRAIN_STEPS} \ - --master=grpc://${MASTER_INSTANCE} \ - --ps_replicas=${PS_REPLICAS} \ - --worker_replicas=${WORKER_REPLICAS} \ - --worker_gpu=${WORKER_GPU} \ - --worker_id=${WORKER_ID} \ - --worker_job=${WORKER_JOB} \ - --ps_gpu=0 \ - --schedule=train \ - --sync=${SYNC}