Fix v1alpha2 version of the T2T training job. (#158)

* Update the Docker image for T2T to use a newer version of T2T library

* Add parameters to set the GCP secret; we need GCP credentials to
  read from GCS even if reading a public bucket. We default
  to the parameters that are created automatically in the case of a GKE
  deployment.

* Create a v1alpha2 template for the job that uses PVC.
This commit is contained in:
Jeremy Lewi 2018-06-29 12:26:18 -07:00 committed by k8s-ci-robot
parent 93db7e369e
commit 98ed4b4a69
5 changed files with 108 additions and 4 deletions

View File

@ -5,7 +5,6 @@ local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["data-pvc"];
local k = import "k.libsonnet";
local pvc = {
apiVersion: "v1",
kind: "PersistentVolumeClaim",

View File

@ -91,5 +91,8 @@
name: "tensor2tensor-v1alpha2",
},
"data-downloader": {},
"tfjob-pvc-v1alpha2": {
name: "tfjob-pvc-v1alpha2",
},
},
}

View File

@ -10,16 +10,18 @@ local updatedParams = {
sync: "0",
dataDir: "gs://kubeflow-examples-data/gh_issue_summarization/data",
usrDir: "./github",
// usrDir needs to match the directory inside the container where the problem is defined.
usrDir: "/home/jovyan/github",
problem: "github_issue_summarization_problem",
model: "transformer_encoder",
hparams: "transformer_github_issues",
hparamsSet: "transformer_github_issues",
// Set this to the path you want to write to.
outputGCSPath: "gs://kubecon-gh-demo/gh-t2t-out/temp",
gpuImage: null,
cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180629-v0.1-2-g4e8b4cb",
cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180629-v0.1-3-g6e7dfda-dirty-6804c5",
trainSteps: 20000,
evalSteps: 10,
@ -31,6 +33,9 @@ local updatedParams = {
masters: 1,
ps: 1,
gcpSecretFile: "user-gcp-sa.json",
gcpSecretName: "user-gcp-sa",
jobName: "tensor2tensor",
} + params;
@ -39,6 +44,10 @@ local containerEnv = [
name: "PYTHONPATH",
value: "/home/jovyan",
},
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/" + updatedParams.gcpSecretFile,
},
];
local baseCommand = [
@ -77,6 +86,23 @@ local masterCommand = workerBaseCommand + [
"--worker_job=/job:master",
];
local volumeMounts = [
{
name: "gcp-credentials",
mountPath: "/secret/gcp-credentials",
readOnly: true,
},
];
local volumes = [
{
name: "gcp-credentials",
secret: {
secretName: updatedParams.gcpSecretName,
},
},
];
local tfjob = {
apiVersion: "kubeflow.org/v1alpha2",
kind: "TFJob",
@ -96,6 +122,7 @@ local tfjob = {
name: "tensorflow",
command: masterCommand,
env: containerEnv,
volumeMounts: volumeMounts,
resources: if updatedParams.workerGpu > 0 then {
limits: {
"nvidia.com/gpu": updatedParams.workerGpu,
@ -103,6 +130,7 @@ local tfjob = {
} else null,
},
],
volumes: volumes,
restartPolicy: "OnFailure",
},
},
@ -118,6 +146,7 @@ local tfjob = {
name: "tensorflow",
command: workerCommand,
env: containerEnv,
volumeMounts: volumeMounts,
resouces:
if updatedParams.workerGpu > 0 then {
limits: {
@ -126,6 +155,7 @@ local tfjob = {
} else null,
},
],
volumes: volumes,
restartPolicy: "OnFailure",
},
},
@ -140,8 +170,10 @@ local tfjob = {
name: "tensorflow",
command: psCommand,
env: containerEnv,
volumeMounts: volumeMounts,
},
],
volumes: volumes,
restartPolicy: "OnFailure",
},
},

View File

@ -0,0 +1,70 @@
local env = std.extVar("__ksonnet/environments");
local overrideParams = std.extVar("__ksonnet/params").components["tfjob-pvc-v1alpha2"];
local k = import "k.libsonnet";
local namespace = env.namespace;
local defaultParams = {
image: "gcr.io/kubeflow-dev/tf-job-issue-summarization:v20180425-e79f888",
input_data: "/data/github_issues.csv",
output_model: "/data/model.h5",
sample_size: "2000000",
claim_name: "data-pvc",
};
local params = defaultParams + overrideParams;
local name = params.name;
local tfjob = {
apiVersion: "kubeflow.org/v1alpha2",
kind: "TFJob",
metadata: {
name: name,
namespace: namespace,
},
spec: {
tfReplicaSpecs: {
Master: {
replicas: 1,
template: {
spec: {
containers: [
{
image: params.image,
name: "tensorflow",
volumeMounts: [
{
name: "data",
mountPath: "/data",
},
],
command: [
"python",
"/workdir/train.py",
"--sample_size=" + std.toString(params.sample_size),
"--input_data=" + params.input_data,
"--output_model=" + params.output_model,
],
},
],
volumes: [
{
name: "data",
persistentVolumeClaim: {
claimName: params.claim_name,
},
},
],
restartPolicy: "OnFailure",
},
}, // template
},
},
},
};
k.core.v1.list.new([
tfjob,
])

View File

@ -6,7 +6,7 @@ ARG BASE_IMAGE=gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:lates
FROM $BASE_IMAGE
# Install pip packages as user jovyan
RUN pip install tensor2tensor h5py
RUN pip install tensor2tensor==1.6.6 h5py
USER root