mirror of https://github.com/kubeflow/examples.git
Fix v1alpha2 version of the T2T training job. (#158)
* Update the Docker image for T2T to use a newer version of T2T library * Add parameters to set the GCP secret; we need GCP credentials to read from GCS even if reading a public bucket. We default to the parameters that are created automatically in the case of a GKE deployment. * Create a v1alpha2 template for the job that uses PVC.
This commit is contained in:
parent
93db7e369e
commit
98ed4b4a69
|
|
@ -5,7 +5,6 @@ local env = std.extVar("__ksonnet/environments");
|
|||
local params = std.extVar("__ksonnet/params").components["data-pvc"];
|
||||
local k = import "k.libsonnet";
|
||||
|
||||
|
||||
local pvc = {
|
||||
apiVersion: "v1",
|
||||
kind: "PersistentVolumeClaim",
|
||||
|
|
|
|||
|
|
@ -91,5 +91,8 @@
|
|||
name: "tensor2tensor-v1alpha2",
|
||||
},
|
||||
"data-downloader": {},
|
||||
"tfjob-pvc-v1alpha2": {
|
||||
name: "tfjob-pvc-v1alpha2",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,16 +10,18 @@ local updatedParams = {
|
|||
sync: "0",
|
||||
|
||||
dataDir: "gs://kubeflow-examples-data/gh_issue_summarization/data",
|
||||
usrDir: "./github",
|
||||
// usrDir needs to match the directory inside the container where the problem is defined.
|
||||
usrDir: "/home/jovyan/github",
|
||||
problem: "github_issue_summarization_problem",
|
||||
|
||||
model: "transformer_encoder",
|
||||
hparams: "transformer_github_issues",
|
||||
hparamsSet: "transformer_github_issues",
|
||||
// Set this to the path you want to write to.
|
||||
outputGCSPath: "gs://kubecon-gh-demo/gh-t2t-out/temp",
|
||||
|
||||
gpuImage: null,
|
||||
cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180629-v0.1-2-g4e8b4cb",
|
||||
cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180629-v0.1-3-g6e7dfda-dirty-6804c5",
|
||||
|
||||
trainSteps: 20000,
|
||||
evalSteps: 10,
|
||||
|
|
@ -31,6 +33,9 @@ local updatedParams = {
|
|||
masters: 1,
|
||||
ps: 1,
|
||||
|
||||
gcpSecretFile: "user-gcp-sa.json",
|
||||
gcpSecretName: "user-gcp-sa",
|
||||
|
||||
jobName: "tensor2tensor",
|
||||
} + params;
|
||||
|
||||
|
|
@ -39,6 +44,10 @@ local containerEnv = [
|
|||
name: "PYTHONPATH",
|
||||
value: "/home/jovyan",
|
||||
},
|
||||
{
|
||||
name: "GOOGLE_APPLICATION_CREDENTIALS",
|
||||
value: "/secret/gcp-credentials/" + updatedParams.gcpSecretFile,
|
||||
},
|
||||
];
|
||||
|
||||
local baseCommand = [
|
||||
|
|
@ -77,6 +86,23 @@ local masterCommand = workerBaseCommand + [
|
|||
"--worker_job=/job:master",
|
||||
];
|
||||
|
||||
local volumeMounts = [
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
readOnly: true,
|
||||
},
|
||||
];
|
||||
|
||||
local volumes = [
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: updatedParams.gcpSecretName,
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
local tfjob = {
|
||||
apiVersion: "kubeflow.org/v1alpha2",
|
||||
kind: "TFJob",
|
||||
|
|
@ -96,6 +122,7 @@ local tfjob = {
|
|||
name: "tensorflow",
|
||||
command: masterCommand,
|
||||
env: containerEnv,
|
||||
volumeMounts: volumeMounts,
|
||||
resources: if updatedParams.workerGpu > 0 then {
|
||||
limits: {
|
||||
"nvidia.com/gpu": updatedParams.workerGpu,
|
||||
|
|
@ -103,6 +130,7 @@ local tfjob = {
|
|||
} else null,
|
||||
},
|
||||
],
|
||||
volumes: volumes,
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
|
|
@ -118,6 +146,7 @@ local tfjob = {
|
|||
name: "tensorflow",
|
||||
command: workerCommand,
|
||||
env: containerEnv,
|
||||
volumeMounts: volumeMounts,
|
||||
resouces:
|
||||
if updatedParams.workerGpu > 0 then {
|
||||
limits: {
|
||||
|
|
@ -126,6 +155,7 @@ local tfjob = {
|
|||
} else null,
|
||||
},
|
||||
],
|
||||
volumes: volumes,
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
|
|
@ -140,8 +170,10 @@ local tfjob = {
|
|||
name: "tensorflow",
|
||||
command: psCommand,
|
||||
env: containerEnv,
|
||||
volumeMounts: volumeMounts,
|
||||
},
|
||||
],
|
||||
volumes: volumes,
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
|
|
|
|||
|
|
@ -0,0 +1,70 @@
|
|||
local env = std.extVar("__ksonnet/environments");
|
||||
local overrideParams = std.extVar("__ksonnet/params").components["tfjob-pvc-v1alpha2"];
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
|
||||
local namespace = env.namespace;
|
||||
|
||||
local defaultParams = {
|
||||
image: "gcr.io/kubeflow-dev/tf-job-issue-summarization:v20180425-e79f888",
|
||||
input_data: "/data/github_issues.csv",
|
||||
|
||||
output_model: "/data/model.h5",
|
||||
sample_size: "2000000",
|
||||
claim_name: "data-pvc",
|
||||
};
|
||||
|
||||
local params = defaultParams + overrideParams;
|
||||
local name = params.name;
|
||||
|
||||
local tfjob = {
|
||||
apiVersion: "kubeflow.org/v1alpha2",
|
||||
kind: "TFJob",
|
||||
metadata: {
|
||||
name: name,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
tfReplicaSpecs: {
|
||||
Master: {
|
||||
replicas: 1,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
image: params.image,
|
||||
name: "tensorflow",
|
||||
volumeMounts: [
|
||||
{
|
||||
name: "data",
|
||||
mountPath: "/data",
|
||||
},
|
||||
],
|
||||
command: [
|
||||
"python",
|
||||
"/workdir/train.py",
|
||||
"--sample_size=" + std.toString(params.sample_size),
|
||||
"--input_data=" + params.input_data,
|
||||
"--output_model=" + params.output_model,
|
||||
],
|
||||
},
|
||||
],
|
||||
volumes: [
|
||||
{
|
||||
name: "data",
|
||||
persistentVolumeClaim: {
|
||||
claimName: params.claim_name,
|
||||
},
|
||||
},
|
||||
],
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
}, // template
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
k.core.v1.list.new([
|
||||
tfjob,
|
||||
])
|
||||
|
|
@ -6,7 +6,7 @@ ARG BASE_IMAGE=gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:lates
|
|||
FROM $BASE_IMAGE
|
||||
|
||||
# Install pip packages as user jovyan
|
||||
RUN pip install tensor2tensor h5py
|
||||
RUN pip install tensor2tensor==1.6.6 h5py
|
||||
|
||||
USER root
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue