mirror of https://github.com/kubeflow/examples.git
				
				
				
			Fix v1alpha2 version of the T2T training job. (#158)
* Update the Docker image for T2T to use a newer version of T2T library * Add parameters to set the GCP secret; we need GCP credentials to read from GCS even if reading a public bucket. We default to the parameters that are created automatically in the case of a GKE deployment. * Create a v1alpha2 template for the job that uses PVC.
This commit is contained in:
		
							parent
							
								
									93db7e369e
								
							
						
					
					
						commit
						98ed4b4a69
					
				| 
						 | 
				
			
			@ -5,7 +5,6 @@ local env = std.extVar("__ksonnet/environments");
 | 
			
		|||
local params = std.extVar("__ksonnet/params").components["data-pvc"];
 | 
			
		||||
local k = import "k.libsonnet";
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
local pvc = {
 | 
			
		||||
  apiVersion: "v1",
 | 
			
		||||
  kind: "PersistentVolumeClaim",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -91,5 +91,8 @@
 | 
			
		|||
      name: "tensor2tensor-v1alpha2",
 | 
			
		||||
    },
 | 
			
		||||
    "data-downloader": {},
 | 
			
		||||
    "tfjob-pvc-v1alpha2": {
 | 
			
		||||
      name: "tfjob-pvc-v1alpha2",
 | 
			
		||||
    },
 | 
			
		||||
  },
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -10,16 +10,18 @@ local updatedParams = {
 | 
			
		|||
  sync: "0",
 | 
			
		||||
 | 
			
		||||
  dataDir: "gs://kubeflow-examples-data/gh_issue_summarization/data",
 | 
			
		||||
  usrDir: "./github",
 | 
			
		||||
  // usrDir needs to match the directory inside the container where the problem is defined.
 | 
			
		||||
  usrDir: "/home/jovyan/github",
 | 
			
		||||
  problem: "github_issue_summarization_problem",
 | 
			
		||||
 | 
			
		||||
  model: "transformer_encoder",
 | 
			
		||||
  hparams: "transformer_github_issues",
 | 
			
		||||
  hparamsSet: "transformer_github_issues",
 | 
			
		||||
  // Set this to the path you want to write to.
 | 
			
		||||
  outputGCSPath: "gs://kubecon-gh-demo/gh-t2t-out/temp",
 | 
			
		||||
 | 
			
		||||
  gpuImage: null,
 | 
			
		||||
  cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180629-v0.1-2-g4e8b4cb",
 | 
			
		||||
  cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180629-v0.1-3-g6e7dfda-dirty-6804c5",
 | 
			
		||||
 | 
			
		||||
  trainSteps: 20000,
 | 
			
		||||
  evalSteps: 10,
 | 
			
		||||
| 
						 | 
				
			
			@ -31,6 +33,9 @@ local updatedParams = {
 | 
			
		|||
  masters: 1,
 | 
			
		||||
  ps: 1,
 | 
			
		||||
 | 
			
		||||
  gcpSecretFile: "user-gcp-sa.json",
 | 
			
		||||
  gcpSecretName: "user-gcp-sa",
 | 
			
		||||
 | 
			
		||||
  jobName: "tensor2tensor",
 | 
			
		||||
} + params;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -39,6 +44,10 @@ local containerEnv = [
 | 
			
		|||
    name: "PYTHONPATH",
 | 
			
		||||
    value: "/home/jovyan",
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
    name: "GOOGLE_APPLICATION_CREDENTIALS",
 | 
			
		||||
    value: "/secret/gcp-credentials/" + updatedParams.gcpSecretFile,
 | 
			
		||||
  },
 | 
			
		||||
];
 | 
			
		||||
 | 
			
		||||
local baseCommand = [
 | 
			
		||||
| 
						 | 
				
			
			@ -77,6 +86,23 @@ local masterCommand = workerBaseCommand + [
 | 
			
		|||
  "--worker_job=/job:master",
 | 
			
		||||
];
 | 
			
		||||
 | 
			
		||||
local volumeMounts = [
 | 
			
		||||
  {
 | 
			
		||||
    name: "gcp-credentials",
 | 
			
		||||
    mountPath: "/secret/gcp-credentials",
 | 
			
		||||
    readOnly: true,
 | 
			
		||||
  },
 | 
			
		||||
];
 | 
			
		||||
 | 
			
		||||
local volumes = [
 | 
			
		||||
  {
 | 
			
		||||
    name: "gcp-credentials",
 | 
			
		||||
    secret: {
 | 
			
		||||
      secretName: updatedParams.gcpSecretName,
 | 
			
		||||
    },
 | 
			
		||||
  },
 | 
			
		||||
];
 | 
			
		||||
 | 
			
		||||
local tfjob = {
 | 
			
		||||
  apiVersion: "kubeflow.org/v1alpha2",
 | 
			
		||||
  kind: "TFJob",
 | 
			
		||||
| 
						 | 
				
			
			@ -96,6 +122,7 @@ local tfjob = {
 | 
			
		|||
                name: "tensorflow",
 | 
			
		||||
                command: masterCommand,
 | 
			
		||||
                env: containerEnv,
 | 
			
		||||
                volumeMounts: volumeMounts,
 | 
			
		||||
                resources: if updatedParams.workerGpu > 0 then {
 | 
			
		||||
                  limits: {
 | 
			
		||||
                    "nvidia.com/gpu": updatedParams.workerGpu,
 | 
			
		||||
| 
						 | 
				
			
			@ -103,6 +130,7 @@ local tfjob = {
 | 
			
		|||
                } else null,
 | 
			
		||||
              },
 | 
			
		||||
            ],
 | 
			
		||||
            volumes: volumes,
 | 
			
		||||
            restartPolicy: "OnFailure",
 | 
			
		||||
          },
 | 
			
		||||
        },
 | 
			
		||||
| 
						 | 
				
			
			@ -118,6 +146,7 @@ local tfjob = {
 | 
			
		|||
                name: "tensorflow",
 | 
			
		||||
                command: workerCommand,
 | 
			
		||||
                env: containerEnv,
 | 
			
		||||
                volumeMounts: volumeMounts,
 | 
			
		||||
                resouces:
 | 
			
		||||
                  if updatedParams.workerGpu > 0 then {
 | 
			
		||||
                    limits: {
 | 
			
		||||
| 
						 | 
				
			
			@ -126,6 +155,7 @@ local tfjob = {
 | 
			
		|||
                  } else null,
 | 
			
		||||
              },
 | 
			
		||||
            ],
 | 
			
		||||
            volumes: volumes,
 | 
			
		||||
            restartPolicy: "OnFailure",
 | 
			
		||||
          },
 | 
			
		||||
        },
 | 
			
		||||
| 
						 | 
				
			
			@ -140,8 +170,10 @@ local tfjob = {
 | 
			
		|||
                name: "tensorflow",
 | 
			
		||||
                command: psCommand,
 | 
			
		||||
                env: containerEnv,
 | 
			
		||||
                volumeMounts: volumeMounts,
 | 
			
		||||
              },
 | 
			
		||||
            ],
 | 
			
		||||
            volumes: volumes,
 | 
			
		||||
            restartPolicy: "OnFailure",
 | 
			
		||||
          },
 | 
			
		||||
        },
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,70 @@
 | 
			
		|||
local env = std.extVar("__ksonnet/environments");
 | 
			
		||||
local overrideParams = std.extVar("__ksonnet/params").components["tfjob-pvc-v1alpha2"];
 | 
			
		||||
 | 
			
		||||
local k = import "k.libsonnet";
 | 
			
		||||
 | 
			
		||||
local namespace = env.namespace;
 | 
			
		||||
 | 
			
		||||
local defaultParams = {
 | 
			
		||||
  image: "gcr.io/kubeflow-dev/tf-job-issue-summarization:v20180425-e79f888",
 | 
			
		||||
  input_data: "/data/github_issues.csv",
 | 
			
		||||
 | 
			
		||||
  output_model: "/data/model.h5",
 | 
			
		||||
  sample_size: "2000000",
 | 
			
		||||
  claim_name: "data-pvc",
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
local params = defaultParams + overrideParams;
 | 
			
		||||
local name = params.name;
 | 
			
		||||
 | 
			
		||||
local tfjob = {
 | 
			
		||||
  apiVersion: "kubeflow.org/v1alpha2",
 | 
			
		||||
  kind: "TFJob",
 | 
			
		||||
  metadata: {
 | 
			
		||||
    name: name,
 | 
			
		||||
    namespace: namespace,
 | 
			
		||||
  },
 | 
			
		||||
  spec: {
 | 
			
		||||
    tfReplicaSpecs: {
 | 
			
		||||
      Master: {
 | 
			
		||||
        replicas: 1,
 | 
			
		||||
        template: {
 | 
			
		||||
          spec: {
 | 
			
		||||
            containers: [
 | 
			
		||||
              {
 | 
			
		||||
                image: params.image,
 | 
			
		||||
                name: "tensorflow",
 | 
			
		||||
                volumeMounts: [
 | 
			
		||||
                  {
 | 
			
		||||
                    name: "data",
 | 
			
		||||
                    mountPath: "/data",
 | 
			
		||||
                  },
 | 
			
		||||
                ],
 | 
			
		||||
                command: [
 | 
			
		||||
                  "python",
 | 
			
		||||
                  "/workdir/train.py",
 | 
			
		||||
                  "--sample_size=" + std.toString(params.sample_size),
 | 
			
		||||
                  "--input_data=" + params.input_data,
 | 
			
		||||
                  "--output_model=" + params.output_model,
 | 
			
		||||
                ],
 | 
			
		||||
              },
 | 
			
		||||
            ],
 | 
			
		||||
            volumes: [
 | 
			
		||||
              {
 | 
			
		||||
                name: "data",
 | 
			
		||||
                persistentVolumeClaim: {
 | 
			
		||||
                  claimName: params.claim_name,
 | 
			
		||||
                },
 | 
			
		||||
              },
 | 
			
		||||
            ],
 | 
			
		||||
            restartPolicy: "OnFailure",
 | 
			
		||||
          },
 | 
			
		||||
        },  // template
 | 
			
		||||
      },
 | 
			
		||||
    },
 | 
			
		||||
  },
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
k.core.v1.list.new([
 | 
			
		||||
  tfjob,
 | 
			
		||||
])
 | 
			
		||||
| 
						 | 
				
			
			@ -6,7 +6,7 @@ ARG BASE_IMAGE=gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:lates
 | 
			
		|||
FROM $BASE_IMAGE
 | 
			
		||||
 | 
			
		||||
# Install pip packages as user jovyan
 | 
			
		||||
RUN pip install tensor2tensor h5py
 | 
			
		||||
RUN pip install tensor2tensor==1.6.6 h5py
 | 
			
		||||
 | 
			
		||||
USER root
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue