Fix v1alpha2 version of the T2T training job. (#158)

* Update the Docker image for T2T to use a newer version of T2T library * Add parameters to set the GCP secret; we need GCP credentials to read from GCS even if reading a public bucket. We default to the parameters that are created automatically in the case of a GKE deployment. * Create a v1alpha2 template for the job that uses PVC.
2018-06-29 12:26:18 -07:00 · 2018-06-29 12:26:18 -07:00 · 98ed4b4a69
parent 93db7e369e
commit 98ed4b4a69
5 changed files with 108 additions and 4 deletions
--- a/github_issue_summarization/ks-kubeflow/components/data-pvc.jsonnet
+++ b/github_issue_summarization/ks-kubeflow/components/data-pvc.jsonnet
@ -5,7 +5,6 @@ local env = std.extVar("__ksonnet/environments");
 local params = std.extVar("__ksonnet/params").components["data-pvc"];
 local k = import "k.libsonnet";

-
 local pvc = {
  apiVersion: "v1",
  kind: "PersistentVolumeClaim",
--- a/github_issue_summarization/ks-kubeflow/components/params.libsonnet
+++ b/github_issue_summarization/ks-kubeflow/components/params.libsonnet
@ -91,5 +91,8 @@
      name: "tensor2tensor-v1alpha2",
    },
    "data-downloader": {},
+    "tfjob-pvc-v1alpha2": {
+      name: "tfjob-pvc-v1alpha2",
+    },
  },
 }
--- a/github_issue_summarization/ks-kubeflow/components/tensor2tensor-v1alpha2.jsonnet
+++ b/github_issue_summarization/ks-kubeflow/components/tensor2tensor-v1alpha2.jsonnet
@ -10,16 +10,18 @@ local updatedParams = {
  sync: "0",

  dataDir: "gs://kubeflow-examples-data/gh_issue_summarization/data",
-  usrDir: "./github",
+  // usrDir needs to match the directory inside the container where the problem is defined.
+  usrDir: "/home/jovyan/github",
  problem: "github_issue_summarization_problem",

  model: "transformer_encoder",
  hparams: "transformer_github_issues",
  hparamsSet: "transformer_github_issues",
+  // Set this to the path you want to write to.
  outputGCSPath: "gs://kubecon-gh-demo/gh-t2t-out/temp",

  gpuImage: null,
-  cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180629-v0.1-2-g4e8b4cb",
+  cpuImage: "gcr.io/kubeflow-examples/issue-summarization-t2t-trainer-cpu:v20180629-v0.1-3-g6e7dfda-dirty-6804c5",

  trainSteps: 20000,
  evalSteps: 10,
@ -31,6 +33,9 @@ local updatedParams = {
  masters: 1,
  ps: 1,

+  gcpSecretFile: "user-gcp-sa.json",
+  gcpSecretName: "user-gcp-sa",
+
  jobName: "tensor2tensor",
 } + params;

@ -39,6 +44,10 @@ local containerEnv = [
    name: "PYTHONPATH",
    value: "/home/jovyan",
  },
+  {
+    name: "GOOGLE_APPLICATION_CREDENTIALS",
+    value: "/secret/gcp-credentials/" + updatedParams.gcpSecretFile,
+  },
 ];

 local baseCommand = [
@ -77,6 +86,23 @@ local masterCommand = workerBaseCommand + [
  "--worker_job=/job:master",
 ];

+local volumeMounts = [
+  {
+    name: "gcp-credentials",
+    mountPath: "/secret/gcp-credentials",
+    readOnly: true,
+  },
+];
+
+local volumes = [
+  {
+    name: "gcp-credentials",
+    secret: {
+      secretName: updatedParams.gcpSecretName,
+    },
+  },
+];
+
 local tfjob = {
  apiVersion: "kubeflow.org/v1alpha2",
  kind: "TFJob",
@ -96,6 +122,7 @@ local tfjob = {
                name: "tensorflow",
                command: masterCommand,
                env: containerEnv,
+                volumeMounts: volumeMounts,
                resources: if updatedParams.workerGpu > 0 then {
                  limits: {
                    "nvidia.com/gpu": updatedParams.workerGpu,
@ -103,6 +130,7 @@ local tfjob = {
                } else null,
              },
            ],
+            volumes: volumes,
            restartPolicy: "OnFailure",
          },
        },
@ -118,6 +146,7 @@ local tfjob = {
                name: "tensorflow",
                command: workerCommand,
                env: containerEnv,
+                volumeMounts: volumeMounts,
                resouces:
                  if updatedParams.workerGpu > 0 then {
                    limits: {
@ -126,6 +155,7 @@ local tfjob = {
                  } else null,
              },
            ],
+            volumes: volumes,
            restartPolicy: "OnFailure",
          },
        },
@ -140,8 +170,10 @@ local tfjob = {
                name: "tensorflow",
                command: psCommand,
                env: containerEnv,
+                volumeMounts: volumeMounts,
              },
            ],
+            volumes: volumes,
            restartPolicy: "OnFailure",
          },
        },
--- a/github_issue_summarization/ks-kubeflow/components/tfjob-pvc-v1alpha2.jsonnet
+++ b/github_issue_summarization/ks-kubeflow/components/tfjob-pvc-v1alpha2.jsonnet
@ -0,0 +1,70 @@
+local env = std.extVar("__ksonnet/environments");
+local overrideParams = std.extVar("__ksonnet/params").components["tfjob-pvc-v1alpha2"];
+
+local k = import "k.libsonnet";
+
+local namespace = env.namespace;
+
+local defaultParams = {
+  image: "gcr.io/kubeflow-dev/tf-job-issue-summarization:v20180425-e79f888",
+  input_data: "/data/github_issues.csv",
+
+  output_model: "/data/model.h5",
+  sample_size: "2000000",
+  claim_name: "data-pvc",
+};
+
+local params = defaultParams + overrideParams;
+local name = params.name;
+
+local tfjob = {
+  apiVersion: "kubeflow.org/v1alpha2",
+  kind: "TFJob",
+  metadata: {
+    name: name,
+    namespace: namespace,
+  },
+  spec: {
+    tfReplicaSpecs: {
+      Master: {
+        replicas: 1,
+        template: {
+          spec: {
+            containers: [
+              {
+                image: params.image,
+                name: "tensorflow",
+                volumeMounts: [
+                  {
+                    name: "data",
+                    mountPath: "/data",
+                  },
+                ],
+                command: [
+                  "python",
+                  "/workdir/train.py",
+                  "--sample_size=" + std.toString(params.sample_size),
+                  "--input_data=" + params.input_data,
+                  "--output_model=" + params.output_model,
+                ],
+              },
+            ],
+            volumes: [
+              {
+                name: "data",
+                persistentVolumeClaim: {
+                  claimName: params.claim_name,
+                },
+              },
+            ],
+            restartPolicy: "OnFailure",
+          },
+        },  // template
+      },
+    },
+  },
+};
+
+k.core.v1.list.new([
+  tfjob,
+])
--- a/github_issue_summarization/tensor2tensor/github/Dockerfile
+++ b/github_issue_summarization/tensor2tensor/github/Dockerfile
@ -6,7 +6,7 @@ ARG BASE_IMAGE=gcr.io/kubeflow-images-public/tensorflow-1.7.0-notebook-gpu:lates
 FROM $BASE_IMAGE

 # Install pip packages as user jovyan
-RUN pip install tensor2tensor h5py
+RUN pip install tensor2tensor==1.6.6 h5py

 USER root