From 2487194fbdeeedb267fa35d3e59ffcdd42239e88 Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Mon, 12 Nov 2018 11:09:42 +0800 Subject: [PATCH] Modify K8s models to export the models; tensorboard manifests (#320) * Modify K8s models to export the models; tensorboard manifests * Use a K8s job not a TFJob to export the model. * Start an experiments.libsonnet file to define groups of parameters for different experiments that should be reused * Need to install tensorflow_hub in the Docker image because it is required by t2t exporter. * * Address review comments. --- code_search/docker/t2t/Dockerfile | 2 +- .../demo-export-11-05-dist-sync-gpu.jsonnet | 11 +++ .../components/demo-tensorboard.jsonnet | 8 +-- .../demo-trainer-11-05-dist-sync-gpu.jsonnet | 18 ++--- .../demo-trainer-11-05-single-gpu.jsonnet | 14 ++-- .../demo-trainer-11-05-tinyparams.jsonnet | 8 +-- .../kubeflow/components/experiments.libsonnet | 11 +++ .../components/export-model.libsonnet | 67 +++++++++++++++++++ .../kubeflow/components/params.libsonnet | 5 +- .../t2t-code-search-exporter.jsonnet | 4 +- .../kubeflow/components/t2t-job.libsonnet | 15 +---- .../environments/cs_demo/globals.libsonnet | 14 ++-- .../environments/cs_demo/main.jsonnet | 2 +- .../environments/cs_demo/params.libsonnet | 18 ++--- 14 files changed, 137 insertions(+), 60 deletions(-) create mode 100644 code_search/kubeflow/components/demo-export-11-05-dist-sync-gpu.jsonnet create mode 100644 code_search/kubeflow/components/experiments.libsonnet create mode 100644 code_search/kubeflow/components/export-model.libsonnet diff --git a/code_search/docker/t2t/Dockerfile b/code_search/docker/t2t/Dockerfile index d00f6df4..c9a12367 100644 --- a/code_search/docker/t2t/Dockerfile +++ b/code_search/docker/t2t/Dockerfile @@ -6,7 +6,7 @@ RUN pip --no-cache-dir install oauth2client~=4.1.0 &&\ apt-get update && apt-get install -y jq git &&\ rm -rf /var/lib/apt/lists/* -RUN pip --no-cache-dir install tensor2tensor~=1.10.0 +RUN pip --no-cache-dir install tensor2tensor~=1.10.0 tensorflow-hub~=0.1.1 ADD src/code_search /app/code_search ADD src /src diff --git a/code_search/kubeflow/components/demo-export-11-05-dist-sync-gpu.jsonnet b/code_search/kubeflow/components/demo-export-11-05-dist-sync-gpu.jsonnet new file mode 100644 index 00000000..b2238ec8 --- /dev/null +++ b/code_search/kubeflow/components/demo-export-11-05-dist-sync-gpu.jsonnet @@ -0,0 +1,11 @@ +local experiments = import "experiments.libsonnet"; +local exporter = import "export-model.libsonnet"; +local k = import "k.libsonnet"; +local env = std.extVar("__ksonnet/environments"); + +local params = std.extVar("__ksonnet/params").components["t2t-code-search-exporter"] + + experiments["demo-trainer-11-07-dist-sync-gpu"] + { + name: "demo-export-11-07-dist-sync-gpu", +}; + +std.prune(k.core.v1.list.new([exporter.parts(params, env).job])) diff --git a/code_search/kubeflow/components/demo-tensorboard.jsonnet b/code_search/kubeflow/components/demo-tensorboard.jsonnet index 0731110a..11f538dc 100644 --- a/code_search/kubeflow/components/demo-tensorboard.jsonnet +++ b/code_search/kubeflow/components/demo-tensorboard.jsonnet @@ -13,7 +13,7 @@ local instances = { "demo-trainer-11-07-dist-sync-gpu": "gs://code-search-demo/models/20181107-dist-sync-gpu", }; -local parts(name, logDir) = { +local parts(name, logDir) = { service:: { apiVersion: "v1", kind: "Service", @@ -46,7 +46,7 @@ local parts(name, logDir) = { "tb-job": name, }, }, - }, // service + }, // service deployment:: { apiVersion: "apps/v1beta1", @@ -107,10 +107,10 @@ local parts(name, logDir) = { }, }, }, - }, // deployment + }, // deployment items: [self.service, self.deployment], -}; // parts +}; // parts local tbObjects = std.flattenArrays(std.map(function(f) parts(f, instances[f]).items, std.objectFieldsAll(instances))); diff --git a/code_search/kubeflow/components/demo-trainer-11-05-dist-sync-gpu.jsonnet b/code_search/kubeflow/components/demo-trainer-11-05-dist-sync-gpu.jsonnet index b80059db..79cb77ed 100644 --- a/code_search/kubeflow/components/demo-trainer-11-05-dist-sync-gpu.jsonnet +++ b/code_search/kubeflow/components/demo-trainer-11-05-dist-sync-gpu.jsonnet @@ -7,14 +7,14 @@ local t2tJob = import "t2t-job.libsonnet"; local env = std.extVar("__ksonnet/environments"); // Note we are reusing the parameters for t2t-code-search-trainer and then explicitly overriding them. -local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] + { - name: "demo-trainer-11-07-dist-sync-gpu", - outputDir: "gs://code-search-demo/models/20181107-dist-sync-gpu", - train_steps: 200000, - eval_steps: 100, - hparams_set: "transformer_base", - numWorkerGpu: 1, - numChief: 1, - numWorker: 8, +local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] { + name: "demo-trainer-11-07-dist-sync-gpu", + outputDir: "gs://code-search-demo/models/20181107-dist-sync-gpu", + train_steps: 200000, + eval_steps: 100, + hparams_set: "transformer_base", + numWorkerGpu: 1, + numChief: 1, + numWorker: 8, }; std.prune(k.core.v1.list.new([t2tJob.parts(params, env).jobDistSync])) diff --git a/code_search/kubeflow/components/demo-trainer-11-05-single-gpu.jsonnet b/code_search/kubeflow/components/demo-trainer-11-05-single-gpu.jsonnet index a87e23e1..fdac40f5 100644 --- a/code_search/kubeflow/components/demo-trainer-11-05-single-gpu.jsonnet +++ b/code_search/kubeflow/components/demo-trainer-11-05-single-gpu.jsonnet @@ -5,12 +5,12 @@ local t2tJob = import "t2t-job.libsonnet"; local env = std.extVar("__ksonnet/environments"); // Note we are reusing the parameters for t2t-code-search-trainer and then explicitly overriding them. -local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] + { - name: "demo-trainer-11-05-single-gpu", - outputDir: "gs://code-search-demo/models/20181105-dist-gpu", - train_steps: 200000, - eval_steps: 100, - hparams_set: "transformer_base_single_gpu", - numWorkerGpu: 1, +local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] { + name: "demo-trainer-11-05-single-gpu", + outputDir: "gs://code-search-demo/models/20181105-dist-gpu", + train_steps: 200000, + eval_steps: 100, + hparams_set: "transformer_base_single_gpu", + numWorkerGpu: 1, }; std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job])) diff --git a/code_search/kubeflow/components/demo-trainer-11-05-tinyparams.jsonnet b/code_search/kubeflow/components/demo-trainer-11-05-tinyparams.jsonnet index 6de7a575..f7423ae7 100644 --- a/code_search/kubeflow/components/demo-trainer-11-05-tinyparams.jsonnet +++ b/code_search/kubeflow/components/demo-trainer-11-05-tinyparams.jsonnet @@ -5,9 +5,9 @@ local t2tJob = import "t2t-job.libsonnet"; local env = std.extVar("__ksonnet/environments"); // Note we are reusing the parameters for t2t-code-search-trainer and then explicitly overriding them. -local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] + { - outputDir: "gs://code-search-demo/models/20181105-tinyparams", - train_steps: 200000, - eval_steps: 100, +local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] { + outputDir: "gs://code-search-demo/models/20181105-tinyparams", + train_steps: 200000, + eval_steps: 100, }; std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job])) diff --git a/code_search/kubeflow/components/experiments.libsonnet b/code_search/kubeflow/components/experiments.libsonnet new file mode 100644 index 00000000..ae3c0a94 --- /dev/null +++ b/code_search/kubeflow/components/experiments.libsonnet @@ -0,0 +1,11 @@ +// Data for various experiments. +// Paths are deliberately hard coded so they get versioned and checked into source control. +{ + "demo-trainer-11-07-dist-sync-gpu": { + name: "demo-trainer-11-07-dist-sync-gpu", + outputDir: "gs://code-search-demo/models/20181107-dist-sync-gpu", + train_steps: 200000, + eval_steps: 100, + hparams_set: "transformer_base", + }, +} diff --git a/code_search/kubeflow/components/export-model.libsonnet b/code_search/kubeflow/components/export-model.libsonnet new file mode 100644 index 00000000..4cb7c05b --- /dev/null +++ b/code_search/kubeflow/components/export-model.libsonnet @@ -0,0 +1,67 @@ +{ + parts(params, env):: { + job: { + apiVersion: "batch/v1", + kind: "Job", + metadata: { + name: params.name, + namespace: env.namespace, + labels: { + app: params.name, + }, + }, + spec: { + replicas: 1, + template: { + metadata: { + labels: { + app: params.name, + }, + }, + spec: { + restartPolicy: "OnFailure", + containers: [ + { + name: "exporter", + image: params.image, + command: [ + "t2t-exporter", + "--problem=" + params.problem, + "--data_dir=" + params.dataDir, + // TODO(kubeflow/examples#331): t2t-exporter should have flags --export and --export_dir + // which allow us to control the location of the exported model. + "--output_dir=" + params.outputDir, + "--model=" + params.model, + "--hparams_set=" + params.hparams_set, + // Need to import the problems. + "--t2t_usr_dir=/src/code_search/t2t", + ], + env: [ + { + name: "GOOGLE_APPLICATION_CREDENTIALS", + value: "/secret/gcp-credentials/user-gcp-sa.json", + }, + ], + workingDir: "/src", + volumeMounts: [ + { + mountPath: "/secret/gcp-credentials", + name: "gcp-credentials", + }, + ], //volumeMounts + }, + ], // containers + volumes: [ + { + name: "gcp-credentials", + secret: { + secretName: "user-gcp-sa", + }, + }, + ], + }, // spec + }, + }, + }, + }, // parts +} diff --git a/code_search/kubeflow/components/params.libsonnet b/code_search/kubeflow/components/params.libsonnet index c7954c10..ca6fd3b9 100644 --- a/code_search/kubeflow/components/params.libsonnet +++ b/code_search/kubeflow/components/params.libsonnet @@ -8,11 +8,11 @@ // are not picked up by the individual components. // Need to see if we can find a way to fix this. - local imageTag = "v20181107-30bab1f-dirty-1ac751", + local imageTag = "v20181108-004b5ad-dirty-eba459", "t2t-job": { jobType: "trainer", numChief: 0, - numWorker: 1, + numWorker: 1, numPs: 0, numWorkerGpu: 0, numPsGpu: 0, @@ -60,6 +60,7 @@ outputDir: $.components["t2t-code-search"].workingDir + "/output", model: $.components["t2t-code-search"].model, hparams_set: $.components["t2t-code-search"].hparams_set, + image: $.components["t2t-job"].image, }, "t2t-code-search-serving": { name: "t2t-code-search", diff --git a/code_search/kubeflow/components/t2t-code-search-exporter.jsonnet b/code_search/kubeflow/components/t2t-code-search-exporter.jsonnet index 717e3da2..5e1b7d12 100644 --- a/code_search/kubeflow/components/t2t-code-search-exporter.jsonnet +++ b/code_search/kubeflow/components/t2t-code-search-exporter.jsonnet @@ -1,7 +1,7 @@ +local exporter = import "export-model.libsonnet"; local k = import "k.libsonnet"; -local t2tJob = import "t2t-job.libsonnet"; local env = std.extVar("__ksonnet/environments"); local params = std.extVar("__ksonnet/params").components["t2t-code-search-exporter"]; -std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job])) +std.prune(k.core.v1.list.new([exporter.parts(params, env).job])) diff --git a/code_search/kubeflow/components/t2t-job.libsonnet b/code_search/kubeflow/components/t2t-job.libsonnet index 23206352..e78e3a90 100644 --- a/code_search/kubeflow/components/t2t-job.libsonnet +++ b/code_search/kubeflow/components/t2t-job.libsonnet @@ -1,19 +1,6 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"]; { - getExporterCmd(params):: - [ - // TODO(jlewi): Do we need to use the T2T entrypoint wrapper for the exporter? - // Why would we need to parse TF_CONFIG into command line flags? - "/usr/local/sbin/t2t-entrypoint", - "t2t-exporter", - "--problem=" + params.problem, - "--data_dir=" + params.dataDir, - "--output_dir=" + params.outputDir, - "--model=" + params.model, - "--hparams_set=" + params.hparams_set, - ], - getTrainerCmd(params):: { local trainer = [ // t2t-entrypoint is a wrapper that parses TF_CONFIG @@ -32,7 +19,7 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"]; worker: trainer, worker_dist: trainer + [ - "--schedule=train", + "--schedule=train", "--ps_gpu=" + std.toString(params.numPsGpu), "--worker_gpu=" + std.toString(params.numWorkerGpu), "--worker_replicas=" + std.toString(params.numWorker), diff --git a/code_search/kubeflow/environments/cs_demo/globals.libsonnet b/code_search/kubeflow/environments/cs_demo/globals.libsonnet index e345e213..79a48dde 100644 --- a/code_search/kubeflow/environments/cs_demo/globals.libsonnet +++ b/code_search/kubeflow/environments/cs_demo/globals.libsonnet @@ -1,8 +1,8 @@ { - // Warning: Do not define a global "image" as that will end up overriding - // the image parameter for all components. Define more specific names - // e.g. "dataflowImage", "trainerCpuImage", "trainerGpuImage", - workingDir: 'gs://code-search-demo/20181104', - dataDir: 'gs://code-search-demo/20181104/data', - project: 'code-search-demo', -} \ No newline at end of file + // Warning: Do not define a global "image" as that will end up overriding + // the image parameter for all components. Define more specific names + // e.g. "dataflowImage", "trainerCpuImage", "trainerGpuImage", + workingDir: "gs://code-search-demo/20181104", + dataDir: "gs://code-search-demo/20181104/data", + project: "code-search-demo", +} diff --git a/code_search/kubeflow/environments/cs_demo/main.jsonnet b/code_search/kubeflow/environments/cs_demo/main.jsonnet index 58695a80..1a44c481 100644 --- a/code_search/kubeflow/environments/cs_demo/main.jsonnet +++ b/code_search/kubeflow/environments/cs_demo/main.jsonnet @@ -2,7 +2,7 @@ local base = import "base.libsonnet"; // uncomment if you reference ksonnet-lib // local k = import "k.libsonnet"; -base + { +base { // Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n") // "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"}) } diff --git a/code_search/kubeflow/environments/cs_demo/params.libsonnet b/code_search/kubeflow/environments/cs_demo/params.libsonnet index 9081c110..083abf40 100644 --- a/code_search/kubeflow/environments/cs_demo/params.libsonnet +++ b/code_search/kubeflow/environments/cs_demo/params.libsonnet @@ -1,14 +1,14 @@ -local params = std.extVar('__ksonnet/params'); -local globals = import 'globals.libsonnet'; -local envParams = params + { +local params = std.extVar("__ksonnet/params"); +local globals = import "globals.libsonnet"; +local envParams = params { components+: { - "t2t-code-search"+: { + "t2t-code-search"+: { }, - "t2t-code-search-datagen"+: { - githubTable: '', + "t2t-code-search-datagen"+: { + githubTable: "", }, - "submit-preprocess-job"+: { - githubTable: '', + "submit-preprocess-job"+: { + githubTable: "", }, }, }; @@ -18,4 +18,4 @@ local envParams = params + { [x]: envParams.components[x] + globals for x in std.objectFields(envParams.components) }, -} \ No newline at end of file +}