Modify K8s models to export the models; tensorboard manifests (#320)

* Modify K8s models to export the models; tensorboard manifests

* Use a K8s job not a TFJob to export the model.
* Start an experiments.libsonnet file to define groups of parameters for
  different experiments that should be reused

* Need to install tensorflow_hub in the Docker image because it is
  required by t2t exporter.

* * Address review comments.
This commit is contained in:
Jeremy Lewi 2018-11-12 11:09:42 +08:00 committed by k8s-ci-robot
parent c6ff5dbef8
commit 2487194fbd
14 changed files with 137 additions and 60 deletions

View File

@ -6,7 +6,7 @@ RUN pip --no-cache-dir install oauth2client~=4.1.0 &&\
apt-get update && apt-get install -y jq git &&\
rm -rf /var/lib/apt/lists/*
RUN pip --no-cache-dir install tensor2tensor~=1.10.0
RUN pip --no-cache-dir install tensor2tensor~=1.10.0 tensorflow-hub~=0.1.1
ADD src/code_search /app/code_search
ADD src /src

View File

@ -0,0 +1,11 @@
local experiments = import "experiments.libsonnet";
local exporter = import "export-model.libsonnet";
local k = import "k.libsonnet";
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["t2t-code-search-exporter"] +
experiments["demo-trainer-11-07-dist-sync-gpu"] + {
name: "demo-export-11-07-dist-sync-gpu",
};
std.prune(k.core.v1.list.new([exporter.parts(params, env).job]))

View File

@ -13,7 +13,7 @@ local instances = {
"demo-trainer-11-07-dist-sync-gpu": "gs://code-search-demo/models/20181107-dist-sync-gpu",
};
local parts(name, logDir) = {
local parts(name, logDir) = {
service:: {
apiVersion: "v1",
kind: "Service",
@ -46,7 +46,7 @@ local parts(name, logDir) = {
"tb-job": name,
},
},
}, // service
}, // service
deployment:: {
apiVersion: "apps/v1beta1",
@ -107,10 +107,10 @@ local parts(name, logDir) = {
},
},
},
}, // deployment
}, // deployment
items: [self.service, self.deployment],
}; // parts
}; // parts
local tbObjects = std.flattenArrays(std.map(function(f) parts(f, instances[f]).items,
std.objectFieldsAll(instances)));

View File

@ -7,14 +7,14 @@ local t2tJob = import "t2t-job.libsonnet";
local env = std.extVar("__ksonnet/environments");
// Note we are reusing the parameters for t2t-code-search-trainer and then explicitly overriding them.
local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] + {
name: "demo-trainer-11-07-dist-sync-gpu",
outputDir: "gs://code-search-demo/models/20181107-dist-sync-gpu",
train_steps: 200000,
eval_steps: 100,
hparams_set: "transformer_base",
numWorkerGpu: 1,
numChief: 1,
numWorker: 8,
local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] {
name: "demo-trainer-11-07-dist-sync-gpu",
outputDir: "gs://code-search-demo/models/20181107-dist-sync-gpu",
train_steps: 200000,
eval_steps: 100,
hparams_set: "transformer_base",
numWorkerGpu: 1,
numChief: 1,
numWorker: 8,
};
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).jobDistSync]))

View File

@ -5,12 +5,12 @@ local t2tJob = import "t2t-job.libsonnet";
local env = std.extVar("__ksonnet/environments");
// Note we are reusing the parameters for t2t-code-search-trainer and then explicitly overriding them.
local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] + {
name: "demo-trainer-11-05-single-gpu",
outputDir: "gs://code-search-demo/models/20181105-dist-gpu",
train_steps: 200000,
eval_steps: 100,
hparams_set: "transformer_base_single_gpu",
numWorkerGpu: 1,
local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] {
name: "demo-trainer-11-05-single-gpu",
outputDir: "gs://code-search-demo/models/20181105-dist-gpu",
train_steps: 200000,
eval_steps: 100,
hparams_set: "transformer_base_single_gpu",
numWorkerGpu: 1,
};
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))

View File

@ -5,9 +5,9 @@ local t2tJob = import "t2t-job.libsonnet";
local env = std.extVar("__ksonnet/environments");
// Note we are reusing the parameters for t2t-code-search-trainer and then explicitly overriding them.
local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] + {
outputDir: "gs://code-search-demo/models/20181105-tinyparams",
train_steps: 200000,
eval_steps: 100,
local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] {
outputDir: "gs://code-search-demo/models/20181105-tinyparams",
train_steps: 200000,
eval_steps: 100,
};
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))

View File

@ -0,0 +1,11 @@
// Data for various experiments.
// Paths are deliberately hard coded so they get versioned and checked into source control.
{
"demo-trainer-11-07-dist-sync-gpu": {
name: "demo-trainer-11-07-dist-sync-gpu",
outputDir: "gs://code-search-demo/models/20181107-dist-sync-gpu",
train_steps: 200000,
eval_steps: 100,
hparams_set: "transformer_base",
},
}

View File

@ -0,0 +1,67 @@
{
parts(params, env):: {
job: {
apiVersion: "batch/v1",
kind: "Job",
metadata: {
name: params.name,
namespace: env.namespace,
labels: {
app: params.name,
},
},
spec: {
replicas: 1,
template: {
metadata: {
labels: {
app: params.name,
},
},
spec: {
restartPolicy: "OnFailure",
containers: [
{
name: "exporter",
image: params.image,
command: [
"t2t-exporter",
"--problem=" + params.problem,
"--data_dir=" + params.dataDir,
// TODO(kubeflow/examples#331): t2t-exporter should have flags --export and --export_dir
// which allow us to control the location of the exported model.
"--output_dir=" + params.outputDir,
"--model=" + params.model,
"--hparams_set=" + params.hparams_set,
// Need to import the problems.
"--t2t_usr_dir=/src/code_search/t2t",
],
env: [
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/user-gcp-sa.json",
},
],
workingDir: "/src",
volumeMounts: [
{
mountPath: "/secret/gcp-credentials",
name: "gcp-credentials",
},
], //volumeMounts
},
], // containers
volumes: [
{
name: "gcp-credentials",
secret: {
secretName: "user-gcp-sa",
},
},
],
}, // spec
},
},
},
}, // parts
}

View File

@ -8,11 +8,11 @@
// are not picked up by the individual components.
// Need to see if we can find a way to fix this.
local imageTag = "v20181107-30bab1f-dirty-1ac751",
local imageTag = "v20181108-004b5ad-dirty-eba459",
"t2t-job": {
jobType: "trainer",
numChief: 0,
numWorker: 1,
numWorker: 1,
numPs: 0,
numWorkerGpu: 0,
numPsGpu: 0,
@ -60,6 +60,7 @@
outputDir: $.components["t2t-code-search"].workingDir + "/output",
model: $.components["t2t-code-search"].model,
hparams_set: $.components["t2t-code-search"].hparams_set,
image: $.components["t2t-job"].image,
},
"t2t-code-search-serving": {
name: "t2t-code-search",

View File

@ -1,7 +1,7 @@
local exporter = import "export-model.libsonnet";
local k = import "k.libsonnet";
local t2tJob = import "t2t-job.libsonnet";
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["t2t-code-search-exporter"];
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))
std.prune(k.core.v1.list.new([exporter.parts(params, env).job]))

View File

@ -1,19 +1,6 @@
local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
{
getExporterCmd(params)::
[
// TODO(jlewi): Do we need to use the T2T entrypoint wrapper for the exporter?
// Why would we need to parse TF_CONFIG into command line flags?
"/usr/local/sbin/t2t-entrypoint",
"t2t-exporter",
"--problem=" + params.problem,
"--data_dir=" + params.dataDir,
"--output_dir=" + params.outputDir,
"--model=" + params.model,
"--hparams_set=" + params.hparams_set,
],
getTrainerCmd(params):: {
local trainer = [
// t2t-entrypoint is a wrapper that parses TF_CONFIG
@ -32,7 +19,7 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
worker: trainer,
worker_dist: trainer + [
"--schedule=train",
"--schedule=train",
"--ps_gpu=" + std.toString(params.numPsGpu),
"--worker_gpu=" + std.toString(params.numWorkerGpu),
"--worker_replicas=" + std.toString(params.numWorker),

View File

@ -1,8 +1,8 @@
{
// Warning: Do not define a global "image" as that will end up overriding
// the image parameter for all components. Define more specific names
// e.g. "dataflowImage", "trainerCpuImage", "trainerGpuImage",
workingDir: 'gs://code-search-demo/20181104',
dataDir: 'gs://code-search-demo/20181104/data',
project: 'code-search-demo',
}
// Warning: Do not define a global "image" as that will end up overriding
// the image parameter for all components. Define more specific names
// e.g. "dataflowImage", "trainerCpuImage", "trainerGpuImage",
workingDir: "gs://code-search-demo/20181104",
dataDir: "gs://code-search-demo/20181104/data",
project: "code-search-demo",
}

View File

@ -2,7 +2,7 @@ local base = import "base.libsonnet";
// uncomment if you reference ksonnet-lib
// local k = import "k.libsonnet";
base + {
base {
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
// "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
}

View File

@ -1,14 +1,14 @@
local params = std.extVar('__ksonnet/params');
local globals = import 'globals.libsonnet';
local envParams = params + {
local params = std.extVar("__ksonnet/params");
local globals = import "globals.libsonnet";
local envParams = params {
components+: {
"t2t-code-search"+: {
"t2t-code-search"+: {
},
"t2t-code-search-datagen"+: {
githubTable: '',
"t2t-code-search-datagen"+: {
githubTable: "",
},
"submit-preprocess-job"+: {
githubTable: '',
"submit-preprocess-job"+: {
githubTable: "",
},
},
};
@ -18,4 +18,4 @@ local envParams = params + {
[x]: envParams.components[x] + globals
for x in std.objectFields(envParams.components)
},
}
}