mirror of https://github.com/kubeflow/examples.git
Modify K8s models to export the models; tensorboard manifests (#320)
* Modify K8s models to export the models; tensorboard manifests * Use a K8s job not a TFJob to export the model. * Start an experiments.libsonnet file to define groups of parameters for different experiments that should be reused * Need to install tensorflow_hub in the Docker image because it is required by t2t exporter. * * Address review comments.
This commit is contained in:
parent
c6ff5dbef8
commit
2487194fbd
|
@ -6,7 +6,7 @@ RUN pip --no-cache-dir install oauth2client~=4.1.0 &&\
|
|||
apt-get update && apt-get install -y jq git &&\
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip --no-cache-dir install tensor2tensor~=1.10.0
|
||||
RUN pip --no-cache-dir install tensor2tensor~=1.10.0 tensorflow-hub~=0.1.1
|
||||
|
||||
ADD src/code_search /app/code_search
|
||||
ADD src /src
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
local experiments = import "experiments.libsonnet";
|
||||
local exporter = import "export-model.libsonnet";
|
||||
local k = import "k.libsonnet";
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-code-search-exporter"] +
|
||||
experiments["demo-trainer-11-07-dist-sync-gpu"] + {
|
||||
name: "demo-export-11-07-dist-sync-gpu",
|
||||
};
|
||||
|
||||
std.prune(k.core.v1.list.new([exporter.parts(params, env).job]))
|
|
@ -13,7 +13,7 @@ local instances = {
|
|||
"demo-trainer-11-07-dist-sync-gpu": "gs://code-search-demo/models/20181107-dist-sync-gpu",
|
||||
};
|
||||
|
||||
local parts(name, logDir) = {
|
||||
local parts(name, logDir) = {
|
||||
service:: {
|
||||
apiVersion: "v1",
|
||||
kind: "Service",
|
||||
|
@ -46,7 +46,7 @@ local parts(name, logDir) = {
|
|||
"tb-job": name,
|
||||
},
|
||||
},
|
||||
}, // service
|
||||
}, // service
|
||||
|
||||
deployment:: {
|
||||
apiVersion: "apps/v1beta1",
|
||||
|
@ -107,10 +107,10 @@ local parts(name, logDir) = {
|
|||
},
|
||||
},
|
||||
},
|
||||
}, // deployment
|
||||
}, // deployment
|
||||
|
||||
items: [self.service, self.deployment],
|
||||
}; // parts
|
||||
}; // parts
|
||||
|
||||
local tbObjects = std.flattenArrays(std.map(function(f) parts(f, instances[f]).items,
|
||||
std.objectFieldsAll(instances)));
|
||||
|
|
|
@ -7,14 +7,14 @@ local t2tJob = import "t2t-job.libsonnet";
|
|||
local env = std.extVar("__ksonnet/environments");
|
||||
|
||||
// Note we are reusing the parameters for t2t-code-search-trainer and then explicitly overriding them.
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] + {
|
||||
name: "demo-trainer-11-07-dist-sync-gpu",
|
||||
outputDir: "gs://code-search-demo/models/20181107-dist-sync-gpu",
|
||||
train_steps: 200000,
|
||||
eval_steps: 100,
|
||||
hparams_set: "transformer_base",
|
||||
numWorkerGpu: 1,
|
||||
numChief: 1,
|
||||
numWorker: 8,
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] {
|
||||
name: "demo-trainer-11-07-dist-sync-gpu",
|
||||
outputDir: "gs://code-search-demo/models/20181107-dist-sync-gpu",
|
||||
train_steps: 200000,
|
||||
eval_steps: 100,
|
||||
hparams_set: "transformer_base",
|
||||
numWorkerGpu: 1,
|
||||
numChief: 1,
|
||||
numWorker: 8,
|
||||
};
|
||||
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).jobDistSync]))
|
||||
|
|
|
@ -5,12 +5,12 @@ local t2tJob = import "t2t-job.libsonnet";
|
|||
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
// Note we are reusing the parameters for t2t-code-search-trainer and then explicitly overriding them.
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] + {
|
||||
name: "demo-trainer-11-05-single-gpu",
|
||||
outputDir: "gs://code-search-demo/models/20181105-dist-gpu",
|
||||
train_steps: 200000,
|
||||
eval_steps: 100,
|
||||
hparams_set: "transformer_base_single_gpu",
|
||||
numWorkerGpu: 1,
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] {
|
||||
name: "demo-trainer-11-05-single-gpu",
|
||||
outputDir: "gs://code-search-demo/models/20181105-dist-gpu",
|
||||
train_steps: 200000,
|
||||
eval_steps: 100,
|
||||
hparams_set: "transformer_base_single_gpu",
|
||||
numWorkerGpu: 1,
|
||||
};
|
||||
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))
|
||||
|
|
|
@ -5,9 +5,9 @@ local t2tJob = import "t2t-job.libsonnet";
|
|||
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
// Note we are reusing the parameters for t2t-code-search-trainer and then explicitly overriding them.
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] + {
|
||||
outputDir: "gs://code-search-demo/models/20181105-tinyparams",
|
||||
train_steps: 200000,
|
||||
eval_steps: 100,
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"] {
|
||||
outputDir: "gs://code-search-demo/models/20181105-tinyparams",
|
||||
train_steps: 200000,
|
||||
eval_steps: 100,
|
||||
};
|
||||
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
// Data for various experiments.
|
||||
// Paths are deliberately hard coded so they get versioned and checked into source control.
|
||||
{
|
||||
"demo-trainer-11-07-dist-sync-gpu": {
|
||||
name: "demo-trainer-11-07-dist-sync-gpu",
|
||||
outputDir: "gs://code-search-demo/models/20181107-dist-sync-gpu",
|
||||
train_steps: 200000,
|
||||
eval_steps: 100,
|
||||
hparams_set: "transformer_base",
|
||||
},
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
{
|
||||
parts(params, env):: {
|
||||
job: {
|
||||
apiVersion: "batch/v1",
|
||||
kind: "Job",
|
||||
metadata: {
|
||||
name: params.name,
|
||||
namespace: env.namespace,
|
||||
labels: {
|
||||
app: params.name,
|
||||
},
|
||||
},
|
||||
spec: {
|
||||
replicas: 1,
|
||||
template: {
|
||||
metadata: {
|
||||
labels: {
|
||||
app: params.name,
|
||||
},
|
||||
},
|
||||
spec: {
|
||||
restartPolicy: "OnFailure",
|
||||
containers: [
|
||||
{
|
||||
name: "exporter",
|
||||
image: params.image,
|
||||
command: [
|
||||
"t2t-exporter",
|
||||
"--problem=" + params.problem,
|
||||
"--data_dir=" + params.dataDir,
|
||||
// TODO(kubeflow/examples#331): t2t-exporter should have flags --export and --export_dir
|
||||
// which allow us to control the location of the exported model.
|
||||
"--output_dir=" + params.outputDir,
|
||||
"--model=" + params.model,
|
||||
"--hparams_set=" + params.hparams_set,
|
||||
// Need to import the problems.
|
||||
"--t2t_usr_dir=/src/code_search/t2t",
|
||||
],
|
||||
env: [
|
||||
{
|
||||
name: "GOOGLE_APPLICATION_CREDENTIALS",
|
||||
value: "/secret/gcp-credentials/user-gcp-sa.json",
|
||||
},
|
||||
],
|
||||
workingDir: "/src",
|
||||
volumeMounts: [
|
||||
{
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
name: "gcp-credentials",
|
||||
},
|
||||
], //volumeMounts
|
||||
},
|
||||
], // containers
|
||||
volumes: [
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: "user-gcp-sa",
|
||||
},
|
||||
},
|
||||
],
|
||||
}, // spec
|
||||
},
|
||||
},
|
||||
},
|
||||
}, // parts
|
||||
}
|
|
@ -8,11 +8,11 @@
|
|||
// are not picked up by the individual components.
|
||||
// Need to see if we can find a way to fix this.
|
||||
|
||||
local imageTag = "v20181107-30bab1f-dirty-1ac751",
|
||||
local imageTag = "v20181108-004b5ad-dirty-eba459",
|
||||
"t2t-job": {
|
||||
jobType: "trainer",
|
||||
numChief: 0,
|
||||
numWorker: 1,
|
||||
numWorker: 1,
|
||||
numPs: 0,
|
||||
numWorkerGpu: 0,
|
||||
numPsGpu: 0,
|
||||
|
@ -60,6 +60,7 @@
|
|||
outputDir: $.components["t2t-code-search"].workingDir + "/output",
|
||||
model: $.components["t2t-code-search"].model,
|
||||
hparams_set: $.components["t2t-code-search"].hparams_set,
|
||||
image: $.components["t2t-job"].image,
|
||||
},
|
||||
"t2t-code-search-serving": {
|
||||
name: "t2t-code-search",
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
local exporter = import "export-model.libsonnet";
|
||||
local k = import "k.libsonnet";
|
||||
local t2tJob = import "t2t-job.libsonnet";
|
||||
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["t2t-code-search-exporter"];
|
||||
|
||||
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))
|
||||
std.prune(k.core.v1.list.new([exporter.parts(params, env).job]))
|
||||
|
|
|
@ -1,19 +1,6 @@
|
|||
local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
|
||||
|
||||
{
|
||||
getExporterCmd(params)::
|
||||
[
|
||||
// TODO(jlewi): Do we need to use the T2T entrypoint wrapper for the exporter?
|
||||
// Why would we need to parse TF_CONFIG into command line flags?
|
||||
"/usr/local/sbin/t2t-entrypoint",
|
||||
"t2t-exporter",
|
||||
"--problem=" + params.problem,
|
||||
"--data_dir=" + params.dataDir,
|
||||
"--output_dir=" + params.outputDir,
|
||||
"--model=" + params.model,
|
||||
"--hparams_set=" + params.hparams_set,
|
||||
],
|
||||
|
||||
getTrainerCmd(params):: {
|
||||
local trainer = [
|
||||
// t2t-entrypoint is a wrapper that parses TF_CONFIG
|
||||
|
@ -32,7 +19,7 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
|
|||
worker: trainer,
|
||||
|
||||
worker_dist: trainer + [
|
||||
"--schedule=train",
|
||||
"--schedule=train",
|
||||
"--ps_gpu=" + std.toString(params.numPsGpu),
|
||||
"--worker_gpu=" + std.toString(params.numWorkerGpu),
|
||||
"--worker_replicas=" + std.toString(params.numWorker),
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
{
|
||||
// Warning: Do not define a global "image" as that will end up overriding
|
||||
// the image parameter for all components. Define more specific names
|
||||
// e.g. "dataflowImage", "trainerCpuImage", "trainerGpuImage",
|
||||
workingDir: 'gs://code-search-demo/20181104',
|
||||
dataDir: 'gs://code-search-demo/20181104/data',
|
||||
project: 'code-search-demo',
|
||||
}
|
||||
// Warning: Do not define a global "image" as that will end up overriding
|
||||
// the image parameter for all components. Define more specific names
|
||||
// e.g. "dataflowImage", "trainerCpuImage", "trainerGpuImage",
|
||||
workingDir: "gs://code-search-demo/20181104",
|
||||
dataDir: "gs://code-search-demo/20181104/data",
|
||||
project: "code-search-demo",
|
||||
}
|
||||
|
|
|
@ -2,7 +2,7 @@ local base = import "base.libsonnet";
|
|||
// uncomment if you reference ksonnet-lib
|
||||
// local k = import "k.libsonnet";
|
||||
|
||||
base + {
|
||||
base {
|
||||
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
|
||||
// "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
|
||||
}
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
local params = std.extVar('__ksonnet/params');
|
||||
local globals = import 'globals.libsonnet';
|
||||
local envParams = params + {
|
||||
local params = std.extVar("__ksonnet/params");
|
||||
local globals = import "globals.libsonnet";
|
||||
local envParams = params {
|
||||
components+: {
|
||||
"t2t-code-search"+: {
|
||||
"t2t-code-search"+: {
|
||||
},
|
||||
"t2t-code-search-datagen"+: {
|
||||
githubTable: '',
|
||||
"t2t-code-search-datagen"+: {
|
||||
githubTable: "",
|
||||
},
|
||||
"submit-preprocess-job"+: {
|
||||
githubTable: '',
|
||||
"submit-preprocess-job"+: {
|
||||
githubTable: "",
|
||||
},
|
||||
},
|
||||
};
|
||||
|
@ -18,4 +18,4 @@ local envParams = params + {
|
|||
[x]: envParams.components[x] + globals
|
||||
for x in std.objectFields(envParams.components)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue