Add tensorboard and check in vendor for the code search example. (#255)

* Add tensorboard and check in vendor for the code search example.

* * Remove the default env; when I ran ks show I got errors but
  removing it and adding a fresh env worked. It also won't point to
  the correct cluster for users.
This commit is contained in:
Jeremy Lewi 2018-10-04 10:18:58 -07:00 committed by k8s-ci-robot
parent 2064b43def
commit adf614fc5f
12 changed files with 849 additions and 42 deletions

View File

@ -1,13 +1,10 @@
apiVersion: 0.2.0
environments:
default:
destination:
namespace: kubeflow
server: https://35.237.202.148
k8sVersion: v1.9.7
path: default
kind: ksonnet.io/app
libraries:
examples:
name: examples
registry: kubeflow
version: defc235463799d5600001ee0ed6ef68f7af24a17
tf-serving:
name: tf-serving
registry: kubeflow

View File

@ -79,5 +79,10 @@
indexFile: $.components['t2t-code-search'].workingDir + '/code_search_index.nmslib',
servingUrl: 'http://t2t-code-search.kubeflow:9001/v1/models/t2t-code-search:predict',
},
tensorboard: {
image: "tensorflow/tensorflow:1.8.0",
logDir: "gs://example/to/model/logdir",
name: "tensorboard",
},
},
}

View File

@ -3,5 +3,4 @@ local t2tJob = import "t2t-job.libsonnet";
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["t2t-code-search-trainer"];
std.prune(k.core.v1.list.new([t2tJob.parts(params, env).job]))

View File

@ -21,35 +21,35 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
],
getTrainerCmd(params):: {
local trainer = [
"/usr/local/sbin/t2t-entrypoint",
"t2t-trainer",
"--problem=" + params.problem,
"--model=" + params.model,
"--hparams_set=" + params.hparams_set,
"--data_dir=" + params.dataDir,
"--output_dir=" + params.outputDir,
"--train_steps=" + std.toString(params.train_steps),
"--eval_steps=" + std.toString(params.eval_steps),
"--t2t_usr_dir=/app/code_search/t2t",
],
local trainer = [
"/usr/local/sbin/t2t-entrypoint",
"t2t-trainer",
"--problem=" + params.problem,
"--model=" + params.model,
"--hparams_set=" + params.hparams_set,
"--data_dir=" + params.dataDir,
"--output_dir=" + params.outputDir,
"--train_steps=" + std.toString(params.train_steps),
"--eval_steps=" + std.toString(params.eval_steps),
"--t2t_usr_dir=/app/code_search/t2t",
],
worker: trainer,
worker: trainer,
worker_dist: trainer + [
"--schedule=train",
"--ps_gpu=" + std.toString(params.numPsGpu),
"--worker_gpu=" + std.toString(params.numWorkerGpu),
"--worker_replicas=" + std.toString(params.numWorker),
"--ps_replicas=" + std.toString(params.numPs),
"--eval_steps=" + std.toString(params.eval_steps),
"--worker_job=/job:worker",
],
worker_dist: trainer + [
"--schedule=train",
"--ps_gpu=" + std.toString(params.numPsGpu),
"--worker_gpu=" + std.toString(params.numWorkerGpu),
"--worker_replicas=" + std.toString(params.numWorker),
"--ps_replicas=" + std.toString(params.numPs),
"--eval_steps=" + std.toString(params.eval_steps),
"--worker_job=/job:worker",
],
ps: trainer + [
"--schedule=run_std_server",
"--ps_job=/job:ps",
],
ps: trainer + [
"--schedule=run_std_server",
"--ps_job=/job:ps",
],
},
tfJobReplica(replicaType, number, args, image, numGpus=0, imagePullSecrets=[], env=[], volumes=[], volumeMounts=[])::
@ -69,7 +69,7 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
replicas: number,
template: {
spec: {
containers: [ containerSpec ],
containers: [containerSpec],
[if std.length(imagePullSecrets) > 0 then "imagePullSecrets"]: imagePullSecrets,
[if std.length(volumes) > 0 then "volumes"]: volumes,
// restartPolicy: "OnFailure",
@ -84,7 +84,7 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
local workerEnv = [
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/user-gcp-sa.json"
value: "/secret/gcp-credentials/user-gcp-sa.json",
},
],
local workerVolumes = [
@ -104,8 +104,8 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
local cmd = $.getTrainerCmd(params),
local workerCmd = if params.jobType == "exporter" then $.getExporterCmd(params)
else if params.jobType == "datagen" then $.getDatagenCmd(params)
else cmd.worker,
else if params.jobType == "datagen" then $.getDatagenCmd(params)
else cmd.worker,
job:: {
apiVersion: "kubeflow.org/v1alpha2",
@ -116,14 +116,19 @@ local baseParams = std.extVar("__ksonnet/params").components["t2t-job"];
},
spec: {
tfReplicaSpecs: {
[if params.numPs > 0 then "PS"]: $.tfJobReplica("PS", params.numPs, cmd.ps, workerImage,
[if params.numPs > 0 then "PS"]: $.tfJobReplica("PS",
params.numPs,
cmd.ps,
workerImage,
numGpus=params.numPsGpu,
env=workerEnv,
volumes=workerVolumes,
volumeMounts=workerVolumeMounts),
[if params.numWorker > 0 then "Worker"]: $.tfJobReplica("WORKER", params.numWorker,
workerCmd, workerImage,
numGpus=params.numPsGpu,
[if params.numWorker > 0 then "Worker"]: $.tfJobReplica("WORKER",
params.numWorker,
workerCmd,
workerImage,
numGpus=params.numWorkerGpu,
env=workerEnv,
volumes=workerVolumes,
volumeMounts=workerVolumeMounts),

View File

@ -0,0 +1,103 @@
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components.tensorboard;
local k = import "k.libsonnet";
local name = params.name;
local namespace = env.namespace;
local service = {
apiVersion: "v1",
kind: "Service",
metadata: {
name: name + "-tb",
namespace: env.namespace,
annotations: {
"getambassador.io/config":
std.join("\n", [
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: " + name + "_mapping",
"prefix: /tensorboard/" + name + "/",
"rewrite: /",
"service: " + name + "-tb." + namespace,
]),
}, //annotations
},
spec: {
ports: [
{
name: "http",
port: 80,
targetPort: 80,
},
],
selector: {
app: "tensorboard",
"tb-job": name,
},
},
};
local deployment = {
apiVersion: "apps/v1beta1",
kind: "Deployment",
metadata: {
name: name + "-tb",
namespace: env.namespace,
},
spec: {
replicas: 1,
template: {
metadata: {
labels: {
app: "tensorboard",
"tb-job": name,
},
name: name,
namespace: namespace,
},
spec: {
containers: [
{
command: [
"/usr/local/bin/tensorboard",
"--logdir=" + params.logDir,
"--port=80",
],
image: params.image,
name: "tensorboard",
ports: [
{
containerPort: 80,
},
],
env: [
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/user-gcp-sa.json",
},
],
volumeMounts: [
{
mountPath: "/secret/gcp-credentials",
name: "gcp-credentials",
},
],
},
],
volumes: [
{
name: "gcp-credentials",
secret: {
secretName: "user-gcp-sa",
},
},
],
},
},
},
};
std.prune(k.core.v1.list.new([service, deployment]))

View File

@ -0,0 +1,22 @@
{
"name": "kubeflow examples",
"apiVersion": "0.0.1",
"kind": "ksonnet.io/parts",
"description": "kubeflow examples.\n",
"author": "kubeflow-team <kubeflow-discuss@googlegroups.com>",
"contributors": [
],
"repository": {
"type": "git",
"url": "https://github.com/kubeflow/kubeflow"
},
"bugs": {
"url": "https://github.com/kubeflow/kubeflow/issues"
},
"keywords": [
"kubernetes",
"kubeflow",
"machine learning"
],
"license": "Apache 2.0",
}

View File

@ -0,0 +1,87 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tensorboard
// @description Prototype for Tensorboard deployments
// @shortDescription Prototype for Tensorboard deployments
// @param name string Name to give to the tensorboard deployment
// @param logDir string The path containing your TF events files.
// @optionalParam image string tensorflow/tensorflow:1.8.0 The Docker image to use.
local k = import "k.libsonnet";
local name = params.name;
local namespace = env.namespace;
local service = {
apiVersion: "v1",
kind: "Service",
metadata: {
name: name + "-tb",
namespace: env.namespace,
annotations: {
"getambassador.io/config":
std.join("\n", [
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: " + name + "_mapping",
"prefix: /tensorboard/" + name + "/",
"rewrite: /",
"service: " + name + "-tb." + namespace,
]),
}, //annotations
},
spec: {
ports: [
{
name: "http",
port: 80,
targetPort: 80,
},
],
selector: {
app: "tensorboard",
"tb-job": name,
},
},
};
local deployment = {
apiVersion: "apps/v1beta1",
kind: "Deployment",
metadata: {
name: name + "-tb",
namespace: env.namespace,
},
spec: {
replicas: 1,
template: {
metadata: {
labels: {
app: "tensorboard",
"tb-job": name,
},
name: name,
namespace: namespace,
},
spec: {
containers: [
{
command: [
"/usr/local/bin/tensorboard",
"--logdir=" + params.logDir,
"--port=80",
],
image: params.image,
name: "tensorboard",
ports: [
{
containerPort: 80,
},
],
},
],
},
},
},
};
std.prune(k.core.v1.list.new([service, deployment]))

View File

@ -0,0 +1,143 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-batch-predict
// @description TensorFlow batch-predict
// @shortDescription A TensorFlow batch-predict job
// @param name string Name to give to each of the components
// @optionalParam numGpus number 0 number of GPUs to use
// @param modelPath string 0 Path to the model directory
// @param inputFilePatterns string Input file patterns
// @param outputResultPrefix string Output result file prefix
// @param outputErrorPrefix string Output error file prefix
// @optionalParam batchSize number 8 Batch size
// @optionalParam gcpCredentialSecretName string Secret name if used in GCP
local k = import "k.libsonnet";
local tfBatchPredictBase = {
local base = self,
// Parameters are intended to be late bound.
params:: {
name: null,
labels: {
app: base.params.name,
},
modelName: self.name,
modelPath: null,
inputFilePatterns: null,
inputFileFormat: null,
outputResultPrefix: null,
outputErrorPrefix: null,
batchSize: 8,
numGpus: 0,
gcpCredentialSecretName: "",
version: "v1",
// If users want to override the image then can override defaultCpuImage and/or defaultGpuImage
// in which case the image used will still depend on whether GPUs are used or not.
// Users can also override the predictImage in which case the user supplied value will always be used
// regardless of numGpus.
defaultCpuImage: "gcr.io/kubeflow-examples/batch-predict:tf18",
defaultGpuImage: "gcr.io/kubeflow-examples/batch-predict:tf18-gpu",
predictImage: if self.numGpus == 0 then
self.defaultCpuImage
else
self.defaultGpuImage,
},
parts:: {
bpJob: {
apiVersion: "batch/v1",
kind: "Job",
metadata: {
name: base.params.name + "-" + base.params.version,
namespace: base.params.namespace,
labels: base.params.labels,
},
spec: {
template: {
metadata: {
labels: base.params.labels,
},
backoffLimit: 1,
spec: {
containers: [
{
name: base.params.name,
image: base.params.predictImage,
imagePullPolicy: "IfNotPresent",
args: [
"--model_dir=" + base.params.modelPath,
"--input_file_patterns=" + base.params.inputFilePatterns,
"--input_file_format=" + base.params.inputFileFormat,
"--output_result_prefix=" + base.params.outputResultPrefix,
"--output_error_prefix=" + base.params.outputErrorPrefix,
"--batch_size=" + base.params.batchSize,
],
env:
if base.params.gcpCredentialSecretName != "" then
[{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/key.json",
}]
else [],
resources: {
limits: {
[if base.params.numGpus > 0 then "nvidia.com/gpu"]: base.params.numGpus,
},
},
volumeMounts+: if base.params.gcpCredentialSecretName != "" then [
{
name: "gcp-credentials",
readOnly: true,
mountPath: "/secret/gcp-credentials",
},
],
}, // container
], // containers
restartPolicy: "Never",
activeDeadlineSeconds: 3000,
// See: https://github.com/kubeflow/kubeflow/tree/master/components/k8s-model-server#set-the-user-optional
// The is user and group should be defined in the Docker image.
// Per best practices we don't run as the root user.
securityContext: {
runAsUser: 1000,
fsGroup: 1000,
},
volumes:
if base.params.gcpCredentialSecretName != "" then [
{
name: "gcp-credentials",
secret: {
secretName: base.params.gcpCredentialSecretName,
},
},
] else [],
}, // template spec
}, // template
}, // overall spec
}, // bpJob
}, // parts
};
// ksonnet appears to require name be a parameter of the prototype which is why we handle it differently.
local name = params.name;
local updatedParams = env + params;
local tfBatchPredict = tfBatchPredictBase {
// Override parameters with user supplied parameters.
params+: updatedParams {
name: name,
},
};
std.prune(k.core.v1.list.new([tfBatchPredict.parts.bpJob]))

View File

@ -0,0 +1,91 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-job-simple-v1alpha1
// @description tf-job-simple
// @shortDescription A simple TFJob to run CNN benchmark
// @param name string Name to give to each of the components
local k = import "k.libsonnet";
local name = import "param://name";
local namespace = "default";
local image = "gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3";
local tfjob = {
apiVersion: "kubeflow.org/v1alpha1",
kind: "TFJob",
metadata: {
name: name,
namespace: namespace,
},
spec: {
replicaSpecs: [
{
replicas: 1,
template: {
spec: {
containers: [
{
args: [
"python",
"tf_cnn_benchmarks.py",
"--batch_size=32",
"--model=resnet50",
"--variable_update=parameter_server",
"--flush_stdout=true",
"--num_gpus=1",
"--local_parameter_device=cpu",
"--device=cpu",
"--data_format=NHWC",
],
image: image,
name: "tensorflow",
workingDir: "/opt/tf-benchmarks/scripts/tf_cnn_benchmarks",
},
],
restartPolicy: "OnFailure",
},
},
tfReplicaType: "WORKER",
},
{
replicas: 1,
template: {
spec: {
containers: [
{
args: [
"python",
"tf_cnn_benchmarks.py",
"--batch_size=32",
"--model=resnet50",
"--variable_update=parameter_server",
"--flush_stdout=true",
"--num_gpus=1",
"--local_parameter_device=cpu",
"--device=cpu",
"--data_format=NHWC",
],
image: image,
name: "tensorflow",
workingDir: "/opt/tf-benchmarks/scripts/tf_cnn_benchmarks",
},
],
restartPolicy: "OnFailure",
},
},
tfReplicaType: "PS",
},
],
terminationPolicy: {
chief: {
replicaIndex: 0,
replicaName: "WORKER",
},
},
tfimage: image,
},
};
k.core.v1.list.new([
tfjob,
])

View File

@ -0,0 +1,82 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-job-simple
// @description tf-job-simple
// @shortDescription A simple TFJob to run CNN benchmark
// @param name string Name for the job.
local k = import "k.libsonnet";
local name = params.name;
local namespace = env.namespace;
local image = "gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3";
local tfjob = {
apiVersion: "kubeflow.org/v1alpha2",
kind: "TFJob",
metadata: {
name: name,
namespace: namespace,
},
spec: {
tfReplicaSpecs: {
Worker: {
replicas: 1,
template: {
spec: {
containers: [
{
args: [
"python",
"tf_cnn_benchmarks.py",
"--batch_size=32",
"--model=resnet50",
"--variable_update=parameter_server",
"--flush_stdout=true",
"--num_gpus=1",
"--local_parameter_device=cpu",
"--device=cpu",
"--data_format=NHWC",
],
image: image,
name: "tensorflow",
workingDir: "/opt/tf-benchmarks/scripts/tf_cnn_benchmarks",
},
],
restartPolicy: "OnFailure",
},
},
},
Ps: {
template: {
spec: {
containers: [
{
args: [
"python",
"tf_cnn_benchmarks.py",
"--batch_size=32",
"--model=resnet50",
"--variable_update=parameter_server",
"--flush_stdout=true",
"--num_gpus=1",
"--local_parameter_device=cpu",
"--device=cpu",
"--data_format=NHWC",
],
image: image,
name: "tensorflow",
workingDir: "/opt/tf-benchmarks/scripts/tf_cnn_benchmarks",
},
],
restartPolicy: "OnFailure",
},
},
tfReplicaType: "PS",
},
},
},
};
k.core.v1.list.new([
tfjob,
])

View File

@ -0,0 +1,94 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-serving-simple
// @description tf-serving-simple
// @shortDescription tf-serving-simple
// @param name string Name to give to each of the components
local k = import "k.libsonnet";
local namespace = "default";
local appName = import "param://name";
local modelBasePath = "gs://kubeflow-models/inception";
local modelName = "inception";
local image = "gcr.io/kubeflow-images-public/tf-model-server-cpu:v20180327-995786ec";
local service = {
apiVersion: "v1",
kind: "Service",
metadata: {
labels: {
app: appName,
},
name: appName,
namespace: namespace,
},
spec: {
ports: [
{
name: "grpc-tf-serving",
port: 9000,
targetPort: 9000,
},
],
selector: {
app: appName,
},
type: "ClusterIP",
},
};
local deployment = {
apiVersion: "extensions/v1beta1",
kind: "Deployment",
metadata: {
labels: {
app: appName,
},
name: appName,
namespace: namespace,
},
spec: {
template: {
metadata: {
labels: {
app: appName,
},
},
spec: {
containers: [
{
args: [
"/usr/bin/tensorflow_model_server",
"--port=9000",
"--model_name=" + modelName,
"--model_base_path=" + modelBasePath,
],
image: image,
imagePullPolicy: "IfNotPresent",
name: "inception",
ports: [
{
containerPort: 9000,
},
],
resources: {
limits: {
cpu: "4",
memory: "4Gi",
},
requests: {
cpu: "1",
memory: "1Gi",
},
},
},
],
},
},
},
};
k.core.v1.list.new([
service,
deployment,
])

View File

@ -0,0 +1,179 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-serving-with-istio
// @description tf-serving-with-istio
// @shortDescription tf-serving-with-istio
// @param name string Name to give to each of the components
local k = import "k.libsonnet";
local namespace = "default";
local appName = import "param://name";
local modelBasePath = "gs://kubeflow-models/inception";
local modelName = "inception";
local image = "gcr.io/kubeflow-images-public/tf-model-server-cpu:v20180327-995786ec";
local httpProxyImage = "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180327-995786ec";
local routeRule = {
apiVersion: "config.istio.io/v1alpha2",
kind: "RouteRule",
metadata: {
name: appName,
namespace: namespace,
},
spec: {
destination: {
name: "tf-serving",
},
precedence: 0,
route: [
{
labels: {
version: "v1",
},
},
],
},
};
local service = {
apiVersion: "v1",
kind: "Service",
metadata: {
annotations: {
"getambassador.io/config":
std.join("\n", [
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: tfserving-mapping-tf-serving-get",
"prefix: /models/tf-serving/",
"rewrite: /",
"method: GET",
"service: tf-serving." + namespace + ":8000",
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: tfserving-mapping-tf-serving-post",
"prefix: /models/tf-serving/",
"rewrite: /model/tf-serving:predict",
"method: POST",
"service: tf-serving." + namespace + ":8000",
]),
},
labels: {
app: appName,
},
name: appName,
namespace: namespace,
},
spec: {
ports: [
{
name: "grpc-tf-serving",
port: 9000,
targetPort: 9000,
},
{
name: "http-tf-serving-proxy",
port: 8000,
targetPort: 8000,
},
],
selector: {
app: appName,
},
type: "ClusterIP",
},
};
local deployment = {
apiVersion: "extensions/v1beta1",
kind: "Deployment",
metadata: {
labels: {
app: appName,
},
name: appName,
namespace: namespace,
},
spec: {
template: {
metadata: {
labels: {
app: appName,
},
annotations: {
"sidecar.istio.io/inject": "true",
},
},
spec: {
containers: [
{
args: [
"/usr/bin/tensorflow_model_server",
"--port=9000",
"--model_name=" + modelName,
"--model_base_path=" + modelBasePath,
],
image: image,
imagePullPolicy: "IfNotPresent",
name: "inception",
ports: [
{
containerPort: 9000,
},
],
resources: {
limits: {
cpu: "4",
memory: "4Gi",
},
requests: {
cpu: "1",
memory: "1Gi",
},
},
},
{
name: appName + "-http-proxy",
image: httpProxyImage,
imagePullPolicy: "IfNotPresent",
command: [
"python",
"/usr/src/app/server.py",
"--port=8000",
"--rpc_port=9000",
"--rpc_timeout=10.0",
],
env: [],
ports: [
{
containerPort: 8000,
},
],
resources: {
requests: {
memory: "1Gi",
cpu: "1",
},
limits: {
memory: "4Gi",
cpu: "4",
},
},
securityContext: {
runAsUser: 1000,
fsGroup: 1000,
},
},
],
},
},
},
};
k.core.v1.list.new([
routeRule,
service,
deployment,
])