diff --git a/mnist/README.md b/mnist/README.md index 3d101c08..e31a4246 100644 --- a/mnist/README.md +++ b/mnist/README.md @@ -473,7 +473,74 @@ kubectl port-forward ${PODNAME} 6006:6006 Tensorboard can now be accessed at [http://127.0.0.1:6006](http://127.0.0.1:6006). -## Using Tensorflow serving +## Serving the model + +The model code will export the model in saved model format which is suitable for serving with TensorFlow serving. + +To serve the model follow the instructions below. The instructins vary slightly based on where you are storing your +model (e.g. GCS, S3, PVC). Depending on the storage system we provide different ksonnet components as a convenience +for setting relevant environment variables. + + +### GCS + +Here we show to serve the model when it is stored on GCS. This assumes that when you trained the model you set `exportDir` to a GCS +URI; if not you can always copy it to GCS using `gsutil`. + +Check that a model was exported + +``` +gsutil ls -r ${EXPORT_DIR} + +``` + +The output should look something like + +``` +gs://${EXPORT_DIR}/1547100373/saved_model.pb +gs://${EXPORT_DIR}/1547100373/variables/: +gs://${EXPORT_DIR}/1547100373/variables/ +gs://${EXPORT_DIR}/1547100373/variables/variables.data-00000-of-00001 +gs://${EXPORT_DIR}/1547100373/variables/variables.index +``` + +The number `1547100373` is a version number auto-generated by TensorFlow; it will vary on each run but should be monotonically increasing if you save a model to the same location as a previous location. + + +Set your model path + +``` +ks param set ${ENV} mnist-deploy-gcp modelBasePath ${EXPORT_DIR} + +``` + +Deploy it + +``` +ks param apply ${ENV} -c mnist-deploy-gcp +``` + +You can check the deployment by running + +``` +kubectl describe deployments mnist-deploy-gcp +``` + +### S3 + +TODO: Add instructions + +### PVC + +TODO: Add instructions + +### Create the K8s service + +Next we need to create a K8s service to route traffic to our model + +``` +ks apply jlewi -c mnist-service +``` By default the workflow deploys our model via Tensorflow Serving. Included in this example is a client that can query your model and provide results: diff --git a/mnist/ks_app/app.lock b/mnist/ks_app/app.lock new file mode 100755 index 00000000..e69de29b diff --git a/mnist/ks_app/app.yaml b/mnist/ks_app/app.yaml index 003c82c8..6d85474f 100644 --- a/mnist/ks_app/app.yaml +++ b/mnist/ks_app/app.yaml @@ -2,14 +2,28 @@ apiVersion: 0.3.0 environments: jlewi: destination: - namespace: kubeflow + namespace: jlewi server: https://35.196.210.94 k8sVersion: v1.11.5 path: jlewi + test-env-d5e3: + destination: + namespace: jlewi + server: https://35.196.210.94 + k8sVersion: v1.11.5 + path: test-env-d5e3 kind: ksonnet.io/app +libraries: + kubeflow/tf-serving: + name: tf-serving + registry: kubeflow + version: fed535eaa276220e4edf59530c0629f4375a40a9 name: ks_app registries: incubator: protocol: github uri: github.com/ksonnet/parts/tree/master/incubator + kubeflow: + protocol: github + uri: github.com/kubeflow/kubeflow/tree/v0.4-branch/kubeflow version: 0.0.1 diff --git a/mnist/ks_app/components/mnist-deploy-aws.jsonnet b/mnist/ks_app/components/mnist-deploy-aws.jsonnet new file mode 100644 index 00000000..9b466168 --- /dev/null +++ b/mnist/ks_app/components/mnist-deploy-aws.jsonnet @@ -0,0 +1,39 @@ +local env = std.extVar("__ksonnet/environments"); +local params = std.extVar("__ksonnet/params").components["mnist-deploy-aws"]; + +local k = import "k.libsonnet"; +local deployment = k.apps.v1beta1.deployment; +local container = deployment.mixin.spec.template.spec.containersType; + +local util = import "kubeflow/tf-serving/util.libsonnet"; +local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet"; + +local base = tfserving.new(env, params); +local tfDeployment = base.tfDeployment + + deployment.mapContainers( + function(c) { + result:: + c + container.withEnvMixin( + if util.toBool(params.s3Enable) then ( + [ + { + name: "AWS_ACCESS_KEY_ID", + valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretAccesskeyidKeyName } }, + }, + { + name: "AWS_SECRET_ACCESS_KEY", + valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretSecretaccesskeyKeyName } }, + }, + { name: "AWS_REGION", value: params.s3AwsRegion }, + { name: "S3_USE_HTTPS", value: std.toString(params.s3UseHttps) }, + { name: "S3_VERIFY_SSL", value: std.toString(params.s3VerifySsl) }, + { name: "S3_ENDPOINT", value: params.s3Endpoint }, + ] + ) else [], + ), + }.result, + ); +util.list([ + tfDeployment, + base.tfservingConfig, +],) diff --git a/mnist/ks_app/components/mnist-deploy-gcp.jsonnet b/mnist/ks_app/components/mnist-deploy-gcp.jsonnet new file mode 100644 index 00000000..a8fd438b --- /dev/null +++ b/mnist/ks_app/components/mnist-deploy-gcp.jsonnet @@ -0,0 +1,47 @@ +local env = std.extVar("__ksonnet/environments"); +local params = std.extVar("__ksonnet/params").components["mnist-deploy-gcp"]; + +local k = import "k.libsonnet"; +local deployment = k.apps.v1beta1.deployment; +local container = deployment.mixin.spec.template.spec.containersType; + +local util = import "kubeflow/tf-serving/util.libsonnet"; +local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet"; + +local base = tfserving.new(env, params); +local tfDeployment = base.tfDeployment + + deployment.mixin.spec.template.spec.withVolumesMixin( + if params.gcpCredentialSecretName != "null" then ( + [{ + name: "gcp-credentials", + secret: { + secretName: params.gcpCredentialSecretName, + }, + }] + ) else [], + ) + + deployment.mapContainers( + function(c) { + result:: + c + container.withEnvMixin( + if params.gcpCredentialSecretName != "null" then ( + [{ + name: "GOOGLE_APPLICATION_CREDENTIALS", + value: "/secret/gcp-credentials/user-gcp-sa.json", + }] + ) else [], + ) + + container.withVolumeMountsMixin( + if params.gcpCredentialSecretName != "null" then ( + [{ + name: "gcp-credentials", + mountPath: "/secret/gcp-credentials", + }] + ) else [], + ), + }.result, + ); +util.list([ + tfDeployment, + base.tfservingConfig, +],) diff --git a/mnist/ks_app/components/mnist-service.jsonnet b/mnist/ks_app/components/mnist-service.jsonnet new file mode 100644 index 00000000..93073b3b --- /dev/null +++ b/mnist/ks_app/components/mnist-service.jsonnet @@ -0,0 +1,8 @@ +local env = std.extVar("__ksonnet/environments"); +local params = std.extVar("__ksonnet/params").components["mnist-service"]; + +local k = import "k.libsonnet"; +local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet"; +local util = import "kubeflow/tf-serving/util.libsonnet"; + +tfservingService.new(env, params).all diff --git a/mnist/ks_app/components/params.libsonnet b/mnist/ks_app/components/params.libsonnet index ae4c47cf..8cd9e65c 100644 --- a/mnist/ks_app/components/params.libsonnet +++ b/mnist/ks_app/components/params.libsonnet @@ -4,15 +4,58 @@ train: { batchSize: 100, envVariables: 'GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json', - exportDir: 'gs://kubeflow-ci_temp/mnist-jlewi', - image: 'gcr.io/kubeflow-examples/mnist/model:v20190108-v0.2-137-g38daafa-dirty-911944', + exportDir: 'gs://kubeflow-ci_temp/mnist-jlewi/export', + image: 'gcr.io/kubeflow-examples/mnist/model:v20190111-v0.2-148-g313770f', learningRate: '0.01', modelDir: 'gs://kubeflow-ci_temp/mnist-jlewi', name: 'mnist-train', - numPs: 1, - numWorkers: 2, + numPs: 0, + numWorkers: 0, secret: '', trainSteps: 200, }, + "mnist-deploy-gcp": { + defaultCpuImage: 'tensorflow/serving:1.11.1', + defaultGpuImage: 'tensorflow/serving:1.11.1-gpu', + deployHttpProxy: 'false', + enablePrometheus: 'true', + gcpCredentialSecretName: 'user-gcp-sa', + httpProxyImage: '', + injectIstio: 'false', + modelBasePath: 'gs://kubeflow-examples-data/mnist', + modelName: 'mnist', + name: 'mnist-deploy-gcp', + numGpus: '0', + versionName: 'v1', + }, + "mnist-deploy-aws": { + defaultCpuImage: 'tensorflow/serving:1.11.1', + defaultGpuImage: 'tensorflow/serving:1.11.1-gpu', + deployHttpProxy: 'false', + enablePrometheus: 'true', + httpProxyImage: 'gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723', + injectIstio: 'false', + modelBasePath: 's3://kubeflow-examples-data/mnist', + modelName: 'null', + name: 'mnist-deploy-aws', + numGpus: '0', + s3AwsRegion: 'us-west-1', + s3Enable: 'false', + s3Endpoint: 's3.us-west-1.amazonaws.com', + s3SecretAccesskeyidKeyName: 'AWS_ACCESS_KEY_ID', + s3SecretName: 'null', + s3SecretSecretaccesskeyKeyName: 'AWS_SECRET_ACCESS_KEY', + s3UseHttps: 'true', + s3VerifySsl: 'true', + versionName: 'v1', + }, + "mnist-service": { + enablePrometheus: 'true', + injectIstio: 'false', + modelName: 'null', + name: 'mnist-service', + serviceType: 'ClusterIP', + trafficRule: 'v1:100', + }, }, } \ No newline at end of file diff --git a/mnist/ks_app/components/train.jsonnet b/mnist/ks_app/components/train.jsonnet index c3c98e33..6a8eeb17 100644 --- a/mnist/ks_app/components/train.jsonnet +++ b/mnist/ks_app/components/train.jsonnet @@ -43,8 +43,9 @@ local trainEnv = [ }, ]; -local secretName = std.split(params.secret, "=")[0]; -local secretMountPath = std.split(params.secret, "=")[1]; +local secretPieces = std.split(params.secret, "="); +local secretName = if std.length(secretPieces) > 0 then secretPieces[0] else ""; +local secretMountPath = if std.length(secretPieces) > 1 then secretPieces[1] else ""; local replicaSpec = { containers: [ diff --git a/mnist/ks_app/environments/jlewi/params.libsonnet b/mnist/ks_app/environments/jlewi/params.libsonnet index 2a71c688..9b000e84 100644 --- a/mnist/ks_app/environments/jlewi/params.libsonnet +++ b/mnist/ks_app/environments/jlewi/params.libsonnet @@ -8,6 +8,23 @@ local envParams = params + { train+: { name: 'mnist-train-dist', secret: 'user-gcp-sa=/var/secrets', + numSteps: 10, + image: 'gcr.io/kubeflow-examples/mnist/model:v20190111-v0.2-146-g0bbff62-dirty-12f353', + numWorkers: 2, + numPs: 1, + }, + "deploy-gcp"+: { + modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export', + }, + "mnist-deploy-gcp"+: { + modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export', + name: 'jlewi-deploy-test', + namespace: 'jlewi', + }, + "mnist-service"+: { + name: 'jlewi-deploy-test', + namespace: 'jlewi', + modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export', }, }, }; diff --git a/mnist/ks_app/environments/test-env-d5e3/globals.libsonnet b/mnist/ks_app/environments/test-env-d5e3/globals.libsonnet new file mode 100644 index 00000000..7a73a41b --- /dev/null +++ b/mnist/ks_app/environments/test-env-d5e3/globals.libsonnet @@ -0,0 +1,2 @@ +{ +} \ No newline at end of file diff --git a/mnist/ks_app/environments/test-env-d5e3/main.jsonnet b/mnist/ks_app/environments/test-env-d5e3/main.jsonnet new file mode 100644 index 00000000..1d4f6425 --- /dev/null +++ b/mnist/ks_app/environments/test-env-d5e3/main.jsonnet @@ -0,0 +1,9 @@ +local base = import "base.libsonnet"; +// uncomment if you reference ksonnet-lib +// local k = import "k.libsonnet"; +// local deployment = k.apps.v1beta2.deployment; + +base + { + // Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n") + // "nginx-deployment"+: deployment.mixin.metadata.withLabels({foo: "bar"}) +} diff --git a/mnist/ks_app/environments/test-env-d5e3/params.libsonnet b/mnist/ks_app/environments/test-env-d5e3/params.libsonnet new file mode 100644 index 00000000..9030ac51 --- /dev/null +++ b/mnist/ks_app/environments/test-env-d5e3/params.libsonnet @@ -0,0 +1,28 @@ +local params = std.extVar('__ksonnet/params'); +local globals = import 'globals.libsonnet'; +local envParams = params + { + components+: { + train+: { + name: 'jlewi-deploy-test', + namespace: 'jlewi', + modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export', + }, + "mnist-deploy-gcp"+: { + name: 'jlewi-deploy-test', + namespace: 'jlewi', + modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export', + }, + "mnist-service"+: { + name: 'jlewi-deploy-test', + namespace: 'jlewi', + modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export', + }, + }, +}; + +{ + components: { + [x]: envParams.components[x] + globals + for x in std.objectFields(envParams.components) + }, +} \ No newline at end of file diff --git a/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/README.md b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/README.md new file mode 100644 index 00000000..3cf13ada --- /dev/null +++ b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/README.md @@ -0,0 +1,73 @@ + + +**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* + +- [tf-serving](#tf-serving) + - [Quickstart](#quickstart) + - [Using the library](#using-the-library) + - [io.ksonnet.pkg.tf-serving](#ioksonnetpkgtf-serving) + - [Example](#example) + - [Parameters](#parameters) + + + +# tf-serving + +> TensorFlow serving is a server for TensorFlow models. + + +* [Quickstart](#quickstart) +* [Using Prototypes](#using-prototypes) + * [io.ksonnet.pkg.tf-serving](#io.ksonnet.pkg.tf-serving) + +## Quickstart + +*The following commands use the `io.ksonnet.pkg.tf-serving` prototype to generate Kubernetes YAML for tf-serving, and then deploys it to your Kubernetes cluster.* + +First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)). + +If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init `. + +Finally, in the ksonnet application directory, run the following: + +```shell +# Expand prototype as a Jsonnet file, place in a file in the +# `components/` directory. (YAML and JSON are also available.) +$ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \ + --name tf-serving \ + --namespace default + +# Apply to server. +$ ks apply -f tf-serving.jsonnet +``` + +## Using the library + +The library files for tf-serving define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure tf-serving for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache. + +This library provides a set of pre-fabricated "flavors" (or "distributions") of tf-serving, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs. + +These prototypes, as well as how to use them, are enumerated below. + +### io.ksonnet.pkg.tf-serving + +TensorFlow serving +#### Example + +```shell +# Expand prototype as a Jsonnet file, place in a file in the +# `components/` directory. (YAML and JSON are also available.) +$ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \ + --name YOUR_NAME_HERE \ + --model_path YOUR_MODEL_PATH_HERE +``` + +#### Parameters + +The available options to pass prototype are: + +* `--name=`: Name to give to each of the components [string] +* `--model_path=`: Path to the model. This can be a GCS path. [string] + + +[rootReadme]: https://github.com/ksonnet/mixins diff --git a/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/parts.yaml b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/parts.yaml new file mode 100644 index 00000000..89955568 --- /dev/null +++ b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/parts.yaml @@ -0,0 +1,35 @@ +{ + "name": "tf-serving", + "apiVersion": "0.0.1", + "kind": "ksonnet.io/parts", + "description": "TensorFlow serving is a server for TensorFlow models.\n", + "author": "kubeflow team ", + "contributors": [ + { + "name": "Jeremy Lewi", + "email": "jlewi@google.com" + } + ], + "repository": { + "type": "git", + "url": "https://github.com/kubeflow/kubeflow" + }, + "bugs": { + "url": "https://github.com/kubeflow/kubeflow/issues" + }, + "keywords": [ + "kubeflow", + "tensorflow", + "database" + ], + "quickStart": { + "prototype": "io.ksonnet.pkg.tf-serving", + "componentName": "tf-serving", + "flags": { + "name": "tf-serving", + "namespace": "default" + }, + "comment": "Run TensorFlow Serving" + }, + "license": "Apache 2.0" +} diff --git a/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-all-features.jsonnet b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-all-features.jsonnet new file mode 100644 index 00000000..e7482ac1 --- /dev/null +++ b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-all-features.jsonnet @@ -0,0 +1,23 @@ +// @apiVersion 0.1 +// @name io.ksonnet.pkg.tf-serving +// @description TensorFlow serving +// @shortDescription A TensorFlow serving deployment +// @param name string Name to give to each of the components + +local k = import "k.libsonnet"; + +// ksonnet appears to require name be a parameter of the prototype which is why we handle it differently. +local name = import "param://name"; + +// updatedParams includes the namespace from env by default. +local updatedParams = params + env; + +local tfServingBase = import "kubeflow/tf-serving/tf-serving.libsonnet"; +local tfServing = tfServingBase { + // Override parameters with user supplied parameters. + params+: updatedParams { + name: name, + }, +}; + +std.prune(k.core.v1.list.new(tfServing.components)) diff --git a/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-aws.jsonnet b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-aws.jsonnet new file mode 100644 index 00000000..32045d19 --- /dev/null +++ b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-aws.jsonnet @@ -0,0 +1,61 @@ +// @apiVersion 0.1 +// @name io.ksonnet.pkg.tf-serving-deployment-aws +// @description TensorFlow serving +// @shortDescription A TensorFlow serving deployment +// @param name string Name to give to each of the components +// @optionalParam numGpus string 0 Number of gpus to use +// @optionalParam deployHttpProxy string false Whether to deploy http proxy +// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false. +// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11) +// @optionalParam modelBasePath string s3://kubeflow-examples-data/mnist The model path +// @optionalParam modelName string null The model name +// @optionalParam versionName string v1 The version name +// @optionalParam defaultCpuImage string tensorflow/serving:1.11.1 The default model server image (cpu) +// @optionalParam defaultGpuImage string tensorflow/serving:1.11.1-gpu The default model server image (gpu) +// @optionalParam httpProxyImage string gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723 Http proxy image +// @optionalParam s3Enable string false Whether to enable S3 +// Following parameters are needed only if s3Enable is true +// @optionalParam s3SecretName string null Name of the k8s secrets containing S3 credentials +// @optionalParam s3SecretAccesskeyidKeyName string AWS_ACCESS_KEY_ID Name of the key in the k8s secret containing AWS_ACCESS_KEY_ID +// @optionalParam s3SecretSecretaccesskeyKeyName string AWS_SECRET_ACCESS_KEY Name of the key in the k8s secret containing AWS_SECRET_ACCESS_KEY +// @optionalParam s3AwsRegion string us-west-1 S3 region +// @optionalParam s3UseHttps string true Whether or not to use https +// @optionalParam s3VerifySsl string true Whether or not to verify https certificates for S3 connections +// @optionalParam s3Endpoint string s3.us-west-1.amazonaws.com URL for your s3-compatible endpoint + +local k = import "k.libsonnet"; +local deployment = k.apps.v1beta1.deployment; +local container = deployment.mixin.spec.template.spec.containersType; + +local util = import "kubeflow/tf-serving/util.libsonnet"; +local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet"; + +local base = tfserving.new(env, params); +local tfDeployment = base.tfDeployment + + deployment.mapContainers( + function(c) { + result:: + c + container.withEnvMixin( + if util.toBool(params.s3Enable) then ( + [ + { + name: "AWS_ACCESS_KEY_ID", + valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretAccesskeyidKeyName } }, + }, + { + name: "AWS_SECRET_ACCESS_KEY", + valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretSecretaccesskeyKeyName } }, + }, + { name: "AWS_REGION", value: params.s3AwsRegion }, + { name: "S3_USE_HTTPS", value: std.toString(params.s3UseHttps) }, + { name: "S3_VERIFY_SSL", value: std.toString(params.s3VerifySsl) }, + { name: "S3_ENDPOINT", value: params.s3Endpoint }, + ] + ) else [], + ), + }.result, + ); +util.list([ + tfDeployment, + base.tfservingConfig, +],) diff --git a/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-gcp.jsonnet b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-gcp.jsonnet new file mode 100644 index 00000000..fd106ce2 --- /dev/null +++ b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-gcp.jsonnet @@ -0,0 +1,61 @@ +// @apiVersion 0.1 +// @name io.ksonnet.pkg.tf-serving-deployment-gcp +// @description TensorFlow serving +// @shortDescription A TensorFlow serving deployment +// @param name string Name to give to each of the components +// @optionalParam numGpus string 0 Number of gpus to use +// @optionalParam deployHttpProxy string false Whether to deploy http proxy +// @optionalParam modelBasePath string gs://kubeflow-examples-data/mnist The model path +// @optionalParam modelName string null The model name +// @optionalParam versionName string v1 The version name +// @optionalParam defaultCpuImage string tensorflow/serving:1.11.1 The default model server image (cpu) +// @optionalParam defaultGpuImage string tensorflow/serving:1.11.1-gpu The default model server image (gpu) +// @optionalParam httpProxyImage string gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723 Http proxy image +// @optionalParam gcpCredentialSecretName string null If not empty, insert the secret credential +// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false. +// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11) + +local k = import "k.libsonnet"; +local deployment = k.apps.v1beta1.deployment; +local container = deployment.mixin.spec.template.spec.containersType; + +local util = import "kubeflow/tf-serving/util.libsonnet"; +local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet"; + +local base = tfserving.new(env, params); +local tfDeployment = base.tfDeployment + + deployment.mixin.spec.template.spec.withVolumesMixin( + if params.gcpCredentialSecretName != "null" then ( + [{ + name: "gcp-credentials", + secret: { + secretName: params.gcpCredentialSecretName, + }, + }] + ) else [], + ) + + deployment.mapContainers( + function(c) { + result:: + c + container.withEnvMixin( + if params.gcpCredentialSecretName != "null" then ( + [{ + name: "GOOGLE_APPLICATION_CREDENTIALS", + value: "/secret/gcp-credentials/user-gcp-sa.json", + }] + ) else [], + ) + + container.withVolumeMountsMixin( + if params.gcpCredentialSecretName != "null" then ( + [{ + name: "gcp-credentials", + mountPath: "/secret/gcp-credentials", + }] + ) else [], + ), + }.result, + ); +util.list([ + tfDeployment, + base.tfservingConfig, +],) diff --git a/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-service.jsonnet b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-service.jsonnet new file mode 100644 index 00000000..18682555 --- /dev/null +++ b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-service.jsonnet @@ -0,0 +1,16 @@ +// @apiVersion 0.1 +// @name io.ksonnet.pkg.tf-serving-service +// @description TensorFlow serving +// @shortDescription A TensorFlow serving model +// @param name string Name to give to each of the components +// @optionalParam serviceType string ClusterIP The k8s service type for tf serving. +// @optionalParam modelName string null The model name +// @optionalParam trafficRule string v1:100 The traffic rule, in the format of version:percentage,version:percentage,.. +// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false. +// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11) + +local k = import "k.libsonnet"; +local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet"; +local util = import "kubeflow/tf-serving/util.libsonnet"; + +tfservingService.new(env, params).all diff --git a/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-with-request-log.jsonnet b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-with-request-log.jsonnet new file mode 100644 index 00000000..44ba455a --- /dev/null +++ b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/prototypes/tf-serving-with-request-log.jsonnet @@ -0,0 +1,230 @@ +// @apiVersion 0.1 +// @name io.ksonnet.pkg.tf-serving-request-log +// @description tf-serving with request logging +// @shortDescription tf-serving with request logging +// @param name string Name to give to each of the components +// @param gcpProject string The gcp project for Bigquery dataset +// @param dataset string The Bigquery dataset +// @param table string The Bigquery table +// @optionalParam modelBasePath string gs://kubeflow-examples-data/mnist The model path +// @optionalParam modelName string mnist The model name + +local k = import "k.libsonnet"; + +local namespace = "kubeflow"; +local appName = import "param://name"; +local image = "gcr.io/kubeflow-images-public/tf-model-server-cpu:v20180327-995786ec"; +local httpProxyImage = "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723"; +local loggingImage = "gcr.io/kubeflow-images-public/tf-model-server-request-logger:v20180723"; + +local gcpSecretName = "user-gcp-sa"; + +local service = { + apiVersion: "v1", + kind: "Service", + metadata: { + labels: { + app: appName, + }, + name: appName, + namespace: namespace, + }, + spec: { + ports: [ + { + name: "grpc-tf-serving", + port: 9000, + targetPort: 9000, + }, + { + name: "http-tf-serving-proxy", + port: 8000, + targetPort: 8000, + }, + ], + selector: { + app: appName, + }, + type: "ClusterIP", + }, +}; + +local configMap = { + apiVersion: "v1", + kind: "ConfigMap", + metadata: { + name: appName + "fluentd-config", + namespace: namespace, + }, + data: { + "fluent.conf": std.format(||| + + @type tail + path /tmp/logs/request.log + pos_file /tmp/logs/request.log.pos + + @type json + + tag dummy + + + @type bigquery_insert + auth_method application_default + project %s + dataset %s + table %s + fetch_schema true + + |||, [params.gcpProject, params.dataset, params.table]), + }, +}; + +local deployment = { + apiVersion: "extensions/v1beta1", + kind: "Deployment", + metadata: { + labels: { + app: appName, + }, + name: appName, + namespace: namespace, + }, + spec: { + template: { + metadata: { + labels: { + app: appName, + }, + }, + spec: { + containers: [ + // ModelServer + { + args: [ + "/usr/bin/tensorflow_model_server", + "--port=9000", + "--model_name=" + params.modelName, + "--model_base_path=" + params.modelBasePath, + ], + image: image, + imagePullPolicy: "IfNotPresent", + name: "model-server", + ports: [ + { + containerPort: 9000, + }, + ], + resources: { + limits: { + cpu: "4", + memory: "4Gi", + }, + requests: { + cpu: "1", + memory: "1Gi", + }, + }, + }, + // Http proxy + { + name: "http-proxy", + image: httpProxyImage, + imagePullPolicy: "Always", + command: [ + "python", + "/usr/src/app/server.py", + "--port=8000", + "--rpc_port=9000", + "--rpc_timeout=10.0", + "--log_request=true", + ], + env: [], + ports: [ + { + containerPort: 8000, + }, + ], + resources: { + requests: { + memory: "1Gi", + cpu: "1", + }, + limits: { + memory: "4Gi", + cpu: "4", + }, + }, + securityContext: { + runAsUser: 1000, + fsGroup: 1000, + }, + volumeMounts: [ + { + name: "request-logs", + mountPath: "/tmp/logs", + }, + ], + }, + // TODO(lunkai): use admission controller to inject. + // Logging container. + { + name: "logging", + image: loggingImage, + imagePullPolicy: "Always", + env: [ + { name: "GOOGLE_APPLICATION_CREDENTIALS", value: "/secret/gcp-credentials/key.json" }, + ], + resources: { + requests: { + memory: "250Mi", + cpu: "0.25", + }, + limits: { + memory: "500Mi", + cpu: "0.5", + }, + }, + volumeMounts: [ + { + name: "request-logs", + mountPath: "/tmp/logs", + }, + { + name: "gcp-credentials", + mountPath: "/secret/gcp-credentials", + }, + { + name: "fluentd-config-volume", + mountPath: "/fluentd/etc/custom", + }, + ], + }, + ], + volumes: [ + { + name: "gcp-credentials", + secret: { + secretName: gcpSecretName, + }, + }, + { + name: "request-logs", + emptyDir: {}, + }, + { + configMap: { + name: "fluentd-config", + }, + name: "fluentd-config-volume", + }, + ], + }, + }, + }, +}; + +k.core.v1.list.new([ + service, + deployment, + configMap, +]) diff --git a/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/tests/tf_serving_test.jsonnet b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/tests/tf_serving_test.jsonnet new file mode 100644 index 00000000..e931d7a4 --- /dev/null +++ b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/tests/tf_serving_test.jsonnet @@ -0,0 +1,112 @@ +local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet"; +local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet"; + +local params = { + name: "m", + serviceType: "ClusterIP", + modelName: "mnist", + trafficRule: "v1:100", + injectIstio: false, +}; + +local istioParams = params { + injectIstio: true, +}; + +local env = { + namespace: "kubeflow", +}; + +local deploymentParam = { + name: "m", + modelName: "mnist", + versionName: "v1", + modelBasePath: "gs://abc", + numGpus: 0, + defaultCpuImage: "gcr.io/abc", + defaultGpuImage: "gcr.io/abc", + injectIstio: false, + enablePrometheus: true, +}; + +local gpuParam1 = { + name: "m", + modelName: "mnist", + versionName: "v1", + modelBasePath: "gs://abc", + numGpus: 1, + defaultCpuImage: "gcr.io/abc", + defaultGpuImage: "gcr.io/abc", + injectIstio: false, + enablePrometheus: true, +}; + +local gpuParamString0 = { + name: "m", + modelName: "mnist", + versionName: "v1", + modelBasePath: "gs://abc", + numGpus: "0", + defaultCpuImage: "gcr.io/abc", + defaultGpuImage: "gcr.io/abc", + injectIstio: false, + enablePrometheus: true, +}; + +local gpuParamString1 = { + name: "m", + modelName: "mnist", + versionName: "v1", + modelBasePath: "gs://abc", + numGpus: "1", + defaultCpuImage: "gcr.io/abc", + defaultGpuImage: "gcr.io/abc", + injectIstio: false, + enablePrometheus: true, +}; + +local serviceInstance = tfservingService.new(env, params); +local istioServiceInstance = tfservingService.new(env, istioParams); + +local deploymentInstance = tfserving.new(env, deploymentParam); + +local gpuInstance = tfserving.new(env, gpuParam1); +local gpuString0Instance = tfserving.new(env, gpuParamString0); +local gpuString1Instance = tfserving.new(env, gpuParamString1); + +// This one should only have tfService +std.assertEqual( + std.length(serviceInstance.all.items), + 1, +) && + +// This one should have tfService, virtualService, and DestinationRule +std.assertEqual( + std.length(istioServiceInstance.all.items), + 3 +) && + +std.startsWith( + deploymentInstance.tfDeployment.spec.template.spec.containers[0].args[4], + "--monitoring_config_file" +) && + +std.assertEqual( + deploymentInstance.tfDeployment.spec.template.spec.containers[0].resources.limits, + { cpu: "4", memory: "4Gi" } +) && + +std.assertEqual( + gpuInstance.tfDeployment.spec.template.spec.containers[0].resources.limits, + { cpu: "4", memory: "4Gi", "nvidia.com/gpu": 1 } +) && + +std.assertEqual( + gpuString0Instance.tfDeployment.spec.template.spec.containers[0].resources.limits, + { cpu: "4", memory: "4Gi" } +) && + +std.assertEqual( + gpuString1Instance.tfDeployment.spec.template.spec.containers[0].resources.limits, + { cpu: "4", memory: "4Gi", "nvidia.com/gpu": 1 } +) diff --git a/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/tf-serving-service-template.libsonnet b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/tf-serving-service-template.libsonnet new file mode 100644 index 00000000..0d633e58 --- /dev/null +++ b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/tf-serving-service-template.libsonnet @@ -0,0 +1,147 @@ +{ + local k = import "k.libsonnet", + local util = import "kubeflow/tf-serving/util.libsonnet", + new(_env, _params):: { + local params = _params + _env, + local namespace = params.namespace, + local name = params.name, + local modelName = + if params.modelName == "null" then + params.name + else + params.modelName, + + local tfService = { + apiVersion: "v1", + kind: "Service", + metadata: { + labels: { + app: modelName, + }, + name: name, + namespace: namespace, + annotations: { + "getambassador.io/config": + std.join("\n", [ + "---", + "apiVersion: ambassador/v0", + "kind: Mapping", + "name: tfserving-predict-mapping-" + modelName, + "prefix: /tfserving/models/" + modelName, + "rewrite: /v1/models/" + modelName + ":predict", + "method: POST", + "service: " + name + "." + namespace + ":8500", + "---", + "apiVersion: ambassador/v0", + "kind: Mapping", + "name: tfserving-predict-mapping-" + modelName + "-get", + "prefix: /tfserving/models/" + modelName, + "rewrite: /v1/models/" + modelName, + "method: GET", + "service: " + name + "." + namespace + ":8500", + ]), + } + if util.toBool(params.enablePrometheus) then { + "prometheus.io/scrape": "true", + "prometheus.io/path": "/monitoring/prometheus/metrics", + "prometheus.io/port": "8500", + } else {}, //annotations + }, + spec: { + ports: [ + { + name: "grpc-tf-serving", + port: 9000, + targetPort: 9000, + }, + { + name: "http-tf-serving", + port: 8500, + targetPort: 8500, + }, + ], + selector: { + app: modelName, + }, + type: params.serviceType, + }, + }, // tfService + tfService:: tfService, + + local versionWeights = std.split(params.trafficRule, ","), + local virtualService = { + apiVersion: "networking.istio.io/v1alpha3", + kind: "VirtualService", + metadata: { + name: name, + namespace: namespace, + }, + spec: { + hosts: [ + "*", + ], + gateways: [ + "kubeflow-gateway", + ], + http: [ + { + match: [ + { + uri: { + prefix: "/istio/tfserving/models/" + modelName, + }, + method: { + exact: "POST", + }, + }, + ], + rewrite: { + uri: "/v1/models/" + modelName + ":predict", + }, + route: [ + { + destination: { + host: name, + port: { + number: 8500, + }, + subset: std.split(versionWeight, ":")[0], + }, + weight: std.parseInt(std.split(versionWeight, ":")[1]), + } + for versionWeight in versionWeights + ], + }, + ], + }, + }, + virtualService:: virtualService, + + local destinationRule = { + apiVersion: "networking.istio.io/v1alpha3", + kind: "DestinationRule", + metadata: { + name: name, + namespace: namespace, + }, + spec: { + host: name, + subsets: [ + { + name: std.split(versionWeight, ":")[0], + labels: { + version: std.split(versionWeight, ":")[0], + }, + } + for versionWeight in versionWeights + ], + }, + }, + destinationRule:: destinationRule, + all:: util.list([ + tfService, + ] + if util.toBool(params.injectIstio) then [ + virtualService, + destinationRule, + ] else []), + }, // new +} diff --git a/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/tf-serving-template.libsonnet b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/tf-serving-template.libsonnet new file mode 100644 index 00000000..a2227002 --- /dev/null +++ b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/tf-serving-template.libsonnet @@ -0,0 +1,137 @@ +{ + local k = import "k.libsonnet", + local util = import "kubeflow/tf-serving/util.libsonnet", + new(_env, _params):: { + local params = _params + _env, + local namespace = params.namespace, + local name = params.name, + local modelName = + if params.modelName == "null" then + params.name + else + params.modelName, + local versionName = params.versionName, + local numGpus = + if std.type(params.numGpus) == "string" then + std.parseInt(params.numGpus) + else + params.numGpus, + local modelServerImage = + if numGpus == 0 then + params.defaultCpuImage + else + params.defaultGpuImage, + + // Optional features. + // TODO(lunkai): Add request logging + + local modelServerContainer = { + command: [ + "/usr/bin/tensorflow_model_server", + ], + args: [ + "--port=9000", + "--rest_api_port=8500", + "--model_name=" + modelName, + "--model_base_path=" + params.modelBasePath, + ] + if util.toBool(params.enablePrometheus) then [ + "--monitoring_config_file=/var/config/monitoring_config.txt", + ] else [], + image: modelServerImage, + imagePullPolicy: "IfNotPresent", + name: modelName, + ports: [ + { + containerPort: 9000, + }, + { + containerPort: 8500, + }, + ], + env: [], + resources: { + limits: { + cpu: "4", + memory: "4Gi", + } + if numGpus != 0 then { + "nvidia.com/gpu": numGpus, + } else {}, + requests: { + cpu: "1", + memory: "1Gi", + }, + }, + volumeMounts: [ + { + mountPath: "/var/config/", + name: "config-volume", + }, + ], + // TCP liveness probe on gRPC port + livenessProbe: { + tcpSocket: { + port: 9000, + }, + initialDelaySeconds: 30, + periodSeconds: 30, + }, + }, // modelServerContainer + + local tfDeployment = { + apiVersion: "extensions/v1beta1", + kind: "Deployment", + metadata: { + labels: { + app: modelName, + }, + name: name, + namespace: namespace, + }, + spec: { + template: { + metadata: { + labels: { + app: modelName, + version: versionName, + }, + annotations: { + "sidecar.istio.io/inject": if util.toBool(params.injectIstio) then "true", + }, + }, + spec: { + containers: [ + modelServerContainer, + ], + volumes: [ + { + configMap: { + name: name + "-config", + }, + name: "config-volume", + }, + ], + }, + }, + }, + }, // tfDeployment + tfDeployment:: tfDeployment, + + local tfservingConfig = { + apiVersion: "v1", + kind: "ConfigMap", + metadata: { + name: name + "-config", + namespace: namespace, + }, + data: { + "monitoring_config.txt": std.join("\n", [ + "prometheus_config: {", + " enable: true,", + ' path: "/monitoring/prometheus/metrics"', + "}", + ]), + }, + }, // tfservingConfig + tfservingConfig:: tfservingConfig, + }, // new +} diff --git a/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/tf-serving.libsonnet b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/tf-serving.libsonnet new file mode 100644 index 00000000..a8a2e90c --- /dev/null +++ b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/tf-serving.libsonnet @@ -0,0 +1,380 @@ +{ + util:: import "kubeflow/tf-serving/util.libsonnet", + + // Parameters are intended to be late bound. + params:: { + name: null, + numGpus: 0, + labels: { + app: $.params.name, + }, + modelName: $.params.name, + modelPath: null, + modelStorageType: "storageType", + + version: "v1", + firstVersion: true, + + deployIstio: false, + + deployHttpProxy: false, + httpProxyImage: "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180606-9dfda4f2", + + serviceType: "ClusterIP", + + // If users want to override the image then can override defaultCpuImage and/or defaultGpuImage + // in which case the image used will still depend on whether GPUs are used or not. + // Users can also override modelServerImage in which case the user supplied value will always be used + // regardless of numGpus. + defaultCpuImage: "tensorflow/serving:1.11.1", + defaultGpuImage: "tensorflow/serving:1.11.1-gpu", + modelServerImage: if $.params.numGpus == 0 then + $.params.defaultCpuImage + else + $.params.defaultGpuImage, + + + // Whether or not to enable s3 parameters + s3Enable:: false, + + // Which storageType to use + storageType:: null, + }, + + // Parametes specific to GCP. + gcpParams:: { + gcpCredentialSecretName: "", + } + $.params, + + // Parameters that control S3 access + // params overrides s3params because params can be overwritten by the user to override the defaults. + s3params:: { + // Name of the k8s secrets containing S3 credentials + s3SecretName: "", + // Name of the key in the k8s secret containing AWS_ACCESS_KEY_ID. + s3SecretAccesskeyidKeyName: "AWS_ACCESS_KEY_ID", + + // Name of the key in the k8s secret containing AWS_SECRET_ACCESS_KEY. + s3SecretSecretaccesskeyKeyName: "AWS_SECRET_ACCESS_KEY", + + // S3 region + s3AwsRegion: "us-west-1", + + // TODO(jlewi): We should use util.toBool to automatically conver to actual boolean values. + // The use of strings is left over from when they were prototype parameters which only supports string type. + + // true Whether or not to use https for S3 connections + s3UseHttps: "true", + + // Whether or not to verify https certificates for S3 connections + s3VerifySsl: "true", + + // URL for your s3-compatible endpoint. + s3Endpoint: "http://s3.us-west-1.amazonaws.com,", + } + $.params, + + + components:: { + + all:: [ + // Default routing rule for the first version of model. + if $.util.toBool($.params.deployIstio) && $.util.toBool($.params.firstVersion) then + $.parts.defaultRouteRule, + ] + + // TODO(jlewi): It would be better to structure s3 as a mixin. + // As an example it would be great to allow S3 and GCS parameters + // to be enabled simultaneously. This should be doable because + // each entails adding a set of environment variables and volumes + // to the containers. These volumes/environment variables shouldn't + // overlap so there's no reason we shouldn't be able to just add + // both modifications to the base container. + // I think we want to restructure things as mixins so they can just + // be added. + if $.params.s3Enable then + [ + $.s3parts.tfService, + $.s3parts.tfDeployment, + ] + else if $.params.storageType == "gcp" then + [ + $.gcpParts.tfService, + $.gcpParts.tfDeployment, + ] + else + [ + $.parts.tfService, + $.parts.tfDeployment, + ], + }.all, + + parts:: { + // We define the containers one level beneath parts because combined with jsonnet late binding + // this makes it easy for users to override specific bits of the container. + tfServingContainerBase:: { + name: $.params.name, + image: $.params.modelServerImage, + imagePullPolicy: "IfNotPresent", + command: [ + "/usr/bin/tensorflow_model_server", + ], + args: [ + "--port=9000", + "--model_name=" + $.params.modelName, + "--model_base_path=" + $.params.modelPath, + ], + ports: [ + { + containerPort: 9000, + }, + ], + // TODO(jlewi): We should add readiness and liveness probes. I think the blocker is that + // model-server doesn't have something we can use out of the box. + resources: { + requests: { + memory: "1Gi", + cpu: "1", + }, + limits: { + memory: "4Gi", + cpu: "4", + }, + }, + // The is user and group should be defined in the Docker image. + // Per best practices we don't run as the root user. + securityContext: { + runAsUser: 1000, + fsGroup: 1000, + }, + volumeMounts+: if $.params.modelStorageType == "nfs" then [{ + name: "nfs", + mountPath: "/mnt", + }] + else [], + }, // tfServingContainer + + tfServingContainer+: $.parts.tfServingContainerBase + + if $.params.numGpus > 0 then + { + resources+: { + limits+: { + "nvidia.com/gpu": $.params.numGpus, + }, + }, + } + else {}, + + tfServingMetadata+: { + labels: $.params.labels { version: $.params.version }, + annotations: { + "sidecar.istio.io/inject": if $.util.toBool($.params.deployIstio) then "true", + }, + }, + + httpProxyContainer:: { + name: $.params.name + "-http-proxy", + image: $.params.httpProxyImage, + imagePullPolicy: "IfNotPresent", + command: [ + "python", + "/usr/src/app/server.py", + "--port=8000", + "--rpc_port=9000", + "--rpc_timeout=10.0", + ], + env: [], + ports: [ + { + containerPort: 8000, + }, + ], + resources: { + requests: { + memory: "500Mi", + cpu: "0.5", + }, + limits: { + memory: "1Gi", + cpu: "1", + }, + }, + securityContext: { + runAsUser: 1000, + fsGroup: 1000, + }, + }, // httpProxyContainer + + + tfDeployment: { + apiVersion: "extensions/v1beta1", + kind: "Deployment", + metadata: { + name: $.params.name + "-" + $.params.version, + namespace: $.params.namespace, + labels: $.params.labels, + }, + spec: { + template: { + metadata: $.parts.tfServingMetadata, + spec: { + containers: [ + $.parts.tfServingContainer, + if $.util.toBool($.params.deployHttpProxy) then + $.parts.httpProxyContainer, + ], + volumes+: if $.params.modelStorageType == "nfs" then + [{ + name: "nfs", + persistentVolumeClaim: { + claimName: $.params.nfsPVC, + }, + }] + else [], + }, + }, + }, + }, // tfDeployment + + tfService: { + apiVersion: "v1", + kind: "Service", + metadata: { + labels: $.params.labels, + name: $.params.name, + namespace: $.params.namespace, + annotations: { + "getambassador.io/config": + std.join("\n", [ + "---", + "apiVersion: ambassador/v0", + "kind: Mapping", + "name: tfserving-mapping-" + $.params.name + "-get", + "prefix: /models/" + $.params.name + "/", + "rewrite: /", + "method: GET", + "service: " + $.params.name + "." + $.params.namespace + ":8000", + "---", + "apiVersion: ambassador/v0", + "kind: Mapping", + "name: tfserving-mapping-" + $.params.name + "-post", + "prefix: /models/" + $.params.name + "/", + "rewrite: /model/" + $.params.name + ":predict", + "method: POST", + "service: " + $.params.name + "." + $.params.namespace + ":8000", + ]), + }, //annotations + }, + spec: { + ports: [ + { + name: "grpc-tf-serving", + port: 9000, + targetPort: 9000, + }, + { + name: "http-tf-serving-proxy", + port: 8000, + targetPort: 8000, + }, + ], + selector: $.params.labels, + type: $.params.serviceType, + }, + }, // tfService + + defaultRouteRule: { + apiVersion: "config.istio.io/v1alpha2", + kind: "RouteRule", + metadata: { + name: $.params.name + "-default", + namespace: $.params.namespace, + }, + spec: { + destination: { + name: $.params.name, + }, + precedence: 0, + route: [ + { + labels: { version: $.params.version }, + }, + ], + }, + }, + + }, // parts + + // Parts specific to S3 + s3parts:: $.parts { + s3Env:: [ + { name: "AWS_ACCESS_KEY_ID", valueFrom: { secretKeyRef: { name: $.s3params.s3SecretName, key: $.s3params.s3SecretAccesskeyidKeyName } } }, + { name: "AWS_SECRET_ACCESS_KEY", valueFrom: { secretKeyRef: { name: $.s3params.s3SecretName, key: $.s3params.s3SecretSecretaccesskeyKeyName } } }, + { name: "AWS_REGION", value: $.s3params.s3AwsRegion }, + { name: "S3_REGION", value: $.s3params.s3AwsRegion }, + { name: "S3_USE_HTTPS", value: $.s3params.s3UseHttps }, + { name: "S3_VERIFY_SSL", value: $.s3params.s3VerifySsl }, + { name: "S3_ENDPOINT", value: $.s3params.s3Endpoint }, + ], + + tfServingContainer: $.parts.tfServingContainer { + env+: $.s3parts.s3Env, + }, + + tfDeployment: $.parts.tfDeployment { + spec: +{ + template: +{ + metadata: $.parts.tfServingMetadata, + spec: +{ + containers: [ + $.s3parts.tfServingContainer, + if $.util.toBool($.params.deployHttpProxy) then + $.parts.httpProxyContainer, + ], + }, + }, + }, + }, // tfDeployment + }, // s3parts + + // Parts specific to GCP + gcpParts:: $.parts { + gcpEnv:: [ + if $.gcpParams.gcpCredentialSecretName != "" then + { name: "GOOGLE_APPLICATION_CREDENTIALS", value: "/secret/gcp-credentials/user-gcp-sa.json" }, + ], + + tfServingContainer: $.parts.tfServingContainer { + env+: $.gcpParts.gcpEnv, + volumeMounts+: [ + if $.gcpParams.gcpCredentialSecretName != "" then + { + name: "gcp-credentials", + mountPath: "/secret/gcp-credentials", + }, + ], + }, + + tfDeployment: $.parts.tfDeployment { + spec+: { + template+: { + metadata: $.parts.tfServingMetadata, + spec+: { + containers: [ + $.gcpParts.tfServingContainer, + if $.util.toBool($.params.deployHttpProxy) then + $.parts.httpProxyContainer, + ], + volumes: [ + if $.gcpParams.gcpCredentialSecretName != "" then + { + name: "gcp-credentials", + secret: { + secretName: $.gcpParams.gcpCredentialSecretName, + }, + }, + ], + }, + }, + }, + }, // tfDeployment + }, // gcpParts +} diff --git a/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/util.libsonnet b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/util.libsonnet new file mode 100644 index 00000000..0659cda5 --- /dev/null +++ b/mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/util.libsonnet @@ -0,0 +1,21 @@ +// Some useful routines. +{ + local k = import "k.libsonnet", + + // Convert non-boolean types like string,number to a boolean. + // This is primarily intended for dealing with parameters that should be booleans. + toBool:: function(x) { + result:: + if std.type(x) == "boolean" then + x + else if std.type(x) == "string" then + std.asciiUpper(x) == "TRUE" + else if std.type(x) == "number" then + x != 0 + else + false, + }.result, + + // Produce a list of manifests. obj must be an array + list(obj):: k.core.v1.list.new(obj,), +} diff --git a/mnist/model.py b/mnist/model.py index c5a468fc..7f80153b 100644 --- a/mnist/model.py +++ b/mnist/model.py @@ -21,6 +21,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import json import os import sys import numpy as np @@ -126,6 +127,22 @@ def linear_serving_input_receiver_fn(): def main(_): tf.logging.set_verbosity(tf.logging.INFO) + tf_config = os.environ.get('TF_CONFIG', '{}') + tf.logging.info("TF_CONFIG %s", tf_config) + tf_config_json = json.loads(tf_config) + cluster = tf_config_json.get('cluster') + job_name = tf_config_json.get('task', {}).get('type') + task_index = tf_config_json.get('task', {}).get('index') + tf.logging.info("cluster=%s job_name=%s task_index=%s", cluster, job_name, + task_index) + + is_chief = False + if not job_name or job_name.lower() in ["chief", "master"]: + is_chief = True + tf.logging.info("Will export model") + else: + tf.logging.info("Will not export model") + # Download and load MNIST dataset. mnist = tf.contrib.learn.datasets.DATASETS['mnist'](TF_DATA_DIR) train_input_fn = tf.estimator.inputs.numpy_input_fn( @@ -151,6 +168,8 @@ def main(_): classifier = tf.estimator.LinearClassifier( feature_columns=feature_columns, n_classes=N_DIGITS, model_dir=TF_MODEL_DIR, config=training_config) + # TODO(jlewi): Should it be linear_serving_input_receiver_fn here? + serving_fn = cnn_serving_input_receiver_fn export_final = tf.estimator.FinalExporter( TF_EXPORT_DIR, serving_input_receiver_fn=cnn_serving_input_receiver_fn) @@ -158,6 +177,7 @@ def main(_): # Convolutional network classifier = tf.estimator.Estimator( model_fn=conv_model, model_dir=TF_MODEL_DIR, config=training_config) + serving_fn = cnn_serving_input_receiver_fn export_final = tf.estimator.FinalExporter( TF_EXPORT_DIR, serving_input_receiver_fn=cnn_serving_input_receiver_fn) else: @@ -171,7 +191,14 @@ def main(_): exporters=export_final, throttle_secs=1, start_delay_secs=1) + print("Train and evaluate") tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec) + print("Training done") + + if is_chief: + print("Export saved model") + classifier.export_savedmodel(TF_EXPORT_DIR, serving_input_receiver_fn=serving_fn) + print("Done exporting the model") if __name__ == '__main__': tf.app.run() diff --git a/mnist/testing/deploy_test.py b/mnist/testing/deploy_test.py new file mode 100644 index 00000000..7ecffeda --- /dev/null +++ b/mnist/testing/deploy_test.py @@ -0,0 +1,76 @@ +"""Test deploying the mnist model. + +This file tests that we can deploy the model. + +TODO(jlewi): Test that we can send predictions to the model. + +It is an integration test as it depends on having access to +a Kubeflow deployment to deploy on. It also depends on having a model. + +Python Path Requirements: + kubeflow/testing/py - https://github.com/kubeflow/testing/tree/master/py + * Provides utilities for testing + +Manually running the test + 1. Configure your KUBECONFIG file to point to the desired cluster + 2. Set --params=name=${NAME},namespace=${NAMESPACE} + * name should be the name for your job + * namespace should be the namespace to use + 3. Use the modelBasePath parameter to the model to test. + --params=...,modelBasePath=${MODEL_BASE_PATH} + +""" + +import logging +import os + +from kubernetes import client as k8s_client +from py import test_runner + +from kubeflow.testing import ks_util +from kubeflow.testing import test_util +from kubeflow.testing import util + +class MnistDeployTest(test_util.TestCase): + def __init__(self, args): + namespace, name, env = test_runner.parse_runtime_params(args) + self.app_dir = args.app_dir + + if not self.app_dir: + self.app_dir = os.path.join(os.path.dirname(__file__), "..", + "ks_app") + self.app_dir = os.path.abspath(self.app_dir) + logging.info("--app_dir not set defaulting to: %s", self.app_dir) + + self.env = env + self.namespace = namespace + self.params = args.params + self.ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) + super(MnistDeployTest, self).__init__(class_name="MnistDeployTest", + name=name) + + def test_serve(self): + # We repeat the test multiple times. + # This ensures that if we delete the job we can create a new job with the + # same name. + api_client = k8s_client.ApiClient() + + # Apply the components + for component in ["mnist-deploy-gcp", "mnist-service"]: + # Setup the ksonnet app + ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, + self.params) + + util.run([self.ks_cmd, "apply", self.env, "-c", component], + cwd=self.app_dir) + + logging.info("Created deployment %s in namespaces %s", self.name, self.namespace) + + util.wait_for_deployment(api_client, self.namespace, self.name, + timeout_minutes=4) + + # We don't delete the resources. We depend on the namespace being + # garbage collected. + +if __name__ == "__main__": + test_runner.main(module=__name__) diff --git a/test/copy_secret.sh b/test/copy_secret.sh new file mode 100755 index 00000000..ec76df40 --- /dev/null +++ b/test/copy_secret.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# +# A simple script to copy a secret from 1 namespace to another +# +# Usage +# copy_secret +set -e +SOURCE=$1 +DEST=$2 +NAME=$3 + +usage() { + echo copy_secret " " +} + +if [ -z ${SOURCE} ]; then + usage + exit -1 +fi + +if [ -z ${DEST} ]; then + usage + exit -1 +fi + +if [ -z ${NAME} ]; then + usage + exit -1 +fi + +echo getting secret +SECRET=$(kubectl -n ${SOURCE} get secrets user-gcp-sa -o jsonpath="{.data.${NAME}\.json}" | base64 -d) +kubectl create -n ${DEST} secret generic ${NAME} --from-literal="${NAME}.json=${SECRET}" \ No newline at end of file diff --git a/test/workflows/components/mnist.jsonnet b/test/workflows/components/mnist.jsonnet index 3f95eb58..0a9d07c1 100644 --- a/test/workflows/components/mnist.jsonnet +++ b/test/workflows/components/mnist.jsonnet @@ -25,6 +25,12 @@ local defaultParams = { // The bucket where the model should be written // This needs to be writable by the GCP service account in the Kubeflow cluster (not the test cluster) modelBucket: "kubeflow-ci_temp", + + // Whether to delete the namespace at the end. + // Leaving the namespace around can be useful for debugging. + // + // TODO(jlewi): We should consider running a cronjob to GC so namespaces. + deleteNamespace: false, }; local params = defaultParams + overrides; @@ -77,6 +83,9 @@ local modelDir = "gs://" + params.modelBucket + "/mnist/models/" + prowDict["BUI // value of KUBECONFIG environment variable. This should be a full path. local kubeConfig = testDir + "/.kube/kubeconfig"; +// Namespace where tests should run +local testNamespace = "mnist-" + prowDict["BUILD_ID"]; + // Build template is a template for constructing Argo step templates. // // step_name: Name for the template @@ -233,10 +242,48 @@ local dagTemplates = [ params.kfCluster, ]] ), - workingDir: srcDir + "/github_issue_summarization", }, dependencies: ["checkout"], }, // get-kubeconfig + { + // Create the namespace + // TODO(jlewi): We should add some sort of retry. + template: buildTemplate { + name: "create-namespace", + command: util.buildCommand([ + [ + "echo", + "KUBECONFIG=", + "${KUBECONFIG}", + ], + [ + "gcloud", + "auth", + "activate-service-account", + "--key-file=${GOOGLE_APPLICATION_CREDENTIALS}", + ], + [ + "kubectl", + "config" , + "current-context", + ], + [ + "kubectl", + "create", + "namespace", + testNamespace, + ], + # Copy the GCP secret from the kubeflow namespace to the test namespace + [ + srcDir + "/test/copy_secret.sh", + "kubeflow", + testNamespace, + "user-gcp-sa", + ]] + ), + }, + dependencies: ["get-kubeconfig"], + }, // create-namespace { // Run the python test for TFJob template: buildTemplate { @@ -247,7 +294,7 @@ local dagTemplates = [ "--artifacts_path=" + artifactsDir, "--params=" + std.join(",", [ "name=mnist-test-" + prowDict["BUILD_ID"], - "namespace=kubeflow", + "namespace=" + testNamespace, "numTrainSteps=10", "batchSize=10", "image=" + trainerImage, @@ -260,8 +307,25 @@ local dagTemplates = [ ])], workingDir: srcDir + "/mnist/testing", }, - dependencies: ["build-images", "get-kubeconfig"], + dependencies: ["build-images", "create-namespace"], }, // tfjob-test + { + // Run the python test for TFJob + template: buildTemplate { + name: "deploy-test", + command: [ + "python", + "deploy_test.py", + "--params=" + std.join(",", [ + "name=mnist-test-" + prowDict["BUILD_ID"], + "namespace=" + testNamespace, + "modelBasePath=" + modelDir + "/export", + "exportDir=" + modelDir, + ])], + workingDir: srcDir + "/mnist/testing", + }, + dependencies: ["tfjob-test"], + }, // deploy-test // TODO(jlewi): We should add a non-distributed test that just uses the default values. ]; @@ -277,8 +341,35 @@ local dag = { // Define templates for the steps to be performed when the // test exits + +local deleteTemplates = if params.deleteNamespace then + [ + { + // Delete the namespace + // TODO(jlewi): We should add some sort of retry. + template: buildTemplate { + name: "delete-namespace", + command: util.buildCommand([ + [ + "gcloud", + "auth", + "activate-service-account", + "--key-file=${GOOGLE_APPLICATION_CREDENTIALS}", + ], + [ + "kubectl", + "delete", + "namespace", + testNamespace, + ]] + ), + }, + }, // delete-namespace + ] else []; + local exitTemplates = - [ + deleteTemplates + + [ { // Copy artifacts to GCS for gubernator. // TODO(https://github.com/kubeflow/testing/issues/257): Create-pr-symlink @@ -294,7 +385,6 @@ local exitTemplates = "--bucket=" + bucket, ], }, // copy-artifacts, - }, { // Delete the test directory in NFS. @@ -314,7 +404,7 @@ local exitTemplates = }, }, }, // test-dir-delete - dependencies: ["copy-artifacts"], + dependencies: ["copy-artifacts"] + if params.deleteNamespace then ["delete-namespace"] else [], }, ]; diff --git a/test/workflows/environments/test/params.libsonnet b/test/workflows/environments/test/params.libsonnet index a13368a1..01c9acc8 100644 --- a/test/workflows/environments/test/params.libsonnet +++ b/test/workflows/environments/test/params.libsonnet @@ -14,8 +14,8 @@ local envParams = params + { }, mnist+: { namespace: 'kubeflow-test-infra', - name: 'jlewi-mnist-test-465-0109-050605', - prow_env: 'JOB_NAME=mnist-test,JOB_TYPE=presubmit,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=0109-050605,BUILD_ID=0109-050605,PULL_NUMBER=465', + name: 'jlewi-mnist-test-469-0111-081531', + prow_env: 'JOB_NAME=mnist-test,JOB_TYPE=presubmit,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=0111-081531,BUILD_ID=0111-081531,PULL_NUMBER=469', }, }, };