Update serving in mnist example; use 0.4 and add testing. (#469)

* Add the TFServing component
* Create TFServing components.

* The model.py code doesn't appear to be exporting a model in saved model
  format; it was a missing a call to export.

  * I'm not sure how this ever worked.

* It also looks like there is a bug in the code in that its using the cnn input fn even if the model is the linear one. I'm going to leave that as is for now.

* Create a namespace for each test run; delete the namespace on teardown
* We need to copy the GCP service account key to the new namespace.
* Add a shell script to do that.
This commit is contained in:
Jeremy Lewi 2019-01-11 14:36:43 -08:00 committed by Kubernetes Prow Robot
parent ef108dbbcc
commit 2494fdf8c5
29 changed files with 1813 additions and 16 deletions

View File

@ -473,7 +473,74 @@ kubectl port-forward ${PODNAME} 6006:6006
Tensorboard can now be accessed at [http://127.0.0.1:6006](http://127.0.0.1:6006).
## Using Tensorflow serving
## Serving the model
The model code will export the model in saved model format which is suitable for serving with TensorFlow serving.
To serve the model follow the instructions below. The instructins vary slightly based on where you are storing your
model (e.g. GCS, S3, PVC). Depending on the storage system we provide different ksonnet components as a convenience
for setting relevant environment variables.
### GCS
Here we show to serve the model when it is stored on GCS. This assumes that when you trained the model you set `exportDir` to a GCS
URI; if not you can always copy it to GCS using `gsutil`.
Check that a model was exported
```
gsutil ls -r ${EXPORT_DIR}
```
The output should look something like
```
gs://${EXPORT_DIR}/1547100373/saved_model.pb
gs://${EXPORT_DIR}/1547100373/variables/:
gs://${EXPORT_DIR}/1547100373/variables/
gs://${EXPORT_DIR}/1547100373/variables/variables.data-00000-of-00001
gs://${EXPORT_DIR}/1547100373/variables/variables.index
```
The number `1547100373` is a version number auto-generated by TensorFlow; it will vary on each run but should be monotonically increasing if you save a model to the same location as a previous location.
Set your model path
```
ks param set ${ENV} mnist-deploy-gcp modelBasePath ${EXPORT_DIR}
```
Deploy it
```
ks param apply ${ENV} -c mnist-deploy-gcp
```
You can check the deployment by running
```
kubectl describe deployments mnist-deploy-gcp
```
### S3
TODO: Add instructions
### PVC
TODO: Add instructions
### Create the K8s service
Next we need to create a K8s service to route traffic to our model
```
ks apply jlewi -c mnist-service
```
By default the workflow deploys our model via Tensorflow Serving. Included in this example is a client that can query your model and provide results:

0
mnist/ks_app/app.lock Executable file
View File

View File

@ -2,14 +2,28 @@ apiVersion: 0.3.0
environments:
jlewi:
destination:
namespace: kubeflow
namespace: jlewi
server: https://35.196.210.94
k8sVersion: v1.11.5
path: jlewi
test-env-d5e3:
destination:
namespace: jlewi
server: https://35.196.210.94
k8sVersion: v1.11.5
path: test-env-d5e3
kind: ksonnet.io/app
libraries:
kubeflow/tf-serving:
name: tf-serving
registry: kubeflow
version: fed535eaa276220e4edf59530c0629f4375a40a9
name: ks_app
registries:
incubator:
protocol: github
uri: github.com/ksonnet/parts/tree/master/incubator
kubeflow:
protocol: github
uri: github.com/kubeflow/kubeflow/tree/v0.4-branch/kubeflow
version: 0.0.1

View File

@ -0,0 +1,39 @@
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["mnist-deploy-aws"];
local k = import "k.libsonnet";
local deployment = k.apps.v1beta1.deployment;
local container = deployment.mixin.spec.template.spec.containersType;
local util = import "kubeflow/tf-serving/util.libsonnet";
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
local base = tfserving.new(env, params);
local tfDeployment = base.tfDeployment +
deployment.mapContainers(
function(c) {
result::
c + container.withEnvMixin(
if util.toBool(params.s3Enable) then (
[
{
name: "AWS_ACCESS_KEY_ID",
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretAccesskeyidKeyName } },
},
{
name: "AWS_SECRET_ACCESS_KEY",
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretSecretaccesskeyKeyName } },
},
{ name: "AWS_REGION", value: params.s3AwsRegion },
{ name: "S3_USE_HTTPS", value: std.toString(params.s3UseHttps) },
{ name: "S3_VERIFY_SSL", value: std.toString(params.s3VerifySsl) },
{ name: "S3_ENDPOINT", value: params.s3Endpoint },
]
) else [],
),
}.result,
);
util.list([
tfDeployment,
base.tfservingConfig,
],)

View File

@ -0,0 +1,47 @@
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["mnist-deploy-gcp"];
local k = import "k.libsonnet";
local deployment = k.apps.v1beta1.deployment;
local container = deployment.mixin.spec.template.spec.containersType;
local util = import "kubeflow/tf-serving/util.libsonnet";
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
local base = tfserving.new(env, params);
local tfDeployment = base.tfDeployment +
deployment.mixin.spec.template.spec.withVolumesMixin(
if params.gcpCredentialSecretName != "null" then (
[{
name: "gcp-credentials",
secret: {
secretName: params.gcpCredentialSecretName,
},
}]
) else [],
) +
deployment.mapContainers(
function(c) {
result::
c + container.withEnvMixin(
if params.gcpCredentialSecretName != "null" then (
[{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/user-gcp-sa.json",
}]
) else [],
) +
container.withVolumeMountsMixin(
if params.gcpCredentialSecretName != "null" then (
[{
name: "gcp-credentials",
mountPath: "/secret/gcp-credentials",
}]
) else [],
),
}.result,
);
util.list([
tfDeployment,
base.tfservingConfig,
],)

View File

@ -0,0 +1,8 @@
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["mnist-service"];
local k = import "k.libsonnet";
local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet";
local util = import "kubeflow/tf-serving/util.libsonnet";
tfservingService.new(env, params).all

View File

@ -4,15 +4,58 @@
train: {
batchSize: 100,
envVariables: 'GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json',
exportDir: 'gs://kubeflow-ci_temp/mnist-jlewi',
image: 'gcr.io/kubeflow-examples/mnist/model:v20190108-v0.2-137-g38daafa-dirty-911944',
exportDir: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
image: 'gcr.io/kubeflow-examples/mnist/model:v20190111-v0.2-148-g313770f',
learningRate: '0.01',
modelDir: 'gs://kubeflow-ci_temp/mnist-jlewi',
name: 'mnist-train',
numPs: 1,
numWorkers: 2,
numPs: 0,
numWorkers: 0,
secret: '',
trainSteps: 200,
},
"mnist-deploy-gcp": {
defaultCpuImage: 'tensorflow/serving:1.11.1',
defaultGpuImage: 'tensorflow/serving:1.11.1-gpu',
deployHttpProxy: 'false',
enablePrometheus: 'true',
gcpCredentialSecretName: 'user-gcp-sa',
httpProxyImage: '',
injectIstio: 'false',
modelBasePath: 'gs://kubeflow-examples-data/mnist',
modelName: 'mnist',
name: 'mnist-deploy-gcp',
numGpus: '0',
versionName: 'v1',
},
"mnist-deploy-aws": {
defaultCpuImage: 'tensorflow/serving:1.11.1',
defaultGpuImage: 'tensorflow/serving:1.11.1-gpu',
deployHttpProxy: 'false',
enablePrometheus: 'true',
httpProxyImage: 'gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723',
injectIstio: 'false',
modelBasePath: 's3://kubeflow-examples-data/mnist',
modelName: 'null',
name: 'mnist-deploy-aws',
numGpus: '0',
s3AwsRegion: 'us-west-1',
s3Enable: 'false',
s3Endpoint: 's3.us-west-1.amazonaws.com',
s3SecretAccesskeyidKeyName: 'AWS_ACCESS_KEY_ID',
s3SecretName: 'null',
s3SecretSecretaccesskeyKeyName: 'AWS_SECRET_ACCESS_KEY',
s3UseHttps: 'true',
s3VerifySsl: 'true',
versionName: 'v1',
},
"mnist-service": {
enablePrometheus: 'true',
injectIstio: 'false',
modelName: 'null',
name: 'mnist-service',
serviceType: 'ClusterIP',
trafficRule: 'v1:100',
},
},
}

View File

@ -43,8 +43,9 @@ local trainEnv = [
},
];
local secretName = std.split(params.secret, "=")[0];
local secretMountPath = std.split(params.secret, "=")[1];
local secretPieces = std.split(params.secret, "=");
local secretName = if std.length(secretPieces) > 0 then secretPieces[0] else "";
local secretMountPath = if std.length(secretPieces) > 1 then secretPieces[1] else "";
local replicaSpec = {
containers: [

View File

@ -8,6 +8,23 @@ local envParams = params + {
train+: {
name: 'mnist-train-dist',
secret: 'user-gcp-sa=/var/secrets',
numSteps: 10,
image: 'gcr.io/kubeflow-examples/mnist/model:v20190111-v0.2-146-g0bbff62-dirty-12f353',
numWorkers: 2,
numPs: 1,
},
"deploy-gcp"+: {
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
},
"mnist-deploy-gcp"+: {
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
name: 'jlewi-deploy-test',
namespace: 'jlewi',
},
"mnist-service"+: {
name: 'jlewi-deploy-test',
namespace: 'jlewi',
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
},
},
};

View File

@ -0,0 +1,2 @@
{
}

View File

@ -0,0 +1,9 @@
local base = import "base.libsonnet";
// uncomment if you reference ksonnet-lib
// local k = import "k.libsonnet";
// local deployment = k.apps.v1beta2.deployment;
base + {
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
// "nginx-deployment"+: deployment.mixin.metadata.withLabels({foo: "bar"})
}

View File

@ -0,0 +1,28 @@
local params = std.extVar('__ksonnet/params');
local globals = import 'globals.libsonnet';
local envParams = params + {
components+: {
train+: {
name: 'jlewi-deploy-test',
namespace: 'jlewi',
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
},
"mnist-deploy-gcp"+: {
name: 'jlewi-deploy-test',
namespace: 'jlewi',
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
},
"mnist-service"+: {
name: 'jlewi-deploy-test',
namespace: 'jlewi',
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
},
},
};
{
components: {
[x]: envParams.components[x] + globals
for x in std.objectFields(envParams.components)
},
}

View File

@ -0,0 +1,73 @@
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)*
- [tf-serving](#tf-serving)
- [Quickstart](#quickstart)
- [Using the library](#using-the-library)
- [io.ksonnet.pkg.tf-serving](#ioksonnetpkgtf-serving)
- [Example](#example)
- [Parameters](#parameters)
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
# tf-serving
> TensorFlow serving is a server for TensorFlow models.
* [Quickstart](#quickstart)
* [Using Prototypes](#using-prototypes)
* [io.ksonnet.pkg.tf-serving](#io.ksonnet.pkg.tf-serving)
## Quickstart
*The following commands use the `io.ksonnet.pkg.tf-serving` prototype to generate Kubernetes YAML for tf-serving, and then deploys it to your Kubernetes cluster.*
First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)).
If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init <app-name>`.
Finally, in the ksonnet application directory, run the following:
```shell
# Expand prototype as a Jsonnet file, place in a file in the
# `components/` directory. (YAML and JSON are also available.)
$ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \
--name tf-serving \
--namespace default
# Apply to server.
$ ks apply -f tf-serving.jsonnet
```
## Using the library
The library files for tf-serving define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure tf-serving for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache.
This library provides a set of pre-fabricated "flavors" (or "distributions") of tf-serving, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs.
These prototypes, as well as how to use them, are enumerated below.
### io.ksonnet.pkg.tf-serving
TensorFlow serving
#### Example
```shell
# Expand prototype as a Jsonnet file, place in a file in the
# `components/` directory. (YAML and JSON are also available.)
$ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \
--name YOUR_NAME_HERE \
--model_path YOUR_MODEL_PATH_HERE
```
#### Parameters
The available options to pass prototype are:
* `--name=<name>`: Name to give to each of the components [string]
* `--model_path=<model_path>`: Path to the model. This can be a GCS path. [string]
[rootReadme]: https://github.com/ksonnet/mixins

View File

@ -0,0 +1,35 @@
{
"name": "tf-serving",
"apiVersion": "0.0.1",
"kind": "ksonnet.io/parts",
"description": "TensorFlow serving is a server for TensorFlow models.\n",
"author": "kubeflow team <kubeflow-team@google.com>",
"contributors": [
{
"name": "Jeremy Lewi",
"email": "jlewi@google.com"
}
],
"repository": {
"type": "git",
"url": "https://github.com/kubeflow/kubeflow"
},
"bugs": {
"url": "https://github.com/kubeflow/kubeflow/issues"
},
"keywords": [
"kubeflow",
"tensorflow",
"database"
],
"quickStart": {
"prototype": "io.ksonnet.pkg.tf-serving",
"componentName": "tf-serving",
"flags": {
"name": "tf-serving",
"namespace": "default"
},
"comment": "Run TensorFlow Serving"
},
"license": "Apache 2.0"
}

View File

@ -0,0 +1,23 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-serving
// @description TensorFlow serving
// @shortDescription A TensorFlow serving deployment
// @param name string Name to give to each of the components
local k = import "k.libsonnet";
// ksonnet appears to require name be a parameter of the prototype which is why we handle it differently.
local name = import "param://name";
// updatedParams includes the namespace from env by default.
local updatedParams = params + env;
local tfServingBase = import "kubeflow/tf-serving/tf-serving.libsonnet";
local tfServing = tfServingBase {
// Override parameters with user supplied parameters.
params+: updatedParams {
name: name,
},
};
std.prune(k.core.v1.list.new(tfServing.components))

View File

@ -0,0 +1,61 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-serving-deployment-aws
// @description TensorFlow serving
// @shortDescription A TensorFlow serving deployment
// @param name string Name to give to each of the components
// @optionalParam numGpus string 0 Number of gpus to use
// @optionalParam deployHttpProxy string false Whether to deploy http proxy
// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false.
// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11)
// @optionalParam modelBasePath string s3://kubeflow-examples-data/mnist The model path
// @optionalParam modelName string null The model name
// @optionalParam versionName string v1 The version name
// @optionalParam defaultCpuImage string tensorflow/serving:1.11.1 The default model server image (cpu)
// @optionalParam defaultGpuImage string tensorflow/serving:1.11.1-gpu The default model server image (gpu)
// @optionalParam httpProxyImage string gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723 Http proxy image
// @optionalParam s3Enable string false Whether to enable S3
// Following parameters are needed only if s3Enable is true
// @optionalParam s3SecretName string null Name of the k8s secrets containing S3 credentials
// @optionalParam s3SecretAccesskeyidKeyName string AWS_ACCESS_KEY_ID Name of the key in the k8s secret containing AWS_ACCESS_KEY_ID
// @optionalParam s3SecretSecretaccesskeyKeyName string AWS_SECRET_ACCESS_KEY Name of the key in the k8s secret containing AWS_SECRET_ACCESS_KEY
// @optionalParam s3AwsRegion string us-west-1 S3 region
// @optionalParam s3UseHttps string true Whether or not to use https
// @optionalParam s3VerifySsl string true Whether or not to verify https certificates for S3 connections
// @optionalParam s3Endpoint string s3.us-west-1.amazonaws.com URL for your s3-compatible endpoint
local k = import "k.libsonnet";
local deployment = k.apps.v1beta1.deployment;
local container = deployment.mixin.spec.template.spec.containersType;
local util = import "kubeflow/tf-serving/util.libsonnet";
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
local base = tfserving.new(env, params);
local tfDeployment = base.tfDeployment +
deployment.mapContainers(
function(c) {
result::
c + container.withEnvMixin(
if util.toBool(params.s3Enable) then (
[
{
name: "AWS_ACCESS_KEY_ID",
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretAccesskeyidKeyName } },
},
{
name: "AWS_SECRET_ACCESS_KEY",
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretSecretaccesskeyKeyName } },
},
{ name: "AWS_REGION", value: params.s3AwsRegion },
{ name: "S3_USE_HTTPS", value: std.toString(params.s3UseHttps) },
{ name: "S3_VERIFY_SSL", value: std.toString(params.s3VerifySsl) },
{ name: "S3_ENDPOINT", value: params.s3Endpoint },
]
) else [],
),
}.result,
);
util.list([
tfDeployment,
base.tfservingConfig,
],)

View File

@ -0,0 +1,61 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-serving-deployment-gcp
// @description TensorFlow serving
// @shortDescription A TensorFlow serving deployment
// @param name string Name to give to each of the components
// @optionalParam numGpus string 0 Number of gpus to use
// @optionalParam deployHttpProxy string false Whether to deploy http proxy
// @optionalParam modelBasePath string gs://kubeflow-examples-data/mnist The model path
// @optionalParam modelName string null The model name
// @optionalParam versionName string v1 The version name
// @optionalParam defaultCpuImage string tensorflow/serving:1.11.1 The default model server image (cpu)
// @optionalParam defaultGpuImage string tensorflow/serving:1.11.1-gpu The default model server image (gpu)
// @optionalParam httpProxyImage string gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723 Http proxy image
// @optionalParam gcpCredentialSecretName string null If not empty, insert the secret credential
// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false.
// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11)
local k = import "k.libsonnet";
local deployment = k.apps.v1beta1.deployment;
local container = deployment.mixin.spec.template.spec.containersType;
local util = import "kubeflow/tf-serving/util.libsonnet";
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
local base = tfserving.new(env, params);
local tfDeployment = base.tfDeployment +
deployment.mixin.spec.template.spec.withVolumesMixin(
if params.gcpCredentialSecretName != "null" then (
[{
name: "gcp-credentials",
secret: {
secretName: params.gcpCredentialSecretName,
},
}]
) else [],
) +
deployment.mapContainers(
function(c) {
result::
c + container.withEnvMixin(
if params.gcpCredentialSecretName != "null" then (
[{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/user-gcp-sa.json",
}]
) else [],
) +
container.withVolumeMountsMixin(
if params.gcpCredentialSecretName != "null" then (
[{
name: "gcp-credentials",
mountPath: "/secret/gcp-credentials",
}]
) else [],
),
}.result,
);
util.list([
tfDeployment,
base.tfservingConfig,
],)

View File

@ -0,0 +1,16 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-serving-service
// @description TensorFlow serving
// @shortDescription A TensorFlow serving model
// @param name string Name to give to each of the components
// @optionalParam serviceType string ClusterIP The k8s service type for tf serving.
// @optionalParam modelName string null The model name
// @optionalParam trafficRule string v1:100 The traffic rule, in the format of version:percentage,version:percentage,..
// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false.
// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11)
local k = import "k.libsonnet";
local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet";
local util = import "kubeflow/tf-serving/util.libsonnet";
tfservingService.new(env, params).all

View File

@ -0,0 +1,230 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-serving-request-log
// @description tf-serving with request logging
// @shortDescription tf-serving with request logging
// @param name string Name to give to each of the components
// @param gcpProject string The gcp project for Bigquery dataset
// @param dataset string The Bigquery dataset
// @param table string The Bigquery table
// @optionalParam modelBasePath string gs://kubeflow-examples-data/mnist The model path
// @optionalParam modelName string mnist The model name
local k = import "k.libsonnet";
local namespace = "kubeflow";
local appName = import "param://name";
local image = "gcr.io/kubeflow-images-public/tf-model-server-cpu:v20180327-995786ec";
local httpProxyImage = "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723";
local loggingImage = "gcr.io/kubeflow-images-public/tf-model-server-request-logger:v20180723";
local gcpSecretName = "user-gcp-sa";
local service = {
apiVersion: "v1",
kind: "Service",
metadata: {
labels: {
app: appName,
},
name: appName,
namespace: namespace,
},
spec: {
ports: [
{
name: "grpc-tf-serving",
port: 9000,
targetPort: 9000,
},
{
name: "http-tf-serving-proxy",
port: 8000,
targetPort: 8000,
},
],
selector: {
app: appName,
},
type: "ClusterIP",
},
};
local configMap = {
apiVersion: "v1",
kind: "ConfigMap",
metadata: {
name: appName + "fluentd-config",
namespace: namespace,
},
data: {
"fluent.conf": std.format(|||
<source>
@type tail
path /tmp/logs/request.log
pos_file /tmp/logs/request.log.pos
<parse>
@type json
</parse>
tag dummy
</source>
<match dummy>
@type bigquery_insert
auth_method application_default
project %s
dataset %s
table %s
fetch_schema true
</match>
|||, [params.gcpProject, params.dataset, params.table]),
},
};
local deployment = {
apiVersion: "extensions/v1beta1",
kind: "Deployment",
metadata: {
labels: {
app: appName,
},
name: appName,
namespace: namespace,
},
spec: {
template: {
metadata: {
labels: {
app: appName,
},
},
spec: {
containers: [
// ModelServer
{
args: [
"/usr/bin/tensorflow_model_server",
"--port=9000",
"--model_name=" + params.modelName,
"--model_base_path=" + params.modelBasePath,
],
image: image,
imagePullPolicy: "IfNotPresent",
name: "model-server",
ports: [
{
containerPort: 9000,
},
],
resources: {
limits: {
cpu: "4",
memory: "4Gi",
},
requests: {
cpu: "1",
memory: "1Gi",
},
},
},
// Http proxy
{
name: "http-proxy",
image: httpProxyImage,
imagePullPolicy: "Always",
command: [
"python",
"/usr/src/app/server.py",
"--port=8000",
"--rpc_port=9000",
"--rpc_timeout=10.0",
"--log_request=true",
],
env: [],
ports: [
{
containerPort: 8000,
},
],
resources: {
requests: {
memory: "1Gi",
cpu: "1",
},
limits: {
memory: "4Gi",
cpu: "4",
},
},
securityContext: {
runAsUser: 1000,
fsGroup: 1000,
},
volumeMounts: [
{
name: "request-logs",
mountPath: "/tmp/logs",
},
],
},
// TODO(lunkai): use admission controller to inject.
// Logging container.
{
name: "logging",
image: loggingImage,
imagePullPolicy: "Always",
env: [
{ name: "GOOGLE_APPLICATION_CREDENTIALS", value: "/secret/gcp-credentials/key.json" },
],
resources: {
requests: {
memory: "250Mi",
cpu: "0.25",
},
limits: {
memory: "500Mi",
cpu: "0.5",
},
},
volumeMounts: [
{
name: "request-logs",
mountPath: "/tmp/logs",
},
{
name: "gcp-credentials",
mountPath: "/secret/gcp-credentials",
},
{
name: "fluentd-config-volume",
mountPath: "/fluentd/etc/custom",
},
],
},
],
volumes: [
{
name: "gcp-credentials",
secret: {
secretName: gcpSecretName,
},
},
{
name: "request-logs",
emptyDir: {},
},
{
configMap: {
name: "fluentd-config",
},
name: "fluentd-config-volume",
},
],
},
},
},
};
k.core.v1.list.new([
service,
deployment,
configMap,
])

View File

@ -0,0 +1,112 @@
local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet";
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
local params = {
name: "m",
serviceType: "ClusterIP",
modelName: "mnist",
trafficRule: "v1:100",
injectIstio: false,
};
local istioParams = params {
injectIstio: true,
};
local env = {
namespace: "kubeflow",
};
local deploymentParam = {
name: "m",
modelName: "mnist",
versionName: "v1",
modelBasePath: "gs://abc",
numGpus: 0,
defaultCpuImage: "gcr.io/abc",
defaultGpuImage: "gcr.io/abc",
injectIstio: false,
enablePrometheus: true,
};
local gpuParam1 = {
name: "m",
modelName: "mnist",
versionName: "v1",
modelBasePath: "gs://abc",
numGpus: 1,
defaultCpuImage: "gcr.io/abc",
defaultGpuImage: "gcr.io/abc",
injectIstio: false,
enablePrometheus: true,
};
local gpuParamString0 = {
name: "m",
modelName: "mnist",
versionName: "v1",
modelBasePath: "gs://abc",
numGpus: "0",
defaultCpuImage: "gcr.io/abc",
defaultGpuImage: "gcr.io/abc",
injectIstio: false,
enablePrometheus: true,
};
local gpuParamString1 = {
name: "m",
modelName: "mnist",
versionName: "v1",
modelBasePath: "gs://abc",
numGpus: "1",
defaultCpuImage: "gcr.io/abc",
defaultGpuImage: "gcr.io/abc",
injectIstio: false,
enablePrometheus: true,
};
local serviceInstance = tfservingService.new(env, params);
local istioServiceInstance = tfservingService.new(env, istioParams);
local deploymentInstance = tfserving.new(env, deploymentParam);
local gpuInstance = tfserving.new(env, gpuParam1);
local gpuString0Instance = tfserving.new(env, gpuParamString0);
local gpuString1Instance = tfserving.new(env, gpuParamString1);
// This one should only have tfService
std.assertEqual(
std.length(serviceInstance.all.items),
1,
) &&
// This one should have tfService, virtualService, and DestinationRule
std.assertEqual(
std.length(istioServiceInstance.all.items),
3
) &&
std.startsWith(
deploymentInstance.tfDeployment.spec.template.spec.containers[0].args[4],
"--monitoring_config_file"
) &&
std.assertEqual(
deploymentInstance.tfDeployment.spec.template.spec.containers[0].resources.limits,
{ cpu: "4", memory: "4Gi" }
) &&
std.assertEqual(
gpuInstance.tfDeployment.spec.template.spec.containers[0].resources.limits,
{ cpu: "4", memory: "4Gi", "nvidia.com/gpu": 1 }
) &&
std.assertEqual(
gpuString0Instance.tfDeployment.spec.template.spec.containers[0].resources.limits,
{ cpu: "4", memory: "4Gi" }
) &&
std.assertEqual(
gpuString1Instance.tfDeployment.spec.template.spec.containers[0].resources.limits,
{ cpu: "4", memory: "4Gi", "nvidia.com/gpu": 1 }
)

View File

@ -0,0 +1,147 @@
{
local k = import "k.libsonnet",
local util = import "kubeflow/tf-serving/util.libsonnet",
new(_env, _params):: {
local params = _params + _env,
local namespace = params.namespace,
local name = params.name,
local modelName =
if params.modelName == "null" then
params.name
else
params.modelName,
local tfService = {
apiVersion: "v1",
kind: "Service",
metadata: {
labels: {
app: modelName,
},
name: name,
namespace: namespace,
annotations: {
"getambassador.io/config":
std.join("\n", [
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: tfserving-predict-mapping-" + modelName,
"prefix: /tfserving/models/" + modelName,
"rewrite: /v1/models/" + modelName + ":predict",
"method: POST",
"service: " + name + "." + namespace + ":8500",
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: tfserving-predict-mapping-" + modelName + "-get",
"prefix: /tfserving/models/" + modelName,
"rewrite: /v1/models/" + modelName,
"method: GET",
"service: " + name + "." + namespace + ":8500",
]),
} + if util.toBool(params.enablePrometheus) then {
"prometheus.io/scrape": "true",
"prometheus.io/path": "/monitoring/prometheus/metrics",
"prometheus.io/port": "8500",
} else {}, //annotations
},
spec: {
ports: [
{
name: "grpc-tf-serving",
port: 9000,
targetPort: 9000,
},
{
name: "http-tf-serving",
port: 8500,
targetPort: 8500,
},
],
selector: {
app: modelName,
},
type: params.serviceType,
},
}, // tfService
tfService:: tfService,
local versionWeights = std.split(params.trafficRule, ","),
local virtualService = {
apiVersion: "networking.istio.io/v1alpha3",
kind: "VirtualService",
metadata: {
name: name,
namespace: namespace,
},
spec: {
hosts: [
"*",
],
gateways: [
"kubeflow-gateway",
],
http: [
{
match: [
{
uri: {
prefix: "/istio/tfserving/models/" + modelName,
},
method: {
exact: "POST",
},
},
],
rewrite: {
uri: "/v1/models/" + modelName + ":predict",
},
route: [
{
destination: {
host: name,
port: {
number: 8500,
},
subset: std.split(versionWeight, ":")[0],
},
weight: std.parseInt(std.split(versionWeight, ":")[1]),
}
for versionWeight in versionWeights
],
},
],
},
},
virtualService:: virtualService,
local destinationRule = {
apiVersion: "networking.istio.io/v1alpha3",
kind: "DestinationRule",
metadata: {
name: name,
namespace: namespace,
},
spec: {
host: name,
subsets: [
{
name: std.split(versionWeight, ":")[0],
labels: {
version: std.split(versionWeight, ":")[0],
},
}
for versionWeight in versionWeights
],
},
},
destinationRule:: destinationRule,
all:: util.list([
tfService,
] + if util.toBool(params.injectIstio) then [
virtualService,
destinationRule,
] else []),
}, // new
}

View File

@ -0,0 +1,137 @@
{
local k = import "k.libsonnet",
local util = import "kubeflow/tf-serving/util.libsonnet",
new(_env, _params):: {
local params = _params + _env,
local namespace = params.namespace,
local name = params.name,
local modelName =
if params.modelName == "null" then
params.name
else
params.modelName,
local versionName = params.versionName,
local numGpus =
if std.type(params.numGpus) == "string" then
std.parseInt(params.numGpus)
else
params.numGpus,
local modelServerImage =
if numGpus == 0 then
params.defaultCpuImage
else
params.defaultGpuImage,
// Optional features.
// TODO(lunkai): Add request logging
local modelServerContainer = {
command: [
"/usr/bin/tensorflow_model_server",
],
args: [
"--port=9000",
"--rest_api_port=8500",
"--model_name=" + modelName,
"--model_base_path=" + params.modelBasePath,
] + if util.toBool(params.enablePrometheus) then [
"--monitoring_config_file=/var/config/monitoring_config.txt",
] else [],
image: modelServerImage,
imagePullPolicy: "IfNotPresent",
name: modelName,
ports: [
{
containerPort: 9000,
},
{
containerPort: 8500,
},
],
env: [],
resources: {
limits: {
cpu: "4",
memory: "4Gi",
} + if numGpus != 0 then {
"nvidia.com/gpu": numGpus,
} else {},
requests: {
cpu: "1",
memory: "1Gi",
},
},
volumeMounts: [
{
mountPath: "/var/config/",
name: "config-volume",
},
],
// TCP liveness probe on gRPC port
livenessProbe: {
tcpSocket: {
port: 9000,
},
initialDelaySeconds: 30,
periodSeconds: 30,
},
}, // modelServerContainer
local tfDeployment = {
apiVersion: "extensions/v1beta1",
kind: "Deployment",
metadata: {
labels: {
app: modelName,
},
name: name,
namespace: namespace,
},
spec: {
template: {
metadata: {
labels: {
app: modelName,
version: versionName,
},
annotations: {
"sidecar.istio.io/inject": if util.toBool(params.injectIstio) then "true",
},
},
spec: {
containers: [
modelServerContainer,
],
volumes: [
{
configMap: {
name: name + "-config",
},
name: "config-volume",
},
],
},
},
},
}, // tfDeployment
tfDeployment:: tfDeployment,
local tfservingConfig = {
apiVersion: "v1",
kind: "ConfigMap",
metadata: {
name: name + "-config",
namespace: namespace,
},
data: {
"monitoring_config.txt": std.join("\n", [
"prometheus_config: {",
" enable: true,",
' path: "/monitoring/prometheus/metrics"',
"}",
]),
},
}, // tfservingConfig
tfservingConfig:: tfservingConfig,
}, // new
}

View File

@ -0,0 +1,380 @@
{
util:: import "kubeflow/tf-serving/util.libsonnet",
// Parameters are intended to be late bound.
params:: {
name: null,
numGpus: 0,
labels: {
app: $.params.name,
},
modelName: $.params.name,
modelPath: null,
modelStorageType: "storageType",
version: "v1",
firstVersion: true,
deployIstio: false,
deployHttpProxy: false,
httpProxyImage: "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180606-9dfda4f2",
serviceType: "ClusterIP",
// If users want to override the image then can override defaultCpuImage and/or defaultGpuImage
// in which case the image used will still depend on whether GPUs are used or not.
// Users can also override modelServerImage in which case the user supplied value will always be used
// regardless of numGpus.
defaultCpuImage: "tensorflow/serving:1.11.1",
defaultGpuImage: "tensorflow/serving:1.11.1-gpu",
modelServerImage: if $.params.numGpus == 0 then
$.params.defaultCpuImage
else
$.params.defaultGpuImage,
// Whether or not to enable s3 parameters
s3Enable:: false,
// Which storageType to use
storageType:: null,
},
// Parametes specific to GCP.
gcpParams:: {
gcpCredentialSecretName: "",
} + $.params,
// Parameters that control S3 access
// params overrides s3params because params can be overwritten by the user to override the defaults.
s3params:: {
// Name of the k8s secrets containing S3 credentials
s3SecretName: "",
// Name of the key in the k8s secret containing AWS_ACCESS_KEY_ID.
s3SecretAccesskeyidKeyName: "AWS_ACCESS_KEY_ID",
// Name of the key in the k8s secret containing AWS_SECRET_ACCESS_KEY.
s3SecretSecretaccesskeyKeyName: "AWS_SECRET_ACCESS_KEY",
// S3 region
s3AwsRegion: "us-west-1",
// TODO(jlewi): We should use util.toBool to automatically conver to actual boolean values.
// The use of strings is left over from when they were prototype parameters which only supports string type.
// true Whether or not to use https for S3 connections
s3UseHttps: "true",
// Whether or not to verify https certificates for S3 connections
s3VerifySsl: "true",
// URL for your s3-compatible endpoint.
s3Endpoint: "http://s3.us-west-1.amazonaws.com,",
} + $.params,
components:: {
all:: [
// Default routing rule for the first version of model.
if $.util.toBool($.params.deployIstio) && $.util.toBool($.params.firstVersion) then
$.parts.defaultRouteRule,
] +
// TODO(jlewi): It would be better to structure s3 as a mixin.
// As an example it would be great to allow S3 and GCS parameters
// to be enabled simultaneously. This should be doable because
// each entails adding a set of environment variables and volumes
// to the containers. These volumes/environment variables shouldn't
// overlap so there's no reason we shouldn't be able to just add
// both modifications to the base container.
// I think we want to restructure things as mixins so they can just
// be added.
if $.params.s3Enable then
[
$.s3parts.tfService,
$.s3parts.tfDeployment,
]
else if $.params.storageType == "gcp" then
[
$.gcpParts.tfService,
$.gcpParts.tfDeployment,
]
else
[
$.parts.tfService,
$.parts.tfDeployment,
],
}.all,
parts:: {
// We define the containers one level beneath parts because combined with jsonnet late binding
// this makes it easy for users to override specific bits of the container.
tfServingContainerBase:: {
name: $.params.name,
image: $.params.modelServerImage,
imagePullPolicy: "IfNotPresent",
command: [
"/usr/bin/tensorflow_model_server",
],
args: [
"--port=9000",
"--model_name=" + $.params.modelName,
"--model_base_path=" + $.params.modelPath,
],
ports: [
{
containerPort: 9000,
},
],
// TODO(jlewi): We should add readiness and liveness probes. I think the blocker is that
// model-server doesn't have something we can use out of the box.
resources: {
requests: {
memory: "1Gi",
cpu: "1",
},
limits: {
memory: "4Gi",
cpu: "4",
},
},
// The is user and group should be defined in the Docker image.
// Per best practices we don't run as the root user.
securityContext: {
runAsUser: 1000,
fsGroup: 1000,
},
volumeMounts+: if $.params.modelStorageType == "nfs" then [{
name: "nfs",
mountPath: "/mnt",
}]
else [],
}, // tfServingContainer
tfServingContainer+: $.parts.tfServingContainerBase +
if $.params.numGpus > 0 then
{
resources+: {
limits+: {
"nvidia.com/gpu": $.params.numGpus,
},
},
}
else {},
tfServingMetadata+: {
labels: $.params.labels { version: $.params.version },
annotations: {
"sidecar.istio.io/inject": if $.util.toBool($.params.deployIstio) then "true",
},
},
httpProxyContainer:: {
name: $.params.name + "-http-proxy",
image: $.params.httpProxyImage,
imagePullPolicy: "IfNotPresent",
command: [
"python",
"/usr/src/app/server.py",
"--port=8000",
"--rpc_port=9000",
"--rpc_timeout=10.0",
],
env: [],
ports: [
{
containerPort: 8000,
},
],
resources: {
requests: {
memory: "500Mi",
cpu: "0.5",
},
limits: {
memory: "1Gi",
cpu: "1",
},
},
securityContext: {
runAsUser: 1000,
fsGroup: 1000,
},
}, // httpProxyContainer
tfDeployment: {
apiVersion: "extensions/v1beta1",
kind: "Deployment",
metadata: {
name: $.params.name + "-" + $.params.version,
namespace: $.params.namespace,
labels: $.params.labels,
},
spec: {
template: {
metadata: $.parts.tfServingMetadata,
spec: {
containers: [
$.parts.tfServingContainer,
if $.util.toBool($.params.deployHttpProxy) then
$.parts.httpProxyContainer,
],
volumes+: if $.params.modelStorageType == "nfs" then
[{
name: "nfs",
persistentVolumeClaim: {
claimName: $.params.nfsPVC,
},
}]
else [],
},
},
},
}, // tfDeployment
tfService: {
apiVersion: "v1",
kind: "Service",
metadata: {
labels: $.params.labels,
name: $.params.name,
namespace: $.params.namespace,
annotations: {
"getambassador.io/config":
std.join("\n", [
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: tfserving-mapping-" + $.params.name + "-get",
"prefix: /models/" + $.params.name + "/",
"rewrite: /",
"method: GET",
"service: " + $.params.name + "." + $.params.namespace + ":8000",
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: tfserving-mapping-" + $.params.name + "-post",
"prefix: /models/" + $.params.name + "/",
"rewrite: /model/" + $.params.name + ":predict",
"method: POST",
"service: " + $.params.name + "." + $.params.namespace + ":8000",
]),
}, //annotations
},
spec: {
ports: [
{
name: "grpc-tf-serving",
port: 9000,
targetPort: 9000,
},
{
name: "http-tf-serving-proxy",
port: 8000,
targetPort: 8000,
},
],
selector: $.params.labels,
type: $.params.serviceType,
},
}, // tfService
defaultRouteRule: {
apiVersion: "config.istio.io/v1alpha2",
kind: "RouteRule",
metadata: {
name: $.params.name + "-default",
namespace: $.params.namespace,
},
spec: {
destination: {
name: $.params.name,
},
precedence: 0,
route: [
{
labels: { version: $.params.version },
},
],
},
},
}, // parts
// Parts specific to S3
s3parts:: $.parts {
s3Env:: [
{ name: "AWS_ACCESS_KEY_ID", valueFrom: { secretKeyRef: { name: $.s3params.s3SecretName, key: $.s3params.s3SecretAccesskeyidKeyName } } },
{ name: "AWS_SECRET_ACCESS_KEY", valueFrom: { secretKeyRef: { name: $.s3params.s3SecretName, key: $.s3params.s3SecretSecretaccesskeyKeyName } } },
{ name: "AWS_REGION", value: $.s3params.s3AwsRegion },
{ name: "S3_REGION", value: $.s3params.s3AwsRegion },
{ name: "S3_USE_HTTPS", value: $.s3params.s3UseHttps },
{ name: "S3_VERIFY_SSL", value: $.s3params.s3VerifySsl },
{ name: "S3_ENDPOINT", value: $.s3params.s3Endpoint },
],
tfServingContainer: $.parts.tfServingContainer {
env+: $.s3parts.s3Env,
},
tfDeployment: $.parts.tfDeployment {
spec: +{
template: +{
metadata: $.parts.tfServingMetadata,
spec: +{
containers: [
$.s3parts.tfServingContainer,
if $.util.toBool($.params.deployHttpProxy) then
$.parts.httpProxyContainer,
],
},
},
},
}, // tfDeployment
}, // s3parts
// Parts specific to GCP
gcpParts:: $.parts {
gcpEnv:: [
if $.gcpParams.gcpCredentialSecretName != "" then
{ name: "GOOGLE_APPLICATION_CREDENTIALS", value: "/secret/gcp-credentials/user-gcp-sa.json" },
],
tfServingContainer: $.parts.tfServingContainer {
env+: $.gcpParts.gcpEnv,
volumeMounts+: [
if $.gcpParams.gcpCredentialSecretName != "" then
{
name: "gcp-credentials",
mountPath: "/secret/gcp-credentials",
},
],
},
tfDeployment: $.parts.tfDeployment {
spec+: {
template+: {
metadata: $.parts.tfServingMetadata,
spec+: {
containers: [
$.gcpParts.tfServingContainer,
if $.util.toBool($.params.deployHttpProxy) then
$.parts.httpProxyContainer,
],
volumes: [
if $.gcpParams.gcpCredentialSecretName != "" then
{
name: "gcp-credentials",
secret: {
secretName: $.gcpParams.gcpCredentialSecretName,
},
},
],
},
},
},
}, // tfDeployment
}, // gcpParts
}

View File

@ -0,0 +1,21 @@
// Some useful routines.
{
local k = import "k.libsonnet",
// Convert non-boolean types like string,number to a boolean.
// This is primarily intended for dealing with parameters that should be booleans.
toBool:: function(x) {
result::
if std.type(x) == "boolean" then
x
else if std.type(x) == "string" then
std.asciiUpper(x) == "TRUE"
else if std.type(x) == "number" then
x != 0
else
false,
}.result,
// Produce a list of manifests. obj must be an array
list(obj):: k.core.v1.list.new(obj,),
}

View File

@ -21,6 +21,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import os
import sys
import numpy as np
@ -126,6 +127,22 @@ def linear_serving_input_receiver_fn():
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
tf_config = os.environ.get('TF_CONFIG', '{}')
tf.logging.info("TF_CONFIG %s", tf_config)
tf_config_json = json.loads(tf_config)
cluster = tf_config_json.get('cluster')
job_name = tf_config_json.get('task', {}).get('type')
task_index = tf_config_json.get('task', {}).get('index')
tf.logging.info("cluster=%s job_name=%s task_index=%s", cluster, job_name,
task_index)
is_chief = False
if not job_name or job_name.lower() in ["chief", "master"]:
is_chief = True
tf.logging.info("Will export model")
else:
tf.logging.info("Will not export model")
# Download and load MNIST dataset.
mnist = tf.contrib.learn.datasets.DATASETS['mnist'](TF_DATA_DIR)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
@ -151,6 +168,8 @@ def main(_):
classifier = tf.estimator.LinearClassifier(
feature_columns=feature_columns, n_classes=N_DIGITS,
model_dir=TF_MODEL_DIR, config=training_config)
# TODO(jlewi): Should it be linear_serving_input_receiver_fn here?
serving_fn = cnn_serving_input_receiver_fn
export_final = tf.estimator.FinalExporter(
TF_EXPORT_DIR, serving_input_receiver_fn=cnn_serving_input_receiver_fn)
@ -158,6 +177,7 @@ def main(_):
# Convolutional network
classifier = tf.estimator.Estimator(
model_fn=conv_model, model_dir=TF_MODEL_DIR, config=training_config)
serving_fn = cnn_serving_input_receiver_fn
export_final = tf.estimator.FinalExporter(
TF_EXPORT_DIR, serving_input_receiver_fn=cnn_serving_input_receiver_fn)
else:
@ -171,7 +191,14 @@ def main(_):
exporters=export_final,
throttle_secs=1,
start_delay_secs=1)
print("Train and evaluate")
tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
print("Training done")
if is_chief:
print("Export saved model")
classifier.export_savedmodel(TF_EXPORT_DIR, serving_input_receiver_fn=serving_fn)
print("Done exporting the model")
if __name__ == '__main__':
tf.app.run()

View File

@ -0,0 +1,76 @@
"""Test deploying the mnist model.
This file tests that we can deploy the model.
TODO(jlewi): Test that we can send predictions to the model.
It is an integration test as it depends on having access to
a Kubeflow deployment to deploy on. It also depends on having a model.
Python Path Requirements:
kubeflow/testing/py - https://github.com/kubeflow/testing/tree/master/py
* Provides utilities for testing
Manually running the test
1. Configure your KUBECONFIG file to point to the desired cluster
2. Set --params=name=${NAME},namespace=${NAMESPACE}
* name should be the name for your job
* namespace should be the namespace to use
3. Use the modelBasePath parameter to the model to test.
--params=...,modelBasePath=${MODEL_BASE_PATH}
"""
import logging
import os
from kubernetes import client as k8s_client
from py import test_runner
from kubeflow.testing import ks_util
from kubeflow.testing import test_util
from kubeflow.testing import util
class MnistDeployTest(test_util.TestCase):
def __init__(self, args):
namespace, name, env = test_runner.parse_runtime_params(args)
self.app_dir = args.app_dir
if not self.app_dir:
self.app_dir = os.path.join(os.path.dirname(__file__), "..",
"ks_app")
self.app_dir = os.path.abspath(self.app_dir)
logging.info("--app_dir not set defaulting to: %s", self.app_dir)
self.env = env
self.namespace = namespace
self.params = args.params
self.ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
super(MnistDeployTest, self).__init__(class_name="MnistDeployTest",
name=name)
def test_serve(self):
# We repeat the test multiple times.
# This ensures that if we delete the job we can create a new job with the
# same name.
api_client = k8s_client.ApiClient()
# Apply the components
for component in ["mnist-deploy-gcp", "mnist-service"]:
# Setup the ksonnet app
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
self.params)
util.run([self.ks_cmd, "apply", self.env, "-c", component],
cwd=self.app_dir)
logging.info("Created deployment %s in namespaces %s", self.name, self.namespace)
util.wait_for_deployment(api_client, self.namespace, self.name,
timeout_minutes=4)
# We don't delete the resources. We depend on the namespace being
# garbage collected.
if __name__ == "__main__":
test_runner.main(module=__name__)

33
test/copy_secret.sh Executable file
View File

@ -0,0 +1,33 @@
#!/bin/bash
#
# A simple script to copy a secret from 1 namespace to another
#
# Usage
# copy_secret <source namepspace> <dest namespace> <secret name>
set -e
SOURCE=$1
DEST=$2
NAME=$3
usage() {
echo copy_secret "<source namepspace> <dest namespace> <secret name>"
}
if [ -z ${SOURCE} ]; then
usage
exit -1
fi
if [ -z ${DEST} ]; then
usage
exit -1
fi
if [ -z ${NAME} ]; then
usage
exit -1
fi
echo getting secret
SECRET=$(kubectl -n ${SOURCE} get secrets user-gcp-sa -o jsonpath="{.data.${NAME}\.json}" | base64 -d)
kubectl create -n ${DEST} secret generic ${NAME} --from-literal="${NAME}.json=${SECRET}"

View File

@ -25,6 +25,12 @@ local defaultParams = {
// The bucket where the model should be written
// This needs to be writable by the GCP service account in the Kubeflow cluster (not the test cluster)
modelBucket: "kubeflow-ci_temp",
// Whether to delete the namespace at the end.
// Leaving the namespace around can be useful for debugging.
//
// TODO(jlewi): We should consider running a cronjob to GC so namespaces.
deleteNamespace: false,
};
local params = defaultParams + overrides;
@ -77,6 +83,9 @@ local modelDir = "gs://" + params.modelBucket + "/mnist/models/" + prowDict["BUI
// value of KUBECONFIG environment variable. This should be a full path.
local kubeConfig = testDir + "/.kube/kubeconfig";
// Namespace where tests should run
local testNamespace = "mnist-" + prowDict["BUILD_ID"];
// Build template is a template for constructing Argo step templates.
//
// step_name: Name for the template
@ -233,10 +242,48 @@ local dagTemplates = [
params.kfCluster,
]]
),
workingDir: srcDir + "/github_issue_summarization",
},
dependencies: ["checkout"],
}, // get-kubeconfig
{
// Create the namespace
// TODO(jlewi): We should add some sort of retry.
template: buildTemplate {
name: "create-namespace",
command: util.buildCommand([
[
"echo",
"KUBECONFIG=",
"${KUBECONFIG}",
],
[
"gcloud",
"auth",
"activate-service-account",
"--key-file=${GOOGLE_APPLICATION_CREDENTIALS}",
],
[
"kubectl",
"config" ,
"current-context",
],
[
"kubectl",
"create",
"namespace",
testNamespace,
],
# Copy the GCP secret from the kubeflow namespace to the test namespace
[
srcDir + "/test/copy_secret.sh",
"kubeflow",
testNamespace,
"user-gcp-sa",
]]
),
},
dependencies: ["get-kubeconfig"],
}, // create-namespace
{
// Run the python test for TFJob
template: buildTemplate {
@ -247,7 +294,7 @@ local dagTemplates = [
"--artifacts_path=" + artifactsDir,
"--params=" + std.join(",", [
"name=mnist-test-" + prowDict["BUILD_ID"],
"namespace=kubeflow",
"namespace=" + testNamespace,
"numTrainSteps=10",
"batchSize=10",
"image=" + trainerImage,
@ -260,8 +307,25 @@ local dagTemplates = [
])],
workingDir: srcDir + "/mnist/testing",
},
dependencies: ["build-images", "get-kubeconfig"],
dependencies: ["build-images", "create-namespace"],
}, // tfjob-test
{
// Run the python test for TFJob
template: buildTemplate {
name: "deploy-test",
command: [
"python",
"deploy_test.py",
"--params=" + std.join(",", [
"name=mnist-test-" + prowDict["BUILD_ID"],
"namespace=" + testNamespace,
"modelBasePath=" + modelDir + "/export",
"exportDir=" + modelDir,
])],
workingDir: srcDir + "/mnist/testing",
},
dependencies: ["tfjob-test"],
}, // deploy-test
// TODO(jlewi): We should add a non-distributed test that just uses the default values.
];
@ -277,8 +341,35 @@ local dag = {
// Define templates for the steps to be performed when the
// test exits
local deleteTemplates = if params.deleteNamespace then
[
{
// Delete the namespace
// TODO(jlewi): We should add some sort of retry.
template: buildTemplate {
name: "delete-namespace",
command: util.buildCommand([
[
"gcloud",
"auth",
"activate-service-account",
"--key-file=${GOOGLE_APPLICATION_CREDENTIALS}",
],
[
"kubectl",
"delete",
"namespace",
testNamespace,
]]
),
},
}, // delete-namespace
] else [];
local exitTemplates =
[
deleteTemplates +
[
{
// Copy artifacts to GCS for gubernator.
// TODO(https://github.com/kubeflow/testing/issues/257): Create-pr-symlink
@ -294,7 +385,6 @@ local exitTemplates =
"--bucket=" + bucket,
],
}, // copy-artifacts,
},
{
// Delete the test directory in NFS.
@ -314,7 +404,7 @@ local exitTemplates =
},
},
}, // test-dir-delete
dependencies: ["copy-artifacts"],
dependencies: ["copy-artifacts"] + if params.deleteNamespace then ["delete-namespace"] else [],
},
];

View File

@ -14,8 +14,8 @@ local envParams = params + {
},
mnist+: {
namespace: 'kubeflow-test-infra',
name: 'jlewi-mnist-test-465-0109-050605',
prow_env: 'JOB_NAME=mnist-test,JOB_TYPE=presubmit,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=0109-050605,BUILD_ID=0109-050605,PULL_NUMBER=465',
name: 'jlewi-mnist-test-469-0111-081531',
prow_env: 'JOB_NAME=mnist-test,JOB_TYPE=presubmit,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=0111-081531,BUILD_ID=0111-081531,PULL_NUMBER=469',
},
},
};