mirror of https://github.com/kubeflow/examples.git
Update serving in mnist example; use 0.4 and add testing. (#469)
* Add the TFServing component * Create TFServing components. * The model.py code doesn't appear to be exporting a model in saved model format; it was a missing a call to export. * I'm not sure how this ever worked. * It also looks like there is a bug in the code in that its using the cnn input fn even if the model is the linear one. I'm going to leave that as is for now. * Create a namespace for each test run; delete the namespace on teardown * We need to copy the GCP service account key to the new namespace. * Add a shell script to do that.
This commit is contained in:
parent
ef108dbbcc
commit
2494fdf8c5
|
|
@ -473,7 +473,74 @@ kubectl port-forward ${PODNAME} 6006:6006
|
|||
|
||||
Tensorboard can now be accessed at [http://127.0.0.1:6006](http://127.0.0.1:6006).
|
||||
|
||||
## Using Tensorflow serving
|
||||
## Serving the model
|
||||
|
||||
The model code will export the model in saved model format which is suitable for serving with TensorFlow serving.
|
||||
|
||||
To serve the model follow the instructions below. The instructins vary slightly based on where you are storing your
|
||||
model (e.g. GCS, S3, PVC). Depending on the storage system we provide different ksonnet components as a convenience
|
||||
for setting relevant environment variables.
|
||||
|
||||
|
||||
### GCS
|
||||
|
||||
Here we show to serve the model when it is stored on GCS. This assumes that when you trained the model you set `exportDir` to a GCS
|
||||
URI; if not you can always copy it to GCS using `gsutil`.
|
||||
|
||||
Check that a model was exported
|
||||
|
||||
```
|
||||
gsutil ls -r ${EXPORT_DIR}
|
||||
|
||||
```
|
||||
|
||||
The output should look something like
|
||||
|
||||
```
|
||||
gs://${EXPORT_DIR}/1547100373/saved_model.pb
|
||||
gs://${EXPORT_DIR}/1547100373/variables/:
|
||||
gs://${EXPORT_DIR}/1547100373/variables/
|
||||
gs://${EXPORT_DIR}/1547100373/variables/variables.data-00000-of-00001
|
||||
gs://${EXPORT_DIR}/1547100373/variables/variables.index
|
||||
```
|
||||
|
||||
The number `1547100373` is a version number auto-generated by TensorFlow; it will vary on each run but should be monotonically increasing if you save a model to the same location as a previous location.
|
||||
|
||||
|
||||
Set your model path
|
||||
|
||||
```
|
||||
ks param set ${ENV} mnist-deploy-gcp modelBasePath ${EXPORT_DIR}
|
||||
|
||||
```
|
||||
|
||||
Deploy it
|
||||
|
||||
```
|
||||
ks param apply ${ENV} -c mnist-deploy-gcp
|
||||
```
|
||||
|
||||
You can check the deployment by running
|
||||
|
||||
```
|
||||
kubectl describe deployments mnist-deploy-gcp
|
||||
```
|
||||
|
||||
### S3
|
||||
|
||||
TODO: Add instructions
|
||||
|
||||
### PVC
|
||||
|
||||
TODO: Add instructions
|
||||
|
||||
### Create the K8s service
|
||||
|
||||
Next we need to create a K8s service to route traffic to our model
|
||||
|
||||
```
|
||||
ks apply jlewi -c mnist-service
|
||||
```
|
||||
|
||||
By default the workflow deploys our model via Tensorflow Serving. Included in this example is a client that can query your model and provide results:
|
||||
|
||||
|
|
|
|||
|
|
@ -2,14 +2,28 @@ apiVersion: 0.3.0
|
|||
environments:
|
||||
jlewi:
|
||||
destination:
|
||||
namespace: kubeflow
|
||||
namespace: jlewi
|
||||
server: https://35.196.210.94
|
||||
k8sVersion: v1.11.5
|
||||
path: jlewi
|
||||
test-env-d5e3:
|
||||
destination:
|
||||
namespace: jlewi
|
||||
server: https://35.196.210.94
|
||||
k8sVersion: v1.11.5
|
||||
path: test-env-d5e3
|
||||
kind: ksonnet.io/app
|
||||
libraries:
|
||||
kubeflow/tf-serving:
|
||||
name: tf-serving
|
||||
registry: kubeflow
|
||||
version: fed535eaa276220e4edf59530c0629f4375a40a9
|
||||
name: ks_app
|
||||
registries:
|
||||
incubator:
|
||||
protocol: github
|
||||
uri: github.com/ksonnet/parts/tree/master/incubator
|
||||
kubeflow:
|
||||
protocol: github
|
||||
uri: github.com/kubeflow/kubeflow/tree/v0.4-branch/kubeflow
|
||||
version: 0.0.1
|
||||
|
|
|
|||
|
|
@ -0,0 +1,39 @@
|
|||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["mnist-deploy-aws"];
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local deployment = k.apps.v1beta1.deployment;
|
||||
local container = deployment.mixin.spec.template.spec.containersType;
|
||||
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet";
|
||||
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
|
||||
|
||||
local base = tfserving.new(env, params);
|
||||
local tfDeployment = base.tfDeployment +
|
||||
deployment.mapContainers(
|
||||
function(c) {
|
||||
result::
|
||||
c + container.withEnvMixin(
|
||||
if util.toBool(params.s3Enable) then (
|
||||
[
|
||||
{
|
||||
name: "AWS_ACCESS_KEY_ID",
|
||||
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretAccesskeyidKeyName } },
|
||||
},
|
||||
{
|
||||
name: "AWS_SECRET_ACCESS_KEY",
|
||||
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretSecretaccesskeyKeyName } },
|
||||
},
|
||||
{ name: "AWS_REGION", value: params.s3AwsRegion },
|
||||
{ name: "S3_USE_HTTPS", value: std.toString(params.s3UseHttps) },
|
||||
{ name: "S3_VERIFY_SSL", value: std.toString(params.s3VerifySsl) },
|
||||
{ name: "S3_ENDPOINT", value: params.s3Endpoint },
|
||||
]
|
||||
) else [],
|
||||
),
|
||||
}.result,
|
||||
);
|
||||
util.list([
|
||||
tfDeployment,
|
||||
base.tfservingConfig,
|
||||
],)
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["mnist-deploy-gcp"];
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local deployment = k.apps.v1beta1.deployment;
|
||||
local container = deployment.mixin.spec.template.spec.containersType;
|
||||
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet";
|
||||
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
|
||||
|
||||
local base = tfserving.new(env, params);
|
||||
local tfDeployment = base.tfDeployment +
|
||||
deployment.mixin.spec.template.spec.withVolumesMixin(
|
||||
if params.gcpCredentialSecretName != "null" then (
|
||||
[{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: params.gcpCredentialSecretName,
|
||||
},
|
||||
}]
|
||||
) else [],
|
||||
) +
|
||||
deployment.mapContainers(
|
||||
function(c) {
|
||||
result::
|
||||
c + container.withEnvMixin(
|
||||
if params.gcpCredentialSecretName != "null" then (
|
||||
[{
|
||||
name: "GOOGLE_APPLICATION_CREDENTIALS",
|
||||
value: "/secret/gcp-credentials/user-gcp-sa.json",
|
||||
}]
|
||||
) else [],
|
||||
) +
|
||||
container.withVolumeMountsMixin(
|
||||
if params.gcpCredentialSecretName != "null" then (
|
||||
[{
|
||||
name: "gcp-credentials",
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
}]
|
||||
) else [],
|
||||
),
|
||||
}.result,
|
||||
);
|
||||
util.list([
|
||||
tfDeployment,
|
||||
base.tfservingConfig,
|
||||
],)
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["mnist-service"];
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet";
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet";
|
||||
|
||||
tfservingService.new(env, params).all
|
||||
|
|
@ -4,15 +4,58 @@
|
|||
train: {
|
||||
batchSize: 100,
|
||||
envVariables: 'GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json',
|
||||
exportDir: 'gs://kubeflow-ci_temp/mnist-jlewi',
|
||||
image: 'gcr.io/kubeflow-examples/mnist/model:v20190108-v0.2-137-g38daafa-dirty-911944',
|
||||
exportDir: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
|
||||
image: 'gcr.io/kubeflow-examples/mnist/model:v20190111-v0.2-148-g313770f',
|
||||
learningRate: '0.01',
|
||||
modelDir: 'gs://kubeflow-ci_temp/mnist-jlewi',
|
||||
name: 'mnist-train',
|
||||
numPs: 1,
|
||||
numWorkers: 2,
|
||||
numPs: 0,
|
||||
numWorkers: 0,
|
||||
secret: '',
|
||||
trainSteps: 200,
|
||||
},
|
||||
"mnist-deploy-gcp": {
|
||||
defaultCpuImage: 'tensorflow/serving:1.11.1',
|
||||
defaultGpuImage: 'tensorflow/serving:1.11.1-gpu',
|
||||
deployHttpProxy: 'false',
|
||||
enablePrometheus: 'true',
|
||||
gcpCredentialSecretName: 'user-gcp-sa',
|
||||
httpProxyImage: '',
|
||||
injectIstio: 'false',
|
||||
modelBasePath: 'gs://kubeflow-examples-data/mnist',
|
||||
modelName: 'mnist',
|
||||
name: 'mnist-deploy-gcp',
|
||||
numGpus: '0',
|
||||
versionName: 'v1',
|
||||
},
|
||||
"mnist-deploy-aws": {
|
||||
defaultCpuImage: 'tensorflow/serving:1.11.1',
|
||||
defaultGpuImage: 'tensorflow/serving:1.11.1-gpu',
|
||||
deployHttpProxy: 'false',
|
||||
enablePrometheus: 'true',
|
||||
httpProxyImage: 'gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723',
|
||||
injectIstio: 'false',
|
||||
modelBasePath: 's3://kubeflow-examples-data/mnist',
|
||||
modelName: 'null',
|
||||
name: 'mnist-deploy-aws',
|
||||
numGpus: '0',
|
||||
s3AwsRegion: 'us-west-1',
|
||||
s3Enable: 'false',
|
||||
s3Endpoint: 's3.us-west-1.amazonaws.com',
|
||||
s3SecretAccesskeyidKeyName: 'AWS_ACCESS_KEY_ID',
|
||||
s3SecretName: 'null',
|
||||
s3SecretSecretaccesskeyKeyName: 'AWS_SECRET_ACCESS_KEY',
|
||||
s3UseHttps: 'true',
|
||||
s3VerifySsl: 'true',
|
||||
versionName: 'v1',
|
||||
},
|
||||
"mnist-service": {
|
||||
enablePrometheus: 'true',
|
||||
injectIstio: 'false',
|
||||
modelName: 'null',
|
||||
name: 'mnist-service',
|
||||
serviceType: 'ClusterIP',
|
||||
trafficRule: 'v1:100',
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
@ -43,8 +43,9 @@ local trainEnv = [
|
|||
},
|
||||
];
|
||||
|
||||
local secretName = std.split(params.secret, "=")[0];
|
||||
local secretMountPath = std.split(params.secret, "=")[1];
|
||||
local secretPieces = std.split(params.secret, "=");
|
||||
local secretName = if std.length(secretPieces) > 0 then secretPieces[0] else "";
|
||||
local secretMountPath = if std.length(secretPieces) > 1 then secretPieces[1] else "";
|
||||
|
||||
local replicaSpec = {
|
||||
containers: [
|
||||
|
|
|
|||
|
|
@ -8,6 +8,23 @@ local envParams = params + {
|
|||
train+: {
|
||||
name: 'mnist-train-dist',
|
||||
secret: 'user-gcp-sa=/var/secrets',
|
||||
numSteps: 10,
|
||||
image: 'gcr.io/kubeflow-examples/mnist/model:v20190111-v0.2-146-g0bbff62-dirty-12f353',
|
||||
numWorkers: 2,
|
||||
numPs: 1,
|
||||
},
|
||||
"deploy-gcp"+: {
|
||||
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
|
||||
},
|
||||
"mnist-deploy-gcp"+: {
|
||||
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
|
||||
name: 'jlewi-deploy-test',
|
||||
namespace: 'jlewi',
|
||||
},
|
||||
"mnist-service"+: {
|
||||
name: 'jlewi-deploy-test',
|
||||
namespace: 'jlewi',
|
||||
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
|
|
|||
|
|
@ -0,0 +1,2 @@
|
|||
{
|
||||
}
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
local base = import "base.libsonnet";
|
||||
// uncomment if you reference ksonnet-lib
|
||||
// local k = import "k.libsonnet";
|
||||
// local deployment = k.apps.v1beta2.deployment;
|
||||
|
||||
base + {
|
||||
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
|
||||
// "nginx-deployment"+: deployment.mixin.metadata.withLabels({foo: "bar"})
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
local params = std.extVar('__ksonnet/params');
|
||||
local globals = import 'globals.libsonnet';
|
||||
local envParams = params + {
|
||||
components+: {
|
||||
train+: {
|
||||
name: 'jlewi-deploy-test',
|
||||
namespace: 'jlewi',
|
||||
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
|
||||
},
|
||||
"mnist-deploy-gcp"+: {
|
||||
name: 'jlewi-deploy-test',
|
||||
namespace: 'jlewi',
|
||||
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
|
||||
},
|
||||
"mnist-service"+: {
|
||||
name: 'jlewi-deploy-test',
|
||||
namespace: 'jlewi',
|
||||
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
{
|
||||
components: {
|
||||
[x]: envParams.components[x] + globals
|
||||
for x in std.objectFields(envParams.components)
|
||||
},
|
||||
}
|
||||
73
mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/README.md
vendored
Normal file
73
mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/README.md
vendored
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
|
||||
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
|
||||
**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)*
|
||||
|
||||
- [tf-serving](#tf-serving)
|
||||
- [Quickstart](#quickstart)
|
||||
- [Using the library](#using-the-library)
|
||||
- [io.ksonnet.pkg.tf-serving](#ioksonnetpkgtf-serving)
|
||||
- [Example](#example)
|
||||
- [Parameters](#parameters)
|
||||
|
||||
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
|
||||
|
||||
# tf-serving
|
||||
|
||||
> TensorFlow serving is a server for TensorFlow models.
|
||||
|
||||
|
||||
* [Quickstart](#quickstart)
|
||||
* [Using Prototypes](#using-prototypes)
|
||||
* [io.ksonnet.pkg.tf-serving](#io.ksonnet.pkg.tf-serving)
|
||||
|
||||
## Quickstart
|
||||
|
||||
*The following commands use the `io.ksonnet.pkg.tf-serving` prototype to generate Kubernetes YAML for tf-serving, and then deploys it to your Kubernetes cluster.*
|
||||
|
||||
First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)).
|
||||
|
||||
If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init <app-name>`.
|
||||
|
||||
Finally, in the ksonnet application directory, run the following:
|
||||
|
||||
```shell
|
||||
# Expand prototype as a Jsonnet file, place in a file in the
|
||||
# `components/` directory. (YAML and JSON are also available.)
|
||||
$ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \
|
||||
--name tf-serving \
|
||||
--namespace default
|
||||
|
||||
# Apply to server.
|
||||
$ ks apply -f tf-serving.jsonnet
|
||||
```
|
||||
|
||||
## Using the library
|
||||
|
||||
The library files for tf-serving define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure tf-serving for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache.
|
||||
|
||||
This library provides a set of pre-fabricated "flavors" (or "distributions") of tf-serving, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs.
|
||||
|
||||
These prototypes, as well as how to use them, are enumerated below.
|
||||
|
||||
### io.ksonnet.pkg.tf-serving
|
||||
|
||||
TensorFlow serving
|
||||
#### Example
|
||||
|
||||
```shell
|
||||
# Expand prototype as a Jsonnet file, place in a file in the
|
||||
# `components/` directory. (YAML and JSON are also available.)
|
||||
$ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \
|
||||
--name YOUR_NAME_HERE \
|
||||
--model_path YOUR_MODEL_PATH_HERE
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
The available options to pass prototype are:
|
||||
|
||||
* `--name=<name>`: Name to give to each of the components [string]
|
||||
* `--model_path=<model_path>`: Path to the model. This can be a GCS path. [string]
|
||||
|
||||
|
||||
[rootReadme]: https://github.com/ksonnet/mixins
|
||||
35
mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/parts.yaml
vendored
Normal file
35
mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/parts.yaml
vendored
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
{
|
||||
"name": "tf-serving",
|
||||
"apiVersion": "0.0.1",
|
||||
"kind": "ksonnet.io/parts",
|
||||
"description": "TensorFlow serving is a server for TensorFlow models.\n",
|
||||
"author": "kubeflow team <kubeflow-team@google.com>",
|
||||
"contributors": [
|
||||
{
|
||||
"name": "Jeremy Lewi",
|
||||
"email": "jlewi@google.com"
|
||||
}
|
||||
],
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/kubeflow/kubeflow"
|
||||
},
|
||||
"bugs": {
|
||||
"url": "https://github.com/kubeflow/kubeflow/issues"
|
||||
},
|
||||
"keywords": [
|
||||
"kubeflow",
|
||||
"tensorflow",
|
||||
"database"
|
||||
],
|
||||
"quickStart": {
|
||||
"prototype": "io.ksonnet.pkg.tf-serving",
|
||||
"componentName": "tf-serving",
|
||||
"flags": {
|
||||
"name": "tf-serving",
|
||||
"namespace": "default"
|
||||
},
|
||||
"comment": "Run TensorFlow Serving"
|
||||
},
|
||||
"license": "Apache 2.0"
|
||||
}
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
// @apiVersion 0.1
|
||||
// @name io.ksonnet.pkg.tf-serving
|
||||
// @description TensorFlow serving
|
||||
// @shortDescription A TensorFlow serving deployment
|
||||
// @param name string Name to give to each of the components
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
|
||||
// ksonnet appears to require name be a parameter of the prototype which is why we handle it differently.
|
||||
local name = import "param://name";
|
||||
|
||||
// updatedParams includes the namespace from env by default.
|
||||
local updatedParams = params + env;
|
||||
|
||||
local tfServingBase = import "kubeflow/tf-serving/tf-serving.libsonnet";
|
||||
local tfServing = tfServingBase {
|
||||
// Override parameters with user supplied parameters.
|
||||
params+: updatedParams {
|
||||
name: name,
|
||||
},
|
||||
};
|
||||
|
||||
std.prune(k.core.v1.list.new(tfServing.components))
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
// @apiVersion 0.1
|
||||
// @name io.ksonnet.pkg.tf-serving-deployment-aws
|
||||
// @description TensorFlow serving
|
||||
// @shortDescription A TensorFlow serving deployment
|
||||
// @param name string Name to give to each of the components
|
||||
// @optionalParam numGpus string 0 Number of gpus to use
|
||||
// @optionalParam deployHttpProxy string false Whether to deploy http proxy
|
||||
// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false.
|
||||
// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11)
|
||||
// @optionalParam modelBasePath string s3://kubeflow-examples-data/mnist The model path
|
||||
// @optionalParam modelName string null The model name
|
||||
// @optionalParam versionName string v1 The version name
|
||||
// @optionalParam defaultCpuImage string tensorflow/serving:1.11.1 The default model server image (cpu)
|
||||
// @optionalParam defaultGpuImage string tensorflow/serving:1.11.1-gpu The default model server image (gpu)
|
||||
// @optionalParam httpProxyImage string gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723 Http proxy image
|
||||
// @optionalParam s3Enable string false Whether to enable S3
|
||||
// Following parameters are needed only if s3Enable is true
|
||||
// @optionalParam s3SecretName string null Name of the k8s secrets containing S3 credentials
|
||||
// @optionalParam s3SecretAccesskeyidKeyName string AWS_ACCESS_KEY_ID Name of the key in the k8s secret containing AWS_ACCESS_KEY_ID
|
||||
// @optionalParam s3SecretSecretaccesskeyKeyName string AWS_SECRET_ACCESS_KEY Name of the key in the k8s secret containing AWS_SECRET_ACCESS_KEY
|
||||
// @optionalParam s3AwsRegion string us-west-1 S3 region
|
||||
// @optionalParam s3UseHttps string true Whether or not to use https
|
||||
// @optionalParam s3VerifySsl string true Whether or not to verify https certificates for S3 connections
|
||||
// @optionalParam s3Endpoint string s3.us-west-1.amazonaws.com URL for your s3-compatible endpoint
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local deployment = k.apps.v1beta1.deployment;
|
||||
local container = deployment.mixin.spec.template.spec.containersType;
|
||||
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet";
|
||||
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
|
||||
|
||||
local base = tfserving.new(env, params);
|
||||
local tfDeployment = base.tfDeployment +
|
||||
deployment.mapContainers(
|
||||
function(c) {
|
||||
result::
|
||||
c + container.withEnvMixin(
|
||||
if util.toBool(params.s3Enable) then (
|
||||
[
|
||||
{
|
||||
name: "AWS_ACCESS_KEY_ID",
|
||||
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretAccesskeyidKeyName } },
|
||||
},
|
||||
{
|
||||
name: "AWS_SECRET_ACCESS_KEY",
|
||||
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretSecretaccesskeyKeyName } },
|
||||
},
|
||||
{ name: "AWS_REGION", value: params.s3AwsRegion },
|
||||
{ name: "S3_USE_HTTPS", value: std.toString(params.s3UseHttps) },
|
||||
{ name: "S3_VERIFY_SSL", value: std.toString(params.s3VerifySsl) },
|
||||
{ name: "S3_ENDPOINT", value: params.s3Endpoint },
|
||||
]
|
||||
) else [],
|
||||
),
|
||||
}.result,
|
||||
);
|
||||
util.list([
|
||||
tfDeployment,
|
||||
base.tfservingConfig,
|
||||
],)
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
// @apiVersion 0.1
|
||||
// @name io.ksonnet.pkg.tf-serving-deployment-gcp
|
||||
// @description TensorFlow serving
|
||||
// @shortDescription A TensorFlow serving deployment
|
||||
// @param name string Name to give to each of the components
|
||||
// @optionalParam numGpus string 0 Number of gpus to use
|
||||
// @optionalParam deployHttpProxy string false Whether to deploy http proxy
|
||||
// @optionalParam modelBasePath string gs://kubeflow-examples-data/mnist The model path
|
||||
// @optionalParam modelName string null The model name
|
||||
// @optionalParam versionName string v1 The version name
|
||||
// @optionalParam defaultCpuImage string tensorflow/serving:1.11.1 The default model server image (cpu)
|
||||
// @optionalParam defaultGpuImage string tensorflow/serving:1.11.1-gpu The default model server image (gpu)
|
||||
// @optionalParam httpProxyImage string gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723 Http proxy image
|
||||
// @optionalParam gcpCredentialSecretName string null If not empty, insert the secret credential
|
||||
// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false.
|
||||
// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11)
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local deployment = k.apps.v1beta1.deployment;
|
||||
local container = deployment.mixin.spec.template.spec.containersType;
|
||||
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet";
|
||||
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
|
||||
|
||||
local base = tfserving.new(env, params);
|
||||
local tfDeployment = base.tfDeployment +
|
||||
deployment.mixin.spec.template.spec.withVolumesMixin(
|
||||
if params.gcpCredentialSecretName != "null" then (
|
||||
[{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: params.gcpCredentialSecretName,
|
||||
},
|
||||
}]
|
||||
) else [],
|
||||
) +
|
||||
deployment.mapContainers(
|
||||
function(c) {
|
||||
result::
|
||||
c + container.withEnvMixin(
|
||||
if params.gcpCredentialSecretName != "null" then (
|
||||
[{
|
||||
name: "GOOGLE_APPLICATION_CREDENTIALS",
|
||||
value: "/secret/gcp-credentials/user-gcp-sa.json",
|
||||
}]
|
||||
) else [],
|
||||
) +
|
||||
container.withVolumeMountsMixin(
|
||||
if params.gcpCredentialSecretName != "null" then (
|
||||
[{
|
||||
name: "gcp-credentials",
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
}]
|
||||
) else [],
|
||||
),
|
||||
}.result,
|
||||
);
|
||||
util.list([
|
||||
tfDeployment,
|
||||
base.tfservingConfig,
|
||||
],)
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
// @apiVersion 0.1
|
||||
// @name io.ksonnet.pkg.tf-serving-service
|
||||
// @description TensorFlow serving
|
||||
// @shortDescription A TensorFlow serving model
|
||||
// @param name string Name to give to each of the components
|
||||
// @optionalParam serviceType string ClusterIP The k8s service type for tf serving.
|
||||
// @optionalParam modelName string null The model name
|
||||
// @optionalParam trafficRule string v1:100 The traffic rule, in the format of version:percentage,version:percentage,..
|
||||
// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false.
|
||||
// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11)
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet";
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet";
|
||||
|
||||
tfservingService.new(env, params).all
|
||||
|
|
@ -0,0 +1,230 @@
|
|||
// @apiVersion 0.1
|
||||
// @name io.ksonnet.pkg.tf-serving-request-log
|
||||
// @description tf-serving with request logging
|
||||
// @shortDescription tf-serving with request logging
|
||||
// @param name string Name to give to each of the components
|
||||
// @param gcpProject string The gcp project for Bigquery dataset
|
||||
// @param dataset string The Bigquery dataset
|
||||
// @param table string The Bigquery table
|
||||
// @optionalParam modelBasePath string gs://kubeflow-examples-data/mnist The model path
|
||||
// @optionalParam modelName string mnist The model name
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
|
||||
local namespace = "kubeflow";
|
||||
local appName = import "param://name";
|
||||
local image = "gcr.io/kubeflow-images-public/tf-model-server-cpu:v20180327-995786ec";
|
||||
local httpProxyImage = "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723";
|
||||
local loggingImage = "gcr.io/kubeflow-images-public/tf-model-server-request-logger:v20180723";
|
||||
|
||||
local gcpSecretName = "user-gcp-sa";
|
||||
|
||||
local service = {
|
||||
apiVersion: "v1",
|
||||
kind: "Service",
|
||||
metadata: {
|
||||
labels: {
|
||||
app: appName,
|
||||
},
|
||||
name: appName,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
ports: [
|
||||
{
|
||||
name: "grpc-tf-serving",
|
||||
port: 9000,
|
||||
targetPort: 9000,
|
||||
},
|
||||
{
|
||||
name: "http-tf-serving-proxy",
|
||||
port: 8000,
|
||||
targetPort: 8000,
|
||||
},
|
||||
],
|
||||
selector: {
|
||||
app: appName,
|
||||
},
|
||||
type: "ClusterIP",
|
||||
},
|
||||
};
|
||||
|
||||
local configMap = {
|
||||
apiVersion: "v1",
|
||||
kind: "ConfigMap",
|
||||
metadata: {
|
||||
name: appName + "fluentd-config",
|
||||
namespace: namespace,
|
||||
},
|
||||
data: {
|
||||
"fluent.conf": std.format(|||
|
||||
<source>
|
||||
@type tail
|
||||
path /tmp/logs/request.log
|
||||
pos_file /tmp/logs/request.log.pos
|
||||
<parse>
|
||||
@type json
|
||||
</parse>
|
||||
tag dummy
|
||||
</source>
|
||||
<match dummy>
|
||||
@type bigquery_insert
|
||||
auth_method application_default
|
||||
project %s
|
||||
dataset %s
|
||||
table %s
|
||||
fetch_schema true
|
||||
</match>
|
||||
|||, [params.gcpProject, params.dataset, params.table]),
|
||||
},
|
||||
};
|
||||
|
||||
local deployment = {
|
||||
apiVersion: "extensions/v1beta1",
|
||||
kind: "Deployment",
|
||||
metadata: {
|
||||
labels: {
|
||||
app: appName,
|
||||
},
|
||||
name: appName,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
template: {
|
||||
metadata: {
|
||||
labels: {
|
||||
app: appName,
|
||||
},
|
||||
},
|
||||
spec: {
|
||||
containers: [
|
||||
// ModelServer
|
||||
{
|
||||
args: [
|
||||
"/usr/bin/tensorflow_model_server",
|
||||
"--port=9000",
|
||||
"--model_name=" + params.modelName,
|
||||
"--model_base_path=" + params.modelBasePath,
|
||||
],
|
||||
image: image,
|
||||
imagePullPolicy: "IfNotPresent",
|
||||
name: "model-server",
|
||||
ports: [
|
||||
{
|
||||
containerPort: 9000,
|
||||
},
|
||||
],
|
||||
resources: {
|
||||
limits: {
|
||||
cpu: "4",
|
||||
memory: "4Gi",
|
||||
},
|
||||
requests: {
|
||||
cpu: "1",
|
||||
memory: "1Gi",
|
||||
},
|
||||
},
|
||||
},
|
||||
// Http proxy
|
||||
{
|
||||
name: "http-proxy",
|
||||
image: httpProxyImage,
|
||||
imagePullPolicy: "Always",
|
||||
command: [
|
||||
"python",
|
||||
"/usr/src/app/server.py",
|
||||
"--port=8000",
|
||||
"--rpc_port=9000",
|
||||
"--rpc_timeout=10.0",
|
||||
"--log_request=true",
|
||||
],
|
||||
env: [],
|
||||
ports: [
|
||||
{
|
||||
containerPort: 8000,
|
||||
},
|
||||
],
|
||||
resources: {
|
||||
requests: {
|
||||
memory: "1Gi",
|
||||
cpu: "1",
|
||||
},
|
||||
limits: {
|
||||
memory: "4Gi",
|
||||
cpu: "4",
|
||||
},
|
||||
},
|
||||
securityContext: {
|
||||
runAsUser: 1000,
|
||||
fsGroup: 1000,
|
||||
},
|
||||
volumeMounts: [
|
||||
{
|
||||
name: "request-logs",
|
||||
mountPath: "/tmp/logs",
|
||||
},
|
||||
],
|
||||
},
|
||||
// TODO(lunkai): use admission controller to inject.
|
||||
// Logging container.
|
||||
{
|
||||
name: "logging",
|
||||
image: loggingImage,
|
||||
imagePullPolicy: "Always",
|
||||
env: [
|
||||
{ name: "GOOGLE_APPLICATION_CREDENTIALS", value: "/secret/gcp-credentials/key.json" },
|
||||
],
|
||||
resources: {
|
||||
requests: {
|
||||
memory: "250Mi",
|
||||
cpu: "0.25",
|
||||
},
|
||||
limits: {
|
||||
memory: "500Mi",
|
||||
cpu: "0.5",
|
||||
},
|
||||
},
|
||||
volumeMounts: [
|
||||
{
|
||||
name: "request-logs",
|
||||
mountPath: "/tmp/logs",
|
||||
},
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
},
|
||||
{
|
||||
name: "fluentd-config-volume",
|
||||
mountPath: "/fluentd/etc/custom",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
volumes: [
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: gcpSecretName,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "request-logs",
|
||||
emptyDir: {},
|
||||
},
|
||||
{
|
||||
configMap: {
|
||||
name: "fluentd-config",
|
||||
},
|
||||
name: "fluentd-config-volume",
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
k.core.v1.list.new([
|
||||
service,
|
||||
deployment,
|
||||
configMap,
|
||||
])
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet";
|
||||
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
|
||||
|
||||
local params = {
|
||||
name: "m",
|
||||
serviceType: "ClusterIP",
|
||||
modelName: "mnist",
|
||||
trafficRule: "v1:100",
|
||||
injectIstio: false,
|
||||
};
|
||||
|
||||
local istioParams = params {
|
||||
injectIstio: true,
|
||||
};
|
||||
|
||||
local env = {
|
||||
namespace: "kubeflow",
|
||||
};
|
||||
|
||||
local deploymentParam = {
|
||||
name: "m",
|
||||
modelName: "mnist",
|
||||
versionName: "v1",
|
||||
modelBasePath: "gs://abc",
|
||||
numGpus: 0,
|
||||
defaultCpuImage: "gcr.io/abc",
|
||||
defaultGpuImage: "gcr.io/abc",
|
||||
injectIstio: false,
|
||||
enablePrometheus: true,
|
||||
};
|
||||
|
||||
local gpuParam1 = {
|
||||
name: "m",
|
||||
modelName: "mnist",
|
||||
versionName: "v1",
|
||||
modelBasePath: "gs://abc",
|
||||
numGpus: 1,
|
||||
defaultCpuImage: "gcr.io/abc",
|
||||
defaultGpuImage: "gcr.io/abc",
|
||||
injectIstio: false,
|
||||
enablePrometheus: true,
|
||||
};
|
||||
|
||||
local gpuParamString0 = {
|
||||
name: "m",
|
||||
modelName: "mnist",
|
||||
versionName: "v1",
|
||||
modelBasePath: "gs://abc",
|
||||
numGpus: "0",
|
||||
defaultCpuImage: "gcr.io/abc",
|
||||
defaultGpuImage: "gcr.io/abc",
|
||||
injectIstio: false,
|
||||
enablePrometheus: true,
|
||||
};
|
||||
|
||||
local gpuParamString1 = {
|
||||
name: "m",
|
||||
modelName: "mnist",
|
||||
versionName: "v1",
|
||||
modelBasePath: "gs://abc",
|
||||
numGpus: "1",
|
||||
defaultCpuImage: "gcr.io/abc",
|
||||
defaultGpuImage: "gcr.io/abc",
|
||||
injectIstio: false,
|
||||
enablePrometheus: true,
|
||||
};
|
||||
|
||||
local serviceInstance = tfservingService.new(env, params);
|
||||
local istioServiceInstance = tfservingService.new(env, istioParams);
|
||||
|
||||
local deploymentInstance = tfserving.new(env, deploymentParam);
|
||||
|
||||
local gpuInstance = tfserving.new(env, gpuParam1);
|
||||
local gpuString0Instance = tfserving.new(env, gpuParamString0);
|
||||
local gpuString1Instance = tfserving.new(env, gpuParamString1);
|
||||
|
||||
// This one should only have tfService
|
||||
std.assertEqual(
|
||||
std.length(serviceInstance.all.items),
|
||||
1,
|
||||
) &&
|
||||
|
||||
// This one should have tfService, virtualService, and DestinationRule
|
||||
std.assertEqual(
|
||||
std.length(istioServiceInstance.all.items),
|
||||
3
|
||||
) &&
|
||||
|
||||
std.startsWith(
|
||||
deploymentInstance.tfDeployment.spec.template.spec.containers[0].args[4],
|
||||
"--monitoring_config_file"
|
||||
) &&
|
||||
|
||||
std.assertEqual(
|
||||
deploymentInstance.tfDeployment.spec.template.spec.containers[0].resources.limits,
|
||||
{ cpu: "4", memory: "4Gi" }
|
||||
) &&
|
||||
|
||||
std.assertEqual(
|
||||
gpuInstance.tfDeployment.spec.template.spec.containers[0].resources.limits,
|
||||
{ cpu: "4", memory: "4Gi", "nvidia.com/gpu": 1 }
|
||||
) &&
|
||||
|
||||
std.assertEqual(
|
||||
gpuString0Instance.tfDeployment.spec.template.spec.containers[0].resources.limits,
|
||||
{ cpu: "4", memory: "4Gi" }
|
||||
) &&
|
||||
|
||||
std.assertEqual(
|
||||
gpuString1Instance.tfDeployment.spec.template.spec.containers[0].resources.limits,
|
||||
{ cpu: "4", memory: "4Gi", "nvidia.com/gpu": 1 }
|
||||
)
|
||||
|
|
@ -0,0 +1,147 @@
|
|||
{
|
||||
local k = import "k.libsonnet",
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet",
|
||||
new(_env, _params):: {
|
||||
local params = _params + _env,
|
||||
local namespace = params.namespace,
|
||||
local name = params.name,
|
||||
local modelName =
|
||||
if params.modelName == "null" then
|
||||
params.name
|
||||
else
|
||||
params.modelName,
|
||||
|
||||
local tfService = {
|
||||
apiVersion: "v1",
|
||||
kind: "Service",
|
||||
metadata: {
|
||||
labels: {
|
||||
app: modelName,
|
||||
},
|
||||
name: name,
|
||||
namespace: namespace,
|
||||
annotations: {
|
||||
"getambassador.io/config":
|
||||
std.join("\n", [
|
||||
"---",
|
||||
"apiVersion: ambassador/v0",
|
||||
"kind: Mapping",
|
||||
"name: tfserving-predict-mapping-" + modelName,
|
||||
"prefix: /tfserving/models/" + modelName,
|
||||
"rewrite: /v1/models/" + modelName + ":predict",
|
||||
"method: POST",
|
||||
"service: " + name + "." + namespace + ":8500",
|
||||
"---",
|
||||
"apiVersion: ambassador/v0",
|
||||
"kind: Mapping",
|
||||
"name: tfserving-predict-mapping-" + modelName + "-get",
|
||||
"prefix: /tfserving/models/" + modelName,
|
||||
"rewrite: /v1/models/" + modelName,
|
||||
"method: GET",
|
||||
"service: " + name + "." + namespace + ":8500",
|
||||
]),
|
||||
} + if util.toBool(params.enablePrometheus) then {
|
||||
"prometheus.io/scrape": "true",
|
||||
"prometheus.io/path": "/monitoring/prometheus/metrics",
|
||||
"prometheus.io/port": "8500",
|
||||
} else {}, //annotations
|
||||
},
|
||||
spec: {
|
||||
ports: [
|
||||
{
|
||||
name: "grpc-tf-serving",
|
||||
port: 9000,
|
||||
targetPort: 9000,
|
||||
},
|
||||
{
|
||||
name: "http-tf-serving",
|
||||
port: 8500,
|
||||
targetPort: 8500,
|
||||
},
|
||||
],
|
||||
selector: {
|
||||
app: modelName,
|
||||
},
|
||||
type: params.serviceType,
|
||||
},
|
||||
}, // tfService
|
||||
tfService:: tfService,
|
||||
|
||||
local versionWeights = std.split(params.trafficRule, ","),
|
||||
local virtualService = {
|
||||
apiVersion: "networking.istio.io/v1alpha3",
|
||||
kind: "VirtualService",
|
||||
metadata: {
|
||||
name: name,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
hosts: [
|
||||
"*",
|
||||
],
|
||||
gateways: [
|
||||
"kubeflow-gateway",
|
||||
],
|
||||
http: [
|
||||
{
|
||||
match: [
|
||||
{
|
||||
uri: {
|
||||
prefix: "/istio/tfserving/models/" + modelName,
|
||||
},
|
||||
method: {
|
||||
exact: "POST",
|
||||
},
|
||||
},
|
||||
],
|
||||
rewrite: {
|
||||
uri: "/v1/models/" + modelName + ":predict",
|
||||
},
|
||||
route: [
|
||||
{
|
||||
destination: {
|
||||
host: name,
|
||||
port: {
|
||||
number: 8500,
|
||||
},
|
||||
subset: std.split(versionWeight, ":")[0],
|
||||
},
|
||||
weight: std.parseInt(std.split(versionWeight, ":")[1]),
|
||||
}
|
||||
for versionWeight in versionWeights
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
virtualService:: virtualService,
|
||||
|
||||
local destinationRule = {
|
||||
apiVersion: "networking.istio.io/v1alpha3",
|
||||
kind: "DestinationRule",
|
||||
metadata: {
|
||||
name: name,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
host: name,
|
||||
subsets: [
|
||||
{
|
||||
name: std.split(versionWeight, ":")[0],
|
||||
labels: {
|
||||
version: std.split(versionWeight, ":")[0],
|
||||
},
|
||||
}
|
||||
for versionWeight in versionWeights
|
||||
],
|
||||
},
|
||||
},
|
||||
destinationRule:: destinationRule,
|
||||
all:: util.list([
|
||||
tfService,
|
||||
] + if util.toBool(params.injectIstio) then [
|
||||
virtualService,
|
||||
destinationRule,
|
||||
] else []),
|
||||
}, // new
|
||||
}
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
{
|
||||
local k = import "k.libsonnet",
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet",
|
||||
new(_env, _params):: {
|
||||
local params = _params + _env,
|
||||
local namespace = params.namespace,
|
||||
local name = params.name,
|
||||
local modelName =
|
||||
if params.modelName == "null" then
|
||||
params.name
|
||||
else
|
||||
params.modelName,
|
||||
local versionName = params.versionName,
|
||||
local numGpus =
|
||||
if std.type(params.numGpus) == "string" then
|
||||
std.parseInt(params.numGpus)
|
||||
else
|
||||
params.numGpus,
|
||||
local modelServerImage =
|
||||
if numGpus == 0 then
|
||||
params.defaultCpuImage
|
||||
else
|
||||
params.defaultGpuImage,
|
||||
|
||||
// Optional features.
|
||||
// TODO(lunkai): Add request logging
|
||||
|
||||
local modelServerContainer = {
|
||||
command: [
|
||||
"/usr/bin/tensorflow_model_server",
|
||||
],
|
||||
args: [
|
||||
"--port=9000",
|
||||
"--rest_api_port=8500",
|
||||
"--model_name=" + modelName,
|
||||
"--model_base_path=" + params.modelBasePath,
|
||||
] + if util.toBool(params.enablePrometheus) then [
|
||||
"--monitoring_config_file=/var/config/monitoring_config.txt",
|
||||
] else [],
|
||||
image: modelServerImage,
|
||||
imagePullPolicy: "IfNotPresent",
|
||||
name: modelName,
|
||||
ports: [
|
||||
{
|
||||
containerPort: 9000,
|
||||
},
|
||||
{
|
||||
containerPort: 8500,
|
||||
},
|
||||
],
|
||||
env: [],
|
||||
resources: {
|
||||
limits: {
|
||||
cpu: "4",
|
||||
memory: "4Gi",
|
||||
} + if numGpus != 0 then {
|
||||
"nvidia.com/gpu": numGpus,
|
||||
} else {},
|
||||
requests: {
|
||||
cpu: "1",
|
||||
memory: "1Gi",
|
||||
},
|
||||
},
|
||||
volumeMounts: [
|
||||
{
|
||||
mountPath: "/var/config/",
|
||||
name: "config-volume",
|
||||
},
|
||||
],
|
||||
// TCP liveness probe on gRPC port
|
||||
livenessProbe: {
|
||||
tcpSocket: {
|
||||
port: 9000,
|
||||
},
|
||||
initialDelaySeconds: 30,
|
||||
periodSeconds: 30,
|
||||
},
|
||||
}, // modelServerContainer
|
||||
|
||||
local tfDeployment = {
|
||||
apiVersion: "extensions/v1beta1",
|
||||
kind: "Deployment",
|
||||
metadata: {
|
||||
labels: {
|
||||
app: modelName,
|
||||
},
|
||||
name: name,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
template: {
|
||||
metadata: {
|
||||
labels: {
|
||||
app: modelName,
|
||||
version: versionName,
|
||||
},
|
||||
annotations: {
|
||||
"sidecar.istio.io/inject": if util.toBool(params.injectIstio) then "true",
|
||||
},
|
||||
},
|
||||
spec: {
|
||||
containers: [
|
||||
modelServerContainer,
|
||||
],
|
||||
volumes: [
|
||||
{
|
||||
configMap: {
|
||||
name: name + "-config",
|
||||
},
|
||||
name: "config-volume",
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
}, // tfDeployment
|
||||
tfDeployment:: tfDeployment,
|
||||
|
||||
local tfservingConfig = {
|
||||
apiVersion: "v1",
|
||||
kind: "ConfigMap",
|
||||
metadata: {
|
||||
name: name + "-config",
|
||||
namespace: namespace,
|
||||
},
|
||||
data: {
|
||||
"monitoring_config.txt": std.join("\n", [
|
||||
"prometheus_config: {",
|
||||
" enable: true,",
|
||||
' path: "/monitoring/prometheus/metrics"',
|
||||
"}",
|
||||
]),
|
||||
},
|
||||
}, // tfservingConfig
|
||||
tfservingConfig:: tfservingConfig,
|
||||
}, // new
|
||||
}
|
||||
|
|
@ -0,0 +1,380 @@
|
|||
{
|
||||
util:: import "kubeflow/tf-serving/util.libsonnet",
|
||||
|
||||
// Parameters are intended to be late bound.
|
||||
params:: {
|
||||
name: null,
|
||||
numGpus: 0,
|
||||
labels: {
|
||||
app: $.params.name,
|
||||
},
|
||||
modelName: $.params.name,
|
||||
modelPath: null,
|
||||
modelStorageType: "storageType",
|
||||
|
||||
version: "v1",
|
||||
firstVersion: true,
|
||||
|
||||
deployIstio: false,
|
||||
|
||||
deployHttpProxy: false,
|
||||
httpProxyImage: "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180606-9dfda4f2",
|
||||
|
||||
serviceType: "ClusterIP",
|
||||
|
||||
// If users want to override the image then can override defaultCpuImage and/or defaultGpuImage
|
||||
// in which case the image used will still depend on whether GPUs are used or not.
|
||||
// Users can also override modelServerImage in which case the user supplied value will always be used
|
||||
// regardless of numGpus.
|
||||
defaultCpuImage: "tensorflow/serving:1.11.1",
|
||||
defaultGpuImage: "tensorflow/serving:1.11.1-gpu",
|
||||
modelServerImage: if $.params.numGpus == 0 then
|
||||
$.params.defaultCpuImage
|
||||
else
|
||||
$.params.defaultGpuImage,
|
||||
|
||||
|
||||
// Whether or not to enable s3 parameters
|
||||
s3Enable:: false,
|
||||
|
||||
// Which storageType to use
|
||||
storageType:: null,
|
||||
},
|
||||
|
||||
// Parametes specific to GCP.
|
||||
gcpParams:: {
|
||||
gcpCredentialSecretName: "",
|
||||
} + $.params,
|
||||
|
||||
// Parameters that control S3 access
|
||||
// params overrides s3params because params can be overwritten by the user to override the defaults.
|
||||
s3params:: {
|
||||
// Name of the k8s secrets containing S3 credentials
|
||||
s3SecretName: "",
|
||||
// Name of the key in the k8s secret containing AWS_ACCESS_KEY_ID.
|
||||
s3SecretAccesskeyidKeyName: "AWS_ACCESS_KEY_ID",
|
||||
|
||||
// Name of the key in the k8s secret containing AWS_SECRET_ACCESS_KEY.
|
||||
s3SecretSecretaccesskeyKeyName: "AWS_SECRET_ACCESS_KEY",
|
||||
|
||||
// S3 region
|
||||
s3AwsRegion: "us-west-1",
|
||||
|
||||
// TODO(jlewi): We should use util.toBool to automatically conver to actual boolean values.
|
||||
// The use of strings is left over from when they were prototype parameters which only supports string type.
|
||||
|
||||
// true Whether or not to use https for S3 connections
|
||||
s3UseHttps: "true",
|
||||
|
||||
// Whether or not to verify https certificates for S3 connections
|
||||
s3VerifySsl: "true",
|
||||
|
||||
// URL for your s3-compatible endpoint.
|
||||
s3Endpoint: "http://s3.us-west-1.amazonaws.com,",
|
||||
} + $.params,
|
||||
|
||||
|
||||
components:: {
|
||||
|
||||
all:: [
|
||||
// Default routing rule for the first version of model.
|
||||
if $.util.toBool($.params.deployIstio) && $.util.toBool($.params.firstVersion) then
|
||||
$.parts.defaultRouteRule,
|
||||
] +
|
||||
// TODO(jlewi): It would be better to structure s3 as a mixin.
|
||||
// As an example it would be great to allow S3 and GCS parameters
|
||||
// to be enabled simultaneously. This should be doable because
|
||||
// each entails adding a set of environment variables and volumes
|
||||
// to the containers. These volumes/environment variables shouldn't
|
||||
// overlap so there's no reason we shouldn't be able to just add
|
||||
// both modifications to the base container.
|
||||
// I think we want to restructure things as mixins so they can just
|
||||
// be added.
|
||||
if $.params.s3Enable then
|
||||
[
|
||||
$.s3parts.tfService,
|
||||
$.s3parts.tfDeployment,
|
||||
]
|
||||
else if $.params.storageType == "gcp" then
|
||||
[
|
||||
$.gcpParts.tfService,
|
||||
$.gcpParts.tfDeployment,
|
||||
]
|
||||
else
|
||||
[
|
||||
$.parts.tfService,
|
||||
$.parts.tfDeployment,
|
||||
],
|
||||
}.all,
|
||||
|
||||
parts:: {
|
||||
// We define the containers one level beneath parts because combined with jsonnet late binding
|
||||
// this makes it easy for users to override specific bits of the container.
|
||||
tfServingContainerBase:: {
|
||||
name: $.params.name,
|
||||
image: $.params.modelServerImage,
|
||||
imagePullPolicy: "IfNotPresent",
|
||||
command: [
|
||||
"/usr/bin/tensorflow_model_server",
|
||||
],
|
||||
args: [
|
||||
"--port=9000",
|
||||
"--model_name=" + $.params.modelName,
|
||||
"--model_base_path=" + $.params.modelPath,
|
||||
],
|
||||
ports: [
|
||||
{
|
||||
containerPort: 9000,
|
||||
},
|
||||
],
|
||||
// TODO(jlewi): We should add readiness and liveness probes. I think the blocker is that
|
||||
// model-server doesn't have something we can use out of the box.
|
||||
resources: {
|
||||
requests: {
|
||||
memory: "1Gi",
|
||||
cpu: "1",
|
||||
},
|
||||
limits: {
|
||||
memory: "4Gi",
|
||||
cpu: "4",
|
||||
},
|
||||
},
|
||||
// The is user and group should be defined in the Docker image.
|
||||
// Per best practices we don't run as the root user.
|
||||
securityContext: {
|
||||
runAsUser: 1000,
|
||||
fsGroup: 1000,
|
||||
},
|
||||
volumeMounts+: if $.params.modelStorageType == "nfs" then [{
|
||||
name: "nfs",
|
||||
mountPath: "/mnt",
|
||||
}]
|
||||
else [],
|
||||
}, // tfServingContainer
|
||||
|
||||
tfServingContainer+: $.parts.tfServingContainerBase +
|
||||
if $.params.numGpus > 0 then
|
||||
{
|
||||
resources+: {
|
||||
limits+: {
|
||||
"nvidia.com/gpu": $.params.numGpus,
|
||||
},
|
||||
},
|
||||
}
|
||||
else {},
|
||||
|
||||
tfServingMetadata+: {
|
||||
labels: $.params.labels { version: $.params.version },
|
||||
annotations: {
|
||||
"sidecar.istio.io/inject": if $.util.toBool($.params.deployIstio) then "true",
|
||||
},
|
||||
},
|
||||
|
||||
httpProxyContainer:: {
|
||||
name: $.params.name + "-http-proxy",
|
||||
image: $.params.httpProxyImage,
|
||||
imagePullPolicy: "IfNotPresent",
|
||||
command: [
|
||||
"python",
|
||||
"/usr/src/app/server.py",
|
||||
"--port=8000",
|
||||
"--rpc_port=9000",
|
||||
"--rpc_timeout=10.0",
|
||||
],
|
||||
env: [],
|
||||
ports: [
|
||||
{
|
||||
containerPort: 8000,
|
||||
},
|
||||
],
|
||||
resources: {
|
||||
requests: {
|
||||
memory: "500Mi",
|
||||
cpu: "0.5",
|
||||
},
|
||||
limits: {
|
||||
memory: "1Gi",
|
||||
cpu: "1",
|
||||
},
|
||||
},
|
||||
securityContext: {
|
||||
runAsUser: 1000,
|
||||
fsGroup: 1000,
|
||||
},
|
||||
}, // httpProxyContainer
|
||||
|
||||
|
||||
tfDeployment: {
|
||||
apiVersion: "extensions/v1beta1",
|
||||
kind: "Deployment",
|
||||
metadata: {
|
||||
name: $.params.name + "-" + $.params.version,
|
||||
namespace: $.params.namespace,
|
||||
labels: $.params.labels,
|
||||
},
|
||||
spec: {
|
||||
template: {
|
||||
metadata: $.parts.tfServingMetadata,
|
||||
spec: {
|
||||
containers: [
|
||||
$.parts.tfServingContainer,
|
||||
if $.util.toBool($.params.deployHttpProxy) then
|
||||
$.parts.httpProxyContainer,
|
||||
],
|
||||
volumes+: if $.params.modelStorageType == "nfs" then
|
||||
[{
|
||||
name: "nfs",
|
||||
persistentVolumeClaim: {
|
||||
claimName: $.params.nfsPVC,
|
||||
},
|
||||
}]
|
||||
else [],
|
||||
},
|
||||
},
|
||||
},
|
||||
}, // tfDeployment
|
||||
|
||||
tfService: {
|
||||
apiVersion: "v1",
|
||||
kind: "Service",
|
||||
metadata: {
|
||||
labels: $.params.labels,
|
||||
name: $.params.name,
|
||||
namespace: $.params.namespace,
|
||||
annotations: {
|
||||
"getambassador.io/config":
|
||||
std.join("\n", [
|
||||
"---",
|
||||
"apiVersion: ambassador/v0",
|
||||
"kind: Mapping",
|
||||
"name: tfserving-mapping-" + $.params.name + "-get",
|
||||
"prefix: /models/" + $.params.name + "/",
|
||||
"rewrite: /",
|
||||
"method: GET",
|
||||
"service: " + $.params.name + "." + $.params.namespace + ":8000",
|
||||
"---",
|
||||
"apiVersion: ambassador/v0",
|
||||
"kind: Mapping",
|
||||
"name: tfserving-mapping-" + $.params.name + "-post",
|
||||
"prefix: /models/" + $.params.name + "/",
|
||||
"rewrite: /model/" + $.params.name + ":predict",
|
||||
"method: POST",
|
||||
"service: " + $.params.name + "." + $.params.namespace + ":8000",
|
||||
]),
|
||||
}, //annotations
|
||||
},
|
||||
spec: {
|
||||
ports: [
|
||||
{
|
||||
name: "grpc-tf-serving",
|
||||
port: 9000,
|
||||
targetPort: 9000,
|
||||
},
|
||||
{
|
||||
name: "http-tf-serving-proxy",
|
||||
port: 8000,
|
||||
targetPort: 8000,
|
||||
},
|
||||
],
|
||||
selector: $.params.labels,
|
||||
type: $.params.serviceType,
|
||||
},
|
||||
}, // tfService
|
||||
|
||||
defaultRouteRule: {
|
||||
apiVersion: "config.istio.io/v1alpha2",
|
||||
kind: "RouteRule",
|
||||
metadata: {
|
||||
name: $.params.name + "-default",
|
||||
namespace: $.params.namespace,
|
||||
},
|
||||
spec: {
|
||||
destination: {
|
||||
name: $.params.name,
|
||||
},
|
||||
precedence: 0,
|
||||
route: [
|
||||
{
|
||||
labels: { version: $.params.version },
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
|
||||
}, // parts
|
||||
|
||||
// Parts specific to S3
|
||||
s3parts:: $.parts {
|
||||
s3Env:: [
|
||||
{ name: "AWS_ACCESS_KEY_ID", valueFrom: { secretKeyRef: { name: $.s3params.s3SecretName, key: $.s3params.s3SecretAccesskeyidKeyName } } },
|
||||
{ name: "AWS_SECRET_ACCESS_KEY", valueFrom: { secretKeyRef: { name: $.s3params.s3SecretName, key: $.s3params.s3SecretSecretaccesskeyKeyName } } },
|
||||
{ name: "AWS_REGION", value: $.s3params.s3AwsRegion },
|
||||
{ name: "S3_REGION", value: $.s3params.s3AwsRegion },
|
||||
{ name: "S3_USE_HTTPS", value: $.s3params.s3UseHttps },
|
||||
{ name: "S3_VERIFY_SSL", value: $.s3params.s3VerifySsl },
|
||||
{ name: "S3_ENDPOINT", value: $.s3params.s3Endpoint },
|
||||
],
|
||||
|
||||
tfServingContainer: $.parts.tfServingContainer {
|
||||
env+: $.s3parts.s3Env,
|
||||
},
|
||||
|
||||
tfDeployment: $.parts.tfDeployment {
|
||||
spec: +{
|
||||
template: +{
|
||||
metadata: $.parts.tfServingMetadata,
|
||||
spec: +{
|
||||
containers: [
|
||||
$.s3parts.tfServingContainer,
|
||||
if $.util.toBool($.params.deployHttpProxy) then
|
||||
$.parts.httpProxyContainer,
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
}, // tfDeployment
|
||||
}, // s3parts
|
||||
|
||||
// Parts specific to GCP
|
||||
gcpParts:: $.parts {
|
||||
gcpEnv:: [
|
||||
if $.gcpParams.gcpCredentialSecretName != "" then
|
||||
{ name: "GOOGLE_APPLICATION_CREDENTIALS", value: "/secret/gcp-credentials/user-gcp-sa.json" },
|
||||
],
|
||||
|
||||
tfServingContainer: $.parts.tfServingContainer {
|
||||
env+: $.gcpParts.gcpEnv,
|
||||
volumeMounts+: [
|
||||
if $.gcpParams.gcpCredentialSecretName != "" then
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
},
|
||||
],
|
||||
},
|
||||
|
||||
tfDeployment: $.parts.tfDeployment {
|
||||
spec+: {
|
||||
template+: {
|
||||
metadata: $.parts.tfServingMetadata,
|
||||
spec+: {
|
||||
containers: [
|
||||
$.gcpParts.tfServingContainer,
|
||||
if $.util.toBool($.params.deployHttpProxy) then
|
||||
$.parts.httpProxyContainer,
|
||||
],
|
||||
volumes: [
|
||||
if $.gcpParams.gcpCredentialSecretName != "" then
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: $.gcpParams.gcpCredentialSecretName,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
}, // tfDeployment
|
||||
}, // gcpParts
|
||||
}
|
||||
21
mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/util.libsonnet
vendored
Normal file
21
mnist/ks_app/vendor/kubeflow/tf-serving@fed535eaa276220e4edf59530c0629f4375a40a9/util.libsonnet
vendored
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
// Some useful routines.
|
||||
{
|
||||
local k = import "k.libsonnet",
|
||||
|
||||
// Convert non-boolean types like string,number to a boolean.
|
||||
// This is primarily intended for dealing with parameters that should be booleans.
|
||||
toBool:: function(x) {
|
||||
result::
|
||||
if std.type(x) == "boolean" then
|
||||
x
|
||||
else if std.type(x) == "string" then
|
||||
std.asciiUpper(x) == "TRUE"
|
||||
else if std.type(x) == "number" then
|
||||
x != 0
|
||||
else
|
||||
false,
|
||||
}.result,
|
||||
|
||||
// Produce a list of manifests. obj must be an array
|
||||
list(obj):: k.core.v1.list.new(obj,),
|
||||
}
|
||||
|
|
@ -21,6 +21,7 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
|
|
@ -126,6 +127,22 @@ def linear_serving_input_receiver_fn():
|
|||
def main(_):
|
||||
tf.logging.set_verbosity(tf.logging.INFO)
|
||||
|
||||
tf_config = os.environ.get('TF_CONFIG', '{}')
|
||||
tf.logging.info("TF_CONFIG %s", tf_config)
|
||||
tf_config_json = json.loads(tf_config)
|
||||
cluster = tf_config_json.get('cluster')
|
||||
job_name = tf_config_json.get('task', {}).get('type')
|
||||
task_index = tf_config_json.get('task', {}).get('index')
|
||||
tf.logging.info("cluster=%s job_name=%s task_index=%s", cluster, job_name,
|
||||
task_index)
|
||||
|
||||
is_chief = False
|
||||
if not job_name or job_name.lower() in ["chief", "master"]:
|
||||
is_chief = True
|
||||
tf.logging.info("Will export model")
|
||||
else:
|
||||
tf.logging.info("Will not export model")
|
||||
|
||||
# Download and load MNIST dataset.
|
||||
mnist = tf.contrib.learn.datasets.DATASETS['mnist'](TF_DATA_DIR)
|
||||
train_input_fn = tf.estimator.inputs.numpy_input_fn(
|
||||
|
|
@ -151,6 +168,8 @@ def main(_):
|
|||
classifier = tf.estimator.LinearClassifier(
|
||||
feature_columns=feature_columns, n_classes=N_DIGITS,
|
||||
model_dir=TF_MODEL_DIR, config=training_config)
|
||||
# TODO(jlewi): Should it be linear_serving_input_receiver_fn here?
|
||||
serving_fn = cnn_serving_input_receiver_fn
|
||||
export_final = tf.estimator.FinalExporter(
|
||||
TF_EXPORT_DIR, serving_input_receiver_fn=cnn_serving_input_receiver_fn)
|
||||
|
||||
|
|
@ -158,6 +177,7 @@ def main(_):
|
|||
# Convolutional network
|
||||
classifier = tf.estimator.Estimator(
|
||||
model_fn=conv_model, model_dir=TF_MODEL_DIR, config=training_config)
|
||||
serving_fn = cnn_serving_input_receiver_fn
|
||||
export_final = tf.estimator.FinalExporter(
|
||||
TF_EXPORT_DIR, serving_input_receiver_fn=cnn_serving_input_receiver_fn)
|
||||
else:
|
||||
|
|
@ -171,7 +191,14 @@ def main(_):
|
|||
exporters=export_final,
|
||||
throttle_secs=1,
|
||||
start_delay_secs=1)
|
||||
print("Train and evaluate")
|
||||
tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
|
||||
print("Training done")
|
||||
|
||||
if is_chief:
|
||||
print("Export saved model")
|
||||
classifier.export_savedmodel(TF_EXPORT_DIR, serving_input_receiver_fn=serving_fn)
|
||||
print("Done exporting the model")
|
||||
|
||||
if __name__ == '__main__':
|
||||
tf.app.run()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,76 @@
|
|||
"""Test deploying the mnist model.
|
||||
|
||||
This file tests that we can deploy the model.
|
||||
|
||||
TODO(jlewi): Test that we can send predictions to the model.
|
||||
|
||||
It is an integration test as it depends on having access to
|
||||
a Kubeflow deployment to deploy on. It also depends on having a model.
|
||||
|
||||
Python Path Requirements:
|
||||
kubeflow/testing/py - https://github.com/kubeflow/testing/tree/master/py
|
||||
* Provides utilities for testing
|
||||
|
||||
Manually running the test
|
||||
1. Configure your KUBECONFIG file to point to the desired cluster
|
||||
2. Set --params=name=${NAME},namespace=${NAMESPACE}
|
||||
* name should be the name for your job
|
||||
* namespace should be the namespace to use
|
||||
3. Use the modelBasePath parameter to the model to test.
|
||||
--params=...,modelBasePath=${MODEL_BASE_PATH}
|
||||
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
from kubernetes import client as k8s_client
|
||||
from py import test_runner
|
||||
|
||||
from kubeflow.testing import ks_util
|
||||
from kubeflow.testing import test_util
|
||||
from kubeflow.testing import util
|
||||
|
||||
class MnistDeployTest(test_util.TestCase):
|
||||
def __init__(self, args):
|
||||
namespace, name, env = test_runner.parse_runtime_params(args)
|
||||
self.app_dir = args.app_dir
|
||||
|
||||
if not self.app_dir:
|
||||
self.app_dir = os.path.join(os.path.dirname(__file__), "..",
|
||||
"ks_app")
|
||||
self.app_dir = os.path.abspath(self.app_dir)
|
||||
logging.info("--app_dir not set defaulting to: %s", self.app_dir)
|
||||
|
||||
self.env = env
|
||||
self.namespace = namespace
|
||||
self.params = args.params
|
||||
self.ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
|
||||
super(MnistDeployTest, self).__init__(class_name="MnistDeployTest",
|
||||
name=name)
|
||||
|
||||
def test_serve(self):
|
||||
# We repeat the test multiple times.
|
||||
# This ensures that if we delete the job we can create a new job with the
|
||||
# same name.
|
||||
api_client = k8s_client.ApiClient()
|
||||
|
||||
# Apply the components
|
||||
for component in ["mnist-deploy-gcp", "mnist-service"]:
|
||||
# Setup the ksonnet app
|
||||
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
|
||||
self.params)
|
||||
|
||||
util.run([self.ks_cmd, "apply", self.env, "-c", component],
|
||||
cwd=self.app_dir)
|
||||
|
||||
logging.info("Created deployment %s in namespaces %s", self.name, self.namespace)
|
||||
|
||||
util.wait_for_deployment(api_client, self.namespace, self.name,
|
||||
timeout_minutes=4)
|
||||
|
||||
# We don't delete the resources. We depend on the namespace being
|
||||
# garbage collected.
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_runner.main(module=__name__)
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# A simple script to copy a secret from 1 namespace to another
|
||||
#
|
||||
# Usage
|
||||
# copy_secret <source namepspace> <dest namespace> <secret name>
|
||||
set -e
|
||||
SOURCE=$1
|
||||
DEST=$2
|
||||
NAME=$3
|
||||
|
||||
usage() {
|
||||
echo copy_secret "<source namepspace> <dest namespace> <secret name>"
|
||||
}
|
||||
|
||||
if [ -z ${SOURCE} ]; then
|
||||
usage
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if [ -z ${DEST} ]; then
|
||||
usage
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if [ -z ${NAME} ]; then
|
||||
usage
|
||||
exit -1
|
||||
fi
|
||||
|
||||
echo getting secret
|
||||
SECRET=$(kubectl -n ${SOURCE} get secrets user-gcp-sa -o jsonpath="{.data.${NAME}\.json}" | base64 -d)
|
||||
kubectl create -n ${DEST} secret generic ${NAME} --from-literal="${NAME}.json=${SECRET}"
|
||||
|
|
@ -25,6 +25,12 @@ local defaultParams = {
|
|||
// The bucket where the model should be written
|
||||
// This needs to be writable by the GCP service account in the Kubeflow cluster (not the test cluster)
|
||||
modelBucket: "kubeflow-ci_temp",
|
||||
|
||||
// Whether to delete the namespace at the end.
|
||||
// Leaving the namespace around can be useful for debugging.
|
||||
//
|
||||
// TODO(jlewi): We should consider running a cronjob to GC so namespaces.
|
||||
deleteNamespace: false,
|
||||
};
|
||||
|
||||
local params = defaultParams + overrides;
|
||||
|
|
@ -77,6 +83,9 @@ local modelDir = "gs://" + params.modelBucket + "/mnist/models/" + prowDict["BUI
|
|||
// value of KUBECONFIG environment variable. This should be a full path.
|
||||
local kubeConfig = testDir + "/.kube/kubeconfig";
|
||||
|
||||
// Namespace where tests should run
|
||||
local testNamespace = "mnist-" + prowDict["BUILD_ID"];
|
||||
|
||||
// Build template is a template for constructing Argo step templates.
|
||||
//
|
||||
// step_name: Name for the template
|
||||
|
|
@ -233,10 +242,48 @@ local dagTemplates = [
|
|||
params.kfCluster,
|
||||
]]
|
||||
),
|
||||
workingDir: srcDir + "/github_issue_summarization",
|
||||
},
|
||||
dependencies: ["checkout"],
|
||||
}, // get-kubeconfig
|
||||
{
|
||||
// Create the namespace
|
||||
// TODO(jlewi): We should add some sort of retry.
|
||||
template: buildTemplate {
|
||||
name: "create-namespace",
|
||||
command: util.buildCommand([
|
||||
[
|
||||
"echo",
|
||||
"KUBECONFIG=",
|
||||
"${KUBECONFIG}",
|
||||
],
|
||||
[
|
||||
"gcloud",
|
||||
"auth",
|
||||
"activate-service-account",
|
||||
"--key-file=${GOOGLE_APPLICATION_CREDENTIALS}",
|
||||
],
|
||||
[
|
||||
"kubectl",
|
||||
"config" ,
|
||||
"current-context",
|
||||
],
|
||||
[
|
||||
"kubectl",
|
||||
"create",
|
||||
"namespace",
|
||||
testNamespace,
|
||||
],
|
||||
# Copy the GCP secret from the kubeflow namespace to the test namespace
|
||||
[
|
||||
srcDir + "/test/copy_secret.sh",
|
||||
"kubeflow",
|
||||
testNamespace,
|
||||
"user-gcp-sa",
|
||||
]]
|
||||
),
|
||||
},
|
||||
dependencies: ["get-kubeconfig"],
|
||||
}, // create-namespace
|
||||
{
|
||||
// Run the python test for TFJob
|
||||
template: buildTemplate {
|
||||
|
|
@ -247,7 +294,7 @@ local dagTemplates = [
|
|||
"--artifacts_path=" + artifactsDir,
|
||||
"--params=" + std.join(",", [
|
||||
"name=mnist-test-" + prowDict["BUILD_ID"],
|
||||
"namespace=kubeflow",
|
||||
"namespace=" + testNamespace,
|
||||
"numTrainSteps=10",
|
||||
"batchSize=10",
|
||||
"image=" + trainerImage,
|
||||
|
|
@ -260,8 +307,25 @@ local dagTemplates = [
|
|||
])],
|
||||
workingDir: srcDir + "/mnist/testing",
|
||||
},
|
||||
dependencies: ["build-images", "get-kubeconfig"],
|
||||
dependencies: ["build-images", "create-namespace"],
|
||||
}, // tfjob-test
|
||||
{
|
||||
// Run the python test for TFJob
|
||||
template: buildTemplate {
|
||||
name: "deploy-test",
|
||||
command: [
|
||||
"python",
|
||||
"deploy_test.py",
|
||||
"--params=" + std.join(",", [
|
||||
"name=mnist-test-" + prowDict["BUILD_ID"],
|
||||
"namespace=" + testNamespace,
|
||||
"modelBasePath=" + modelDir + "/export",
|
||||
"exportDir=" + modelDir,
|
||||
])],
|
||||
workingDir: srcDir + "/mnist/testing",
|
||||
},
|
||||
dependencies: ["tfjob-test"],
|
||||
}, // deploy-test
|
||||
// TODO(jlewi): We should add a non-distributed test that just uses the default values.
|
||||
];
|
||||
|
||||
|
|
@ -277,8 +341,35 @@ local dag = {
|
|||
|
||||
// Define templates for the steps to be performed when the
|
||||
// test exits
|
||||
|
||||
local deleteTemplates = if params.deleteNamespace then
|
||||
[
|
||||
{
|
||||
// Delete the namespace
|
||||
// TODO(jlewi): We should add some sort of retry.
|
||||
template: buildTemplate {
|
||||
name: "delete-namespace",
|
||||
command: util.buildCommand([
|
||||
[
|
||||
"gcloud",
|
||||
"auth",
|
||||
"activate-service-account",
|
||||
"--key-file=${GOOGLE_APPLICATION_CREDENTIALS}",
|
||||
],
|
||||
[
|
||||
"kubectl",
|
||||
"delete",
|
||||
"namespace",
|
||||
testNamespace,
|
||||
]]
|
||||
),
|
||||
},
|
||||
}, // delete-namespace
|
||||
] else [];
|
||||
|
||||
local exitTemplates =
|
||||
[
|
||||
deleteTemplates +
|
||||
[
|
||||
{
|
||||
// Copy artifacts to GCS for gubernator.
|
||||
// TODO(https://github.com/kubeflow/testing/issues/257): Create-pr-symlink
|
||||
|
|
@ -294,7 +385,6 @@ local exitTemplates =
|
|||
"--bucket=" + bucket,
|
||||
],
|
||||
}, // copy-artifacts,
|
||||
|
||||
},
|
||||
{
|
||||
// Delete the test directory in NFS.
|
||||
|
|
@ -314,7 +404,7 @@ local exitTemplates =
|
|||
},
|
||||
},
|
||||
}, // test-dir-delete
|
||||
dependencies: ["copy-artifacts"],
|
||||
dependencies: ["copy-artifacts"] + if params.deleteNamespace then ["delete-namespace"] else [],
|
||||
},
|
||||
];
|
||||
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@ local envParams = params + {
|
|||
},
|
||||
mnist+: {
|
||||
namespace: 'kubeflow-test-infra',
|
||||
name: 'jlewi-mnist-test-465-0109-050605',
|
||||
prow_env: 'JOB_NAME=mnist-test,JOB_TYPE=presubmit,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=0109-050605,BUILD_ID=0109-050605,PULL_NUMBER=465',
|
||||
name: 'jlewi-mnist-test-469-0111-081531',
|
||||
prow_env: 'JOB_NAME=mnist-test,JOB_TYPE=presubmit,REPO_NAME=examples,REPO_OWNER=kubeflow,BUILD_NUMBER=0111-081531,BUILD_ID=0111-081531,PULL_NUMBER=469',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
|
|
|||
Loading…
Reference in New Issue