Enhance mnist training and add serving steps for local mode (#528)

This commit is contained in:
Jin Chi He 2019-04-10 05:54:45 +08:00 committed by Kubernetes Prow Robot
parent fb0c5eb115
commit 335c2a1d6e
4 changed files with 125 additions and 17 deletions

View File

@ -4,7 +4,7 @@
- [Training MNIST](#training-mnist)
- [Prerequisites](#prerequisites)
- [Kubernetes Cluster Environment](#kubernetes-cluster-environment)
- [Deploy Kubeflow](#deploy-kubeflow)
- [Local Setup](#local-setup)
- [Modifying existing examples](#modifying-existing-examples)
- [Prepare model](#prepare-model)
@ -16,7 +16,16 @@
- [Using S3](#using-s3)
- [Monitoring](#monitoring)
- [Tensorboard](#tensorboard)
- [Using Tensorflow serving](#using-tensorflow-serving)
- [Using GCS](#using-gcs-1)
- [Using S3](#using-s3-1)
- [Deploying TensorBoard](#deploying-tensorboard)
- [Serving the model](#serving-the-model)
- [GCS](#gcs)
- [S3](#s3)
- [Local storage](#local-storage-1)
- [Web Front End](#web-front-end)
- [Connecting via port forwarding](#connecting-via-port-forwarding)
- [Using IAP on GCP](#using-iap-on-gcp)
- [Conclusion and Next Steps](#conclusion-and-next-steps)
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
@ -85,11 +94,11 @@ In the following instructions we will install our required components to a singl
#### Local storage
Let's start by runing the training job on Kubeflow and storing the model in a directory local to the pod e.g. '/tmp'.
This is useful as a smoke test to ensure everything works. Since `/tmp` is not a filesystem external to the container, all data
is lost once the job finishes. So to make the model available after the job finishes we will need to use an external filesystem
like GCS or S3 as discussed in the next section.
Let's start by runing the training job on Kubeflow and storing the model in a local storage.
Fristly, refer to the [document](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) to create Persistent Volume(PV) and Persistent Volume Claim(PVC), the PVC name (${PVC_NAME}) will be used by pods of training and serving for local mode in steps below.
Creating an environment to store parameters particular for local mode.
```
KSENV=local
cd ks_app
@ -108,13 +117,18 @@ Point the job at your custom training image
ks param set --env=${KSENV} train image $DOCKER_URL
```
Mount the pvc to store the exported model, by default the pvc will be mounted to the `/mnt` of the training pod.
```
ks param set --env=${KSENV} train pvcName ${PVC_NAME}
```
Configure a filepath for the exported model and checkpoints.
```
ks param set --env=${KSENV} train modelDir ./output
ks param set --env=${KSENV} train exportDir ./output/export
ks param set --env=${KSENV} train modelDir /mnt
ks param set --env=${KSENV} train exportDir /mnt/export
```
You can now submit the job
```
@ -133,11 +147,6 @@ And to check the logs
kubectl logs mnist-train-local-chief-0
```
Storing the model in a directory inside the container isn't useful because the directory is
lost as soon as the pod is deleted.
So in the next sections we cover saving the model on a suitable filesystem like GCS or S3.
#### Using GCS
In this section we describe how to save the model to Google Cloud Storage (GCS).
@ -722,9 +731,40 @@ kubectl describe service mnist-service
TODO: Add instructions
### PVC
### Local storage
TODO: Add instructions
The section shows how to serve the local model that was stored in PVC while training.
Mount the PVC, by default the pvc will be mounted to the `/mnt` of the pod.
```
ks param set --env=${KSENV} mnist-deploy-local pvcName ${PVC_NAME}
```
Configure a filepath for the exported model.
```
ks param set --env=${KSENV} mnist-deploy-local modelBasePath /mnt/export
```
Deploy it.
```
ks apply ${KSENV} -c mnist-deploy-local
```
You can check the deployment by running
```
kubectl describe deployments mnist-deploy-local
```
Finally, run a service to make the deployment accessible to other pods in the cluster.
```
ks apply ${KSENV} -c mnist-service
```
The service should make the `mnist-deploy-local` deployment accessible over port 9000.
```
kubectl describe service mnist-service
```
## Web Front End

View File

@ -0,0 +1,39 @@
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["mnist-deploy-local"];
local k = import "k.libsonnet";
local deployment = k.apps.v1beta1.deployment;
local container = deployment.mixin.spec.template.spec.containersType;
local util = import "kubeflow/tf-serving/util.libsonnet";
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
local base = tfserving.new(env, params);
local tfDeployment = base.tfDeployment +
deployment.mixin.spec.template.spec.withVolumesMixin(
if params.pvcName != "null" && params.pvcName != "" then (
[{
name: "local-storage",
persistentVolumeClaim: {
claimName: params.pvcName,
},
}]
) else [],
) +
deployment.mapContainers(
function(c) {
result::
c + container.withVolumeMountsMixin(
if params.pvcName != "null" && params.pvcName != "" then (
[{
name: "local-storage",
mountPath: "/mnt",
}]
) else [],
),
}.result,
);
util.list([
tfDeployment,
base.tfservingConfig,
],)

View File

@ -14,6 +14,21 @@
secret: '',
secretKeyRefs: '',
trainSteps: 200,
pvcName: '',
},
"mnist-deploy-local": {
defaultCpuImage: 'tensorflow/serving:1.11.1',
defaultGpuImage: 'tensorflow/serving:1.11.1-gpu',
deployHttpProxy: 'false',
enablePrometheus: 'true',
httpProxyImage: '',
injectIstio: 'false',
pvcName: '',
modelBasePath: '/mnt/export',
modelName: 'mnist',
name: 'mnist-deploy-local',
numGpus: '0',
versionName: 'v1',
},
"mnist-deploy-gcp": {
defaultCpuImage: 'tensorflow/serving:1.11.1',
@ -75,4 +90,4 @@
type: "ClusterIP",
},
},
}
}

View File

@ -49,6 +49,12 @@ local replicaSpec = {
mountPath: secretMountPath,
readOnly: true,
},
] else if params.pvcName != "null" && params.pvcName != "" then
[
{
name: "local-storage",
mountPath: "/mnt",
},
] else [],
workingDir: "/opt",
},
@ -62,6 +68,14 @@ local replicaSpec = {
secretName: secretName,
},
},
] else if params.pvcName != "null" && params.pvcName != "" then
[
{
name: "local-storage",
persistentVolumeClaim: {
claimName: params.pvcName,
},
},
] else [],
restartPolicy: "OnFailure",
};