mirror of https://github.com/kubeflow/examples.git
Mnist fixes (#495)
* removed environments * fixed issues with README * addressed PR comments * updated aws yaml to match master
This commit is contained in:
parent
74378a2990
commit
b18cec9b3b
201
mnist/README.md
201
mnist/README.md
|
@ -69,10 +69,10 @@ The resulting model is [model.py](model.py).
|
|||
With our code ready, we will now build/push the docker image.
|
||||
|
||||
```
|
||||
DOCKER_BASE_URL=docker.io/elsonrodriguez # Put your docker registry here
|
||||
docker build . --no-cache -f Dockerfile.model -t ${DOCKER_BASE_URL}/mytfmodel:1.7
|
||||
DOCKER_URL=docker.io/reponame/mytfmodel # Put your docker registry here
|
||||
docker build . --no-cache -f Dockerfile.model -t ${DOCKER_URL}
|
||||
|
||||
docker push ${DOCKER_BASE_URL}/mytfmodel:1.7
|
||||
docker push ${DOCKER_URL}
|
||||
```
|
||||
|
||||
## Preparing your Kubernetes Cluster
|
||||
|
@ -102,6 +102,19 @@ Give the job a name to indicate it is running locally
|
|||
ks param set --env=${KSENV} train name mnist-train-local
|
||||
```
|
||||
|
||||
Point the job at your custom training image
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train image $DOCKER_URL
|
||||
```
|
||||
|
||||
Configure a filepath for the exported model and checkpoints.
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train modelDir ./output
|
||||
ks param set --env=${KSENV} train exportDir ./output/export
|
||||
```
|
||||
|
||||
You can now submit the job
|
||||
|
||||
```
|
||||
|
@ -145,6 +158,17 @@ cd ks_app
|
|||
ks env add ${KSENV}
|
||||
```
|
||||
|
||||
Set an environment variable that points to your GCP project Id
|
||||
```
|
||||
PROJECT=<your project id>
|
||||
```
|
||||
|
||||
Create a bucket on GCS to store our model. The name must be unique across all GCS buckets
|
||||
```
|
||||
BUCKET=$KSENV-$(date +%s)
|
||||
gsutil mb gs://$BUCKET/
|
||||
```
|
||||
|
||||
Give the job a different name (to distinguish it from your job which didn't use GCS)
|
||||
|
||||
```
|
||||
|
@ -160,6 +184,7 @@ ks param set --env=${KSENV} train numWorkers 2
|
|||
Now we need to configure parameters telling the code to save the model to GCS.
|
||||
|
||||
```
|
||||
MODEL_PATH=my-model
|
||||
ks param set --env=${KSENV} train modelDir gs://${BUCKET}/${MODEL_PATH}
|
||||
ks param set --env=${KSENV} train exportDir gs://${BUCKET}/${MODEL_PATH}/export
|
||||
```
|
||||
|
@ -183,7 +208,7 @@ then a number of steps have already been performed for you
|
|||
* To see the secrets in your cluster
|
||||
|
||||
```
|
||||
kubectl get secrets
|
||||
kubectl get secrets -n kubeflow
|
||||
```
|
||||
|
||||
1. We granted this service account permission to read/write GCS buckets in this project
|
||||
|
@ -194,7 +219,7 @@ then a number of steps have already been performed for you
|
|||
gcloud projects get-iam-policy ${PROJECT} --format=yaml
|
||||
```
|
||||
|
||||
* The output should look like
|
||||
* The output should look like the following
|
||||
|
||||
```
|
||||
bindings:
|
||||
|
@ -206,49 +231,46 @@ then a number of steps have already been performed for you
|
|||
...
|
||||
etag: BwV_BqSmSCY=
|
||||
version: 1
|
||||
```
|
||||
```
|
||||
|
||||
To use this service account we perform the following steps
|
||||
|
||||
1. Mount the secret into the pod
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train secret user-gcp-sa=/var/secrets
|
||||
```
|
||||
1. Mount the secret into the pod
|
||||
```
|
||||
ks param set --env=${KSENV} train secret user-gcp-sa=/var/secrets
|
||||
```
|
||||
|
||||
* Note: ensure your envrionment is pointed at the same `kubeflow` namespace as the `user-gcp-sa` secret
|
||||
* Setting this ksonnet parameter causes a volumeMount and volume to be added to your TFJob
|
||||
* To see this you can run
|
||||
|
||||
```
|
||||
ks show ${KSENV} -c train
|
||||
```
|
||||
* To see this you can run `ks show ${KSENV} -c train`
|
||||
|
||||
* The output should now include a volumeMount and volume section
|
||||
|
||||
```
|
||||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: TFJob
|
||||
metadata:
|
||||
...
|
||||
spec:
|
||||
tfReplicaSpecs:
|
||||
Chief:
|
||||
...
|
||||
template:
|
||||
...
|
||||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: TFJob
|
||||
metadata:
|
||||
...
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
...
|
||||
volumeMounts:
|
||||
- mountPath: /var/secrets
|
||||
name: user-gcp-sa
|
||||
readOnly: true
|
||||
...
|
||||
volumes:
|
||||
- name: user-gcp-sa
|
||||
secret:
|
||||
secretName: user-gcp-sa
|
||||
...
|
||||
tfReplicaSpecs:
|
||||
Chief:
|
||||
...
|
||||
template:
|
||||
...
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
...
|
||||
volumeMounts:
|
||||
- mountPath: /var/secrets
|
||||
name: user-gcp-sa
|
||||
readOnly: true
|
||||
...
|
||||
volumes:
|
||||
- name: user-gcp-sa
|
||||
secret:
|
||||
secretName: user-gcp-sa
|
||||
...
|
||||
```
|
||||
|
||||
1. Next we need to set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` so that our code knows
|
||||
|
@ -262,7 +284,8 @@ spec:
|
|||
|
||||
```
|
||||
ks show ${KSENV} -c train
|
||||
|
||||
```
|
||||
```
|
||||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: TFJob
|
||||
metadata:
|
||||
|
@ -292,7 +315,7 @@ You can now submit the job
|
|||
ks apply ${KSENV} -c train
|
||||
```
|
||||
|
||||
And you can check the job
|
||||
And you can check the job status
|
||||
|
||||
```
|
||||
kubectl get tfjobs -o yaml mnist-train-dist
|
||||
|
@ -301,7 +324,7 @@ kubectl get tfjobs -o yaml mnist-train-dist
|
|||
And to check the logs
|
||||
|
||||
```
|
||||
kubectl logs mnist-train-dist-chief-0
|
||||
kubectl logs -f mnist-train-dist-chief-0
|
||||
```
|
||||
|
||||
|
||||
|
@ -367,34 +390,33 @@ various environment variables configuring access to S3.
|
|||
* The output should now include two environment variables referencing K8s secret
|
||||
|
||||
```
|
||||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: TFJob
|
||||
metadata:
|
||||
...
|
||||
spec:
|
||||
tfReplicaSpecs:
|
||||
Chief:
|
||||
...
|
||||
template:
|
||||
...
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
...
|
||||
env:
|
||||
...
|
||||
- name: AWS_ACCESS_KEY_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: awsAccessKeyID
|
||||
name: aws-creds
|
||||
- name: AWS_SECRET_ACCESS_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: awsSecretAccessKey
|
||||
name: aws-creds
|
||||
...
|
||||
```
|
||||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: TFJob
|
||||
metadata:
|
||||
...
|
||||
spec:
|
||||
tfReplicaSpecs:
|
||||
Chief:
|
||||
...
|
||||
template:
|
||||
...
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
...
|
||||
env:
|
||||
- name: AWS_ACCESS_KEY_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: awsAccessKeyID
|
||||
name: aws-creds
|
||||
- name: AWS_SECRET_ACCESS_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: awsSecretAccessKey
|
||||
name: aws-creds
|
||||
...
|
||||
```
|
||||
|
||||
1. Next we need to set a whole bunch of S3 related environment variables so that TensorFlow
|
||||
knows how to talk to S3
|
||||
|
@ -410,7 +432,7 @@ various environment variables configuring access to S3.
|
|||
ks param set --env=${KSENV} train envVariables ${AWSENV}
|
||||
```
|
||||
|
||||
* If we look at the spec for our job we can see that the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set.
|
||||
* If we look at the spec for our job we can see that the environment variable `AWS_BUCKET` is set.
|
||||
|
||||
```
|
||||
ks show ${KSENV} -c train
|
||||
|
@ -453,7 +475,7 @@ kubectl get tfjobs -o yaml mnist-train-dist
|
|||
And to check the logs
|
||||
|
||||
```
|
||||
kubectl logs mnist-train-dist-chief-0
|
||||
kubectl logs -f mnist-train-dist-chief-0
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
@ -508,18 +530,18 @@ URI; if not you can always copy it to GCS using `gsutil`.
|
|||
Check that a model was exported
|
||||
|
||||
```
|
||||
EXPORT_DIR=gs://${BUCKET}/${MODEL_PATH}/export
|
||||
gsutil ls -r ${EXPORT_DIR}
|
||||
|
||||
```
|
||||
|
||||
The output should look something like
|
||||
|
||||
```
|
||||
gs://${EXPORT_DIR}/1547100373/saved_model.pb
|
||||
gs://${EXPORT_DIR}/1547100373/variables/:
|
||||
gs://${EXPORT_DIR}/1547100373/variables/
|
||||
gs://${EXPORT_DIR}/1547100373/variables/variables.data-00000-of-00001
|
||||
gs://${EXPORT_DIR}/1547100373/variables/variables.index
|
||||
${EXPORT_DIR}/1547100373/saved_model.pb
|
||||
${EXPORT_DIR}/1547100373/variables/:
|
||||
${EXPORT_DIR}/1547100373/variables/
|
||||
${EXPORT_DIR}/1547100373/variables/variables.data-00000-of-00001
|
||||
${EXPORT_DIR}/1547100373/variables/variables.index
|
||||
```
|
||||
|
||||
The number `1547100373` is a version number auto-generated by TensorFlow; it will vary on each run but should be monotonically increasing if you save a model to the same location as a previous location.
|
||||
|
@ -528,14 +550,13 @@ The number `1547100373` is a version number auto-generated by TensorFlow; it wil
|
|||
Set your model path
|
||||
|
||||
```
|
||||
ks param set ${ENV} mnist-deploy-gcp modelBasePath ${EXPORT_DIR}
|
||||
|
||||
ks param set --env=${KSENV} mnist-deploy-gcp modelBasePath ${EXPORT_DIR}
|
||||
```
|
||||
|
||||
Deploy it
|
||||
|
||||
```
|
||||
ks param apply ${ENV} -c mnist-deploy-gcp
|
||||
ks apply ${KSENV} -c mnist-deploy-gcp
|
||||
```
|
||||
|
||||
You can check the deployment by running
|
||||
|
@ -544,6 +565,18 @@ You can check the deployment by running
|
|||
kubectl describe deployments mnist-deploy-gcp
|
||||
```
|
||||
|
||||
Finally, run a service to make the deployment accessible to other pods in the cluster
|
||||
|
||||
```
|
||||
ks apply ${KSENV} -c mnist-service
|
||||
```
|
||||
|
||||
The service should make the `mnist-deploy-gcp` deployment accessible over port 9000
|
||||
|
||||
```
|
||||
kubectl describe service mnist-service
|
||||
```
|
||||
|
||||
### S3
|
||||
|
||||
TODO: Add instructions
|
||||
|
@ -559,7 +592,7 @@ The example comes with a simple web front end that can be used with your model.
|
|||
To deploy the web front end
|
||||
|
||||
```
|
||||
ks apply ${ENV} -c web-ui
|
||||
ks apply ${KSENV} -c web-ui
|
||||
```
|
||||
|
||||
### Connecting via port forwarding
|
||||
|
@ -567,7 +600,9 @@ ks apply ${ENV} -c web-ui
|
|||
To connect to the web app via port-forwarding
|
||||
|
||||
```
|
||||
kubectl -n ${NAMESPACE} port-forward svc/web-ui 8080:80
|
||||
POD_NAME=$(kubectl get pods --selector=app=web-ui --template '{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}')
|
||||
|
||||
kubectl port-forward ${POD_NAME} 8080:5000
|
||||
```
|
||||
|
||||
You should now be able to open up the web app at [http://localhost:8080](http://localhost:8080).
|
||||
|
|
|
@ -1,17 +1,4 @@
|
|||
apiVersion: 0.3.0
|
||||
environments:
|
||||
jlewi:
|
||||
destination:
|
||||
namespace: jlewi
|
||||
server: https://35.196.210.94
|
||||
k8sVersion: v1.11.5
|
||||
path: jlewi
|
||||
test-env-d5e3:
|
||||
destination:
|
||||
namespace: jlewi
|
||||
server: https://35.196.210.94
|
||||
k8sVersion: v1.11.5
|
||||
path: test-env-d5e3
|
||||
kind: ksonnet.io/app
|
||||
libraries:
|
||||
kubeflow/tf-serving:
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
{
|
||||
}
|
|
@ -1,9 +0,0 @@
|
|||
local base = import "base.libsonnet";
|
||||
// uncomment if you reference ksonnet-lib
|
||||
// local k = import "k.libsonnet";
|
||||
// local deployment = k.apps.v1beta2.deployment;
|
||||
|
||||
base + {
|
||||
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
|
||||
// "nginx-deployment"+: deployment.mixin.metadata.withLabels({foo: "bar"})
|
||||
}
|
|
@ -1,38 +0,0 @@
|
|||
local params = std.extVar('__ksonnet/params');
|
||||
local globals = import 'globals.libsonnet';
|
||||
local envParams = params + {
|
||||
components+: {
|
||||
"mnist-train"+: {
|
||||
envVariables: 'GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json',
|
||||
},
|
||||
train+: {
|
||||
name: 'mnist-train-dist',
|
||||
secret: 'user-gcp-sa=/var/secrets',
|
||||
numSteps: 10,
|
||||
image: 'gcr.io/kubeflow-examples/mnist/model:v20190111-v0.2-146-g0bbff62-dirty-12f353',
|
||||
numWorkers: 2,
|
||||
numPs: 1,
|
||||
},
|
||||
"deploy-gcp"+: {
|
||||
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
|
||||
},
|
||||
"mnist-deploy-gcp"+: {
|
||||
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
|
||||
name: 'jlewi-deploy-test',
|
||||
namespace: 'jlewi',
|
||||
},
|
||||
"mnist-service"+: {
|
||||
namespace: 'jlewi',
|
||||
},
|
||||
tensorboard+: {
|
||||
logDir: 'gs://kubeflow-ci_temp/mnist-jlewi/',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
{
|
||||
components: {
|
||||
[x]: envParams.components[x] + globals
|
||||
for x in std.objectFields(envParams.components)
|
||||
},
|
||||
}
|
|
@ -1,2 +0,0 @@
|
|||
{
|
||||
}
|
|
@ -1,9 +0,0 @@
|
|||
local base = import "base.libsonnet";
|
||||
// uncomment if you reference ksonnet-lib
|
||||
// local k = import "k.libsonnet";
|
||||
// local deployment = k.apps.v1beta2.deployment;
|
||||
|
||||
base + {
|
||||
// Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n")
|
||||
// "nginx-deployment"+: deployment.mixin.metadata.withLabels({foo: "bar"})
|
||||
}
|
|
@ -1,28 +0,0 @@
|
|||
local params = std.extVar('__ksonnet/params');
|
||||
local globals = import 'globals.libsonnet';
|
||||
local envParams = params + {
|
||||
components+: {
|
||||
train+: {
|
||||
name: 'jlewi-deploy-test',
|
||||
namespace: 'jlewi',
|
||||
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
|
||||
},
|
||||
"mnist-deploy-gcp"+: {
|
||||
name: 'jlewi-deploy-test',
|
||||
namespace: 'jlewi',
|
||||
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
|
||||
},
|
||||
"mnist-service"+: {
|
||||
name: 'jlewi-deploy-test',
|
||||
namespace: 'jlewi',
|
||||
modelBasePath: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
{
|
||||
components: {
|
||||
[x]: envParams.components[x] + globals
|
||||
for x in std.objectFields(envParams.components)
|
||||
},
|
||||
}
|
Loading…
Reference in New Issue