drop_ksonnet_from_mnist (#546)

This commit is contained in:
Jin Chi He 2019-05-08 10:54:32 +08:00 committed by Kubernetes Prow Robot
parent b23adc1f0b
commit 5fac627725
77 changed files with 1512 additions and 2174 deletions

View File

@ -1,20 +1,20 @@
# This container is for running ksonnet within Kubernetes
# This container is for running kustomize within Kubernetes
FROM ubuntu:16.04
ENV KUBECTL_VERSION v1.9.2
ENV KSONNET_VERSION 0.10.1
ENV KUSTOMIZE_VERSION 2.0.3
RUN apt-get update && apt-get -y install curl && rm -rf /var/lib/apt/lists/*
#RUN apk add --update ca-certificates openssl && update-ca-certificates
RUN curl -O -L https://github.com/ksonnet/ksonnet/releases/download/v${KSONNET_VERSION}/ks_${KSONNET_VERSION}_linux_amd64.tar.gz
RUN tar -zxvf ks_${KSONNET_VERSION}_linux_amd64.tar.gz -C /usr/bin/ --strip-components=1 ks_${KSONNET_VERSION}_linux_amd64/ks
RUN chmod +x /usr/bin/ks
RUN curl -O -L https://github.com/kubernetes-sigs/kustomize/releases/download/v${KUSTOMIZE_VERSION}/kustomize_${KUSTOMIZE_VERSION}_linux_amd64
RUN mv kustomize_${KUSTOMIZE_VERSION}_linux_amd64 /usr/bin/kustomize
RUN chmod +x /usr/bin/kustomize
RUN curl -L https://storage.googleapis.com/kubernetes-release/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /usr/bin/kubectl
RUN chmod +x /usr/bin/kubectl
#ksonnet doesn't work without a kubeconfig, the following is just to add a utility to generate a kubeconfig from a service account.
# The following is just to add a utility to generate a kubeconfig from a service account.
ADD https://raw.githubusercontent.com/zlabjp/kubernetes-scripts/cb265de1d4c4dc4ad0f15f4aaaf5b936dcf639a5/create-kubeconfig /usr/bin/
ADD https://raw.githubusercontent.com/zlabjp/kubernetes-scripts/cb265de1d4c4dc4ad0f15f4aaaf5b936dcf639a5/LICENSE.txt /usr/bin/create-kubeconfig.LICENSE
RUN chmod +x /usr/bin/create-kubeconfig
@ -24,7 +24,7 @@ RUN kubectl config use-context default
ENV USER root
ADD ksonnet-entrypoint.sh /
RUN chmod +x /ksonnet-entrypoint.sh
ADD kustomize-entrypoint.sh /
RUN chmod +x /kustomize-entrypoint.sh
ENTRYPOINT ["/ksonnet-entrypoint.sh"]
ENTRYPOINT ["/kustomize-entrypoint.sh"]

View File

@ -27,7 +27,6 @@ IMG ?= gcr.io/kubeflow-examples/mnist
# List any changed files. We only include files in the notebooks directory.
# because that is the code in the docker image.
# In particular we exclude changes to the ksonnet configs.
CHANGED_FILES := $(shell git diff-files --relative=mnist/)
# Whether to use cached images with GCB

View File

@ -2,7 +2,7 @@
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)*
- [Training MNIST](#training-mnist)
- [MNIST on Kubeflow](#mnist-on-kubeflow)
- [Prerequisites](#prerequisites)
- [Deploy Kubeflow](#deploy-kubeflow)
- [Local Setup](#local-setup)
@ -30,24 +30,25 @@
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
# Training MNIST
# MNIST on Kubeflow
This example guides you through the process of taking an example model, modifying it to run better within Kubeflow, and serving the resulting trained model.
## Prerequisites
Before we get started there a few requirements.
Before we get started there are a few requirements.
### Deploy Kubeflow
Follow the [Getting Started Guide](https://www.kubeflow.org/docs/started/getting-started/) to deploy Kubeflow
Follow the [Getting Started Guide](https://www.kubeflow.org/docs/started/getting-started/) to deploy Kubeflow.
### Local Setup
You also need the following command line tools:
- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/)
- [ksonnet](https://ksonnet.io/#get-started)
- [kustomize](https://kustomize.io/)
To run the client at the end of the example, you must have [requirements.txt](requirements.txt) intalled in your active python environment.
@ -78,7 +79,7 @@ The resulting model is [model.py](model.py).
With our code ready, we will now build/push the docker image.
```
DOCKER_URL=docker.io/reponame/mytfmodel # Put your docker registry here
DOCKER_URL=docker.io/reponame/mytfmodel:tag # Put your docker registry here
docker build . --no-cache -f Dockerfile.model -t ${DOCKER_URL}
docker push ${DOCKER_URL}
@ -88,7 +89,9 @@ docker push ${DOCKER_URL}
With our data and workloads ready, now the cluster must be prepared. We will be deploying the TF Operator, and Argo to help manage our training job.
In the following instructions we will install our required components to a single namespace. For these instructions we will assume the chosen namespace is `tfworkflow`:
In the following instructions we will install our required components to a single namespace. For these instructions we will assume the chosen namespace is `kubeflow`.
### Training your model
@ -98,41 +101,55 @@ Let's start by runing the training job on Kubeflow and storing the model in a lo
Fristly, refer to the [document](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) to create Persistent Volume(PV) and Persistent Volume Claim(PVC), the PVC name (${PVC_NAME}) will be used by pods of training and serving for local mode in steps below.
Creating an environment to store parameters particular for local mode.
Enter the `training/local` from the `mnist` application directory.
```
KSENV=local
cd ks_app
ks env add ${KSENV}
cd training/local
```
Give the job a name to indicate it is running locally
```
ks param set --env=${KSENV} train name mnist-train-local
kustomize edit add configmap mnist-map-training --from-literal=name=mnist-train-local
```
Point the job at your custom training image
```
ks param set --env=${KSENV} train image $DOCKER_URL
kustomize edit set image training-image=$DOCKER_URL:$TAG
```
Mount the pvc to store the exported model, by default the pvc will be mounted to the `/mnt` of the training pod.
Optionally, configure it to run distributed by setting the number of parameter servers and workers to use. The `numPs` means the number of Ps and the `numWorkers` means the number of Worker.
```
ks param set --env=${KSENV} train pvcName ${PVC_NAME}
../base/definition.sh --numPs 1 --numWorkers 2
```
Configure a filepath for the exported model and checkpoints.
Set the training parameters, such as training steps, batch size and learning rate.
```
ks param set --env=${KSENV} train modelDir /mnt
ks param set --env=${KSENV} train exportDir /mnt/export
kustomize edit add configmap mnist-map-training --from-literal=trainSteps=200
kustomize edit add configmap mnist-map-training --from-literal=batchSize=100
kustomize edit add configmap mnist-map-training --from-literal=learningRate=0.01
```
To store the the exported model and checkpoints model, configure PVC name and mount piont.
```
kustomize edit add configmap mnist-map-training --from-literal=pvcName=${PVC_NAME}
kustomize edit add configmap mnist-map-training --from-literal=pvcMountPath=/mnt
```
Now we need to configure parameters and telling the code to save the model to PVC.
```
kustomize edit add configmap mnist-map-training --from-literal=modelDir=/mnt
kustomize edit add configmap mnist-map-training --from-literal=exportDir=/mnt/export
```
You can now submit the job
```
ks apply ${KSENV} -c train
kustomize build . |kubectl apply -f -
```
And you can check the job
@ -147,24 +164,22 @@ And to check the logs
kubectl logs mnist-train-local-chief-0
```
#### Using GCS
In this section we describe how to save the model to Google Cloud Storage (GCS).
Storing the model in GCS has the advantages
Storing the model in GCS has the advantages:
* The model is readily available after the job finishes
* We can run distributed training
* Distributed training requires a storage system accessible to all the machines
Lets start by creating an environment to store parameters particular to writing the model to GCS
and running distributed.
Enter the `training/GCS` from the `mnist` application directory.
```
KSENV=distributed
cd ks_app
ks env add ${KSENV}
cd training/GCS
```
Set an environment variable that points to your GCP project Id
@ -174,28 +189,42 @@ PROJECT=<your project id>
Create a bucket on GCS to store our model. The name must be unique across all GCS buckets
```
BUCKET=$KSENV-$(date +%s)
BUCKET=distributed-$(date +%s)
gsutil mb gs://$BUCKET/
```
Give the job a different name (to distinguish it from your job which didn't use GCS)
```
ks param set --env=${KSENV} train name mnist-train-dist
kustomize edit add configmap mnist-map-training --from-literal=name=mnist-train-dist
```
Next we configure it to run distributed by setting the number of parameter servers and workers to use.
Optionally, if you want to use your custom training image, configurate that as below.
```
ks param set --env=${KSENV} train numPs 1
ks param set --env=${KSENV} train numWorkers 2
kustomize edit set image training-image=$DOCKER_URL:$TAG
```
Now we need to configure parameters telling the code to save the model to GCS.
Next we configure it to run distributed by setting the number of parameter servers and workers to use. The `numPs` means the number of Ps and the `numWorkers` means the number of Worker.
```
../base/definition.sh --numPs 1 --numWorkers 2
```
Set the training parameters, such as training steps, batch size and learning rate.
```
kustomize edit add configmap mnist-map-training --from-literal=trainSteps=200
kustomize edit add configmap mnist-map-training --from-literal=batchSize=100
kustomize edit add configmap mnist-map-training --from-literal=learningRate=0.01
```
Now we need to configure parameters and telling the code to save the model to GCS.
```
MODEL_PATH=my-model
ks param set --env=${KSENV} train modelDir gs://${BUCKET}/${MODEL_PATH}
ks param set --env=${KSENV} train exportDir gs://${BUCKET}/${MODEL_PATH}/export
kustomize edit add configmap mnist-map-training --from-literal=modelDir=gs://${BUCKET}/${MODEL_PATH}
kustomize edit add configmap mnist-map-training --from-literal=exportDir=gs://${BUCKET}/${MODEL_PATH}/export
```
In order to write to GCS we need to supply the TFJob with GCP credentials. We do
@ -212,7 +241,7 @@ then a number of steps have already been performed for you
gcloud --project=${PROJECT} iam service-accounts list
```
1. We stored the private key for this account in a K8s secret named `user-gcp-sa`
2. We stored the private key for this account in a K8s secret named `user-gcp-sa`
* To see the secrets in your cluster
@ -220,7 +249,7 @@ then a number of steps have already been performed for you
kubectl get secrets -n kubeflow
```
1. We granted this service account permission to read/write GCS buckets in this project
3. We granted this service account permission to read/write GCS buckets in this project
* To see the IAM policy you can do
@ -244,55 +273,24 @@ then a number of steps have already been performed for you
To use this service account we perform the following steps
1. Mount the secret into the pod
1. Mount the secret `user-gcp-sa` into the pod and configure the mount path of the secret.
```
ks param set --env=${KSENV} train secret user-gcp-sa=/var/secrets
kustomize edit add configmap mnist-map-training --from-literal=secretName=user-gcp-sa
kustomize edit add configmap mnist-map-training --from-literal=secretMountPath=/var/secrets
```
* Note: ensure your envrionment is pointed at the same `kubeflow` namespace as the `user-gcp-sa` secret
* Setting this ksonnet parameter causes a volumeMount and volume to be added to your TFJob
* To see this you can run `ks show ${KSENV} -c train`
* The output should now include a volumeMount and volume section
```
apiVersion: kubeflow.org/v1beta1
kind: TFJob
metadata:
...
spec:
tfReplicaSpecs:
Chief:
...
template:
...
spec:
containers:
- command:
...
volumeMounts:
- mountPath: /var/secrets
name: user-gcp-sa
readOnly: true
...
volumes:
- name: user-gcp-sa
secret:
secretName: user-gcp-sa
...
```
1. Next we need to set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` so that our code knows
where to look for the service account key.
2. Next we need to set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` so that our code knows where to look for the service account key.
```
ks param set --env=${KSENV} train envVariables GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json
kustomize edit add configmap mnist-map-training --from-literal=GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json
```
* If we look at the spec for our job we can see that the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set.
```
ks show ${KSENV} -c train
kustomize build .
```
```
apiVersion: kubeflow.org/v1beta1
@ -321,7 +319,7 @@ To use this service account we perform the following steps
You can now submit the job
```
ks apply ${KSENV} -c train
kustomize build . |kubectl apply -f -
```
And you can check the job status
@ -336,41 +334,50 @@ And to check the logs
kubectl logs -f mnist-train-dist-chief-0
```
#### Using S3
To use S3 we need we need to configure TensorFlow to use S3 credentials and variables. These credentials will be provided as kubernetes secrets and the variables will be passed in as environment variables. Modify the below values to suit your environment.
To use S3 we need to configure TensorFlow to use S3 credentials and variables. These credentials will be provided as kubernetes secrets and the variables will be passed in as environment variables. Modify the below values to suit your environment.
Lets start by creating an environment to store parameters particular to writing the model to S3
and running distributed.
Enter the `training/S3` from the `mnist` application directory.
```
KSENV=distributed
cd ks_app
ks env add ${KSENV}
cd training/S3
```
Give the job a different name (to distinguish it from your job which didn't use S3)
```
ks param set --env=${KSENV} train name mnist-train-dist
kustomize edit add configmap mnist-map-training --from-literal=name=mnist-train-dist
```
Next we configure it to run distributed by setting the number of parameter servers and workers to use.
Optionally, if you want to use your custom training image, configurate that as below.
```
ks param set --env=${KSENV} train numPs 1
ks param set --env=${KSENV} train numWorkers 2
```
Now we need to configure parameters telling the code to save the model to S3.
```
ks param set --env=${KSENV} train modelDir ${S3_MODEL_PATH_URI}
ks param set --env=${KSENV} train exportDir ${S3_MODEL_EXPORT_URI}
kustomize edit set image training-image=$DOCKER_URL:$TAG
```
In order to write to S3 we need to supply the TensorFlow code with AWS credentials we also need to set
various environment variables configuring access to S3.
Next we configure it to run distributed by setting the number of parameter servers and workers to use. The `numPs` means the number of Ps and the `numWorkers` means the number of Worker.
```
../base/definition.sh --numPs 1 --numWorkers 2
```
Set the training parameters, such as training steps, batch size and learning rate.
```
kustomize edit add configmap mnist-map-training --from-literal=trainSteps=200
kustomize edit add configmap mnist-map-training --from-literal=batchSize=100
kustomize edit add configmap mnist-map-training --from-literal=learningRate=0.01
```
Now we need to configure parameters telling the code to save the model to S3, replace `${S3_MODEL_PATH_URI}` and `${S3_MODEL_EXPORT_URI}` below with real value.
```
kustomize edit add configmap mnist-map-training --from-literal=modelDir=${S3_MODEL_PATH_URI}
kustomize edit add configmap mnist-map-training --from-literal=exportDir=${S3_MODEL_EXPORT_URI}
```
In order to write to S3 we need to supply the TensorFlow code with AWS credentials we also need to set various environment variables configuring access to S3.
1. Define a bunch of environment variables corresponding to your S3 settings; these will be used in subsequent steps
@ -385,76 +392,36 @@ various environment variables configuring access to S3.
export S3_VERIFY_SSL=1 #set to 0 for defaul minio installs
```
1. Create a K8s secret containing your AWS credentials
2. Create a K8s secret containing your AWS credentials
```
kubectl create secret generic aws-creds --from-literal=awsAccessKeyID=${AWS_ACCESS_KEY_ID} \
kustomize edit add secret aws-creds --from-literal=awsAccessKeyID=${AWS_ACCESS_KEY_ID} \
--from-literal=awsSecretAccessKey=${AWS_SECRET_ACCESS_KEY}
```
1. Pass secrets as environment variables into pod
3. Pass secrets as environment variables into pod
```
ks param set --env=${KSENV} train secretKeyRefs AWS_ACCESS_KEY_ID=aws-creds.awsAccessKeyID,AWS_SECRET_ACCESS_KEY=aws-creds.awsSecretAccessKey
```
kustomize edit add configmap mnist-map-training --from-literal=awsSecretName=aws-creds
kustomize edit add configmap mnist-map-training --from-literal=awsAccessKeyIDName=awsAccessKeyID
kustomize edit add configmap mnist-map-training --from-literal=awsSecretAccessKeyName=awsSecretAccessKey
```
* Setting this ksonnet parameter causes a two new environment variables to be added to your TFJob
* To see this you can run
```
ks show ${KSENV} -c train
```
* The output should now include two environment variables referencing K8s secret
```
apiVersion: kubeflow.org/v1beta1
kind: TFJob
metadata:
...
spec:
tfReplicaSpecs:
Chief:
...
template:
...
spec:
containers:
- command:
...
env:
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
key: awsAccessKeyID
name: aws-creds
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
key: awsSecretAccessKey
name: aws-creds
...
```
1. Next we need to set a whole bunch of S3 related environment variables so that TensorFlow
knows how to talk to S3
4. Next we need to set a whole bunch of S3 related environment variables so that TensorFlow knows how to talk to S3
```
AWSENV="S3_ENDPOINT=${S3_ENDPOINT}"
AWSENV="${AWSENV},AWS_ENDPOINT_URL=${AWS_ENDPOINT_URL}"
AWSENV="${AWSENV},AWS_REGION=${AWS_REGION}"
AWSENV="${AWSENV},BUCKET_NAME=${BUCKET_NAME}"
AWSENV="${AWSENV},S3_USE_HTTPS=${S3_USE_HTTPS}"
AWSENV="${AWSENV},S3_VERIFY_SSL=${S3_VERIFY_SSL}"
ks param set --env=${KSENV} train envVariables ${AWSENV}
kustomize edit add configmap mnist-map-training --from-literal=S3_ENDPOINT=${S3_ENDPOINT}
kustomize edit add configmap mnist-map-training --from-literal=AWS_ENDPOINT_URL=${AWS_ENDPOINT_URL}
kustomize edit add configmap mnist-map-training --from-literal=AWS_REGION=${AWS_REGION}
kustomize edit add configmap mnist-map-training --from-literal=BUCKET_NAME=${BUCKET_NAME}
kustomize edit add configmap mnist-map-training --from-literal=S3_USE_HTTPS=${S3_USE_HTTPS}
kustomize edit add configmap mnist-map-training --from-literal=S3_VERIFY_SSL=${S3_VERIFY_SSL}
```
* If we look at the spec for our job we can see that the environment variables related
to S3 are set.
* If we look at the spec for our job we can see that the environment variables related to S3 are set.
```
ks show ${KSENV} -c train
kustomize build .
apiVersion: kubeflow.org/v1beta1
kind: TFJob
@ -484,7 +451,7 @@ various environment variables configuring access to S3.
You can now submit the job
```
ks apply ${KSENV} -c train
kustomize build . |kubectl apply -f -
```
And you can check the job
@ -507,10 +474,16 @@ There are various ways to monitor workflow/training job. In addition to using `k
#### Using GCS
Enter the `monitoring/GCS` from the `mnist` application directory.
```
cd monitoring/GCS
```
Configure TensorBoard to point to your model location
```
ks param set tensorboard --env=${KSENV} logDir ${LOGDIR}
kustomize edit add configmap mnist-map-monitoring --from-literal=logDir=${LOGDIR}
```
Assuming you followed the directions above if you used GCS you can use the following value
@ -521,33 +494,25 @@ LOGDIR=gs://${BUCKET}/${MODEL_PATH}
You need to point TensorBoard to GCP credentials to access GCS bucket with model.
1. Mount the secret into the pod
```
ks param set --env=${KSENV} tensorboatd secret user-gcp-sa=/var/secrets
```
* Setting this ksonnet parameter causes a volumeMount and volume to be added to TensorBoard
deployment
* To see this you can run
1. Mount the secret `user-gcp-sa` into the pod and configure the mount path of the secret.
```
ks show ${KSENV} -c tensorboard
kustomize edit add configmap mnist-map-monitoring --from-literal=secretName=user-gcp-sa
kustomize edit add configmap mnist-map-monitoring --from-literal=secretMountPath=/var/secrets
```
* The output should now include a volumeMount and volume section
* Setting this parameter causes a volumeMount and volume to be added to TensorBoard deployment
1. Next we need to set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` so that our code knows
2. Next we need to set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` so that our code knows
where to look for the service account key.
```
ks param set --env=${KSENV} tensorboard envVariables GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json
kustomize edit add configmap mnist-map-monitoring --from-literal=GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json
```
* If we look at the spec for TensorBoard deployment we can see that the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set.
```
ks show ${KSENV} -c tensorboard
kustomize build .
```
```
...
@ -559,10 +524,16 @@ You need to point TensorBoard to GCP credentials to access GCS bucket with model
#### Using S3
Enter the `monitoring/S3` from the `mnist` application directory.
```
cd monitoring/S3
```
Configure TensorBoard to point to your model location
```
ks param set tensorboard --env=${KSENV} logDir ${LOGDIR}
kustomize edit add configmap mnist-map-monitoring --from-literal=logDir=${LOGDIR}
```
Assuming you followed the directions above if you used S3 you can use the following value
@ -576,58 +547,26 @@ You need to point TensorBoard to AWS credentials to access S3 bucket with model.
1. Pass secrets as environment variables into pod
```
ks param set --env=${KSENV} tensorboard secretKeyRefs AWS_ACCESS_KEY_ID=aws-creds.awsAccessKeyID,AWS_SECRET_ACCESS_KEY=aws-creds.awsSecretAccessKey
kustomize edit add configmap mnist-map-monitoring --from-literal=awsSecretName=aws-creds
kustomize edit add configmap mnist-map-monitoring --from-literal=awsAccessKeyIDName=awsAccessKeyID
kustomize edit add configmap mnist-map-monitoring --from-literal=awsSecretAccessKeyName=awsSecretAccessKey
```
* Setting this ksonnet parameter causes a two new environment variables to be added to TensorBoard
deployment
* To see this you can run
```
ks show ${KSENV} -c tensorboard
```
* The output should now include two environment variables referencing K8s secret
```
...
spec:
containers:
- command:
...
env:
...
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
key: awsAccessKeyID
name: aws-creds
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
key: awsSecretAccessKey
name: aws-creds
...
```
1. Next we need to set a whole bunch of S3 related environment variables so that TensorBoard
knows how to talk to S3
2. Next we need to set a whole bunch of S3 related environment variables so that TensorBoard knows how to talk to S3
```
AWSENV="S3_ENDPOINT=${S3_ENDPOINT}"
AWSENV="${AWSENV},AWS_ENDPOINT_URL=${AWS_ENDPOINT_URL}"
AWSENV="${AWSENV},AWS_REGION=${AWS_REGION}"
AWSENV="${AWSENV},BUCKET_NAME=${BUCKET_NAME}"
AWSENV="${AWSENV},S3_USE_HTTPS=${S3_USE_HTTPS}"
AWSENV="${AWSENV},S3_VERIFY_SSL=${S3_VERIFY_SSL}"
ks param set --env=${KSENV} tensorboard envVariables ${AWSENV}
kustomize edit add configmap mnist-map-monitoring --from-literal=S3_ENDPOINT=${S3_ENDPOINT}
kustomize edit add configmap mnist-map-monitoring --from-literal=AWS_ENDPOINT_URL=${AWS_ENDPOINT_URL}
kustomize edit add configmap mnist-map-monitoring --from-literal=AWS_REGION=${AWS_REGION}
kustomize edit add configmap mnist-map-monitoring --from-literal=BUCKET_NAME=${BUCKET_NAME}
kustomize edit add configmap mnist-map-monitoring --from-literal=S3_USE_HTTPS=${S3_USE_HTTPS}
kustomize edit add configmap mnist-map-monitoring --from-literal=S3_VERIFY_SSL=${S3_VERIFY_SSL}
```
* If we look at the spec for TensorBoard deployment we can see that the environment variables related to S3 are set.
```
ks show ${KSENV} -c tensorboard
kustomize build .
```
```
@ -648,34 +587,29 @@ You need to point TensorBoard to AWS credentials to access S3 bucket with model.
#### Deploying TensorBoard
Now you can deploy TensorBoard
```
ks apply ${KSENV} -c tensorboard
kustomize build . | kubectl apply -f -
```
To access TensorBoard using port-forwarding
```
kubectl -n jlewi port-forward service/tensorboard-tb 8090:80
kubectl -n kubeflow port-forward service/tensorboard-tb 8090:80
```
TensorBoard can now be accessed at [http://127.0.0.1:8090](http://127.0.0.1:8090).
## Serving the model
The model code will export the model in saved model format which is suitable for serving with TensorFlow serving.
To serve the model follow the instructions below. The instructins vary slightly based on where you are storing your
model (e.g. GCS, S3, PVC). Depending on the storage system we provide different ksonnet components as a convenience
for setting relevant environment variables.
To serve the model follow the instructions below. The instructins vary slightly based on where you are storing your model (e.g. GCS, S3, PVC). Depending on the storage system we provide different kustomization as a convenience for setting relevant environment variables.
### GCS
Here we show to serve the model when it is stored on GCS. This assumes that when you trained the model you set `exportDir` to a GCS
URI; if not you can always copy it to GCS using `gsutil`.
Here we show to serve the model when it is stored on GCS. This assumes that when you trained the model you set `exportDir` to a GCS URI; if not you can always copy it to GCS using `gsutil`.
Check that a model was exported
@ -696,35 +630,39 @@ ${EXPORT_DIR}/1547100373/variables/variables.index
The number `1547100373` is a version number auto-generated by TensorFlow; it will vary on each run but should be monotonically increasing if you save a model to the same location as a previous location.
Enter the `serving/GCS` from the `mnist` application directory.
```
cd serving/GCS
```
Set a different name for the tf-serving.
```
kustomize edit add configmap mnist-map-serving --from-literal=name=mnist-gcs-dist
```
Set your model path
```
ks param set --env=${KSENV} mnist-deploy-gcp modelBasePath ${EXPORT_DIR}
kustomize edit add configmap mnist-map-serving --from-literal=modelBasePath=${EXPORT_DIR}
```
Deploy it
Deploy it, and run a service to make the deployment accessible to other pods in the cluster
```
ks apply ${KSENV} -c mnist-deploy-gcp
kustomize build . |kubectl apply -f -
```
You can check the deployment by running
```
kubectl describe deployments mnist-deploy-gcp
kubectl describe deployments mnist-gcs-dist
```
Finally, run a service to make the deployment accessible to other pods in the cluster
The service should make the `mnist-gcs-dist` deployment accessible over port 9000
```
ks apply ${KSENV} -c mnist-service
```
The service should make the `mnist-deploy-gcp` deployment accessible over port 9000
```
kubectl describe service mnist-service
kubectl describe service mnist-gcs-dist
```
### S3
@ -735,32 +673,42 @@ TODO: Add instructions
The section shows how to serve the local model that was stored in PVC while training.
Enter the `serving/local` from the `mnist` application directory.
```
cd serving/local
```
Set a different name for the tf-serving.
```
kustomize edit add configmap mnist-map-serving --from-literal=name=mnist-service-local
```
Mount the PVC, by default the pvc will be mounted to the `/mnt` of the pod.
```
ks param set --env=${KSENV} mnist-deploy-local pvcName ${PVC_NAME}
kustomize edit add configmap mnist-map-serving --from-literal=pvcName=${PVC_NAME}
kustomize edit add configmap mnist-map-serving --from-literal=pvcMountPath=/mnt
```
Configure a filepath for the exported model.
```
ks param set --env=${KSENV} mnist-deploy-local modelBasePath /mnt/export
kustomize edit add configmap mnist-map-serving --from-literal=modelBasePath=/mnt/export
```
Deploy it.
Deploy it, and run a service to make the deployment accessible to other pods in the cluster.
```
ks apply ${KSENV} -c mnist-deploy-local
kustomize build . |kubectl apply -f -
```
You can check the deployment by running
```
kubectl describe deployments mnist-deploy-local
```
Finally, run a service to make the deployment accessible to other pods in the cluster.
```
ks apply ${KSENV} -c mnist-service
```
The service should make the `mnist-deploy-local` deployment accessible over port 9000.
```
kubectl describe service mnist-service
@ -770,10 +718,16 @@ kubectl describe service mnist-service
The example comes with a simple web front end that can be used with your model.
Enter the `front` from the `mnist` application directory.
```
cd front
```
To deploy the web front end
```
ks apply ${KSENV} -c web-ui
kustomize build . |kubectl apply -f -
```
### Connecting via port forwarding

View File

@ -0,0 +1,20 @@
apiVersion: apps/v1beta2
kind: Deployment
metadata:
name: web-ui
namespace: kubeflow
spec:
replicas: 1
selector:
matchLabels:
app: web-ui
template:
metadata:
labels:
app: web-ui
spec:
containers:
- image: gcr.io/kubeflow-examples/mnist/web-ui:v20190112-v0.2-142-g3b38225
name: web-ui
ports:
- containerPort: 5000

View File

@ -0,0 +1,11 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
generatorOptions:
disableNameSuffixHash: true
resources:
- deployment.yaml
- service.yaml
namespace: kubeflow

21
mnist/front/service.yaml Normal file
View File

@ -0,0 +1,21 @@
apiVersion: v1
kind: Service
metadata:
annotations:
getambassador.io/config: |-
---
apiVersion: ambassador/v0
kind: Mapping
name: web-ui_mapping
prefix: /kubeflow/mnist/
rewrite: /
service: web-ui.kubeflow
name: web-ui
namespace: kubeflow
spec:
ports:
- port: 80
targetPort: 5000
selector:
app: web-ui
type: ClusterIP

View File

@ -82,9 +82,9 @@
contextDir: "."
},
local ksonnetSteps = subGraphTemplate {
local kustomizeSteps = subGraphTemplate {
name: "ksonnet",
dockerFile: "./Dockerfile.ksonnet",
dockerFile: "./Dockerfile.kustomize",
contextDir: "."
},
@ -94,6 +94,6 @@
contextDir: "./web-ui"
},
steps: modelSteps.steps + ksonnetSteps.steps + uiSteps.steps,
images: modelSteps.images + ksonnetSteps.images + uiSteps.images,
steps: modelSteps.steps + kustomizeSteps.steps + uiSteps.steps,
images: modelSteps.images + kustomizeSteps.images + uiSteps.images,
}

View File

@ -1,4 +0,0 @@
/lib
/.ksonnet/registries
/app.override.yaml
/.ks_environment

View File

View File

@ -1,16 +0,0 @@
apiVersion: 0.3.0
kind: ksonnet.io/app
libraries:
kubeflow/tf-serving:
name: tf-serving
registry: kubeflow
version: fed535eaa276220e4edf59530c0629f4375a40a9
name: ks_app
registries:
incubator:
protocol: github
uri: github.com/ksonnet/parts/tree/master/incubator
kubeflow:
protocol: github
uri: github.com/kubeflow/kubeflow/tree/v0.4-branch/kubeflow
version: 0.0.1

View File

@ -1,39 +0,0 @@
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["mnist-deploy-aws"];
local k = import "k.libsonnet";
local deployment = k.apps.v1beta1.deployment;
local container = deployment.mixin.spec.template.spec.containersType;
local util = import "kubeflow/tf-serving/util.libsonnet";
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
local base = tfserving.new(env, params);
local tfDeployment = base.tfDeployment +
deployment.mapContainers(
function(c) {
result::
c + container.withEnvMixin(
if util.toBool(params.s3Enable) then (
[
{
name: "AWS_ACCESS_KEY_ID",
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretAccesskeyidKeyName } },
},
{
name: "AWS_SECRET_ACCESS_KEY",
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretSecretaccesskeyKeyName } },
},
{ name: "AWS_REGION", value: params.s3AwsRegion },
{ name: "S3_USE_HTTPS", value: std.toString(params.s3UseHttps) },
{ name: "S3_VERIFY_SSL", value: std.toString(params.s3VerifySsl) },
{ name: "S3_ENDPOINT", value: params.s3Endpoint },
]
) else [],
),
}.result,
);
util.list([
tfDeployment,
base.tfservingConfig,
],)

View File

@ -1,47 +0,0 @@
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["mnist-deploy-gcp"];
local k = import "k.libsonnet";
local deployment = k.apps.v1beta1.deployment;
local container = deployment.mixin.spec.template.spec.containersType;
local util = import "kubeflow/tf-serving/util.libsonnet";
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
local base = tfserving.new(env, params);
local tfDeployment = base.tfDeployment +
deployment.mixin.spec.template.spec.withVolumesMixin(
if params.gcpCredentialSecretName != "null" then (
[{
name: "gcp-credentials",
secret: {
secretName: params.gcpCredentialSecretName,
},
}]
) else [],
) +
deployment.mapContainers(
function(c) {
result::
c + container.withEnvMixin(
if params.gcpCredentialSecretName != "null" then (
[{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/user-gcp-sa.json",
}]
) else [],
) +
container.withVolumeMountsMixin(
if params.gcpCredentialSecretName != "null" then (
[{
name: "gcp-credentials",
mountPath: "/secret/gcp-credentials",
}]
) else [],
),
}.result,
);
util.list([
tfDeployment,
base.tfservingConfig,
],)

View File

@ -1,39 +0,0 @@
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["mnist-deploy-local"];
local k = import "k.libsonnet";
local deployment = k.apps.v1beta1.deployment;
local container = deployment.mixin.spec.template.spec.containersType;
local util = import "kubeflow/tf-serving/util.libsonnet";
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
local base = tfserving.new(env, params);
local tfDeployment = base.tfDeployment +
deployment.mixin.spec.template.spec.withVolumesMixin(
if params.pvcName != "null" && params.pvcName != "" then (
[{
name: "local-storage",
persistentVolumeClaim: {
claimName: params.pvcName,
},
}]
) else [],
) +
deployment.mapContainers(
function(c) {
result::
c + container.withVolumeMountsMixin(
if params.pvcName != "null" && params.pvcName != "" then (
[{
name: "local-storage",
mountPath: "/mnt",
}]
) else [],
),
}.result,
);
util.list([
tfDeployment,
base.tfservingConfig,
],)

View File

@ -1,8 +0,0 @@
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["mnist-service"];
local k = import "k.libsonnet";
local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet";
local util = import "kubeflow/tf-serving/util.libsonnet";
tfservingService.new(env, params).all

View File

@ -1,93 +0,0 @@
{
global: {},
components: {
train: {
batchSize: 100,
envVariables: 'GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json',
exportDir: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
image: 'gcr.io/kubeflow-examples/mnist/model:v20190111-v0.2-148-g313770f',
learningRate: '0.01',
modelDir: 'gs://kubeflow-ci_temp/mnist-jlewi',
name: 'mnist-train',
numPs: 0,
numWorkers: 0,
secret: '',
secretKeyRefs: '',
trainSteps: 200,
pvcName: '',
},
"mnist-deploy-local": {
defaultCpuImage: 'tensorflow/serving:1.11.1',
defaultGpuImage: 'tensorflow/serving:1.11.1-gpu',
deployHttpProxy: 'false',
enablePrometheus: 'true',
httpProxyImage: '',
injectIstio: 'false',
pvcName: '',
modelBasePath: '/mnt/export',
modelName: 'mnist',
name: 'mnist-deploy-local',
numGpus: '0',
versionName: 'v1',
},
"mnist-deploy-gcp": {
defaultCpuImage: 'tensorflow/serving:1.11.1',
defaultGpuImage: 'tensorflow/serving:1.11.1-gpu',
deployHttpProxy: 'false',
enablePrometheus: 'true',
gcpCredentialSecretName: 'user-gcp-sa',
httpProxyImage: '',
injectIstio: 'false',
modelBasePath: 'gs://kubeflow-examples-data/mnist',
modelName: 'mnist',
name: 'mnist-deploy-gcp',
numGpus: '0',
versionName: 'v1',
},
"mnist-deploy-aws": {
defaultCpuImage: 'tensorflow/serving:1.11.1',
defaultGpuImage: 'tensorflow/serving:1.11.1-gpu',
deployHttpProxy: 'false',
enablePrometheus: 'true',
httpProxyImage: 'gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723',
injectIstio: 'false',
modelBasePath: 's3://kubeflow-examples-data/mnist',
modelName: 'null',
name: 'mnist-deploy-aws',
numGpus: '0',
s3AwsRegion: 'us-west-1',
s3Enable: 'false',
s3Endpoint: 's3.us-west-1.amazonaws.com',
s3SecretAccesskeyidKeyName: 'AWS_ACCESS_KEY_ID',
s3SecretName: 'null',
s3SecretSecretaccesskeyKeyName: 'AWS_SECRET_ACCESS_KEY',
s3UseHttps: 'true',
s3VerifySsl: 'true',
versionName: 'v1',
},
"mnist-service": {
enablePrometheus: 'true',
injectIstio: 'false',
modelName: 'mnist',
name: 'mnist-service',
serviceType: 'ClusterIP',
trafficRule: 'v1:100',
},
"tensorboard": {
envVariables: 'GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json',
image: "tensorflow/tensorflow:1.11.0",
logDir: "gs://example/to/model/logdir",
name: "tensorboard",
secret: '',
secretKeyRefs: '',
},
"web-ui": {
containerPort: 5000,
image: "gcr.io/kubeflow-examples/mnist/web-ui:v20190112-v0.2-142-g3b38225",
name: "web-ui",
replicas: 1,
servicePort: 80,
type: "ClusterIP",
},
},
}

View File

@ -1,120 +0,0 @@
// TODO: Generalize to use S3. We can follow the pattern of training that
// takes parameters to specify environment variables and secret which can be customized
// for GCS, S3 as needed.
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components.tensorboard;
local util = import "util.libsonnet";
local k = import "k.libsonnet";
local name = params.name;
local namespace = env.namespace;
local service = {
apiVersion: "v1",
kind: "Service",
metadata: {
name: name + "-tb",
namespace: env.namespace,
annotations: {
"getambassador.io/config":
std.join("\n", [
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: " + name + "_mapping",
"prefix: /" + env.namespace + "/tensorboard/mnist",
"rewrite: /",
"service: " + name + "-tb." + namespace,
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: " + name + "_mapping_data",
"prefix: /" + env.namespace + "/tensorboard/mnist/data/",
"rewrite: /data/",
"service: " + name + "-tb." + namespace,
]),
}, //annotations
},
spec: {
ports: [
{
name: "http",
port: 80,
targetPort: 80,
},
],
selector: {
app: "tensorboard",
"tb-job": name,
},
},
};
local tbSecrets = util.parseSecrets(params.secretKeyRefs);
local secretPieces = std.split(params.secret, "=");
local secretName = if std.length(secretPieces) > 0 then secretPieces[0] else "";
local secretMountPath = if std.length(secretPieces) > 1 then secretPieces[1] else "";
local deployment = {
apiVersion: "apps/v1beta1",
kind: "Deployment",
metadata: {
name: name + "-tb",
namespace: env.namespace,
},
spec: {
replicas: 1,
template: {
metadata: {
labels: {
app: "tensorboard",
"tb-job": name,
},
name: name,
namespace: namespace,
},
spec: {
containers: [
{
command: [
"/usr/local/bin/tensorboard",
"--logdir=" + params.logDir,
"--port=80",
],
image: params.image,
name: "tensorboard",
ports: [
{
containerPort: 80,
},
],
env: util.parseEnv(params.envVariables) + tbSecrets,
volumeMounts: if secretMountPath != "" then
[
{
name: secretName,
mountPath: secretMountPath,
readOnly: true,
},
] else [],
},
],
volumes:
if secretName != "" then
[
{
name: secretName,
secret: {
secretName: secretName,
},
},
] else [],
},
},
},
};
std.prune(k.core.v1.list.new([service, deployment]))

View File

@ -1,117 +0,0 @@
// Component to train a model.
//
// Parameters are used to control training
// image: Docker iamge to use
// modelDir: Location to write the model this can be a local path (e.g. to a PV)
// or it can be any filesystem URI that TF understands (e.g GCS, S3, HDFS)
// exportDir: Location to export the model
// trainSteps: Number of training steps to run
// batchSize: Batch size
// learningRate: Learning rate
// envVariables: Comma separated list of environment variables to set.
// Use this to set environment variables needed to configure S3 access.
// numWorkers: Number of workers
// numPs: Number of parameter servers
//
local k = import "k.libsonnet";
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components.train;
local util = import "util.libsonnet";
local trainSecrets = util.parseSecrets(params.secretKeyRefs);
local secretPieces = std.split(params.secret, "=");
local secretName = if std.length(secretPieces) > 0 then secretPieces[0] else "";
local secretMountPath = if std.length(secretPieces) > 1 then secretPieces[1] else "";
local replicaSpec = {
containers: [
{
command: [
"/usr/bin/python",
"/opt/model.py",
],
args: [
"--tf-model-dir=" + params.modelDir,
"--tf-export-dir=" + params.exportDir,
"--tf-train-steps=" + params.trainSteps,
"--tf-batch-size=" + params.batchSize,
"--tf-learning-rate=" + params.learningRate,
],
env: util.parseEnv(params.envVariables) + trainSecrets,
image: params.image,
name: "tensorflow",
volumeMounts: if secretMountPath != "" then
[
{
name: secretName,
mountPath: secretMountPath,
readOnly: true,
},
] else if params.pvcName != "null" && params.pvcName != "" then
[
{
name: "local-storage",
mountPath: "/mnt",
},
] else [],
workingDir: "/opt",
},
],
volumes:
if secretName != "" then
[
{
name: secretName,
secret: {
secretName: secretName,
},
},
] else if params.pvcName != "null" && params.pvcName != "" then
[
{
name: "local-storage",
persistentVolumeClaim: {
claimName: params.pvcName,
},
},
] else [],
restartPolicy: "OnFailure",
};
local tfjob = {
apiVersion: "kubeflow.org/v1beta1",
kind: "TFJob",
metadata: {
name: params.name,
namespace: env.namespace,
},
spec: {
tfReplicaSpecs: {
Chief: {
replicas: 1,
template: {
spec: replicaSpec,
},
},
[if params.numWorkers > 0 then "Worker"]: {
replicas: params.numWorkers,
template: {
spec: replicaSpec,
},
},
[if params.numWorkers > 0 then "Ps"]: {
replicas: params.numPs,
template: {
spec: replicaSpec,
},
},
},
},
};
k.core.v1.list.new([
tfjob,
])

View File

@ -1,41 +0,0 @@
{
// convert a list of two items into a map representing an environment variable
// TODO(jlewi): Should we move this into kubeflow/core/util.libsonnet
listToMap:: function(v)
{
name: v[0],
value: v[1],
},
// convert a list of two items into a map representing an env variable referencing k8s secret
listToSecretMap:: function(v)
{
name: v[0],
valueFrom: {
secretKeyRef: {
name: std.split(v[1], ".")[0],
key: std.split(v[1], ".")[1],
}
}
},
// Function to turn comma separated list of environment variables into a dictionary.
parseEnv:: function(v)
local pieces = std.split(v, ",");
if v != "" && std.length(pieces) > 0 then
std.map(
function(i) $.listToMap(std.split(i, "=")),
std.split(v, ",")
)
else [],
// Function to turn comma separated list of env variables referencing secrets into a dictionary.
parseSecrets:: function(v)
local pieces = std.split(v, ",");
if v != "" && std.length(pieces) > 0 then
std.map(
function(i) $.listToSecretMap(std.split(i, "=")),
std.split(v, ",")
)
else [],
}

View File

@ -1,72 +0,0 @@
local env = std.extVar("__ksonnet/environments");
local params = std.extVar("__ksonnet/params").components["web-ui"];
[
{
"apiVersion": "v1",
"kind": "Service",
"metadata": {
"name": params.name,
"namespace": env.namespace,
annotations: {
"getambassador.io/config":
std.join("\n", [
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: " + params.name + "_mapping",
"prefix: /" + env.namespace + "/mnist/",
"rewrite: /",
"service: " + params.name + "." + env.namespace,
]),
}, //annotations
},
"spec": {
"ports": [
{
"port": params.servicePort,
"targetPort": params.containerPort
}
],
"selector": {
"app": params.name
},
"type": params.type
}
},
{
"apiVersion": "apps/v1beta2",
"kind": "Deployment",
"metadata": {
"name": params.name,
"namespace": env.namespace,
},
"spec": {
"replicas": params.replicas,
"selector": {
"matchLabels": {
"app": params.name
},
},
"template": {
"metadata": {
"labels": {
"app": params.name
}
},
"spec": {
"containers": [
{
"image": params.image,
"name": params.name,
"ports": [
{
"containerPort": params.containerPort
}
]
}
]
}
}
}
}
]

View File

@ -1,4 +0,0 @@
local components = std.extVar("__ksonnet/components");
components + {
// Insert user-specified overrides here.
}

View File

@ -1,73 +0,0 @@
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)*
- [tf-serving](#tf-serving)
- [Quickstart](#quickstart)
- [Using the library](#using-the-library)
- [io.ksonnet.pkg.tf-serving](#ioksonnetpkgtf-serving)
- [Example](#example)
- [Parameters](#parameters)
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
# tf-serving
> TensorFlow serving is a server for TensorFlow models.
* [Quickstart](#quickstart)
* [Using Prototypes](#using-prototypes)
* [io.ksonnet.pkg.tf-serving](#io.ksonnet.pkg.tf-serving)
## Quickstart
*The following commands use the `io.ksonnet.pkg.tf-serving` prototype to generate Kubernetes YAML for tf-serving, and then deploys it to your Kubernetes cluster.*
First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)).
If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init <app-name>`.
Finally, in the ksonnet application directory, run the following:
```shell
# Expand prototype as a Jsonnet file, place in a file in the
# `components/` directory. (YAML and JSON are also available.)
$ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \
--name tf-serving \
--namespace default
# Apply to server.
$ ks apply -f tf-serving.jsonnet
```
## Using the library
The library files for tf-serving define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure tf-serving for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache.
This library provides a set of pre-fabricated "flavors" (or "distributions") of tf-serving, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs.
These prototypes, as well as how to use them, are enumerated below.
### io.ksonnet.pkg.tf-serving
TensorFlow serving
#### Example
```shell
# Expand prototype as a Jsonnet file, place in a file in the
# `components/` directory. (YAML and JSON are also available.)
$ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \
--name YOUR_NAME_HERE \
--model_path YOUR_MODEL_PATH_HERE
```
#### Parameters
The available options to pass prototype are:
* `--name=<name>`: Name to give to each of the components [string]
* `--model_path=<model_path>`: Path to the model. This can be a GCS path. [string]
[rootReadme]: https://github.com/ksonnet/mixins

View File

@ -1,35 +0,0 @@
{
"name": "tf-serving",
"apiVersion": "0.0.1",
"kind": "ksonnet.io/parts",
"description": "TensorFlow serving is a server for TensorFlow models.\n",
"author": "kubeflow team <kubeflow-team@google.com>",
"contributors": [
{
"name": "Jeremy Lewi",
"email": "jlewi@google.com"
}
],
"repository": {
"type": "git",
"url": "https://github.com/kubeflow/kubeflow"
},
"bugs": {
"url": "https://github.com/kubeflow/kubeflow/issues"
},
"keywords": [
"kubeflow",
"tensorflow",
"database"
],
"quickStart": {
"prototype": "io.ksonnet.pkg.tf-serving",
"componentName": "tf-serving",
"flags": {
"name": "tf-serving",
"namespace": "default"
},
"comment": "Run TensorFlow Serving"
},
"license": "Apache 2.0"
}

View File

@ -1,23 +0,0 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-serving
// @description TensorFlow serving
// @shortDescription A TensorFlow serving deployment
// @param name string Name to give to each of the components
local k = import "k.libsonnet";
// ksonnet appears to require name be a parameter of the prototype which is why we handle it differently.
local name = import "param://name";
// updatedParams includes the namespace from env by default.
local updatedParams = params + env;
local tfServingBase = import "kubeflow/tf-serving/tf-serving.libsonnet";
local tfServing = tfServingBase {
// Override parameters with user supplied parameters.
params+: updatedParams {
name: name,
},
};
std.prune(k.core.v1.list.new(tfServing.components))

View File

@ -1,61 +0,0 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-serving-deployment-aws
// @description TensorFlow serving
// @shortDescription A TensorFlow serving deployment
// @param name string Name to give to each of the components
// @optionalParam numGpus string 0 Number of gpus to use
// @optionalParam deployHttpProxy string false Whether to deploy http proxy
// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false.
// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11)
// @optionalParam modelBasePath string s3://kubeflow-examples-data/mnist The model path
// @optionalParam modelName string null The model name
// @optionalParam versionName string v1 The version name
// @optionalParam defaultCpuImage string tensorflow/serving:1.11.1 The default model server image (cpu)
// @optionalParam defaultGpuImage string tensorflow/serving:1.11.1-gpu The default model server image (gpu)
// @optionalParam httpProxyImage string gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723 Http proxy image
// @optionalParam s3Enable string false Whether to enable S3
// Following parameters are needed only if s3Enable is true
// @optionalParam s3SecretName string null Name of the k8s secrets containing S3 credentials
// @optionalParam s3SecretAccesskeyidKeyName string AWS_ACCESS_KEY_ID Name of the key in the k8s secret containing AWS_ACCESS_KEY_ID
// @optionalParam s3SecretSecretaccesskeyKeyName string AWS_SECRET_ACCESS_KEY Name of the key in the k8s secret containing AWS_SECRET_ACCESS_KEY
// @optionalParam s3AwsRegion string us-west-1 S3 region
// @optionalParam s3UseHttps string true Whether or not to use https
// @optionalParam s3VerifySsl string true Whether or not to verify https certificates for S3 connections
// @optionalParam s3Endpoint string s3.us-west-1.amazonaws.com URL for your s3-compatible endpoint
local k = import "k.libsonnet";
local deployment = k.apps.v1beta1.deployment;
local container = deployment.mixin.spec.template.spec.containersType;
local util = import "kubeflow/tf-serving/util.libsonnet";
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
local base = tfserving.new(env, params);
local tfDeployment = base.tfDeployment +
deployment.mapContainers(
function(c) {
result::
c + container.withEnvMixin(
if util.toBool(params.s3Enable) then (
[
{
name: "AWS_ACCESS_KEY_ID",
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretAccesskeyidKeyName } },
},
{
name: "AWS_SECRET_ACCESS_KEY",
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretSecretaccesskeyKeyName } },
},
{ name: "AWS_REGION", value: params.s3AwsRegion },
{ name: "S3_USE_HTTPS", value: std.toString(params.s3UseHttps) },
{ name: "S3_VERIFY_SSL", value: std.toString(params.s3VerifySsl) },
{ name: "S3_ENDPOINT", value: params.s3Endpoint },
]
) else [],
),
}.result,
);
util.list([
tfDeployment,
base.tfservingConfig,
],)

View File

@ -1,61 +0,0 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-serving-deployment-gcp
// @description TensorFlow serving
// @shortDescription A TensorFlow serving deployment
// @param name string Name to give to each of the components
// @optionalParam numGpus string 0 Number of gpus to use
// @optionalParam deployHttpProxy string false Whether to deploy http proxy
// @optionalParam modelBasePath string gs://kubeflow-examples-data/mnist The model path
// @optionalParam modelName string null The model name
// @optionalParam versionName string v1 The version name
// @optionalParam defaultCpuImage string tensorflow/serving:1.11.1 The default model server image (cpu)
// @optionalParam defaultGpuImage string tensorflow/serving:1.11.1-gpu The default model server image (gpu)
// @optionalParam httpProxyImage string gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723 Http proxy image
// @optionalParam gcpCredentialSecretName string null If not empty, insert the secret credential
// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false.
// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11)
local k = import "k.libsonnet";
local deployment = k.apps.v1beta1.deployment;
local container = deployment.mixin.spec.template.spec.containersType;
local util = import "kubeflow/tf-serving/util.libsonnet";
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
local base = tfserving.new(env, params);
local tfDeployment = base.tfDeployment +
deployment.mixin.spec.template.spec.withVolumesMixin(
if params.gcpCredentialSecretName != "null" then (
[{
name: "gcp-credentials",
secret: {
secretName: params.gcpCredentialSecretName,
},
}]
) else [],
) +
deployment.mapContainers(
function(c) {
result::
c + container.withEnvMixin(
if params.gcpCredentialSecretName != "null" then (
[{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/user-gcp-sa.json",
}]
) else [],
) +
container.withVolumeMountsMixin(
if params.gcpCredentialSecretName != "null" then (
[{
name: "gcp-credentials",
mountPath: "/secret/gcp-credentials",
}]
) else [],
),
}.result,
);
util.list([
tfDeployment,
base.tfservingConfig,
],)

View File

@ -1,16 +0,0 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-serving-service
// @description TensorFlow serving
// @shortDescription A TensorFlow serving model
// @param name string Name to give to each of the components
// @optionalParam serviceType string ClusterIP The k8s service type for tf serving.
// @optionalParam modelName string null The model name
// @optionalParam trafficRule string v1:100 The traffic rule, in the format of version:percentage,version:percentage,..
// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false.
// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11)
local k = import "k.libsonnet";
local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet";
local util = import "kubeflow/tf-serving/util.libsonnet";
tfservingService.new(env, params).all

View File

@ -1,230 +0,0 @@
// @apiVersion 0.1
// @name io.ksonnet.pkg.tf-serving-request-log
// @description tf-serving with request logging
// @shortDescription tf-serving with request logging
// @param name string Name to give to each of the components
// @param gcpProject string The gcp project for Bigquery dataset
// @param dataset string The Bigquery dataset
// @param table string The Bigquery table
// @optionalParam modelBasePath string gs://kubeflow-examples-data/mnist The model path
// @optionalParam modelName string mnist The model name
local k = import "k.libsonnet";
local namespace = "kubeflow";
local appName = import "param://name";
local image = "gcr.io/kubeflow-images-public/tf-model-server-cpu:v20180327-995786ec";
local httpProxyImage = "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723";
local loggingImage = "gcr.io/kubeflow-images-public/tf-model-server-request-logger:v20180723";
local gcpSecretName = "user-gcp-sa";
local service = {
apiVersion: "v1",
kind: "Service",
metadata: {
labels: {
app: appName,
},
name: appName,
namespace: namespace,
},
spec: {
ports: [
{
name: "grpc-tf-serving",
port: 9000,
targetPort: 9000,
},
{
name: "http-tf-serving-proxy",
port: 8000,
targetPort: 8000,
},
],
selector: {
app: appName,
},
type: "ClusterIP",
},
};
local configMap = {
apiVersion: "v1",
kind: "ConfigMap",
metadata: {
name: appName + "fluentd-config",
namespace: namespace,
},
data: {
"fluent.conf": std.format(|||
<source>
@type tail
path /tmp/logs/request.log
pos_file /tmp/logs/request.log.pos
<parse>
@type json
</parse>
tag dummy
</source>
<match dummy>
@type bigquery_insert
auth_method application_default
project %s
dataset %s
table %s
fetch_schema true
</match>
|||, [params.gcpProject, params.dataset, params.table]),
},
};
local deployment = {
apiVersion: "extensions/v1beta1",
kind: "Deployment",
metadata: {
labels: {
app: appName,
},
name: appName,
namespace: namespace,
},
spec: {
template: {
metadata: {
labels: {
app: appName,
},
},
spec: {
containers: [
// ModelServer
{
args: [
"/usr/bin/tensorflow_model_server",
"--port=9000",
"--model_name=" + params.modelName,
"--model_base_path=" + params.modelBasePath,
],
image: image,
imagePullPolicy: "IfNotPresent",
name: "model-server",
ports: [
{
containerPort: 9000,
},
],
resources: {
limits: {
cpu: "4",
memory: "4Gi",
},
requests: {
cpu: "1",
memory: "1Gi",
},
},
},
// Http proxy
{
name: "http-proxy",
image: httpProxyImage,
imagePullPolicy: "Always",
command: [
"python",
"/usr/src/app/server.py",
"--port=8000",
"--rpc_port=9000",
"--rpc_timeout=10.0",
"--log_request=true",
],
env: [],
ports: [
{
containerPort: 8000,
},
],
resources: {
requests: {
memory: "1Gi",
cpu: "1",
},
limits: {
memory: "4Gi",
cpu: "4",
},
},
securityContext: {
runAsUser: 1000,
fsGroup: 1000,
},
volumeMounts: [
{
name: "request-logs",
mountPath: "/tmp/logs",
},
],
},
// TODO(lunkai): use admission controller to inject.
// Logging container.
{
name: "logging",
image: loggingImage,
imagePullPolicy: "Always",
env: [
{ name: "GOOGLE_APPLICATION_CREDENTIALS", value: "/secret/gcp-credentials/key.json" },
],
resources: {
requests: {
memory: "250Mi",
cpu: "0.25",
},
limits: {
memory: "500Mi",
cpu: "0.5",
},
},
volumeMounts: [
{
name: "request-logs",
mountPath: "/tmp/logs",
},
{
name: "gcp-credentials",
mountPath: "/secret/gcp-credentials",
},
{
name: "fluentd-config-volume",
mountPath: "/fluentd/etc/custom",
},
],
},
],
volumes: [
{
name: "gcp-credentials",
secret: {
secretName: gcpSecretName,
},
},
{
name: "request-logs",
emptyDir: {},
},
{
configMap: {
name: "fluentd-config",
},
name: "fluentd-config-volume",
},
],
},
},
},
};
k.core.v1.list.new([
service,
deployment,
configMap,
])

View File

@ -1,112 +0,0 @@
local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet";
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
local params = {
name: "m",
serviceType: "ClusterIP",
modelName: "mnist",
trafficRule: "v1:100",
injectIstio: false,
};
local istioParams = params {
injectIstio: true,
};
local env = {
namespace: "kubeflow",
};
local deploymentParam = {
name: "m",
modelName: "mnist",
versionName: "v1",
modelBasePath: "gs://abc",
numGpus: 0,
defaultCpuImage: "gcr.io/abc",
defaultGpuImage: "gcr.io/abc",
injectIstio: false,
enablePrometheus: true,
};
local gpuParam1 = {
name: "m",
modelName: "mnist",
versionName: "v1",
modelBasePath: "gs://abc",
numGpus: 1,
defaultCpuImage: "gcr.io/abc",
defaultGpuImage: "gcr.io/abc",
injectIstio: false,
enablePrometheus: true,
};
local gpuParamString0 = {
name: "m",
modelName: "mnist",
versionName: "v1",
modelBasePath: "gs://abc",
numGpus: "0",
defaultCpuImage: "gcr.io/abc",
defaultGpuImage: "gcr.io/abc",
injectIstio: false,
enablePrometheus: true,
};
local gpuParamString1 = {
name: "m",
modelName: "mnist",
versionName: "v1",
modelBasePath: "gs://abc",
numGpus: "1",
defaultCpuImage: "gcr.io/abc",
defaultGpuImage: "gcr.io/abc",
injectIstio: false,
enablePrometheus: true,
};
local serviceInstance = tfservingService.new(env, params);
local istioServiceInstance = tfservingService.new(env, istioParams);
local deploymentInstance = tfserving.new(env, deploymentParam);
local gpuInstance = tfserving.new(env, gpuParam1);
local gpuString0Instance = tfserving.new(env, gpuParamString0);
local gpuString1Instance = tfserving.new(env, gpuParamString1);
// This one should only have tfService
std.assertEqual(
std.length(serviceInstance.all.items),
1,
) &&
// This one should have tfService, virtualService, and DestinationRule
std.assertEqual(
std.length(istioServiceInstance.all.items),
3
) &&
std.startsWith(
deploymentInstance.tfDeployment.spec.template.spec.containers[0].args[4],
"--monitoring_config_file"
) &&
std.assertEqual(
deploymentInstance.tfDeployment.spec.template.spec.containers[0].resources.limits,
{ cpu: "4", memory: "4Gi" }
) &&
std.assertEqual(
gpuInstance.tfDeployment.spec.template.spec.containers[0].resources.limits,
{ cpu: "4", memory: "4Gi", "nvidia.com/gpu": 1 }
) &&
std.assertEqual(
gpuString0Instance.tfDeployment.spec.template.spec.containers[0].resources.limits,
{ cpu: "4", memory: "4Gi" }
) &&
std.assertEqual(
gpuString1Instance.tfDeployment.spec.template.spec.containers[0].resources.limits,
{ cpu: "4", memory: "4Gi", "nvidia.com/gpu": 1 }
)

View File

@ -1,147 +0,0 @@
{
local k = import "k.libsonnet",
local util = import "kubeflow/tf-serving/util.libsonnet",
new(_env, _params):: {
local params = _params + _env,
local namespace = params.namespace,
local name = params.name,
local modelName =
if params.modelName == "null" then
params.name
else
params.modelName,
local tfService = {
apiVersion: "v1",
kind: "Service",
metadata: {
labels: {
app: modelName,
},
name: name,
namespace: namespace,
annotations: {
"getambassador.io/config":
std.join("\n", [
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: tfserving-predict-mapping-" + modelName,
"prefix: /tfserving/models/" + modelName,
"rewrite: /v1/models/" + modelName + ":predict",
"method: POST",
"service: " + name + "." + namespace + ":8500",
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: tfserving-predict-mapping-" + modelName + "-get",
"prefix: /tfserving/models/" + modelName,
"rewrite: /v1/models/" + modelName,
"method: GET",
"service: " + name + "." + namespace + ":8500",
]),
} + if util.toBool(params.enablePrometheus) then {
"prometheus.io/scrape": "true",
"prometheus.io/path": "/monitoring/prometheus/metrics",
"prometheus.io/port": "8500",
} else {}, //annotations
},
spec: {
ports: [
{
name: "grpc-tf-serving",
port: 9000,
targetPort: 9000,
},
{
name: "http-tf-serving",
port: 8500,
targetPort: 8500,
},
],
selector: {
app: modelName,
},
type: params.serviceType,
},
}, // tfService
tfService:: tfService,
local versionWeights = std.split(params.trafficRule, ","),
local virtualService = {
apiVersion: "networking.istio.io/v1alpha3",
kind: "VirtualService",
metadata: {
name: name,
namespace: namespace,
},
spec: {
hosts: [
"*",
],
gateways: [
"kubeflow-gateway",
],
http: [
{
match: [
{
uri: {
prefix: "/istio/tfserving/models/" + modelName,
},
method: {
exact: "POST",
},
},
],
rewrite: {
uri: "/v1/models/" + modelName + ":predict",
},
route: [
{
destination: {
host: name,
port: {
number: 8500,
},
subset: std.split(versionWeight, ":")[0],
},
weight: std.parseInt(std.split(versionWeight, ":")[1]),
}
for versionWeight in versionWeights
],
},
],
},
},
virtualService:: virtualService,
local destinationRule = {
apiVersion: "networking.istio.io/v1alpha3",
kind: "DestinationRule",
metadata: {
name: name,
namespace: namespace,
},
spec: {
host: name,
subsets: [
{
name: std.split(versionWeight, ":")[0],
labels: {
version: std.split(versionWeight, ":")[0],
},
}
for versionWeight in versionWeights
],
},
},
destinationRule:: destinationRule,
all:: util.list([
tfService,
] + if util.toBool(params.injectIstio) then [
virtualService,
destinationRule,
] else []),
}, // new
}

View File

@ -1,137 +0,0 @@
{
local k = import "k.libsonnet",
local util = import "kubeflow/tf-serving/util.libsonnet",
new(_env, _params):: {
local params = _params + _env,
local namespace = params.namespace,
local name = params.name,
local modelName =
if params.modelName == "null" then
params.name
else
params.modelName,
local versionName = params.versionName,
local numGpus =
if std.type(params.numGpus) == "string" then
std.parseInt(params.numGpus)
else
params.numGpus,
local modelServerImage =
if numGpus == 0 then
params.defaultCpuImage
else
params.defaultGpuImage,
// Optional features.
// TODO(lunkai): Add request logging
local modelServerContainer = {
command: [
"/usr/bin/tensorflow_model_server",
],
args: [
"--port=9000",
"--rest_api_port=8500",
"--model_name=" + modelName,
"--model_base_path=" + params.modelBasePath,
] + if util.toBool(params.enablePrometheus) then [
"--monitoring_config_file=/var/config/monitoring_config.txt",
] else [],
image: modelServerImage,
imagePullPolicy: "IfNotPresent",
name: modelName,
ports: [
{
containerPort: 9000,
},
{
containerPort: 8500,
},
],
env: [],
resources: {
limits: {
cpu: "4",
memory: "4Gi",
} + if numGpus != 0 then {
"nvidia.com/gpu": numGpus,
} else {},
requests: {
cpu: "1",
memory: "1Gi",
},
},
volumeMounts: [
{
mountPath: "/var/config/",
name: "config-volume",
},
],
// TCP liveness probe on gRPC port
livenessProbe: {
tcpSocket: {
port: 9000,
},
initialDelaySeconds: 30,
periodSeconds: 30,
},
}, // modelServerContainer
local tfDeployment = {
apiVersion: "extensions/v1beta1",
kind: "Deployment",
metadata: {
labels: {
app: modelName,
},
name: name,
namespace: namespace,
},
spec: {
template: {
metadata: {
labels: {
app: modelName,
version: versionName,
},
annotations: {
"sidecar.istio.io/inject": if util.toBool(params.injectIstio) then "true",
},
},
spec: {
containers: [
modelServerContainer,
],
volumes: [
{
configMap: {
name: name + "-config",
},
name: "config-volume",
},
],
},
},
},
}, // tfDeployment
tfDeployment:: tfDeployment,
local tfservingConfig = {
apiVersion: "v1",
kind: "ConfigMap",
metadata: {
name: name + "-config",
namespace: namespace,
},
data: {
"monitoring_config.txt": std.join("\n", [
"prometheus_config: {",
" enable: true,",
' path: "/monitoring/prometheus/metrics"',
"}",
]),
},
}, // tfservingConfig
tfservingConfig:: tfservingConfig,
}, // new
}

View File

@ -1,380 +0,0 @@
{
util:: import "kubeflow/tf-serving/util.libsonnet",
// Parameters are intended to be late bound.
params:: {
name: null,
numGpus: 0,
labels: {
app: $.params.name,
},
modelName: $.params.name,
modelPath: null,
modelStorageType: "storageType",
version: "v1",
firstVersion: true,
deployIstio: false,
deployHttpProxy: false,
httpProxyImage: "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180606-9dfda4f2",
serviceType: "ClusterIP",
// If users want to override the image then can override defaultCpuImage and/or defaultGpuImage
// in which case the image used will still depend on whether GPUs are used or not.
// Users can also override modelServerImage in which case the user supplied value will always be used
// regardless of numGpus.
defaultCpuImage: "tensorflow/serving:1.11.1",
defaultGpuImage: "tensorflow/serving:1.11.1-gpu",
modelServerImage: if $.params.numGpus == 0 then
$.params.defaultCpuImage
else
$.params.defaultGpuImage,
// Whether or not to enable s3 parameters
s3Enable:: false,
// Which storageType to use
storageType:: null,
},
// Parametes specific to GCP.
gcpParams:: {
gcpCredentialSecretName: "",
} + $.params,
// Parameters that control S3 access
// params overrides s3params because params can be overwritten by the user to override the defaults.
s3params:: {
// Name of the k8s secrets containing S3 credentials
s3SecretName: "",
// Name of the key in the k8s secret containing AWS_ACCESS_KEY_ID.
s3SecretAccesskeyidKeyName: "AWS_ACCESS_KEY_ID",
// Name of the key in the k8s secret containing AWS_SECRET_ACCESS_KEY.
s3SecretSecretaccesskeyKeyName: "AWS_SECRET_ACCESS_KEY",
// S3 region
s3AwsRegion: "us-west-1",
// TODO(jlewi): We should use util.toBool to automatically conver to actual boolean values.
// The use of strings is left over from when they were prototype parameters which only supports string type.
// true Whether or not to use https for S3 connections
s3UseHttps: "true",
// Whether or not to verify https certificates for S3 connections
s3VerifySsl: "true",
// URL for your s3-compatible endpoint.
s3Endpoint: "http://s3.us-west-1.amazonaws.com,",
} + $.params,
components:: {
all:: [
// Default routing rule for the first version of model.
if $.util.toBool($.params.deployIstio) && $.util.toBool($.params.firstVersion) then
$.parts.defaultRouteRule,
] +
// TODO(jlewi): It would be better to structure s3 as a mixin.
// As an example it would be great to allow S3 and GCS parameters
// to be enabled simultaneously. This should be doable because
// each entails adding a set of environment variables and volumes
// to the containers. These volumes/environment variables shouldn't
// overlap so there's no reason we shouldn't be able to just add
// both modifications to the base container.
// I think we want to restructure things as mixins so they can just
// be added.
if $.params.s3Enable then
[
$.s3parts.tfService,
$.s3parts.tfDeployment,
]
else if $.params.storageType == "gcp" then
[
$.gcpParts.tfService,
$.gcpParts.tfDeployment,
]
else
[
$.parts.tfService,
$.parts.tfDeployment,
],
}.all,
parts:: {
// We define the containers one level beneath parts because combined with jsonnet late binding
// this makes it easy for users to override specific bits of the container.
tfServingContainerBase:: {
name: $.params.name,
image: $.params.modelServerImage,
imagePullPolicy: "IfNotPresent",
command: [
"/usr/bin/tensorflow_model_server",
],
args: [
"--port=9000",
"--model_name=" + $.params.modelName,
"--model_base_path=" + $.params.modelPath,
],
ports: [
{
containerPort: 9000,
},
],
// TODO(jlewi): We should add readiness and liveness probes. I think the blocker is that
// model-server doesn't have something we can use out of the box.
resources: {
requests: {
memory: "1Gi",
cpu: "1",
},
limits: {
memory: "4Gi",
cpu: "4",
},
},
// The is user and group should be defined in the Docker image.
// Per best practices we don't run as the root user.
securityContext: {
runAsUser: 1000,
fsGroup: 1000,
},
volumeMounts+: if $.params.modelStorageType == "nfs" then [{
name: "nfs",
mountPath: "/mnt",
}]
else [],
}, // tfServingContainer
tfServingContainer+: $.parts.tfServingContainerBase +
if $.params.numGpus > 0 then
{
resources+: {
limits+: {
"nvidia.com/gpu": $.params.numGpus,
},
},
}
else {},
tfServingMetadata+: {
labels: $.params.labels { version: $.params.version },
annotations: {
"sidecar.istio.io/inject": if $.util.toBool($.params.deployIstio) then "true",
},
},
httpProxyContainer:: {
name: $.params.name + "-http-proxy",
image: $.params.httpProxyImage,
imagePullPolicy: "IfNotPresent",
command: [
"python",
"/usr/src/app/server.py",
"--port=8000",
"--rpc_port=9000",
"--rpc_timeout=10.0",
],
env: [],
ports: [
{
containerPort: 8000,
},
],
resources: {
requests: {
memory: "500Mi",
cpu: "0.5",
},
limits: {
memory: "1Gi",
cpu: "1",
},
},
securityContext: {
runAsUser: 1000,
fsGroup: 1000,
},
}, // httpProxyContainer
tfDeployment: {
apiVersion: "extensions/v1beta1",
kind: "Deployment",
metadata: {
name: $.params.name + "-" + $.params.version,
namespace: $.params.namespace,
labels: $.params.labels,
},
spec: {
template: {
metadata: $.parts.tfServingMetadata,
spec: {
containers: [
$.parts.tfServingContainer,
if $.util.toBool($.params.deployHttpProxy) then
$.parts.httpProxyContainer,
],
volumes+: if $.params.modelStorageType == "nfs" then
[{
name: "nfs",
persistentVolumeClaim: {
claimName: $.params.nfsPVC,
},
}]
else [],
},
},
},
}, // tfDeployment
tfService: {
apiVersion: "v1",
kind: "Service",
metadata: {
labels: $.params.labels,
name: $.params.name,
namespace: $.params.namespace,
annotations: {
"getambassador.io/config":
std.join("\n", [
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: tfserving-mapping-" + $.params.name + "-get",
"prefix: /models/" + $.params.name + "/",
"rewrite: /",
"method: GET",
"service: " + $.params.name + "." + $.params.namespace + ":8000",
"---",
"apiVersion: ambassador/v0",
"kind: Mapping",
"name: tfserving-mapping-" + $.params.name + "-post",
"prefix: /models/" + $.params.name + "/",
"rewrite: /model/" + $.params.name + ":predict",
"method: POST",
"service: " + $.params.name + "." + $.params.namespace + ":8000",
]),
}, //annotations
},
spec: {
ports: [
{
name: "grpc-tf-serving",
port: 9000,
targetPort: 9000,
},
{
name: "http-tf-serving-proxy",
port: 8000,
targetPort: 8000,
},
],
selector: $.params.labels,
type: $.params.serviceType,
},
}, // tfService
defaultRouteRule: {
apiVersion: "config.istio.io/v1alpha2",
kind: "RouteRule",
metadata: {
name: $.params.name + "-default",
namespace: $.params.namespace,
},
spec: {
destination: {
name: $.params.name,
},
precedence: 0,
route: [
{
labels: { version: $.params.version },
},
],
},
},
}, // parts
// Parts specific to S3
s3parts:: $.parts {
s3Env:: [
{ name: "AWS_ACCESS_KEY_ID", valueFrom: { secretKeyRef: { name: $.s3params.s3SecretName, key: $.s3params.s3SecretAccesskeyidKeyName } } },
{ name: "AWS_SECRET_ACCESS_KEY", valueFrom: { secretKeyRef: { name: $.s3params.s3SecretName, key: $.s3params.s3SecretSecretaccesskeyKeyName } } },
{ name: "AWS_REGION", value: $.s3params.s3AwsRegion },
{ name: "S3_REGION", value: $.s3params.s3AwsRegion },
{ name: "S3_USE_HTTPS", value: $.s3params.s3UseHttps },
{ name: "S3_VERIFY_SSL", value: $.s3params.s3VerifySsl },
{ name: "S3_ENDPOINT", value: $.s3params.s3Endpoint },
],
tfServingContainer: $.parts.tfServingContainer {
env+: $.s3parts.s3Env,
},
tfDeployment: $.parts.tfDeployment {
spec: +{
template: +{
metadata: $.parts.tfServingMetadata,
spec: +{
containers: [
$.s3parts.tfServingContainer,
if $.util.toBool($.params.deployHttpProxy) then
$.parts.httpProxyContainer,
],
},
},
},
}, // tfDeployment
}, // s3parts
// Parts specific to GCP
gcpParts:: $.parts {
gcpEnv:: [
if $.gcpParams.gcpCredentialSecretName != "" then
{ name: "GOOGLE_APPLICATION_CREDENTIALS", value: "/secret/gcp-credentials/user-gcp-sa.json" },
],
tfServingContainer: $.parts.tfServingContainer {
env+: $.gcpParts.gcpEnv,
volumeMounts+: [
if $.gcpParams.gcpCredentialSecretName != "" then
{
name: "gcp-credentials",
mountPath: "/secret/gcp-credentials",
},
],
},
tfDeployment: $.parts.tfDeployment {
spec+: {
template+: {
metadata: $.parts.tfServingMetadata,
spec+: {
containers: [
$.gcpParts.tfServingContainer,
if $.util.toBool($.params.deployHttpProxy) then
$.parts.httpProxyContainer,
],
volumes: [
if $.gcpParams.gcpCredentialSecretName != "" then
{
name: "gcp-credentials",
secret: {
secretName: $.gcpParams.gcpCredentialSecretName,
},
},
],
},
},
},
}, // tfDeployment
}, // gcpParts
}

View File

@ -1,21 +0,0 @@
// Some useful routines.
{
local k = import "k.libsonnet",
// Convert non-boolean types like string,number to a boolean.
// This is primarily intended for dealing with parameters that should be booleans.
toBool:: function(x) {
result::
if std.type(x) == "boolean" then
x
else if std.type(x) == "string" then
std.asciiUpper(x) == "TRUE"
else if std.type(x) == "number" then
x != 0
else
false,
}.result,
// Produce a list of manifests. obj must be an array
list(obj):: k.core.v1.list.new(obj,),
}

View File

@ -0,0 +1,17 @@
- op: add
path: /spec/template/spec/containers/0/volumeMounts
value:
- mountPath: $(secretMountPath)
name: user-gcp-sa
readOnly: true
- op: add
path: /spec/template/spec/volumes
value:
- name: user-gcp-sa
secret:
secretName: $(secretName)
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: GOOGLE_APPLICATION_CREDENTIALS
value: $(GOOGLE_APPLICATION_CREDENTIALS)

View File

@ -0,0 +1,39 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
bases:
- ../base
configurations:
- params.yaml
vars:
- fieldref:
fieldPath: data.GOOGLE_APPLICATION_CREDENTIALS
name: GOOGLE_APPLICATION_CREDENTIALS
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-monitoring
- fieldref:
fieldPath: data.secretName
name: secretName
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-monitoring
- fieldref:
fieldPath: data.secretMountPath
name: secretMountPath
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-monitoring
patchesJson6902:
- path: deployment_patch.yaml
target:
group: apps
kind: Deployment
name: tensorboard-tb
version: v1beta1

View File

@ -0,0 +1,5 @@
varReference:
- path: spec/template/spec/volumes/secret/secretName
kind: Deployment
- path: spec/template/spec/containers/volumeMounts/mountPath
kind: Deployment

View File

@ -0,0 +1,46 @@
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: S3_ENDPOINT
value: $(S3_ENDPOINT)
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: AWS_ENDPOINT_URL
value: $(AWS_ENDPOINT_URL)
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: AWS_REGION
value: $(AWS_REGION)
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: BUCKET_NAME
value: $(BUCKET_NAME)
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: S3_USE_HTTPS
value: $(S3_USE_HTTPS)
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: S3_VERIFY_SSL
value: $(S3_VERIFY_SSL)
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
key: $(awsAccessKeyIDName)
name: $(awsSecretName)
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
key: $(awsSecretAccessKeyName)
name: $(awsSecretName)

View File

@ -0,0 +1,81 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
bases:
- ../base
configurations:
- params.yaml
vars:
- fieldref:
fieldPath: data.S3_ENDPOINT
name: S3_ENDPOINT
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-monitoring
- fieldref:
fieldPath: data.AWS_ENDPOINT_URL
name: AWS_ENDPOINT_URL
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-monitoring
- fieldref:
fieldPath: data.AWS_REGION
name: AWS_REGION
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-monitoring
- fieldref:
fieldPath: data.BUCKET_NAME
name: BUCKET_NAME
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-monitoring
- fieldref:
fieldPath: data.S3_USE_HTTPS
name: S3_USE_HTTPS
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-monitoring
- fieldref:
fieldPath: data.S3_VERIFY_SSL
name: S3_VERIFY_SSL
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-monitoring
- fieldref:
fieldPath: data.awsSecretName
name: awsSecretName
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-monitoring
- fieldref:
fieldPath: data.awsAccessKeyIDName
name: awsAccessKeyIDName
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-monitoring
- fieldref:
fieldPath: data.awsSecretAccessKeyName
name: awsSecretAccessKeyName
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-monitoring
patchesJson6902:
- path: deployment_patch.yaml
target:
group: apps
kind: Deployment
name: tensorboard-tb
version: v1beta1

View File

@ -0,0 +1,5 @@
varReference:
- path: spec/template/spec/containers/env/valueFrom/secretKeyRef/name
kind: Deployment
- path: spec/template/spec/containers/env/valueFrom/secretKeyRef/key
kind: Deployment

View File

@ -0,0 +1,27 @@
apiVersion: apps/v1beta1
kind: Deployment
metadata:
name: tensorboard-tb
namespace: kubeflow
spec:
replicas: 1
template:
metadata:
labels:
app: tensorboard
tb-job: tensorboard
name: tensorboard
namespace: kubeflow
spec:
containers:
- command:
- /usr/local/bin/tensorboard
- --logdir=$(logDir)
- --port=80
env:
- name: logDir
value: $(logDir)
image: tensorflow/tensorflow:1.11.0
name: tensorboard
ports:
- containerPort: 80

View File

@ -0,0 +1,23 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- deployment.yaml
- service.yaml
namespace: kubeflow
generatorOptions:
disableNameSuffixHash: true
configurations:
- params.yaml
vars:
- fieldref:
fieldPath: data.logDir
name: logDir
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-monitoring

View File

@ -0,0 +1,3 @@
varReference:
- path: spec/template/spec/containers/env/value
kind: Deployment

View File

@ -0,0 +1,29 @@
apiVersion: v1
kind: Service
metadata:
annotations:
getambassador.io/config: |-
---
apiVersion: ambassador/v0
kind: Mapping
name: tensorboard_mapping
prefix: /kubeflow/tensorboard/mnist
rewrite: /
service: tensorboard-tb.kubeflow
---
apiVersion: ambassador/v0
kind: Mapping
name: tensorboard_mapping_data
prefix: /kubeflow/tensorboard/mnist/data/
rewrite: /data/
service: tensorboard-tb.kubeflow
name: tensorboard-tb
namespace: kubeflow
spec:
ports:
- name: http
port: 80
targetPort: 80
selector:
app: tensorboard
tb-job: tensorboard

View File

@ -0,0 +1,17 @@
- op: add
path: /spec/template/spec/containers/0/volumeMounts/-
value:
mountPath: /secret/gcp-credentials
name: user-gcp-sa
readOnly: true
- op: add
path: /spec/template/spec/volumes/-
value:
name: user-gcp-sa
secret:
secretName: user-gcp-sa
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: GOOGLE_APPLICATION_CREDENTIALS
value: /secret/gcp-credentials/user-gcp-sa.json

View File

@ -0,0 +1,13 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
bases:
- ../base
patchesJson6902:
- path: deployment_patch.yaml
target:
group: extensions
kind: Deployment
name: $(svcName)
version: v1beta1

View File

@ -0,0 +1,51 @@
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
labels:
app: mnist
name: $(svcName)
namespace: kubeflow
spec:
template:
metadata:
labels:
app: mnist
version: v1
spec:
containers:
- args:
- --port=9000
- --rest_api_port=8500
- --model_name=mnist
- --model_base_path=$(modelBasePath)
- --monitoring_config_file=/var/config/monitoring_config.txt
command:
- /usr/bin/tensorflow_model_server
env:
- name: modelBasePath
value: $(modelBasePath)
image: tensorflow/serving:1.11.1
imagePullPolicy: IfNotPresent
livenessProbe:
initialDelaySeconds: 30
periodSeconds: 30
tcpSocket:
port: 9000
name: mnist
ports:
- containerPort: 9000
- containerPort: 8500
resources:
limits:
cpu: "4"
memory: 4Gi
requests:
cpu: "1"
memory: 1Gi
volumeMounts:
- mountPath: /var/config/
name: config-volume
volumes:
- configMap:
name: mnist-deploy-config
name: config-volume

View File

@ -0,0 +1,31 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- deployment.yaml
- mnist-deploy-config.yaml
- service.yaml
namespace: kubeflow
generatorOptions:
disableNameSuffixHash: true
configurations:
- params.yaml
vars:
- fieldref:
fieldPath: data.name
name: svcName
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-serving
- fieldref:
fieldPath: data.modelBasePath
name: modelBasePath
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-serving

View File

@ -0,0 +1,11 @@
apiVersion: v1
data:
monitoring_config.txt: |-
prometheus_config: {
enable: true,
path: "/monitoring/prometheus/metrics"
}
kind: ConfigMap
metadata:
name: mnist-deploy-config
namespace: kubeflow

View File

@ -0,0 +1,7 @@
varReference:
- path: spec/template/spec/containers/env/value
kind: Deployment
- path: metadata/name
kind: Service
- path: metadata/name
kind: Deployment

View File

@ -0,0 +1,39 @@
apiVersion: v1
kind: Service
metadata:
annotations:
getambassador.io/config: |-
---
apiVersion: ambassador/v0
kind: Mapping
name: tfserving-predict-mapping-mnist
prefix: /tfserving/models/mnist
rewrite: /v1/models/mnist:predict
method: POST
service: mnist-service.kubeflow:8500
---
apiVersion: ambassador/v0
kind: Mapping
name: tfserving-predict-mapping-mnist-get
prefix: /tfserving/models/mnist
rewrite: /v1/models/mnist
method: GET
service: mnist-service.kubeflow:8500
prometheus.io/path: /monitoring/prometheus/metrics
prometheus.io/port: "8500"
prometheus.io/scrape: "true"
labels:
app: mnist
name: $(svcName)
namespace: kubeflow
spec:
ports:
- name: grpc-tf-serving
port: 9000
targetPort: 9000
- name: http-tf-serving
port: 8500
targetPort: 8500
selector:
app: mnist
type: ClusterIP

View File

@ -0,0 +1,12 @@
- op: add
path: /spec/template/spec/containers/0/volumeMounts/-
value:
mountPath: $(pvcMountPath)
name: local-storage
- op: add
path: /spec/template/spec/volumes/-
value:
name: local-storage
persistentVolumeClaim:
claimName: $(pvcName)

View File

@ -0,0 +1,32 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
bases:
- ../base
vars:
- fieldref:
fieldPath: data.pvcName
name: pvcName
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-serving
- fieldref:
fieldPath: data.pvcMountPath
name: pvcMountPath
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-serving
configurations:
- params.yaml
patchesJson6902:
- path: deployment_patch.yaml
target:
group: extensions
kind: Deployment
name: $(svcName)
version: v1beta1

View File

@ -0,0 +1,5 @@
varReference:
- path: spec/template/spec/volumes/persistentVolumeClaim/claimName
kind: Deployment
- path: spec/template/spec/containers/volumeMounts/mountPath
kind: TFJob

View File

@ -21,11 +21,11 @@ Manually running the test
import logging
import os
import subprocess
from kubernetes import client as k8s_client
from kubeflow.tf_operator import test_runner #pylint: disable=no-name-in-module
from kubeflow.testing import ks_util
from kubeflow.testing import test_util
from kubeflow.testing import util
@ -38,14 +38,13 @@ class MnistDeployTest(test_util.TestCase):
if not self.app_dir:
self.app_dir = os.path.join(os.path.dirname(__file__), "..",
"ks_app")
"serving/GCS")
self.app_dir = os.path.abspath(self.app_dir)
logging.info("--app_dir not set defaulting to: %s", self.app_dir)
self.env = env
self.namespace = namespace
self.params = args.params
self.ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
super(MnistDeployTest, self).__init__(class_name="MnistDeployTest",
name=name)
@ -55,16 +54,26 @@ class MnistDeployTest(test_util.TestCase):
# same name.
api_client = k8s_client.ApiClient()
# TODO (jinchihe) beflow code will be removed once new test-worker image
# is publish in https://github.com/kubeflow/testing/issues/373.
kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \
'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64'
util.run(['wget', '-O', '/usr/local/bin/kustomize', kusUrl], cwd=self.app_dir)
util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=self.app_dir)
# Apply the components
for component in ["mnist-deploy-gcp", "mnist-service"]:
# Setup the ksonnet app
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
self.params)
configmap = 'mnist-map-serving'
for pair in self.params.split(","):
k, v = pair.split("=", 1)
if k == "namespace":
util.run(['kustomize', 'edit', 'set', k, v], cwd=self.app_dir)
else:
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
'--from-literal=' + k + '=' + v], cwd=self.app_dir)
util.run([self.ks_cmd, "apply", self.env, "-c", component],
cwd=self.app_dir)
logging.info("Created deployment %s in namespaces %s", self.name, self.namespace)
# Seems the util.run cannot handle pipes case, using check_call.
subCmd = 'kustomize build ' + self.app_dir + '| kubectl apply -f -'
subprocess.check_call(subCmd, shell=True)
util.wait_for_deployment(api_client, self.namespace, self.name,
timeout_minutes=4)

View File

@ -1,6 +1,6 @@
"""Test training using TFJob.
This file tests that we can submit the job from ksonnet
This file tests that we can submit the job
and that the job runs to completion.
It is an integration test as it depends on having access to
@ -20,18 +20,18 @@ Manually running the test
3. To test a new image set the parameter image e.g
--params=name=${NAME},namespace=${NAMESPACE},image=${IMAGE}
4. To control how long it trains set sample_size and num_epochs
--params=numTrainSteps=10,batchSize=10,...
--params=trainSteps=10,batchSize=10,...
"""
import json
import logging
import os
import subprocess
from kubernetes import client as k8s_client
from kubeflow.tf_operator import tf_job_client #pylint: disable=no-name-in-module
from kubeflow.tf_operator import test_runner #pylint: disable=no-name-in-module
from kubeflow.testing import ks_util
from kubeflow.testing import test_util
from kubeflow.testing import util
@ -42,14 +42,13 @@ class TFJobTest(test_util.TestCase):
if not self.app_dir:
self.app_dir = os.path.join(os.path.dirname(__file__), "..",
"ks_app")
"training/GCS")
self.app_dir = os.path.abspath(self.app_dir)
logging.info("--app_dir not set defaulting to: %s", self.app_dir)
self.env = env
self.namespace = namespace
self.params = args.params
self.ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
super(TFJobTest, self).__init__(class_name="TFJobTest", name=name)
def test_train(self):
@ -58,15 +57,43 @@ class TFJobTest(test_util.TestCase):
# same name.
api_client = k8s_client.ApiClient()
component = "train"
# Setup the ksonnet app
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
self.params)
# TODO (jinchihe) beflow code will be removed once new test-worker image
# is publish in https://github.com/kubeflow/testing/issues/373.
kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \
'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64'
util.run(['wget', '-O', '/usr/local/bin/kustomize', kusUrl], cwd=self.app_dir)
util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=self.app_dir)
# Setup parameters for kustomize
configmap = 'mnist-map-training'
for pair in self.params.split(","):
k, v = pair.split("=", 1)
if k == "namespace":
util.run(['kustomize', 'edit', 'set', k, v], cwd=self.app_dir)
elif k == "image":
util.run(['kustomize', 'edit', 'set', k, 'training-image=' + v], cwd=self.app_dir)
elif k == "numPs":
util.run(['../base/definition.sh', '--numPs', v], cwd=self.app_dir)
elif k == "numWorkers":
util.run(['../base/definition.sh', '--numWorkers', v], cwd=self.app_dir)
elif k == "secret":
secretName, secretMountPath = v.split("=", 1)
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
'--from-literal=secretName=' + secretName], cwd=self.app_dir)
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
'--from-literal=secretMountPath=' + secretMountPath], cwd=self.app_dir)
elif k == "envVariables":
var_k, var_v = v.split("=", 1)
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
'--from-literal=' + var_k + '=' + var_v], cwd=self.app_dir)
else:
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
'--from-literal=' + k + '=' + v], cwd=self.app_dir)
# Create the TF job
util.run([self.ks_cmd, "apply", self.env, "-c", component],
cwd=self.app_dir)
# Seems the util.run cannot handle pipes case, using check_call.
subCmd = 'kustomize build ' + self.app_dir + '| kubectl apply -f -'
subprocess.check_call(subCmd, shell=True)
logging.info("Created job %s in namespaces %s", self.name, self.namespace)
# Wait for the job to complete.
@ -89,6 +116,21 @@ class TFJobTest(test_util.TestCase):
self.failure = "Job {0} in namespace {1} in status {2}".format( # pylint: disable=attribute-defined-outside-init
self.name, self.namespace, results.get("status", {}))
logging.error(self.failure)
# if the TFJob failed, print out the pod logs for debugging.
pod_names = tf_job_client.get_pod_names(
api_client, self.namespace, self.name)
logging.info("The Pods name:\n %s", pod_names)
core_api = k8s_client.CoreV1Api(api_client)
for pod in pod_names:
logging.info("Getting logs of Pod %s.", pod)
try:
pod_logs = core_api.read_namespaced_pod_log(pod, self.namespace)
logging.info("The logs of Pod %s log:\n %s", pod, pod_logs)
except k8s_client.rest.ApiException as e:
logging.info("Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n", e)
return
# We don't delete the jobs. We rely on TTLSecondsAfterFinished

View File

@ -0,0 +1,17 @@
- op: add
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/volumeMounts
value:
- mountPath: $(secretMountPath)
name: user-gcp-sa
readOnly: true
- op: add
path: /spec/tfReplicaSpecs/Chief/template/spec/volumes
value:
- name: user-gcp-sa
secret:
secretName: $(secretName)
- op: add
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
value:
name: GOOGLE_APPLICATION_CREDENTIALS
value: $(GOOGLE_APPLICATION_CREDENTIALS)

View File

@ -0,0 +1,17 @@
- op: add
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/volumeMounts
value:
- mountPath: $(secretMountPath)
name: user-gcp-sa
readOnly: true
- op: add
path: /spec/tfReplicaSpecs/Ps/template/spec/volumes
value:
- name: user-gcp-sa
secret:
secretName: $(secretName)
- op: add
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
value:
name: GOOGLE_APPLICATION_CREDENTIALS
value: $(GOOGLE_APPLICATION_CREDENTIALS)

View File

@ -0,0 +1,17 @@
- op: add
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/volumeMounts
value:
- mountPath: $(secretMountPath)
name: user-gcp-sa
readOnly: true
- op: add
path: /spec/tfReplicaSpecs/Worker/template/spec/volumes
value:
- name: user-gcp-sa
secret:
secretName: $(secretName)
- op: add
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
value:
name: GOOGLE_APPLICATION_CREDENTIALS
value: $(GOOGLE_APPLICATION_CREDENTIALS)

View File

@ -0,0 +1,48 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
bases:
- ../base
configurations:
- params.yaml
# TBD (jinchihe) Need move the image to base file once.
# the issue addressed: kubernetes-sigs/kustomize/issues/1040
# TBD (jinchihe) Need to update the image once
# the issue addressed: kubeflow/testing/issues/373
images:
- name: training-image
newName: gcr.io/kubeflow-examples/mnist/model
newTag: v20190111-v0.2-148-g313770f
vars:
- fieldref:
fieldPath: data.GOOGLE_APPLICATION_CREDENTIALS
name: GOOGLE_APPLICATION_CREDENTIALS
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.secretName
name: secretName
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.secretMountPath
name: secretMountPath
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
patchesJson6902:
- path: Chief_patch.yaml
target:
group: kubeflow.org
kind: TFJob
name: $(trainingName)
version: v1beta1

View File

@ -0,0 +1,15 @@
varReference:
- path: metadata/name
kind: TFJob
- path: spec/tfReplicaSpecs/Chief/template/spec/volumes/secret/secretName
kind: TFJob
- path: spec/tfReplicaSpecs/Chief/template/spec/containers/volumeMounts/mountPath
kind: TFJob
- path: spec/tfReplicaSpecs/Worker/template/spec/volumes/secret/secretName
kind: TFJob
- path: spec/tfReplicaSpecs/Worker/template/spec/containers/volumeMounts/mountPath
kind: TFJob
- path: spec/tfReplicaSpecs/Ps/template/spec/volumes/secret/secretName
kind: TFJob
- path: spec/tfReplicaSpecs/Ps/template/spec/containers/volumeMounts/mountPath
kind: TFJob

View File

@ -0,0 +1,46 @@
- op: add
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
value:
name: S3_ENDPOINT
value: $(S3_ENDPOINT)
- op: add
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
value:
name: AWS_ENDPOINT_URL
value: $(AWS_ENDPOINT_URL)
- op: add
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
value:
name: AWS_REGION
value: $(AWS_REGION)
- op: add
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
value:
name: BUCKET_NAME
value: $(BUCKET_NAME)
- op: add
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
value:
name: S3_USE_HTTPS
value: $(S3_USE_HTTPS)
- op: add
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
value:
name: S3_VERIFY_SSL
value: $(S3_VERIFY_SSL)
- op: add
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
value:
name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
key: $(awsAccessKeyIDName)
name: $(awsSecretName)
- op: add
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
value:
name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
key: $(awsSecretAccessKeyName)
name: $(awsSecretName)

View File

@ -0,0 +1,46 @@
- op: add
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
value:
name: S3_ENDPOINT
value: $(S3_ENDPOINT)
- op: add
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
value:
name: AWS_ENDPOINT_URL
value: $(AWS_ENDPOINT_URL)
- op: add
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
value:
name: AWS_REGION
value: $(AWS_REGION)
- op: add
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
value:
name: BUCKET_NAME
value: $(BUCKET_NAME)
- op: add
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
value:
name: S3_USE_HTTPS
value: $(S3_USE_HTTPS)
- op: add
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
value:
name: S3_VERIFY_SSL
value: $(S3_VERIFY_SSL)
- op: add
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
value:
name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
key: $(awsAccessKeyIDName)
name: $(awsSecretName)
- op: add
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
value:
name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
key: $(awsSecretAccessKeyName)
name: $(awsSecretName)

View File

@ -0,0 +1,46 @@
- op: add
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
value:
name: S3_ENDPOINT
value: $(S3_ENDPOINT)
- op: add
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
value:
name: AWS_ENDPOINT_URL
value: $(AWS_ENDPOINT_URL)
- op: add
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
value:
name: AWS_REGION
value: $(AWS_REGION)
- op: add
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
value:
name: BUCKET_NAME
value: $(BUCKET_NAME)
- op: add
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
value:
name: S3_USE_HTTPS
value: $(S3_USE_HTTPS)
- op: add
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
value:
name: S3_VERIFY_SSL
value: $(S3_VERIFY_SSL)
- op: add
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
value:
name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
key: $(awsAccessKeyIDName)
name: $(awsSecretName)
- op: add
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
value:
name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
key: $(awsSecretAccessKeyName)
name: $(awsSecretName)

View File

@ -0,0 +1,90 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
bases:
- ../base
configurations:
- params.yaml
# TBD (jinchihe) Need move the image to base file once.
# the issue addressed: kubernetes-sigs/kustomize/issues/1040
# TBD (jinchihe) Need to update the image once
# the issue addressed: kubeflow/testing/issues/373
images:
- name: training-image
newName: gcr.io/kubeflow-examples/mnist/model
newTag: v20190111-v0.2-148-g313770f
vars:
- fieldref:
fieldPath: data.S3_ENDPOINT
name: S3_ENDPOINT
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.AWS_ENDPOINT_URL
name: AWS_ENDPOINT_URL
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.AWS_REGION
name: AWS_REGION
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.BUCKET_NAME
name: BUCKET_NAME
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.S3_USE_HTTPS
name: S3_USE_HTTPS
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.S3_VERIFY_SSL
name: S3_VERIFY_SSL
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.awsSecretName
name: awsSecretName
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.awsAccessKeyIDName
name: awsAccessKeyIDName
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.awsSecretAccessKeyName
name: awsSecretAccessKeyName
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
patchesJson6902:
- path: Chief_patch.yaml
target:
group: kubeflow.org
kind: TFJob
name: $(trainingName)
version: v1beta1

View File

@ -0,0 +1,9 @@
varReference:
- path: metadata/name
kind: TFJob
- path: metadata/name
kind: TFJob
- path: spec/tfReplicaSpecs/Chief/template/spec/containers/env/valueFrom/secretKeyRef/name
kind: TFJob
- path: spec/tfReplicaSpecs/Chief/template/spec/containers/env/valueFrom/secretKeyRef/key
kind: TFJob

View File

@ -0,0 +1,35 @@
apiVersion: kubeflow.org/v1beta1
kind: TFJob
metadata:
name: $(trainingName)
namespace: kubeflow
spec:
tfReplicaSpecs:
Chief:
replicas: 1
template:
spec:
containers:
- name: tensorflow
command:
- /usr/bin/python
- /opt/model.py
- --tf-model-dir=$(modelDir)
- --tf-export-dir=$(exportDir)
- --tf-train-steps=$(trainSteps)
- --tf-batch-size=$(batchSize)
- --tf-learning-rate=$(learningRate)
env:
- name: modelDir
value: $(modelDir)
- name: exportDir
value: $(exportDir)
- name: trainSteps
value: $(trainSteps)
- name: batchSize
value: $(batchSize)
- name: learningRate
value: $(learningRate)
image: training-image
workingDir: /opt
restartPolicy: OnFailure

View File

@ -0,0 +1,35 @@
apiVersion: kubeflow.org/v1beta1
kind: TFJob
metadata:
name: $(trainingName)
namespace: kubeflow
spec:
tfReplicaSpecs:
Ps:
replicas: %numPs%
template:
spec:
containers:
- name: tensorflow
command:
- /usr/bin/python
- /opt/model.py
- --tf-model-dir=$(modelDir)
- --tf-export-dir=$(exportDir)
- --tf-train-steps=$(trainSteps)
- --tf-batch-size=$(batchSize)
- --tf-learning-rate=$(learningRate)
env:
- name: modelDir
value: $(modelDir)
- name: exportDir
value: $(exportDir)
- name: trainSteps
value: $(trainSteps)
- name: batchSize
value: $(batchSize)
- name: learningRate
value: $(learningRate)
image: training-image
workingDir: /opt
restartPolicy: OnFailure

View File

@ -0,0 +1,36 @@
apiVersion: kubeflow.org/v1beta1
kind: TFJob
metadata:
name: $(trainingName)
namespace: kubeflow
spec:
tfReplicaSpecs:
Worker:
replicas: %numWorkers%
template:
spec:
containers:
- name: tensorflow
command:
- /usr/bin/python
- /opt/model.py
- --tf-model-dir=$(modelDir)
- --tf-export-dir=$(exportDir)
- --tf-train-steps=$(trainSteps)
- --tf-batch-size=$(batchSize)
- --tf-learning-rate=$(learningRate)
env:
- name: modelDir
value: $(modelDir)
- name: exportDir
value: $(exportDir)
- name: trainSteps
value: $(trainSteps)
- name: batchSize
value: $(batchSize)
- name: learningRate
value: $(learningRate)
image: training-image
name: tensorflow
workingDir: /opt
restartPolicy: OnFailure

View File

@ -0,0 +1,72 @@
#!/bin/bash
# The script is to define the number of Ps and Workers for TFJOB.
# Usage: definition.sh --numPs number_of_PS --numWorkers number_of_worker
while (($#)); do
case $1 in
"--numPs")
shift
numPs="$1"
shift
;;
"--numWorkers")
shift
numWorkers="$1"
shift
;;
"--help")
shift
echo "Usage: definition.sh --numPs number_of_PS --numWorkers number_of_worker"
shift
;;
*)
echo "Unknown argument: '$1'"
echo "Usage: definition.sh --numPs number_of_PS --numWorkers number_of_worker"
exit 1
;;
esac
done
BASE_PATH=$(dirname "$0")
if [ "x${numPs}" != "x" ]; then
if [[ ${numPs} =~ ^[0-9]+$ ]] && [ ${numPs} -gt 0 ]; then
(cd ${BASE_PATH}; sed -i.sedbak s/%numPs%/${numPs}/ Ps.yaml >> /dev/null)
(cd ${BASE_PATH}; kustomize edit add patch Ps.yaml)
sed -i.sedbak '/patchesJson6902/a \
- path: Ps_patch.yaml \
\ target: \
\ group: kubeflow.org \
\ kind: TFJob \
\ name: \$(trainingName) \
\ version: v1beta1 \
\
' kustomization.yaml
else
echo "ERROR: numPS must be an integer greater than or equal to 1."
exit 1
fi
fi
if [ "x${numWorkers}" != "x" ]; then
if [[ ${numWorkers} =~ ^[0-9]+$ ]] && [ ${numWorkers} -gt 0 ]; then
(cd ${BASE_PATH}; sed -i.sedbak s/%numWorkers%/${numWorkers}/ Worker.yaml >> /dev/null)
(cd ${BASE_PATH}; kustomize edit add patch Worker.yaml)
sed -i.sedbak '/patchesJson6902/a \
- path: Worker_patch.yaml \
\ target: \
\ group: kubeflow.org \
\ kind: TFJob \
\ name: \$(trainingName) \
\ version: v1beta1 \
\
' kustomization.yaml
else
echo "ERROR: numWorkers must be an integer greater than or equal to 1."
exit 1
fi
fi
rm -rf ${BASE_PATH}/*.sedbak
rm -rf *.sedbak

View File

@ -0,0 +1,57 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- Chief.yaml
namespace: kubeflow
generatorOptions:
disableNameSuffixHash: true
configurations:
- params.yaml
vars:
- fieldref:
fieldPath: data.name
name: trainingName
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.modelDir
name: modelDir
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.exportDir
name: exportDir
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.trainSteps
name: trainSteps
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.batchSize
name: batchSize
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.learningRate
name: learningRate
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training

View File

@ -0,0 +1,9 @@
varReference:
- path: metadata/name
kind: TFJob
- path: spec/tfReplicaSpecs/Chief/template/spec/containers/env/value
kind: TFJob
- path: spec/tfReplicaSpecs/Worker/template/spec/containers/env/value
kind: TFJob
- path: spec/tfReplicaSpecs/Ps/template/spec/containers/env/value
kind: TFJob

View File

@ -0,0 +1,11 @@
- op: add
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/volumeMounts
value:
- mountPath: $(pvcMountPath)
name: local-storage
- op: add
path: /spec/tfReplicaSpecs/Chief/template/spec/volumes
value:
- name: local-storage
persistentVolumeClaim:
claimName: $(pvcName)

View File

@ -0,0 +1,11 @@
- op: add
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/volumeMounts
value:
- mountPath: $(pvcMountPath)
name: local-storage
- op: add
path: /spec/tfReplicaSpecs/Ps/template/spec/volumes
value:
- name: local-storage
persistentVolumeClaim:
claimName: $(pvcName)

View File

@ -0,0 +1,11 @@
- op: add
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/volumeMounts
value:
- mountPath: $(pvcMountPath)
name: local-storage
- op: add
path: /spec/tfReplicaSpecs/Worker/template/spec/volumes
value:
- name: local-storage
persistentVolumeClaim:
claimName: $(pvcName)

View File

@ -0,0 +1,41 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
bases:
- ../base
configurations:
- params.yaml
# TBD (jinchihe) Need move the image to base file once.
# the issue addressed: kubernetes-sigs/kustomize/issues/1040
# TBD (jinchihe) Need to update the image once
# the issue addressed: kubeflow/testing/issues/373
images:
- name: training-image
newName: gcr.io/kubeflow-examples/mnist/model
newTag: v20190111-v0.2-148-g313770f
vars:
- fieldref:
fieldPath: data.pvcName
name: pvcName
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
- fieldref:
fieldPath: data.pvcMountPath
name: pvcMountPath
objref:
apiVersion: v1
kind: ConfigMap
name: mnist-map-training
patchesJson6902:
- path: Chief_patch.yaml
target:
group: kubeflow.org
kind: TFJob
name: $(trainingName)
version: v1beta1

View File

@ -0,0 +1,15 @@
varReference:
- path: metadata/name
kind: TFJob
- path: spec/tfReplicaSpecs/Chief/template/spec/volumes/persistentVolumeClaim/claimName
kind: TFJob
- path: spec/tfReplicaSpecs/Worker/template/spec/volumes/persistentVolumeClaim/claimName
kind: TFJob
- path: spec/tfReplicaSpecs/Ps/template/spec/volumes/persistentVolumeClaim/claimName
kind: TFJob
- path: spec/tfReplicaSpecs/Chief/template/spec/containers/volumeMounts/mountPath
kind: TFJob
- path: spec/tfReplicaSpecs/Worker/template/spec/containers/volumeMounts/mountPath
kind: TFJob
- path: spec/tfReplicaSpecs/Ps/template/spec/containers/volumeMounts/mountPath
kind: TFJob

View File

@ -20,7 +20,7 @@ local defaultParams = {
// Which Kubeflow cluster to use for running TFJobs on.
kfProject: "kubeflow-ci-deployment",
kfZone: "us-east1-b",
kfCluster: "kf-vmaster-n00",
kfCluster: "kf-v0-5-n04",
// The bucket where the model should be written
// This needs to be writable by the GCP service account in the Kubeflow cluster (not the test cluster)
@ -311,8 +311,9 @@ local dagTemplates = [
"--params=" + std.join(",", [
"name=mnist-test-" + prowDict["BUILD_ID"],
"namespace=" + testNamespace,
"numTrainSteps=10",
"trainSteps=10",
"batchSize=10",
"learningRate=0.01",
"image=" + trainerImage,
"numPs=1",
"numWorkers=2",