mirror of https://github.com/kubeflow/examples.git
drop_ksonnet_from_mnist (#546)
This commit is contained in:
parent
b23adc1f0b
commit
5fac627725
|
|
@ -1,20 +1,20 @@
|
|||
# This container is for running ksonnet within Kubernetes
|
||||
# This container is for running kustomize within Kubernetes
|
||||
FROM ubuntu:16.04
|
||||
|
||||
ENV KUBECTL_VERSION v1.9.2
|
||||
ENV KSONNET_VERSION 0.10.1
|
||||
ENV KUSTOMIZE_VERSION 2.0.3
|
||||
|
||||
RUN apt-get update && apt-get -y install curl && rm -rf /var/lib/apt/lists/*
|
||||
#RUN apk add --update ca-certificates openssl && update-ca-certificates
|
||||
|
||||
RUN curl -O -L https://github.com/ksonnet/ksonnet/releases/download/v${KSONNET_VERSION}/ks_${KSONNET_VERSION}_linux_amd64.tar.gz
|
||||
RUN tar -zxvf ks_${KSONNET_VERSION}_linux_amd64.tar.gz -C /usr/bin/ --strip-components=1 ks_${KSONNET_VERSION}_linux_amd64/ks
|
||||
RUN chmod +x /usr/bin/ks
|
||||
RUN curl -O -L https://github.com/kubernetes-sigs/kustomize/releases/download/v${KUSTOMIZE_VERSION}/kustomize_${KUSTOMIZE_VERSION}_linux_amd64
|
||||
RUN mv kustomize_${KUSTOMIZE_VERSION}_linux_amd64 /usr/bin/kustomize
|
||||
RUN chmod +x /usr/bin/kustomize
|
||||
|
||||
RUN curl -L https://storage.googleapis.com/kubernetes-release/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /usr/bin/kubectl
|
||||
RUN chmod +x /usr/bin/kubectl
|
||||
|
||||
#ksonnet doesn't work without a kubeconfig, the following is just to add a utility to generate a kubeconfig from a service account.
|
||||
# The following is just to add a utility to generate a kubeconfig from a service account.
|
||||
ADD https://raw.githubusercontent.com/zlabjp/kubernetes-scripts/cb265de1d4c4dc4ad0f15f4aaaf5b936dcf639a5/create-kubeconfig /usr/bin/
|
||||
ADD https://raw.githubusercontent.com/zlabjp/kubernetes-scripts/cb265de1d4c4dc4ad0f15f4aaaf5b936dcf639a5/LICENSE.txt /usr/bin/create-kubeconfig.LICENSE
|
||||
RUN chmod +x /usr/bin/create-kubeconfig
|
||||
|
|
@ -24,7 +24,7 @@ RUN kubectl config use-context default
|
|||
|
||||
ENV USER root
|
||||
|
||||
ADD ksonnet-entrypoint.sh /
|
||||
RUN chmod +x /ksonnet-entrypoint.sh
|
||||
ADD kustomize-entrypoint.sh /
|
||||
RUN chmod +x /kustomize-entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/ksonnet-entrypoint.sh"]
|
||||
ENTRYPOINT ["/kustomize-entrypoint.sh"]
|
||||
|
|
@ -27,7 +27,6 @@ IMG ?= gcr.io/kubeflow-examples/mnist
|
|||
|
||||
# List any changed files. We only include files in the notebooks directory.
|
||||
# because that is the code in the docker image.
|
||||
# In particular we exclude changes to the ksonnet configs.
|
||||
CHANGED_FILES := $(shell git diff-files --relative=mnist/)
|
||||
|
||||
# Whether to use cached images with GCB
|
||||
|
|
|
|||
434
mnist/README.md
434
mnist/README.md
|
|
@ -2,7 +2,7 @@
|
|||
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
|
||||
**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)*
|
||||
|
||||
- [Training MNIST](#training-mnist)
|
||||
- [MNIST on Kubeflow](#mnist-on-kubeflow)
|
||||
- [Prerequisites](#prerequisites)
|
||||
- [Deploy Kubeflow](#deploy-kubeflow)
|
||||
- [Local Setup](#local-setup)
|
||||
|
|
@ -30,24 +30,25 @@
|
|||
|
||||
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
|
||||
|
||||
# Training MNIST
|
||||
|
||||
# MNIST on Kubeflow
|
||||
|
||||
This example guides you through the process of taking an example model, modifying it to run better within Kubeflow, and serving the resulting trained model.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before we get started there a few requirements.
|
||||
Before we get started there are a few requirements.
|
||||
|
||||
### Deploy Kubeflow
|
||||
|
||||
Follow the [Getting Started Guide](https://www.kubeflow.org/docs/started/getting-started/) to deploy Kubeflow
|
||||
Follow the [Getting Started Guide](https://www.kubeflow.org/docs/started/getting-started/) to deploy Kubeflow.
|
||||
|
||||
### Local Setup
|
||||
|
||||
You also need the following command line tools:
|
||||
|
||||
- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/)
|
||||
- [ksonnet](https://ksonnet.io/#get-started)
|
||||
- [kustomize](https://kustomize.io/)
|
||||
|
||||
To run the client at the end of the example, you must have [requirements.txt](requirements.txt) intalled in your active python environment.
|
||||
|
||||
|
|
@ -78,7 +79,7 @@ The resulting model is [model.py](model.py).
|
|||
With our code ready, we will now build/push the docker image.
|
||||
|
||||
```
|
||||
DOCKER_URL=docker.io/reponame/mytfmodel # Put your docker registry here
|
||||
DOCKER_URL=docker.io/reponame/mytfmodel:tag # Put your docker registry here
|
||||
docker build . --no-cache -f Dockerfile.model -t ${DOCKER_URL}
|
||||
|
||||
docker push ${DOCKER_URL}
|
||||
|
|
@ -88,7 +89,9 @@ docker push ${DOCKER_URL}
|
|||
|
||||
With our data and workloads ready, now the cluster must be prepared. We will be deploying the TF Operator, and Argo to help manage our training job.
|
||||
|
||||
In the following instructions we will install our required components to a single namespace. For these instructions we will assume the chosen namespace is `tfworkflow`:
|
||||
In the following instructions we will install our required components to a single namespace. For these instructions we will assume the chosen namespace is `kubeflow`.
|
||||
|
||||
|
||||
|
||||
### Training your model
|
||||
|
||||
|
|
@ -98,41 +101,55 @@ Let's start by runing the training job on Kubeflow and storing the model in a lo
|
|||
|
||||
Fristly, refer to the [document](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) to create Persistent Volume(PV) and Persistent Volume Claim(PVC), the PVC name (${PVC_NAME}) will be used by pods of training and serving for local mode in steps below.
|
||||
|
||||
Creating an environment to store parameters particular for local mode.
|
||||
Enter the `training/local` from the `mnist` application directory.
|
||||
```
|
||||
KSENV=local
|
||||
cd ks_app
|
||||
ks env add ${KSENV}
|
||||
cd training/local
|
||||
```
|
||||
|
||||
Give the job a name to indicate it is running locally
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train name mnist-train-local
|
||||
kustomize edit add configmap mnist-map-training --from-literal=name=mnist-train-local
|
||||
```
|
||||
|
||||
Point the job at your custom training image
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train image $DOCKER_URL
|
||||
kustomize edit set image training-image=$DOCKER_URL:$TAG
|
||||
```
|
||||
|
||||
Mount the pvc to store the exported model, by default the pvc will be mounted to the `/mnt` of the training pod.
|
||||
Optionally, configure it to run distributed by setting the number of parameter servers and workers to use. The `numPs` means the number of Ps and the `numWorkers` means the number of Worker.
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train pvcName ${PVC_NAME}
|
||||
../base/definition.sh --numPs 1 --numWorkers 2
|
||||
```
|
||||
|
||||
Configure a filepath for the exported model and checkpoints.
|
||||
Set the training parameters, such as training steps, batch size and learning rate.
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train modelDir /mnt
|
||||
ks param set --env=${KSENV} train exportDir /mnt/export
|
||||
kustomize edit add configmap mnist-map-training --from-literal=trainSteps=200
|
||||
kustomize edit add configmap mnist-map-training --from-literal=batchSize=100
|
||||
kustomize edit add configmap mnist-map-training --from-literal=learningRate=0.01
|
||||
```
|
||||
|
||||
To store the the exported model and checkpoints model, configure PVC name and mount piont.
|
||||
|
||||
```
|
||||
kustomize edit add configmap mnist-map-training --from-literal=pvcName=${PVC_NAME}
|
||||
kustomize edit add configmap mnist-map-training --from-literal=pvcMountPath=/mnt
|
||||
```
|
||||
|
||||
Now we need to configure parameters and telling the code to save the model to PVC.
|
||||
|
||||
```
|
||||
kustomize edit add configmap mnist-map-training --from-literal=modelDir=/mnt
|
||||
kustomize edit add configmap mnist-map-training --from-literal=exportDir=/mnt/export
|
||||
```
|
||||
|
||||
You can now submit the job
|
||||
|
||||
```
|
||||
ks apply ${KSENV} -c train
|
||||
kustomize build . |kubectl apply -f -
|
||||
```
|
||||
|
||||
And you can check the job
|
||||
|
|
@ -147,24 +164,22 @@ And to check the logs
|
|||
kubectl logs mnist-train-local-chief-0
|
||||
```
|
||||
|
||||
|
||||
#### Using GCS
|
||||
|
||||
In this section we describe how to save the model to Google Cloud Storage (GCS).
|
||||
|
||||
Storing the model in GCS has the advantages
|
||||
Storing the model in GCS has the advantages:
|
||||
|
||||
* The model is readily available after the job finishes
|
||||
* We can run distributed training
|
||||
|
||||
* Distributed training requires a storage system accessible to all the machines
|
||||
|
||||
Lets start by creating an environment to store parameters particular to writing the model to GCS
|
||||
and running distributed.
|
||||
Enter the `training/GCS` from the `mnist` application directory.
|
||||
|
||||
```
|
||||
KSENV=distributed
|
||||
cd ks_app
|
||||
ks env add ${KSENV}
|
||||
cd training/GCS
|
||||
```
|
||||
|
||||
Set an environment variable that points to your GCP project Id
|
||||
|
|
@ -174,28 +189,42 @@ PROJECT=<your project id>
|
|||
|
||||
Create a bucket on GCS to store our model. The name must be unique across all GCS buckets
|
||||
```
|
||||
BUCKET=$KSENV-$(date +%s)
|
||||
BUCKET=distributed-$(date +%s)
|
||||
gsutil mb gs://$BUCKET/
|
||||
```
|
||||
|
||||
Give the job a different name (to distinguish it from your job which didn't use GCS)
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train name mnist-train-dist
|
||||
kustomize edit add configmap mnist-map-training --from-literal=name=mnist-train-dist
|
||||
```
|
||||
|
||||
Next we configure it to run distributed by setting the number of parameter servers and workers to use.
|
||||
Optionally, if you want to use your custom training image, configurate that as below.
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train numPs 1
|
||||
ks param set --env=${KSENV} train numWorkers 2
|
||||
kustomize edit set image training-image=$DOCKER_URL:$TAG
|
||||
```
|
||||
Now we need to configure parameters telling the code to save the model to GCS.
|
||||
|
||||
Next we configure it to run distributed by setting the number of parameter servers and workers to use. The `numPs` means the number of Ps and the `numWorkers` means the number of Worker.
|
||||
|
||||
```
|
||||
../base/definition.sh --numPs 1 --numWorkers 2
|
||||
```
|
||||
|
||||
Set the training parameters, such as training steps, batch size and learning rate.
|
||||
|
||||
```
|
||||
kustomize edit add configmap mnist-map-training --from-literal=trainSteps=200
|
||||
kustomize edit add configmap mnist-map-training --from-literal=batchSize=100
|
||||
kustomize edit add configmap mnist-map-training --from-literal=learningRate=0.01
|
||||
```
|
||||
|
||||
Now we need to configure parameters and telling the code to save the model to GCS.
|
||||
|
||||
```
|
||||
MODEL_PATH=my-model
|
||||
ks param set --env=${KSENV} train modelDir gs://${BUCKET}/${MODEL_PATH}
|
||||
ks param set --env=${KSENV} train exportDir gs://${BUCKET}/${MODEL_PATH}/export
|
||||
kustomize edit add configmap mnist-map-training --from-literal=modelDir=gs://${BUCKET}/${MODEL_PATH}
|
||||
kustomize edit add configmap mnist-map-training --from-literal=exportDir=gs://${BUCKET}/${MODEL_PATH}/export
|
||||
```
|
||||
|
||||
In order to write to GCS we need to supply the TFJob with GCP credentials. We do
|
||||
|
|
@ -212,7 +241,7 @@ then a number of steps have already been performed for you
|
|||
gcloud --project=${PROJECT} iam service-accounts list
|
||||
```
|
||||
|
||||
1. We stored the private key for this account in a K8s secret named `user-gcp-sa`
|
||||
2. We stored the private key for this account in a K8s secret named `user-gcp-sa`
|
||||
|
||||
* To see the secrets in your cluster
|
||||
|
||||
|
|
@ -220,7 +249,7 @@ then a number of steps have already been performed for you
|
|||
kubectl get secrets -n kubeflow
|
||||
```
|
||||
|
||||
1. We granted this service account permission to read/write GCS buckets in this project
|
||||
3. We granted this service account permission to read/write GCS buckets in this project
|
||||
|
||||
* To see the IAM policy you can do
|
||||
|
||||
|
|
@ -244,55 +273,24 @@ then a number of steps have already been performed for you
|
|||
|
||||
To use this service account we perform the following steps
|
||||
|
||||
1. Mount the secret into the pod
|
||||
1. Mount the secret `user-gcp-sa` into the pod and configure the mount path of the secret.
|
||||
```
|
||||
ks param set --env=${KSENV} train secret user-gcp-sa=/var/secrets
|
||||
kustomize edit add configmap mnist-map-training --from-literal=secretName=user-gcp-sa
|
||||
kustomize edit add configmap mnist-map-training --from-literal=secretMountPath=/var/secrets
|
||||
```
|
||||
|
||||
* Note: ensure your envrionment is pointed at the same `kubeflow` namespace as the `user-gcp-sa` secret
|
||||
* Setting this ksonnet parameter causes a volumeMount and volume to be added to your TFJob
|
||||
* To see this you can run `ks show ${KSENV} -c train`
|
||||
|
||||
* The output should now include a volumeMount and volume section
|
||||
|
||||
```
|
||||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: TFJob
|
||||
metadata:
|
||||
...
|
||||
spec:
|
||||
tfReplicaSpecs:
|
||||
Chief:
|
||||
...
|
||||
template:
|
||||
...
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
...
|
||||
volumeMounts:
|
||||
- mountPath: /var/secrets
|
||||
name: user-gcp-sa
|
||||
readOnly: true
|
||||
...
|
||||
volumes:
|
||||
- name: user-gcp-sa
|
||||
secret:
|
||||
secretName: user-gcp-sa
|
||||
...
|
||||
```
|
||||
|
||||
1. Next we need to set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` so that our code knows
|
||||
where to look for the service account key.
|
||||
2. Next we need to set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` so that our code knows where to look for the service account key.
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train envVariables GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json
|
||||
kustomize edit add configmap mnist-map-training --from-literal=GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json
|
||||
```
|
||||
|
||||
* If we look at the spec for our job we can see that the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set.
|
||||
|
||||
```
|
||||
ks show ${KSENV} -c train
|
||||
kustomize build .
|
||||
```
|
||||
```
|
||||
apiVersion: kubeflow.org/v1beta1
|
||||
|
|
@ -321,7 +319,7 @@ To use this service account we perform the following steps
|
|||
You can now submit the job
|
||||
|
||||
```
|
||||
ks apply ${KSENV} -c train
|
||||
kustomize build . |kubectl apply -f -
|
||||
```
|
||||
|
||||
And you can check the job status
|
||||
|
|
@ -336,41 +334,50 @@ And to check the logs
|
|||
kubectl logs -f mnist-train-dist-chief-0
|
||||
```
|
||||
|
||||
|
||||
#### Using S3
|
||||
|
||||
To use S3 we need we need to configure TensorFlow to use S3 credentials and variables. These credentials will be provided as kubernetes secrets and the variables will be passed in as environment variables. Modify the below values to suit your environment.
|
||||
To use S3 we need to configure TensorFlow to use S3 credentials and variables. These credentials will be provided as kubernetes secrets and the variables will be passed in as environment variables. Modify the below values to suit your environment.
|
||||
|
||||
Lets start by creating an environment to store parameters particular to writing the model to S3
|
||||
and running distributed.
|
||||
Enter the `training/S3` from the `mnist` application directory.
|
||||
|
||||
```
|
||||
KSENV=distributed
|
||||
cd ks_app
|
||||
ks env add ${KSENV}
|
||||
cd training/S3
|
||||
```
|
||||
|
||||
Give the job a different name (to distinguish it from your job which didn't use S3)
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train name mnist-train-dist
|
||||
kustomize edit add configmap mnist-map-training --from-literal=name=mnist-train-dist
|
||||
```
|
||||
|
||||
Next we configure it to run distributed by setting the number of parameter servers and workers to use.
|
||||
Optionally, if you want to use your custom training image, configurate that as below.
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train numPs 1
|
||||
ks param set --env=${KSENV} train numWorkers 2
|
||||
```
|
||||
Now we need to configure parameters telling the code to save the model to S3.
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train modelDir ${S3_MODEL_PATH_URI}
|
||||
ks param set --env=${KSENV} train exportDir ${S3_MODEL_EXPORT_URI}
|
||||
kustomize edit set image training-image=$DOCKER_URL:$TAG
|
||||
```
|
||||
|
||||
In order to write to S3 we need to supply the TensorFlow code with AWS credentials we also need to set
|
||||
various environment variables configuring access to S3.
|
||||
Next we configure it to run distributed by setting the number of parameter servers and workers to use. The `numPs` means the number of Ps and the `numWorkers` means the number of Worker.
|
||||
|
||||
```
|
||||
../base/definition.sh --numPs 1 --numWorkers 2
|
||||
```
|
||||
|
||||
Set the training parameters, such as training steps, batch size and learning rate.
|
||||
|
||||
```
|
||||
kustomize edit add configmap mnist-map-training --from-literal=trainSteps=200
|
||||
kustomize edit add configmap mnist-map-training --from-literal=batchSize=100
|
||||
kustomize edit add configmap mnist-map-training --from-literal=learningRate=0.01
|
||||
```
|
||||
|
||||
Now we need to configure parameters telling the code to save the model to S3, replace `${S3_MODEL_PATH_URI}` and `${S3_MODEL_EXPORT_URI}` below with real value.
|
||||
|
||||
```
|
||||
kustomize edit add configmap mnist-map-training --from-literal=modelDir=${S3_MODEL_PATH_URI}
|
||||
kustomize edit add configmap mnist-map-training --from-literal=exportDir=${S3_MODEL_EXPORT_URI}
|
||||
```
|
||||
|
||||
In order to write to S3 we need to supply the TensorFlow code with AWS credentials we also need to set various environment variables configuring access to S3.
|
||||
|
||||
1. Define a bunch of environment variables corresponding to your S3 settings; these will be used in subsequent steps
|
||||
|
||||
|
|
@ -385,76 +392,36 @@ various environment variables configuring access to S3.
|
|||
export S3_VERIFY_SSL=1 #set to 0 for defaul minio installs
|
||||
```
|
||||
|
||||
1. Create a K8s secret containing your AWS credentials
|
||||
2. Create a K8s secret containing your AWS credentials
|
||||
|
||||
```
|
||||
kubectl create secret generic aws-creds --from-literal=awsAccessKeyID=${AWS_ACCESS_KEY_ID} \
|
||||
kustomize edit add secret aws-creds --from-literal=awsAccessKeyID=${AWS_ACCESS_KEY_ID} \
|
||||
--from-literal=awsSecretAccessKey=${AWS_SECRET_ACCESS_KEY}
|
||||
```
|
||||
|
||||
1. Pass secrets as environment variables into pod
|
||||
|
||||
3. Pass secrets as environment variables into pod
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} train secretKeyRefs AWS_ACCESS_KEY_ID=aws-creds.awsAccessKeyID,AWS_SECRET_ACCESS_KEY=aws-creds.awsSecretAccessKey
|
||||
```
|
||||
kustomize edit add configmap mnist-map-training --from-literal=awsSecretName=aws-creds
|
||||
kustomize edit add configmap mnist-map-training --from-literal=awsAccessKeyIDName=awsAccessKeyID
|
||||
kustomize edit add configmap mnist-map-training --from-literal=awsSecretAccessKeyName=awsSecretAccessKey
|
||||
```
|
||||
|
||||
* Setting this ksonnet parameter causes a two new environment variables to be added to your TFJob
|
||||
* To see this you can run
|
||||
|
||||
```
|
||||
ks show ${KSENV} -c train
|
||||
```
|
||||
|
||||
* The output should now include two environment variables referencing K8s secret
|
||||
|
||||
```
|
||||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: TFJob
|
||||
metadata:
|
||||
...
|
||||
spec:
|
||||
tfReplicaSpecs:
|
||||
Chief:
|
||||
...
|
||||
template:
|
||||
...
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
...
|
||||
env:
|
||||
- name: AWS_ACCESS_KEY_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: awsAccessKeyID
|
||||
name: aws-creds
|
||||
- name: AWS_SECRET_ACCESS_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: awsSecretAccessKey
|
||||
name: aws-creds
|
||||
...
|
||||
```
|
||||
|
||||
1. Next we need to set a whole bunch of S3 related environment variables so that TensorFlow
|
||||
knows how to talk to S3
|
||||
4. Next we need to set a whole bunch of S3 related environment variables so that TensorFlow knows how to talk to S3
|
||||
|
||||
```
|
||||
AWSENV="S3_ENDPOINT=${S3_ENDPOINT}"
|
||||
AWSENV="${AWSENV},AWS_ENDPOINT_URL=${AWS_ENDPOINT_URL}"
|
||||
AWSENV="${AWSENV},AWS_REGION=${AWS_REGION}"
|
||||
AWSENV="${AWSENV},BUCKET_NAME=${BUCKET_NAME}"
|
||||
AWSENV="${AWSENV},S3_USE_HTTPS=${S3_USE_HTTPS}"
|
||||
AWSENV="${AWSENV},S3_VERIFY_SSL=${S3_VERIFY_SSL}"
|
||||
|
||||
ks param set --env=${KSENV} train envVariables ${AWSENV}
|
||||
kustomize edit add configmap mnist-map-training --from-literal=S3_ENDPOINT=${S3_ENDPOINT}
|
||||
kustomize edit add configmap mnist-map-training --from-literal=AWS_ENDPOINT_URL=${AWS_ENDPOINT_URL}
|
||||
kustomize edit add configmap mnist-map-training --from-literal=AWS_REGION=${AWS_REGION}
|
||||
kustomize edit add configmap mnist-map-training --from-literal=BUCKET_NAME=${BUCKET_NAME}
|
||||
kustomize edit add configmap mnist-map-training --from-literal=S3_USE_HTTPS=${S3_USE_HTTPS}
|
||||
kustomize edit add configmap mnist-map-training --from-literal=S3_VERIFY_SSL=${S3_VERIFY_SSL}
|
||||
```
|
||||
|
||||
* If we look at the spec for our job we can see that the environment variables related
|
||||
to S3 are set.
|
||||
* If we look at the spec for our job we can see that the environment variables related to S3 are set.
|
||||
|
||||
```
|
||||
ks show ${KSENV} -c train
|
||||
kustomize build .
|
||||
|
||||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: TFJob
|
||||
|
|
@ -484,7 +451,7 @@ various environment variables configuring access to S3.
|
|||
You can now submit the job
|
||||
|
||||
```
|
||||
ks apply ${KSENV} -c train
|
||||
kustomize build . |kubectl apply -f -
|
||||
```
|
||||
|
||||
And you can check the job
|
||||
|
|
@ -507,10 +474,16 @@ There are various ways to monitor workflow/training job. In addition to using `k
|
|||
|
||||
#### Using GCS
|
||||
|
||||
Enter the `monitoring/GCS` from the `mnist` application directory.
|
||||
|
||||
```
|
||||
cd monitoring/GCS
|
||||
```
|
||||
|
||||
Configure TensorBoard to point to your model location
|
||||
|
||||
```
|
||||
ks param set tensorboard --env=${KSENV} logDir ${LOGDIR}
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=logDir=${LOGDIR}
|
||||
```
|
||||
|
||||
Assuming you followed the directions above if you used GCS you can use the following value
|
||||
|
|
@ -521,33 +494,25 @@ LOGDIR=gs://${BUCKET}/${MODEL_PATH}
|
|||
|
||||
You need to point TensorBoard to GCP credentials to access GCS bucket with model.
|
||||
|
||||
1. Mount the secret into the pod
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} tensorboatd secret user-gcp-sa=/var/secrets
|
||||
```
|
||||
|
||||
* Setting this ksonnet parameter causes a volumeMount and volume to be added to TensorBoard
|
||||
deployment
|
||||
* To see this you can run
|
||||
|
||||
1. Mount the secret `user-gcp-sa` into the pod and configure the mount path of the secret.
|
||||
```
|
||||
ks show ${KSENV} -c tensorboard
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=secretName=user-gcp-sa
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=secretMountPath=/var/secrets
|
||||
```
|
||||
|
||||
* The output should now include a volumeMount and volume section
|
||||
* Setting this parameter causes a volumeMount and volume to be added to TensorBoard deployment
|
||||
|
||||
1. Next we need to set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` so that our code knows
|
||||
2. Next we need to set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` so that our code knows
|
||||
where to look for the service account key.
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} tensorboard envVariables GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json
|
||||
```
|
||||
|
||||
* If we look at the spec for TensorBoard deployment we can see that the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set.
|
||||
|
||||
```
|
||||
ks show ${KSENV} -c tensorboard
|
||||
kustomize build .
|
||||
```
|
||||
```
|
||||
...
|
||||
|
|
@ -559,10 +524,16 @@ You need to point TensorBoard to GCP credentials to access GCS bucket with model
|
|||
|
||||
#### Using S3
|
||||
|
||||
Enter the `monitoring/S3` from the `mnist` application directory.
|
||||
|
||||
```
|
||||
cd monitoring/S3
|
||||
```
|
||||
|
||||
Configure TensorBoard to point to your model location
|
||||
|
||||
```
|
||||
ks param set tensorboard --env=${KSENV} logDir ${LOGDIR}
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=logDir=${LOGDIR}
|
||||
```
|
||||
|
||||
Assuming you followed the directions above if you used S3 you can use the following value
|
||||
|
|
@ -576,58 +547,26 @@ You need to point TensorBoard to AWS credentials to access S3 bucket with model.
|
|||
1. Pass secrets as environment variables into pod
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} tensorboard secretKeyRefs AWS_ACCESS_KEY_ID=aws-creds.awsAccessKeyID,AWS_SECRET_ACCESS_KEY=aws-creds.awsSecretAccessKey
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=awsSecretName=aws-creds
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=awsAccessKeyIDName=awsAccessKeyID
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=awsSecretAccessKeyName=awsSecretAccessKey
|
||||
```
|
||||
|
||||
* Setting this ksonnet parameter causes a two new environment variables to be added to TensorBoard
|
||||
deployment
|
||||
* To see this you can run
|
||||
|
||||
```
|
||||
ks show ${KSENV} -c tensorboard
|
||||
```
|
||||
|
||||
* The output should now include two environment variables referencing K8s secret
|
||||
|
||||
```
|
||||
...
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
...
|
||||
env:
|
||||
...
|
||||
- name: AWS_ACCESS_KEY_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: awsAccessKeyID
|
||||
name: aws-creds
|
||||
- name: AWS_SECRET_ACCESS_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: awsSecretAccessKey
|
||||
name: aws-creds
|
||||
...
|
||||
```
|
||||
|
||||
1. Next we need to set a whole bunch of S3 related environment variables so that TensorBoard
|
||||
knows how to talk to S3
|
||||
2. Next we need to set a whole bunch of S3 related environment variables so that TensorBoard knows how to talk to S3
|
||||
|
||||
```
|
||||
AWSENV="S3_ENDPOINT=${S3_ENDPOINT}"
|
||||
AWSENV="${AWSENV},AWS_ENDPOINT_URL=${AWS_ENDPOINT_URL}"
|
||||
AWSENV="${AWSENV},AWS_REGION=${AWS_REGION}"
|
||||
AWSENV="${AWSENV},BUCKET_NAME=${BUCKET_NAME}"
|
||||
AWSENV="${AWSENV},S3_USE_HTTPS=${S3_USE_HTTPS}"
|
||||
AWSENV="${AWSENV},S3_VERIFY_SSL=${S3_VERIFY_SSL}"
|
||||
|
||||
ks param set --env=${KSENV} tensorboard envVariables ${AWSENV}
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=S3_ENDPOINT=${S3_ENDPOINT}
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=AWS_ENDPOINT_URL=${AWS_ENDPOINT_URL}
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=AWS_REGION=${AWS_REGION}
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=BUCKET_NAME=${BUCKET_NAME}
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=S3_USE_HTTPS=${S3_USE_HTTPS}
|
||||
kustomize edit add configmap mnist-map-monitoring --from-literal=S3_VERIFY_SSL=${S3_VERIFY_SSL}
|
||||
```
|
||||
|
||||
* If we look at the spec for TensorBoard deployment we can see that the environment variables related to S3 are set.
|
||||
|
||||
```
|
||||
ks show ${KSENV} -c tensorboard
|
||||
kustomize build .
|
||||
```
|
||||
|
||||
```
|
||||
|
|
@ -648,34 +587,29 @@ You need to point TensorBoard to AWS credentials to access S3 bucket with model.
|
|||
|
||||
#### Deploying TensorBoard
|
||||
|
||||
|
||||
Now you can deploy TensorBoard
|
||||
|
||||
```
|
||||
ks apply ${KSENV} -c tensorboard
|
||||
kustomize build . | kubectl apply -f -
|
||||
```
|
||||
|
||||
To access TensorBoard using port-forwarding
|
||||
|
||||
```
|
||||
kubectl -n jlewi port-forward service/tensorboard-tb 8090:80
|
||||
kubectl -n kubeflow port-forward service/tensorboard-tb 8090:80
|
||||
```
|
||||
TensorBoard can now be accessed at [http://127.0.0.1:8090](http://127.0.0.1:8090).
|
||||
|
||||
|
||||
## Serving the model
|
||||
|
||||
The model code will export the model in saved model format which is suitable for serving with TensorFlow serving.
|
||||
|
||||
To serve the model follow the instructions below. The instructins vary slightly based on where you are storing your
|
||||
model (e.g. GCS, S3, PVC). Depending on the storage system we provide different ksonnet components as a convenience
|
||||
for setting relevant environment variables.
|
||||
To serve the model follow the instructions below. The instructins vary slightly based on where you are storing your model (e.g. GCS, S3, PVC). Depending on the storage system we provide different kustomization as a convenience for setting relevant environment variables.
|
||||
|
||||
|
||||
### GCS
|
||||
|
||||
Here we show to serve the model when it is stored on GCS. This assumes that when you trained the model you set `exportDir` to a GCS
|
||||
URI; if not you can always copy it to GCS using `gsutil`.
|
||||
Here we show to serve the model when it is stored on GCS. This assumes that when you trained the model you set `exportDir` to a GCS URI; if not you can always copy it to GCS using `gsutil`.
|
||||
|
||||
Check that a model was exported
|
||||
|
||||
|
|
@ -696,35 +630,39 @@ ${EXPORT_DIR}/1547100373/variables/variables.index
|
|||
|
||||
The number `1547100373` is a version number auto-generated by TensorFlow; it will vary on each run but should be monotonically increasing if you save a model to the same location as a previous location.
|
||||
|
||||
Enter the `serving/GCS` from the `mnist` application directory.
|
||||
```
|
||||
cd serving/GCS
|
||||
```
|
||||
|
||||
Set a different name for the tf-serving.
|
||||
|
||||
```
|
||||
kustomize edit add configmap mnist-map-serving --from-literal=name=mnist-gcs-dist
|
||||
```
|
||||
|
||||
Set your model path
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} mnist-deploy-gcp modelBasePath ${EXPORT_DIR}
|
||||
kustomize edit add configmap mnist-map-serving --from-literal=modelBasePath=${EXPORT_DIR}
|
||||
```
|
||||
|
||||
Deploy it
|
||||
Deploy it, and run a service to make the deployment accessible to other pods in the cluster
|
||||
|
||||
```
|
||||
ks apply ${KSENV} -c mnist-deploy-gcp
|
||||
kustomize build . |kubectl apply -f -
|
||||
```
|
||||
|
||||
You can check the deployment by running
|
||||
|
||||
```
|
||||
kubectl describe deployments mnist-deploy-gcp
|
||||
kubectl describe deployments mnist-gcs-dist
|
||||
```
|
||||
|
||||
Finally, run a service to make the deployment accessible to other pods in the cluster
|
||||
The service should make the `mnist-gcs-dist` deployment accessible over port 9000
|
||||
|
||||
```
|
||||
ks apply ${KSENV} -c mnist-service
|
||||
```
|
||||
|
||||
The service should make the `mnist-deploy-gcp` deployment accessible over port 9000
|
||||
|
||||
```
|
||||
kubectl describe service mnist-service
|
||||
kubectl describe service mnist-gcs-dist
|
||||
```
|
||||
|
||||
### S3
|
||||
|
|
@ -735,32 +673,42 @@ TODO: Add instructions
|
|||
|
||||
The section shows how to serve the local model that was stored in PVC while training.
|
||||
|
||||
Enter the `serving/local` from the `mnist` application directory.
|
||||
|
||||
```
|
||||
cd serving/local
|
||||
```
|
||||
|
||||
Set a different name for the tf-serving.
|
||||
|
||||
```
|
||||
kustomize edit add configmap mnist-map-serving --from-literal=name=mnist-service-local
|
||||
```
|
||||
|
||||
Mount the PVC, by default the pvc will be mounted to the `/mnt` of the pod.
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} mnist-deploy-local pvcName ${PVC_NAME}
|
||||
kustomize edit add configmap mnist-map-serving --from-literal=pvcName=${PVC_NAME}
|
||||
kustomize edit add configmap mnist-map-serving --from-literal=pvcMountPath=/mnt
|
||||
```
|
||||
|
||||
Configure a filepath for the exported model.
|
||||
|
||||
```
|
||||
ks param set --env=${KSENV} mnist-deploy-local modelBasePath /mnt/export
|
||||
kustomize edit add configmap mnist-map-serving --from-literal=modelBasePath=/mnt/export
|
||||
```
|
||||
|
||||
Deploy it.
|
||||
Deploy it, and run a service to make the deployment accessible to other pods in the cluster.
|
||||
|
||||
```
|
||||
ks apply ${KSENV} -c mnist-deploy-local
|
||||
kustomize build . |kubectl apply -f -
|
||||
```
|
||||
|
||||
You can check the deployment by running
|
||||
```
|
||||
kubectl describe deployments mnist-deploy-local
|
||||
```
|
||||
Finally, run a service to make the deployment accessible to other pods in the cluster.
|
||||
```
|
||||
ks apply ${KSENV} -c mnist-service
|
||||
```
|
||||
|
||||
The service should make the `mnist-deploy-local` deployment accessible over port 9000.
|
||||
```
|
||||
kubectl describe service mnist-service
|
||||
|
|
@ -770,10 +718,16 @@ kubectl describe service mnist-service
|
|||
|
||||
The example comes with a simple web front end that can be used with your model.
|
||||
|
||||
Enter the `front` from the `mnist` application directory.
|
||||
|
||||
```
|
||||
cd front
|
||||
```
|
||||
|
||||
To deploy the web front end
|
||||
|
||||
```
|
||||
ks apply ${KSENV} -c web-ui
|
||||
kustomize build . |kubectl apply -f -
|
||||
```
|
||||
|
||||
### Connecting via port forwarding
|
||||
|
|
|
|||
|
|
@ -0,0 +1,20 @@
|
|||
apiVersion: apps/v1beta2
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: web-ui
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: web-ui
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: web-ui
|
||||
spec:
|
||||
containers:
|
||||
- image: gcr.io/kubeflow-examples/mnist/web-ui:v20190112-v0.2-142-g3b38225
|
||||
name: web-ui
|
||||
ports:
|
||||
- containerPort: 5000
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
generatorOptions:
|
||||
disableNameSuffixHash: true
|
||||
|
||||
resources:
|
||||
- deployment.yaml
|
||||
- service.yaml
|
||||
|
||||
namespace: kubeflow
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
annotations:
|
||||
getambassador.io/config: |-
|
||||
---
|
||||
apiVersion: ambassador/v0
|
||||
kind: Mapping
|
||||
name: web-ui_mapping
|
||||
prefix: /kubeflow/mnist/
|
||||
rewrite: /
|
||||
service: web-ui.kubeflow
|
||||
name: web-ui
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
ports:
|
||||
- port: 80
|
||||
targetPort: 5000
|
||||
selector:
|
||||
app: web-ui
|
||||
type: ClusterIP
|
||||
|
|
@ -82,9 +82,9 @@
|
|||
contextDir: "."
|
||||
},
|
||||
|
||||
local ksonnetSteps = subGraphTemplate {
|
||||
local kustomizeSteps = subGraphTemplate {
|
||||
name: "ksonnet",
|
||||
dockerFile: "./Dockerfile.ksonnet",
|
||||
dockerFile: "./Dockerfile.kustomize",
|
||||
contextDir: "."
|
||||
},
|
||||
|
||||
|
|
@ -94,6 +94,6 @@
|
|||
contextDir: "./web-ui"
|
||||
},
|
||||
|
||||
steps: modelSteps.steps + ksonnetSteps.steps + uiSteps.steps,
|
||||
images: modelSteps.images + ksonnetSteps.images + uiSteps.images,
|
||||
steps: modelSteps.steps + kustomizeSteps.steps + uiSteps.steps,
|
||||
images: modelSteps.images + kustomizeSteps.images + uiSteps.images,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +0,0 @@
|
|||
/lib
|
||||
/.ksonnet/registries
|
||||
/app.override.yaml
|
||||
/.ks_environment
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
apiVersion: 0.3.0
|
||||
kind: ksonnet.io/app
|
||||
libraries:
|
||||
kubeflow/tf-serving:
|
||||
name: tf-serving
|
||||
registry: kubeflow
|
||||
version: fed535eaa276220e4edf59530c0629f4375a40a9
|
||||
name: ks_app
|
||||
registries:
|
||||
incubator:
|
||||
protocol: github
|
||||
uri: github.com/ksonnet/parts/tree/master/incubator
|
||||
kubeflow:
|
||||
protocol: github
|
||||
uri: github.com/kubeflow/kubeflow/tree/v0.4-branch/kubeflow
|
||||
version: 0.0.1
|
||||
|
|
@ -1,39 +0,0 @@
|
|||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["mnist-deploy-aws"];
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local deployment = k.apps.v1beta1.deployment;
|
||||
local container = deployment.mixin.spec.template.spec.containersType;
|
||||
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet";
|
||||
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
|
||||
|
||||
local base = tfserving.new(env, params);
|
||||
local tfDeployment = base.tfDeployment +
|
||||
deployment.mapContainers(
|
||||
function(c) {
|
||||
result::
|
||||
c + container.withEnvMixin(
|
||||
if util.toBool(params.s3Enable) then (
|
||||
[
|
||||
{
|
||||
name: "AWS_ACCESS_KEY_ID",
|
||||
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretAccesskeyidKeyName } },
|
||||
},
|
||||
{
|
||||
name: "AWS_SECRET_ACCESS_KEY",
|
||||
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretSecretaccesskeyKeyName } },
|
||||
},
|
||||
{ name: "AWS_REGION", value: params.s3AwsRegion },
|
||||
{ name: "S3_USE_HTTPS", value: std.toString(params.s3UseHttps) },
|
||||
{ name: "S3_VERIFY_SSL", value: std.toString(params.s3VerifySsl) },
|
||||
{ name: "S3_ENDPOINT", value: params.s3Endpoint },
|
||||
]
|
||||
) else [],
|
||||
),
|
||||
}.result,
|
||||
);
|
||||
util.list([
|
||||
tfDeployment,
|
||||
base.tfservingConfig,
|
||||
],)
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["mnist-deploy-gcp"];
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local deployment = k.apps.v1beta1.deployment;
|
||||
local container = deployment.mixin.spec.template.spec.containersType;
|
||||
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet";
|
||||
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
|
||||
|
||||
local base = tfserving.new(env, params);
|
||||
local tfDeployment = base.tfDeployment +
|
||||
deployment.mixin.spec.template.spec.withVolumesMixin(
|
||||
if params.gcpCredentialSecretName != "null" then (
|
||||
[{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: params.gcpCredentialSecretName,
|
||||
},
|
||||
}]
|
||||
) else [],
|
||||
) +
|
||||
deployment.mapContainers(
|
||||
function(c) {
|
||||
result::
|
||||
c + container.withEnvMixin(
|
||||
if params.gcpCredentialSecretName != "null" then (
|
||||
[{
|
||||
name: "GOOGLE_APPLICATION_CREDENTIALS",
|
||||
value: "/secret/gcp-credentials/user-gcp-sa.json",
|
||||
}]
|
||||
) else [],
|
||||
) +
|
||||
container.withVolumeMountsMixin(
|
||||
if params.gcpCredentialSecretName != "null" then (
|
||||
[{
|
||||
name: "gcp-credentials",
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
}]
|
||||
) else [],
|
||||
),
|
||||
}.result,
|
||||
);
|
||||
util.list([
|
||||
tfDeployment,
|
||||
base.tfservingConfig,
|
||||
],)
|
||||
|
|
@ -1,39 +0,0 @@
|
|||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["mnist-deploy-local"];
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local deployment = k.apps.v1beta1.deployment;
|
||||
local container = deployment.mixin.spec.template.spec.containersType;
|
||||
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet";
|
||||
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
|
||||
|
||||
local base = tfserving.new(env, params);
|
||||
local tfDeployment = base.tfDeployment +
|
||||
deployment.mixin.spec.template.spec.withVolumesMixin(
|
||||
if params.pvcName != "null" && params.pvcName != "" then (
|
||||
[{
|
||||
name: "local-storage",
|
||||
persistentVolumeClaim: {
|
||||
claimName: params.pvcName,
|
||||
},
|
||||
}]
|
||||
) else [],
|
||||
) +
|
||||
deployment.mapContainers(
|
||||
function(c) {
|
||||
result::
|
||||
c + container.withVolumeMountsMixin(
|
||||
if params.pvcName != "null" && params.pvcName != "" then (
|
||||
[{
|
||||
name: "local-storage",
|
||||
mountPath: "/mnt",
|
||||
}]
|
||||
) else [],
|
||||
),
|
||||
}.result,
|
||||
);
|
||||
util.list([
|
||||
tfDeployment,
|
||||
base.tfservingConfig,
|
||||
],)
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["mnist-service"];
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet";
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet";
|
||||
|
||||
tfservingService.new(env, params).all
|
||||
|
|
@ -1,93 +0,0 @@
|
|||
{
|
||||
global: {},
|
||||
components: {
|
||||
train: {
|
||||
batchSize: 100,
|
||||
envVariables: 'GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json',
|
||||
exportDir: 'gs://kubeflow-ci_temp/mnist-jlewi/export',
|
||||
image: 'gcr.io/kubeflow-examples/mnist/model:v20190111-v0.2-148-g313770f',
|
||||
learningRate: '0.01',
|
||||
modelDir: 'gs://kubeflow-ci_temp/mnist-jlewi',
|
||||
name: 'mnist-train',
|
||||
numPs: 0,
|
||||
numWorkers: 0,
|
||||
secret: '',
|
||||
secretKeyRefs: '',
|
||||
trainSteps: 200,
|
||||
pvcName: '',
|
||||
},
|
||||
"mnist-deploy-local": {
|
||||
defaultCpuImage: 'tensorflow/serving:1.11.1',
|
||||
defaultGpuImage: 'tensorflow/serving:1.11.1-gpu',
|
||||
deployHttpProxy: 'false',
|
||||
enablePrometheus: 'true',
|
||||
httpProxyImage: '',
|
||||
injectIstio: 'false',
|
||||
pvcName: '',
|
||||
modelBasePath: '/mnt/export',
|
||||
modelName: 'mnist',
|
||||
name: 'mnist-deploy-local',
|
||||
numGpus: '0',
|
||||
versionName: 'v1',
|
||||
},
|
||||
"mnist-deploy-gcp": {
|
||||
defaultCpuImage: 'tensorflow/serving:1.11.1',
|
||||
defaultGpuImage: 'tensorflow/serving:1.11.1-gpu',
|
||||
deployHttpProxy: 'false',
|
||||
enablePrometheus: 'true',
|
||||
gcpCredentialSecretName: 'user-gcp-sa',
|
||||
httpProxyImage: '',
|
||||
injectIstio: 'false',
|
||||
modelBasePath: 'gs://kubeflow-examples-data/mnist',
|
||||
modelName: 'mnist',
|
||||
name: 'mnist-deploy-gcp',
|
||||
numGpus: '0',
|
||||
versionName: 'v1',
|
||||
},
|
||||
"mnist-deploy-aws": {
|
||||
defaultCpuImage: 'tensorflow/serving:1.11.1',
|
||||
defaultGpuImage: 'tensorflow/serving:1.11.1-gpu',
|
||||
deployHttpProxy: 'false',
|
||||
enablePrometheus: 'true',
|
||||
httpProxyImage: 'gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723',
|
||||
injectIstio: 'false',
|
||||
modelBasePath: 's3://kubeflow-examples-data/mnist',
|
||||
modelName: 'null',
|
||||
name: 'mnist-deploy-aws',
|
||||
numGpus: '0',
|
||||
s3AwsRegion: 'us-west-1',
|
||||
s3Enable: 'false',
|
||||
s3Endpoint: 's3.us-west-1.amazonaws.com',
|
||||
s3SecretAccesskeyidKeyName: 'AWS_ACCESS_KEY_ID',
|
||||
s3SecretName: 'null',
|
||||
s3SecretSecretaccesskeyKeyName: 'AWS_SECRET_ACCESS_KEY',
|
||||
s3UseHttps: 'true',
|
||||
s3VerifySsl: 'true',
|
||||
versionName: 'v1',
|
||||
},
|
||||
"mnist-service": {
|
||||
enablePrometheus: 'true',
|
||||
injectIstio: 'false',
|
||||
modelName: 'mnist',
|
||||
name: 'mnist-service',
|
||||
serviceType: 'ClusterIP',
|
||||
trafficRule: 'v1:100',
|
||||
},
|
||||
"tensorboard": {
|
||||
envVariables: 'GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json',
|
||||
image: "tensorflow/tensorflow:1.11.0",
|
||||
logDir: "gs://example/to/model/logdir",
|
||||
name: "tensorboard",
|
||||
secret: '',
|
||||
secretKeyRefs: '',
|
||||
},
|
||||
"web-ui": {
|
||||
containerPort: 5000,
|
||||
image: "gcr.io/kubeflow-examples/mnist/web-ui:v20190112-v0.2-142-g3b38225",
|
||||
name: "web-ui",
|
||||
replicas: 1,
|
||||
servicePort: 80,
|
||||
type: "ClusterIP",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
@ -1,120 +0,0 @@
|
|||
// TODO: Generalize to use S3. We can follow the pattern of training that
|
||||
// takes parameters to specify environment variables and secret which can be customized
|
||||
// for GCS, S3 as needed.
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components.tensorboard;
|
||||
|
||||
local util = import "util.libsonnet";
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
|
||||
local name = params.name;
|
||||
local namespace = env.namespace;
|
||||
local service = {
|
||||
apiVersion: "v1",
|
||||
kind: "Service",
|
||||
metadata: {
|
||||
name: name + "-tb",
|
||||
namespace: env.namespace,
|
||||
annotations: {
|
||||
"getambassador.io/config":
|
||||
std.join("\n", [
|
||||
"---",
|
||||
"apiVersion: ambassador/v0",
|
||||
"kind: Mapping",
|
||||
"name: " + name + "_mapping",
|
||||
"prefix: /" + env.namespace + "/tensorboard/mnist",
|
||||
"rewrite: /",
|
||||
"service: " + name + "-tb." + namespace,
|
||||
"---",
|
||||
"apiVersion: ambassador/v0",
|
||||
"kind: Mapping",
|
||||
"name: " + name + "_mapping_data",
|
||||
"prefix: /" + env.namespace + "/tensorboard/mnist/data/",
|
||||
"rewrite: /data/",
|
||||
"service: " + name + "-tb." + namespace,
|
||||
]),
|
||||
}, //annotations
|
||||
},
|
||||
spec: {
|
||||
ports: [
|
||||
{
|
||||
name: "http",
|
||||
port: 80,
|
||||
targetPort: 80,
|
||||
},
|
||||
],
|
||||
selector: {
|
||||
app: "tensorboard",
|
||||
"tb-job": name,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
local tbSecrets = util.parseSecrets(params.secretKeyRefs);
|
||||
|
||||
local secretPieces = std.split(params.secret, "=");
|
||||
local secretName = if std.length(secretPieces) > 0 then secretPieces[0] else "";
|
||||
local secretMountPath = if std.length(secretPieces) > 1 then secretPieces[1] else "";
|
||||
|
||||
local deployment = {
|
||||
apiVersion: "apps/v1beta1",
|
||||
kind: "Deployment",
|
||||
metadata: {
|
||||
name: name + "-tb",
|
||||
namespace: env.namespace,
|
||||
},
|
||||
spec: {
|
||||
replicas: 1,
|
||||
template: {
|
||||
metadata: {
|
||||
labels: {
|
||||
app: "tensorboard",
|
||||
"tb-job": name,
|
||||
},
|
||||
name: name,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
command: [
|
||||
"/usr/local/bin/tensorboard",
|
||||
"--logdir=" + params.logDir,
|
||||
"--port=80",
|
||||
],
|
||||
image: params.image,
|
||||
name: "tensorboard",
|
||||
ports: [
|
||||
{
|
||||
containerPort: 80,
|
||||
},
|
||||
],
|
||||
env: util.parseEnv(params.envVariables) + tbSecrets,
|
||||
volumeMounts: if secretMountPath != "" then
|
||||
[
|
||||
{
|
||||
name: secretName,
|
||||
mountPath: secretMountPath,
|
||||
readOnly: true,
|
||||
},
|
||||
] else [],
|
||||
},
|
||||
],
|
||||
volumes:
|
||||
if secretName != "" then
|
||||
[
|
||||
{
|
||||
name: secretName,
|
||||
secret: {
|
||||
secretName: secretName,
|
||||
},
|
||||
},
|
||||
] else [],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
std.prune(k.core.v1.list.new([service, deployment]))
|
||||
|
||||
|
|
@ -1,117 +0,0 @@
|
|||
// Component to train a model.
|
||||
//
|
||||
// Parameters are used to control training
|
||||
// image: Docker iamge to use
|
||||
// modelDir: Location to write the model this can be a local path (e.g. to a PV)
|
||||
// or it can be any filesystem URI that TF understands (e.g GCS, S3, HDFS)
|
||||
// exportDir: Location to export the model
|
||||
// trainSteps: Number of training steps to run
|
||||
// batchSize: Batch size
|
||||
// learningRate: Learning rate
|
||||
// envVariables: Comma separated list of environment variables to set.
|
||||
// Use this to set environment variables needed to configure S3 access.
|
||||
// numWorkers: Number of workers
|
||||
// numPs: Number of parameter servers
|
||||
//
|
||||
local k = import "k.libsonnet";
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components.train;
|
||||
|
||||
local util = import "util.libsonnet";
|
||||
|
||||
local trainSecrets = util.parseSecrets(params.secretKeyRefs);
|
||||
|
||||
local secretPieces = std.split(params.secret, "=");
|
||||
local secretName = if std.length(secretPieces) > 0 then secretPieces[0] else "";
|
||||
local secretMountPath = if std.length(secretPieces) > 1 then secretPieces[1] else "";
|
||||
|
||||
local replicaSpec = {
|
||||
containers: [
|
||||
{
|
||||
command: [
|
||||
"/usr/bin/python",
|
||||
"/opt/model.py",
|
||||
],
|
||||
args: [
|
||||
"--tf-model-dir=" + params.modelDir,
|
||||
"--tf-export-dir=" + params.exportDir,
|
||||
"--tf-train-steps=" + params.trainSteps,
|
||||
"--tf-batch-size=" + params.batchSize,
|
||||
"--tf-learning-rate=" + params.learningRate,
|
||||
],
|
||||
env: util.parseEnv(params.envVariables) + trainSecrets,
|
||||
image: params.image,
|
||||
name: "tensorflow",
|
||||
volumeMounts: if secretMountPath != "" then
|
||||
[
|
||||
{
|
||||
name: secretName,
|
||||
mountPath: secretMountPath,
|
||||
readOnly: true,
|
||||
},
|
||||
] else if params.pvcName != "null" && params.pvcName != "" then
|
||||
[
|
||||
{
|
||||
name: "local-storage",
|
||||
mountPath: "/mnt",
|
||||
},
|
||||
] else [],
|
||||
workingDir: "/opt",
|
||||
},
|
||||
],
|
||||
volumes:
|
||||
if secretName != "" then
|
||||
[
|
||||
{
|
||||
name: secretName,
|
||||
secret: {
|
||||
secretName: secretName,
|
||||
},
|
||||
},
|
||||
] else if params.pvcName != "null" && params.pvcName != "" then
|
||||
[
|
||||
{
|
||||
name: "local-storage",
|
||||
persistentVolumeClaim: {
|
||||
claimName: params.pvcName,
|
||||
},
|
||||
},
|
||||
] else [],
|
||||
restartPolicy: "OnFailure",
|
||||
};
|
||||
|
||||
|
||||
local tfjob = {
|
||||
apiVersion: "kubeflow.org/v1beta1",
|
||||
kind: "TFJob",
|
||||
metadata: {
|
||||
name: params.name,
|
||||
namespace: env.namespace,
|
||||
},
|
||||
spec: {
|
||||
tfReplicaSpecs: {
|
||||
Chief: {
|
||||
replicas: 1,
|
||||
template: {
|
||||
spec: replicaSpec,
|
||||
},
|
||||
},
|
||||
[if params.numWorkers > 0 then "Worker"]: {
|
||||
replicas: params.numWorkers,
|
||||
template: {
|
||||
spec: replicaSpec,
|
||||
},
|
||||
},
|
||||
[if params.numWorkers > 0 then "Ps"]: {
|
||||
replicas: params.numPs,
|
||||
template: {
|
||||
spec: replicaSpec,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
k.core.v1.list.new([
|
||||
tfjob,
|
||||
])
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
{
|
||||
// convert a list of two items into a map representing an environment variable
|
||||
// TODO(jlewi): Should we move this into kubeflow/core/util.libsonnet
|
||||
listToMap:: function(v)
|
||||
{
|
||||
name: v[0],
|
||||
value: v[1],
|
||||
},
|
||||
|
||||
// convert a list of two items into a map representing an env variable referencing k8s secret
|
||||
listToSecretMap:: function(v)
|
||||
{
|
||||
name: v[0],
|
||||
valueFrom: {
|
||||
secretKeyRef: {
|
||||
name: std.split(v[1], ".")[0],
|
||||
key: std.split(v[1], ".")[1],
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
// Function to turn comma separated list of environment variables into a dictionary.
|
||||
parseEnv:: function(v)
|
||||
local pieces = std.split(v, ",");
|
||||
if v != "" && std.length(pieces) > 0 then
|
||||
std.map(
|
||||
function(i) $.listToMap(std.split(i, "=")),
|
||||
std.split(v, ",")
|
||||
)
|
||||
else [],
|
||||
|
||||
// Function to turn comma separated list of env variables referencing secrets into a dictionary.
|
||||
parseSecrets:: function(v)
|
||||
local pieces = std.split(v, ",");
|
||||
if v != "" && std.length(pieces) > 0 then
|
||||
std.map(
|
||||
function(i) $.listToSecretMap(std.split(i, "=")),
|
||||
std.split(v, ",")
|
||||
)
|
||||
else [],
|
||||
}
|
||||
|
|
@ -1,72 +0,0 @@
|
|||
local env = std.extVar("__ksonnet/environments");
|
||||
local params = std.extVar("__ksonnet/params").components["web-ui"];
|
||||
[
|
||||
{
|
||||
"apiVersion": "v1",
|
||||
"kind": "Service",
|
||||
"metadata": {
|
||||
"name": params.name,
|
||||
"namespace": env.namespace,
|
||||
annotations: {
|
||||
"getambassador.io/config":
|
||||
std.join("\n", [
|
||||
"---",
|
||||
"apiVersion: ambassador/v0",
|
||||
"kind: Mapping",
|
||||
"name: " + params.name + "_mapping",
|
||||
"prefix: /" + env.namespace + "/mnist/",
|
||||
"rewrite: /",
|
||||
"service: " + params.name + "." + env.namespace,
|
||||
]),
|
||||
}, //annotations
|
||||
},
|
||||
"spec": {
|
||||
"ports": [
|
||||
{
|
||||
"port": params.servicePort,
|
||||
"targetPort": params.containerPort
|
||||
}
|
||||
],
|
||||
"selector": {
|
||||
"app": params.name
|
||||
},
|
||||
"type": params.type
|
||||
}
|
||||
},
|
||||
{
|
||||
"apiVersion": "apps/v1beta2",
|
||||
"kind": "Deployment",
|
||||
"metadata": {
|
||||
"name": params.name,
|
||||
"namespace": env.namespace,
|
||||
},
|
||||
"spec": {
|
||||
"replicas": params.replicas,
|
||||
"selector": {
|
||||
"matchLabels": {
|
||||
"app": params.name
|
||||
},
|
||||
},
|
||||
"template": {
|
||||
"metadata": {
|
||||
"labels": {
|
||||
"app": params.name
|
||||
}
|
||||
},
|
||||
"spec": {
|
||||
"containers": [
|
||||
{
|
||||
"image": params.image,
|
||||
"name": params.name,
|
||||
"ports": [
|
||||
{
|
||||
"containerPort": params.containerPort
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
local components = std.extVar("__ksonnet/components");
|
||||
components + {
|
||||
// Insert user-specified overrides here.
|
||||
}
|
||||
|
|
@ -1,73 +0,0 @@
|
|||
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
|
||||
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
|
||||
**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)*
|
||||
|
||||
- [tf-serving](#tf-serving)
|
||||
- [Quickstart](#quickstart)
|
||||
- [Using the library](#using-the-library)
|
||||
- [io.ksonnet.pkg.tf-serving](#ioksonnetpkgtf-serving)
|
||||
- [Example](#example)
|
||||
- [Parameters](#parameters)
|
||||
|
||||
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
|
||||
|
||||
# tf-serving
|
||||
|
||||
> TensorFlow serving is a server for TensorFlow models.
|
||||
|
||||
|
||||
* [Quickstart](#quickstart)
|
||||
* [Using Prototypes](#using-prototypes)
|
||||
* [io.ksonnet.pkg.tf-serving](#io.ksonnet.pkg.tf-serving)
|
||||
|
||||
## Quickstart
|
||||
|
||||
*The following commands use the `io.ksonnet.pkg.tf-serving` prototype to generate Kubernetes YAML for tf-serving, and then deploys it to your Kubernetes cluster.*
|
||||
|
||||
First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)).
|
||||
|
||||
If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init <app-name>`.
|
||||
|
||||
Finally, in the ksonnet application directory, run the following:
|
||||
|
||||
```shell
|
||||
# Expand prototype as a Jsonnet file, place in a file in the
|
||||
# `components/` directory. (YAML and JSON are also available.)
|
||||
$ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \
|
||||
--name tf-serving \
|
||||
--namespace default
|
||||
|
||||
# Apply to server.
|
||||
$ ks apply -f tf-serving.jsonnet
|
||||
```
|
||||
|
||||
## Using the library
|
||||
|
||||
The library files for tf-serving define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure tf-serving for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache.
|
||||
|
||||
This library provides a set of pre-fabricated "flavors" (or "distributions") of tf-serving, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs.
|
||||
|
||||
These prototypes, as well as how to use them, are enumerated below.
|
||||
|
||||
### io.ksonnet.pkg.tf-serving
|
||||
|
||||
TensorFlow serving
|
||||
#### Example
|
||||
|
||||
```shell
|
||||
# Expand prototype as a Jsonnet file, place in a file in the
|
||||
# `components/` directory. (YAML and JSON are also available.)
|
||||
$ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \
|
||||
--name YOUR_NAME_HERE \
|
||||
--model_path YOUR_MODEL_PATH_HERE
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
The available options to pass prototype are:
|
||||
|
||||
* `--name=<name>`: Name to give to each of the components [string]
|
||||
* `--model_path=<model_path>`: Path to the model. This can be a GCS path. [string]
|
||||
|
||||
|
||||
[rootReadme]: https://github.com/ksonnet/mixins
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
{
|
||||
"name": "tf-serving",
|
||||
"apiVersion": "0.0.1",
|
||||
"kind": "ksonnet.io/parts",
|
||||
"description": "TensorFlow serving is a server for TensorFlow models.\n",
|
||||
"author": "kubeflow team <kubeflow-team@google.com>",
|
||||
"contributors": [
|
||||
{
|
||||
"name": "Jeremy Lewi",
|
||||
"email": "jlewi@google.com"
|
||||
}
|
||||
],
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/kubeflow/kubeflow"
|
||||
},
|
||||
"bugs": {
|
||||
"url": "https://github.com/kubeflow/kubeflow/issues"
|
||||
},
|
||||
"keywords": [
|
||||
"kubeflow",
|
||||
"tensorflow",
|
||||
"database"
|
||||
],
|
||||
"quickStart": {
|
||||
"prototype": "io.ksonnet.pkg.tf-serving",
|
||||
"componentName": "tf-serving",
|
||||
"flags": {
|
||||
"name": "tf-serving",
|
||||
"namespace": "default"
|
||||
},
|
||||
"comment": "Run TensorFlow Serving"
|
||||
},
|
||||
"license": "Apache 2.0"
|
||||
}
|
||||
|
|
@ -1,23 +0,0 @@
|
|||
// @apiVersion 0.1
|
||||
// @name io.ksonnet.pkg.tf-serving
|
||||
// @description TensorFlow serving
|
||||
// @shortDescription A TensorFlow serving deployment
|
||||
// @param name string Name to give to each of the components
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
|
||||
// ksonnet appears to require name be a parameter of the prototype which is why we handle it differently.
|
||||
local name = import "param://name";
|
||||
|
||||
// updatedParams includes the namespace from env by default.
|
||||
local updatedParams = params + env;
|
||||
|
||||
local tfServingBase = import "kubeflow/tf-serving/tf-serving.libsonnet";
|
||||
local tfServing = tfServingBase {
|
||||
// Override parameters with user supplied parameters.
|
||||
params+: updatedParams {
|
||||
name: name,
|
||||
},
|
||||
};
|
||||
|
||||
std.prune(k.core.v1.list.new(tfServing.components))
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
// @apiVersion 0.1
|
||||
// @name io.ksonnet.pkg.tf-serving-deployment-aws
|
||||
// @description TensorFlow serving
|
||||
// @shortDescription A TensorFlow serving deployment
|
||||
// @param name string Name to give to each of the components
|
||||
// @optionalParam numGpus string 0 Number of gpus to use
|
||||
// @optionalParam deployHttpProxy string false Whether to deploy http proxy
|
||||
// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false.
|
||||
// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11)
|
||||
// @optionalParam modelBasePath string s3://kubeflow-examples-data/mnist The model path
|
||||
// @optionalParam modelName string null The model name
|
||||
// @optionalParam versionName string v1 The version name
|
||||
// @optionalParam defaultCpuImage string tensorflow/serving:1.11.1 The default model server image (cpu)
|
||||
// @optionalParam defaultGpuImage string tensorflow/serving:1.11.1-gpu The default model server image (gpu)
|
||||
// @optionalParam httpProxyImage string gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723 Http proxy image
|
||||
// @optionalParam s3Enable string false Whether to enable S3
|
||||
// Following parameters are needed only if s3Enable is true
|
||||
// @optionalParam s3SecretName string null Name of the k8s secrets containing S3 credentials
|
||||
// @optionalParam s3SecretAccesskeyidKeyName string AWS_ACCESS_KEY_ID Name of the key in the k8s secret containing AWS_ACCESS_KEY_ID
|
||||
// @optionalParam s3SecretSecretaccesskeyKeyName string AWS_SECRET_ACCESS_KEY Name of the key in the k8s secret containing AWS_SECRET_ACCESS_KEY
|
||||
// @optionalParam s3AwsRegion string us-west-1 S3 region
|
||||
// @optionalParam s3UseHttps string true Whether or not to use https
|
||||
// @optionalParam s3VerifySsl string true Whether or not to verify https certificates for S3 connections
|
||||
// @optionalParam s3Endpoint string s3.us-west-1.amazonaws.com URL for your s3-compatible endpoint
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local deployment = k.apps.v1beta1.deployment;
|
||||
local container = deployment.mixin.spec.template.spec.containersType;
|
||||
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet";
|
||||
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
|
||||
|
||||
local base = tfserving.new(env, params);
|
||||
local tfDeployment = base.tfDeployment +
|
||||
deployment.mapContainers(
|
||||
function(c) {
|
||||
result::
|
||||
c + container.withEnvMixin(
|
||||
if util.toBool(params.s3Enable) then (
|
||||
[
|
||||
{
|
||||
name: "AWS_ACCESS_KEY_ID",
|
||||
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretAccesskeyidKeyName } },
|
||||
},
|
||||
{
|
||||
name: "AWS_SECRET_ACCESS_KEY",
|
||||
valueFrom: { secretKeyRef: { name: params.s3SecretName, key: params.s3SecretSecretaccesskeyKeyName } },
|
||||
},
|
||||
{ name: "AWS_REGION", value: params.s3AwsRegion },
|
||||
{ name: "S3_USE_HTTPS", value: std.toString(params.s3UseHttps) },
|
||||
{ name: "S3_VERIFY_SSL", value: std.toString(params.s3VerifySsl) },
|
||||
{ name: "S3_ENDPOINT", value: params.s3Endpoint },
|
||||
]
|
||||
) else [],
|
||||
),
|
||||
}.result,
|
||||
);
|
||||
util.list([
|
||||
tfDeployment,
|
||||
base.tfservingConfig,
|
||||
],)
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
// @apiVersion 0.1
|
||||
// @name io.ksonnet.pkg.tf-serving-deployment-gcp
|
||||
// @description TensorFlow serving
|
||||
// @shortDescription A TensorFlow serving deployment
|
||||
// @param name string Name to give to each of the components
|
||||
// @optionalParam numGpus string 0 Number of gpus to use
|
||||
// @optionalParam deployHttpProxy string false Whether to deploy http proxy
|
||||
// @optionalParam modelBasePath string gs://kubeflow-examples-data/mnist The model path
|
||||
// @optionalParam modelName string null The model name
|
||||
// @optionalParam versionName string v1 The version name
|
||||
// @optionalParam defaultCpuImage string tensorflow/serving:1.11.1 The default model server image (cpu)
|
||||
// @optionalParam defaultGpuImage string tensorflow/serving:1.11.1-gpu The default model server image (gpu)
|
||||
// @optionalParam httpProxyImage string gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723 Http proxy image
|
||||
// @optionalParam gcpCredentialSecretName string null If not empty, insert the secret credential
|
||||
// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false.
|
||||
// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11)
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local deployment = k.apps.v1beta1.deployment;
|
||||
local container = deployment.mixin.spec.template.spec.containersType;
|
||||
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet";
|
||||
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
|
||||
|
||||
local base = tfserving.new(env, params);
|
||||
local tfDeployment = base.tfDeployment +
|
||||
deployment.mixin.spec.template.spec.withVolumesMixin(
|
||||
if params.gcpCredentialSecretName != "null" then (
|
||||
[{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: params.gcpCredentialSecretName,
|
||||
},
|
||||
}]
|
||||
) else [],
|
||||
) +
|
||||
deployment.mapContainers(
|
||||
function(c) {
|
||||
result::
|
||||
c + container.withEnvMixin(
|
||||
if params.gcpCredentialSecretName != "null" then (
|
||||
[{
|
||||
name: "GOOGLE_APPLICATION_CREDENTIALS",
|
||||
value: "/secret/gcp-credentials/user-gcp-sa.json",
|
||||
}]
|
||||
) else [],
|
||||
) +
|
||||
container.withVolumeMountsMixin(
|
||||
if params.gcpCredentialSecretName != "null" then (
|
||||
[{
|
||||
name: "gcp-credentials",
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
}]
|
||||
) else [],
|
||||
),
|
||||
}.result,
|
||||
);
|
||||
util.list([
|
||||
tfDeployment,
|
||||
base.tfservingConfig,
|
||||
],)
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
// @apiVersion 0.1
|
||||
// @name io.ksonnet.pkg.tf-serving-service
|
||||
// @description TensorFlow serving
|
||||
// @shortDescription A TensorFlow serving model
|
||||
// @param name string Name to give to each of the components
|
||||
// @optionalParam serviceType string ClusterIP The k8s service type for tf serving.
|
||||
// @optionalParam modelName string null The model name
|
||||
// @optionalParam trafficRule string v1:100 The traffic rule, in the format of version:percentage,version:percentage,..
|
||||
// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false.
|
||||
// @optionalParam enablePrometheus string true Whether to enable prometheus endpoint (requires TF 1.11)
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet";
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet";
|
||||
|
||||
tfservingService.new(env, params).all
|
||||
|
|
@ -1,230 +0,0 @@
|
|||
// @apiVersion 0.1
|
||||
// @name io.ksonnet.pkg.tf-serving-request-log
|
||||
// @description tf-serving with request logging
|
||||
// @shortDescription tf-serving with request logging
|
||||
// @param name string Name to give to each of the components
|
||||
// @param gcpProject string The gcp project for Bigquery dataset
|
||||
// @param dataset string The Bigquery dataset
|
||||
// @param table string The Bigquery table
|
||||
// @optionalParam modelBasePath string gs://kubeflow-examples-data/mnist The model path
|
||||
// @optionalParam modelName string mnist The model name
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
|
||||
local namespace = "kubeflow";
|
||||
local appName = import "param://name";
|
||||
local image = "gcr.io/kubeflow-images-public/tf-model-server-cpu:v20180327-995786ec";
|
||||
local httpProxyImage = "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180723";
|
||||
local loggingImage = "gcr.io/kubeflow-images-public/tf-model-server-request-logger:v20180723";
|
||||
|
||||
local gcpSecretName = "user-gcp-sa";
|
||||
|
||||
local service = {
|
||||
apiVersion: "v1",
|
||||
kind: "Service",
|
||||
metadata: {
|
||||
labels: {
|
||||
app: appName,
|
||||
},
|
||||
name: appName,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
ports: [
|
||||
{
|
||||
name: "grpc-tf-serving",
|
||||
port: 9000,
|
||||
targetPort: 9000,
|
||||
},
|
||||
{
|
||||
name: "http-tf-serving-proxy",
|
||||
port: 8000,
|
||||
targetPort: 8000,
|
||||
},
|
||||
],
|
||||
selector: {
|
||||
app: appName,
|
||||
},
|
||||
type: "ClusterIP",
|
||||
},
|
||||
};
|
||||
|
||||
local configMap = {
|
||||
apiVersion: "v1",
|
||||
kind: "ConfigMap",
|
||||
metadata: {
|
||||
name: appName + "fluentd-config",
|
||||
namespace: namespace,
|
||||
},
|
||||
data: {
|
||||
"fluent.conf": std.format(|||
|
||||
<source>
|
||||
@type tail
|
||||
path /tmp/logs/request.log
|
||||
pos_file /tmp/logs/request.log.pos
|
||||
<parse>
|
||||
@type json
|
||||
</parse>
|
||||
tag dummy
|
||||
</source>
|
||||
<match dummy>
|
||||
@type bigquery_insert
|
||||
auth_method application_default
|
||||
project %s
|
||||
dataset %s
|
||||
table %s
|
||||
fetch_schema true
|
||||
</match>
|
||||
|||, [params.gcpProject, params.dataset, params.table]),
|
||||
},
|
||||
};
|
||||
|
||||
local deployment = {
|
||||
apiVersion: "extensions/v1beta1",
|
||||
kind: "Deployment",
|
||||
metadata: {
|
||||
labels: {
|
||||
app: appName,
|
||||
},
|
||||
name: appName,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
template: {
|
||||
metadata: {
|
||||
labels: {
|
||||
app: appName,
|
||||
},
|
||||
},
|
||||
spec: {
|
||||
containers: [
|
||||
// ModelServer
|
||||
{
|
||||
args: [
|
||||
"/usr/bin/tensorflow_model_server",
|
||||
"--port=9000",
|
||||
"--model_name=" + params.modelName,
|
||||
"--model_base_path=" + params.modelBasePath,
|
||||
],
|
||||
image: image,
|
||||
imagePullPolicy: "IfNotPresent",
|
||||
name: "model-server",
|
||||
ports: [
|
||||
{
|
||||
containerPort: 9000,
|
||||
},
|
||||
],
|
||||
resources: {
|
||||
limits: {
|
||||
cpu: "4",
|
||||
memory: "4Gi",
|
||||
},
|
||||
requests: {
|
||||
cpu: "1",
|
||||
memory: "1Gi",
|
||||
},
|
||||
},
|
||||
},
|
||||
// Http proxy
|
||||
{
|
||||
name: "http-proxy",
|
||||
image: httpProxyImage,
|
||||
imagePullPolicy: "Always",
|
||||
command: [
|
||||
"python",
|
||||
"/usr/src/app/server.py",
|
||||
"--port=8000",
|
||||
"--rpc_port=9000",
|
||||
"--rpc_timeout=10.0",
|
||||
"--log_request=true",
|
||||
],
|
||||
env: [],
|
||||
ports: [
|
||||
{
|
||||
containerPort: 8000,
|
||||
},
|
||||
],
|
||||
resources: {
|
||||
requests: {
|
||||
memory: "1Gi",
|
||||
cpu: "1",
|
||||
},
|
||||
limits: {
|
||||
memory: "4Gi",
|
||||
cpu: "4",
|
||||
},
|
||||
},
|
||||
securityContext: {
|
||||
runAsUser: 1000,
|
||||
fsGroup: 1000,
|
||||
},
|
||||
volumeMounts: [
|
||||
{
|
||||
name: "request-logs",
|
||||
mountPath: "/tmp/logs",
|
||||
},
|
||||
],
|
||||
},
|
||||
// TODO(lunkai): use admission controller to inject.
|
||||
// Logging container.
|
||||
{
|
||||
name: "logging",
|
||||
image: loggingImage,
|
||||
imagePullPolicy: "Always",
|
||||
env: [
|
||||
{ name: "GOOGLE_APPLICATION_CREDENTIALS", value: "/secret/gcp-credentials/key.json" },
|
||||
],
|
||||
resources: {
|
||||
requests: {
|
||||
memory: "250Mi",
|
||||
cpu: "0.25",
|
||||
},
|
||||
limits: {
|
||||
memory: "500Mi",
|
||||
cpu: "0.5",
|
||||
},
|
||||
},
|
||||
volumeMounts: [
|
||||
{
|
||||
name: "request-logs",
|
||||
mountPath: "/tmp/logs",
|
||||
},
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
},
|
||||
{
|
||||
name: "fluentd-config-volume",
|
||||
mountPath: "/fluentd/etc/custom",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
volumes: [
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: gcpSecretName,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "request-logs",
|
||||
emptyDir: {},
|
||||
},
|
||||
{
|
||||
configMap: {
|
||||
name: "fluentd-config",
|
||||
},
|
||||
name: "fluentd-config-volume",
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
k.core.v1.list.new([
|
||||
service,
|
||||
deployment,
|
||||
configMap,
|
||||
])
|
||||
|
|
@ -1,112 +0,0 @@
|
|||
local tfservingService = import "kubeflow/tf-serving/tf-serving-service-template.libsonnet";
|
||||
local tfserving = import "kubeflow/tf-serving/tf-serving-template.libsonnet";
|
||||
|
||||
local params = {
|
||||
name: "m",
|
||||
serviceType: "ClusterIP",
|
||||
modelName: "mnist",
|
||||
trafficRule: "v1:100",
|
||||
injectIstio: false,
|
||||
};
|
||||
|
||||
local istioParams = params {
|
||||
injectIstio: true,
|
||||
};
|
||||
|
||||
local env = {
|
||||
namespace: "kubeflow",
|
||||
};
|
||||
|
||||
local deploymentParam = {
|
||||
name: "m",
|
||||
modelName: "mnist",
|
||||
versionName: "v1",
|
||||
modelBasePath: "gs://abc",
|
||||
numGpus: 0,
|
||||
defaultCpuImage: "gcr.io/abc",
|
||||
defaultGpuImage: "gcr.io/abc",
|
||||
injectIstio: false,
|
||||
enablePrometheus: true,
|
||||
};
|
||||
|
||||
local gpuParam1 = {
|
||||
name: "m",
|
||||
modelName: "mnist",
|
||||
versionName: "v1",
|
||||
modelBasePath: "gs://abc",
|
||||
numGpus: 1,
|
||||
defaultCpuImage: "gcr.io/abc",
|
||||
defaultGpuImage: "gcr.io/abc",
|
||||
injectIstio: false,
|
||||
enablePrometheus: true,
|
||||
};
|
||||
|
||||
local gpuParamString0 = {
|
||||
name: "m",
|
||||
modelName: "mnist",
|
||||
versionName: "v1",
|
||||
modelBasePath: "gs://abc",
|
||||
numGpus: "0",
|
||||
defaultCpuImage: "gcr.io/abc",
|
||||
defaultGpuImage: "gcr.io/abc",
|
||||
injectIstio: false,
|
||||
enablePrometheus: true,
|
||||
};
|
||||
|
||||
local gpuParamString1 = {
|
||||
name: "m",
|
||||
modelName: "mnist",
|
||||
versionName: "v1",
|
||||
modelBasePath: "gs://abc",
|
||||
numGpus: "1",
|
||||
defaultCpuImage: "gcr.io/abc",
|
||||
defaultGpuImage: "gcr.io/abc",
|
||||
injectIstio: false,
|
||||
enablePrometheus: true,
|
||||
};
|
||||
|
||||
local serviceInstance = tfservingService.new(env, params);
|
||||
local istioServiceInstance = tfservingService.new(env, istioParams);
|
||||
|
||||
local deploymentInstance = tfserving.new(env, deploymentParam);
|
||||
|
||||
local gpuInstance = tfserving.new(env, gpuParam1);
|
||||
local gpuString0Instance = tfserving.new(env, gpuParamString0);
|
||||
local gpuString1Instance = tfserving.new(env, gpuParamString1);
|
||||
|
||||
// This one should only have tfService
|
||||
std.assertEqual(
|
||||
std.length(serviceInstance.all.items),
|
||||
1,
|
||||
) &&
|
||||
|
||||
// This one should have tfService, virtualService, and DestinationRule
|
||||
std.assertEqual(
|
||||
std.length(istioServiceInstance.all.items),
|
||||
3
|
||||
) &&
|
||||
|
||||
std.startsWith(
|
||||
deploymentInstance.tfDeployment.spec.template.spec.containers[0].args[4],
|
||||
"--monitoring_config_file"
|
||||
) &&
|
||||
|
||||
std.assertEqual(
|
||||
deploymentInstance.tfDeployment.spec.template.spec.containers[0].resources.limits,
|
||||
{ cpu: "4", memory: "4Gi" }
|
||||
) &&
|
||||
|
||||
std.assertEqual(
|
||||
gpuInstance.tfDeployment.spec.template.spec.containers[0].resources.limits,
|
||||
{ cpu: "4", memory: "4Gi", "nvidia.com/gpu": 1 }
|
||||
) &&
|
||||
|
||||
std.assertEqual(
|
||||
gpuString0Instance.tfDeployment.spec.template.spec.containers[0].resources.limits,
|
||||
{ cpu: "4", memory: "4Gi" }
|
||||
) &&
|
||||
|
||||
std.assertEqual(
|
||||
gpuString1Instance.tfDeployment.spec.template.spec.containers[0].resources.limits,
|
||||
{ cpu: "4", memory: "4Gi", "nvidia.com/gpu": 1 }
|
||||
)
|
||||
|
|
@ -1,147 +0,0 @@
|
|||
{
|
||||
local k = import "k.libsonnet",
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet",
|
||||
new(_env, _params):: {
|
||||
local params = _params + _env,
|
||||
local namespace = params.namespace,
|
||||
local name = params.name,
|
||||
local modelName =
|
||||
if params.modelName == "null" then
|
||||
params.name
|
||||
else
|
||||
params.modelName,
|
||||
|
||||
local tfService = {
|
||||
apiVersion: "v1",
|
||||
kind: "Service",
|
||||
metadata: {
|
||||
labels: {
|
||||
app: modelName,
|
||||
},
|
||||
name: name,
|
||||
namespace: namespace,
|
||||
annotations: {
|
||||
"getambassador.io/config":
|
||||
std.join("\n", [
|
||||
"---",
|
||||
"apiVersion: ambassador/v0",
|
||||
"kind: Mapping",
|
||||
"name: tfserving-predict-mapping-" + modelName,
|
||||
"prefix: /tfserving/models/" + modelName,
|
||||
"rewrite: /v1/models/" + modelName + ":predict",
|
||||
"method: POST",
|
||||
"service: " + name + "." + namespace + ":8500",
|
||||
"---",
|
||||
"apiVersion: ambassador/v0",
|
||||
"kind: Mapping",
|
||||
"name: tfserving-predict-mapping-" + modelName + "-get",
|
||||
"prefix: /tfserving/models/" + modelName,
|
||||
"rewrite: /v1/models/" + modelName,
|
||||
"method: GET",
|
||||
"service: " + name + "." + namespace + ":8500",
|
||||
]),
|
||||
} + if util.toBool(params.enablePrometheus) then {
|
||||
"prometheus.io/scrape": "true",
|
||||
"prometheus.io/path": "/monitoring/prometheus/metrics",
|
||||
"prometheus.io/port": "8500",
|
||||
} else {}, //annotations
|
||||
},
|
||||
spec: {
|
||||
ports: [
|
||||
{
|
||||
name: "grpc-tf-serving",
|
||||
port: 9000,
|
||||
targetPort: 9000,
|
||||
},
|
||||
{
|
||||
name: "http-tf-serving",
|
||||
port: 8500,
|
||||
targetPort: 8500,
|
||||
},
|
||||
],
|
||||
selector: {
|
||||
app: modelName,
|
||||
},
|
||||
type: params.serviceType,
|
||||
},
|
||||
}, // tfService
|
||||
tfService:: tfService,
|
||||
|
||||
local versionWeights = std.split(params.trafficRule, ","),
|
||||
local virtualService = {
|
||||
apiVersion: "networking.istio.io/v1alpha3",
|
||||
kind: "VirtualService",
|
||||
metadata: {
|
||||
name: name,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
hosts: [
|
||||
"*",
|
||||
],
|
||||
gateways: [
|
||||
"kubeflow-gateway",
|
||||
],
|
||||
http: [
|
||||
{
|
||||
match: [
|
||||
{
|
||||
uri: {
|
||||
prefix: "/istio/tfserving/models/" + modelName,
|
||||
},
|
||||
method: {
|
||||
exact: "POST",
|
||||
},
|
||||
},
|
||||
],
|
||||
rewrite: {
|
||||
uri: "/v1/models/" + modelName + ":predict",
|
||||
},
|
||||
route: [
|
||||
{
|
||||
destination: {
|
||||
host: name,
|
||||
port: {
|
||||
number: 8500,
|
||||
},
|
||||
subset: std.split(versionWeight, ":")[0],
|
||||
},
|
||||
weight: std.parseInt(std.split(versionWeight, ":")[1]),
|
||||
}
|
||||
for versionWeight in versionWeights
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
virtualService:: virtualService,
|
||||
|
||||
local destinationRule = {
|
||||
apiVersion: "networking.istio.io/v1alpha3",
|
||||
kind: "DestinationRule",
|
||||
metadata: {
|
||||
name: name,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
host: name,
|
||||
subsets: [
|
||||
{
|
||||
name: std.split(versionWeight, ":")[0],
|
||||
labels: {
|
||||
version: std.split(versionWeight, ":")[0],
|
||||
},
|
||||
}
|
||||
for versionWeight in versionWeights
|
||||
],
|
||||
},
|
||||
},
|
||||
destinationRule:: destinationRule,
|
||||
all:: util.list([
|
||||
tfService,
|
||||
] + if util.toBool(params.injectIstio) then [
|
||||
virtualService,
|
||||
destinationRule,
|
||||
] else []),
|
||||
}, // new
|
||||
}
|
||||
|
|
@ -1,137 +0,0 @@
|
|||
{
|
||||
local k = import "k.libsonnet",
|
||||
local util = import "kubeflow/tf-serving/util.libsonnet",
|
||||
new(_env, _params):: {
|
||||
local params = _params + _env,
|
||||
local namespace = params.namespace,
|
||||
local name = params.name,
|
||||
local modelName =
|
||||
if params.modelName == "null" then
|
||||
params.name
|
||||
else
|
||||
params.modelName,
|
||||
local versionName = params.versionName,
|
||||
local numGpus =
|
||||
if std.type(params.numGpus) == "string" then
|
||||
std.parseInt(params.numGpus)
|
||||
else
|
||||
params.numGpus,
|
||||
local modelServerImage =
|
||||
if numGpus == 0 then
|
||||
params.defaultCpuImage
|
||||
else
|
||||
params.defaultGpuImage,
|
||||
|
||||
// Optional features.
|
||||
// TODO(lunkai): Add request logging
|
||||
|
||||
local modelServerContainer = {
|
||||
command: [
|
||||
"/usr/bin/tensorflow_model_server",
|
||||
],
|
||||
args: [
|
||||
"--port=9000",
|
||||
"--rest_api_port=8500",
|
||||
"--model_name=" + modelName,
|
||||
"--model_base_path=" + params.modelBasePath,
|
||||
] + if util.toBool(params.enablePrometheus) then [
|
||||
"--monitoring_config_file=/var/config/monitoring_config.txt",
|
||||
] else [],
|
||||
image: modelServerImage,
|
||||
imagePullPolicy: "IfNotPresent",
|
||||
name: modelName,
|
||||
ports: [
|
||||
{
|
||||
containerPort: 9000,
|
||||
},
|
||||
{
|
||||
containerPort: 8500,
|
||||
},
|
||||
],
|
||||
env: [],
|
||||
resources: {
|
||||
limits: {
|
||||
cpu: "4",
|
||||
memory: "4Gi",
|
||||
} + if numGpus != 0 then {
|
||||
"nvidia.com/gpu": numGpus,
|
||||
} else {},
|
||||
requests: {
|
||||
cpu: "1",
|
||||
memory: "1Gi",
|
||||
},
|
||||
},
|
||||
volumeMounts: [
|
||||
{
|
||||
mountPath: "/var/config/",
|
||||
name: "config-volume",
|
||||
},
|
||||
],
|
||||
// TCP liveness probe on gRPC port
|
||||
livenessProbe: {
|
||||
tcpSocket: {
|
||||
port: 9000,
|
||||
},
|
||||
initialDelaySeconds: 30,
|
||||
periodSeconds: 30,
|
||||
},
|
||||
}, // modelServerContainer
|
||||
|
||||
local tfDeployment = {
|
||||
apiVersion: "extensions/v1beta1",
|
||||
kind: "Deployment",
|
||||
metadata: {
|
||||
labels: {
|
||||
app: modelName,
|
||||
},
|
||||
name: name,
|
||||
namespace: namespace,
|
||||
},
|
||||
spec: {
|
||||
template: {
|
||||
metadata: {
|
||||
labels: {
|
||||
app: modelName,
|
||||
version: versionName,
|
||||
},
|
||||
annotations: {
|
||||
"sidecar.istio.io/inject": if util.toBool(params.injectIstio) then "true",
|
||||
},
|
||||
},
|
||||
spec: {
|
||||
containers: [
|
||||
modelServerContainer,
|
||||
],
|
||||
volumes: [
|
||||
{
|
||||
configMap: {
|
||||
name: name + "-config",
|
||||
},
|
||||
name: "config-volume",
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
}, // tfDeployment
|
||||
tfDeployment:: tfDeployment,
|
||||
|
||||
local tfservingConfig = {
|
||||
apiVersion: "v1",
|
||||
kind: "ConfigMap",
|
||||
metadata: {
|
||||
name: name + "-config",
|
||||
namespace: namespace,
|
||||
},
|
||||
data: {
|
||||
"monitoring_config.txt": std.join("\n", [
|
||||
"prometheus_config: {",
|
||||
" enable: true,",
|
||||
' path: "/monitoring/prometheus/metrics"',
|
||||
"}",
|
||||
]),
|
||||
},
|
||||
}, // tfservingConfig
|
||||
tfservingConfig:: tfservingConfig,
|
||||
}, // new
|
||||
}
|
||||
|
|
@ -1,380 +0,0 @@
|
|||
{
|
||||
util:: import "kubeflow/tf-serving/util.libsonnet",
|
||||
|
||||
// Parameters are intended to be late bound.
|
||||
params:: {
|
||||
name: null,
|
||||
numGpus: 0,
|
||||
labels: {
|
||||
app: $.params.name,
|
||||
},
|
||||
modelName: $.params.name,
|
||||
modelPath: null,
|
||||
modelStorageType: "storageType",
|
||||
|
||||
version: "v1",
|
||||
firstVersion: true,
|
||||
|
||||
deployIstio: false,
|
||||
|
||||
deployHttpProxy: false,
|
||||
httpProxyImage: "gcr.io/kubeflow-images-public/tf-model-server-http-proxy:v20180606-9dfda4f2",
|
||||
|
||||
serviceType: "ClusterIP",
|
||||
|
||||
// If users want to override the image then can override defaultCpuImage and/or defaultGpuImage
|
||||
// in which case the image used will still depend on whether GPUs are used or not.
|
||||
// Users can also override modelServerImage in which case the user supplied value will always be used
|
||||
// regardless of numGpus.
|
||||
defaultCpuImage: "tensorflow/serving:1.11.1",
|
||||
defaultGpuImage: "tensorflow/serving:1.11.1-gpu",
|
||||
modelServerImage: if $.params.numGpus == 0 then
|
||||
$.params.defaultCpuImage
|
||||
else
|
||||
$.params.defaultGpuImage,
|
||||
|
||||
|
||||
// Whether or not to enable s3 parameters
|
||||
s3Enable:: false,
|
||||
|
||||
// Which storageType to use
|
||||
storageType:: null,
|
||||
},
|
||||
|
||||
// Parametes specific to GCP.
|
||||
gcpParams:: {
|
||||
gcpCredentialSecretName: "",
|
||||
} + $.params,
|
||||
|
||||
// Parameters that control S3 access
|
||||
// params overrides s3params because params can be overwritten by the user to override the defaults.
|
||||
s3params:: {
|
||||
// Name of the k8s secrets containing S3 credentials
|
||||
s3SecretName: "",
|
||||
// Name of the key in the k8s secret containing AWS_ACCESS_KEY_ID.
|
||||
s3SecretAccesskeyidKeyName: "AWS_ACCESS_KEY_ID",
|
||||
|
||||
// Name of the key in the k8s secret containing AWS_SECRET_ACCESS_KEY.
|
||||
s3SecretSecretaccesskeyKeyName: "AWS_SECRET_ACCESS_KEY",
|
||||
|
||||
// S3 region
|
||||
s3AwsRegion: "us-west-1",
|
||||
|
||||
// TODO(jlewi): We should use util.toBool to automatically conver to actual boolean values.
|
||||
// The use of strings is left over from when they were prototype parameters which only supports string type.
|
||||
|
||||
// true Whether or not to use https for S3 connections
|
||||
s3UseHttps: "true",
|
||||
|
||||
// Whether or not to verify https certificates for S3 connections
|
||||
s3VerifySsl: "true",
|
||||
|
||||
// URL for your s3-compatible endpoint.
|
||||
s3Endpoint: "http://s3.us-west-1.amazonaws.com,",
|
||||
} + $.params,
|
||||
|
||||
|
||||
components:: {
|
||||
|
||||
all:: [
|
||||
// Default routing rule for the first version of model.
|
||||
if $.util.toBool($.params.deployIstio) && $.util.toBool($.params.firstVersion) then
|
||||
$.parts.defaultRouteRule,
|
||||
] +
|
||||
// TODO(jlewi): It would be better to structure s3 as a mixin.
|
||||
// As an example it would be great to allow S3 and GCS parameters
|
||||
// to be enabled simultaneously. This should be doable because
|
||||
// each entails adding a set of environment variables and volumes
|
||||
// to the containers. These volumes/environment variables shouldn't
|
||||
// overlap so there's no reason we shouldn't be able to just add
|
||||
// both modifications to the base container.
|
||||
// I think we want to restructure things as mixins so they can just
|
||||
// be added.
|
||||
if $.params.s3Enable then
|
||||
[
|
||||
$.s3parts.tfService,
|
||||
$.s3parts.tfDeployment,
|
||||
]
|
||||
else if $.params.storageType == "gcp" then
|
||||
[
|
||||
$.gcpParts.tfService,
|
||||
$.gcpParts.tfDeployment,
|
||||
]
|
||||
else
|
||||
[
|
||||
$.parts.tfService,
|
||||
$.parts.tfDeployment,
|
||||
],
|
||||
}.all,
|
||||
|
||||
parts:: {
|
||||
// We define the containers one level beneath parts because combined with jsonnet late binding
|
||||
// this makes it easy for users to override specific bits of the container.
|
||||
tfServingContainerBase:: {
|
||||
name: $.params.name,
|
||||
image: $.params.modelServerImage,
|
||||
imagePullPolicy: "IfNotPresent",
|
||||
command: [
|
||||
"/usr/bin/tensorflow_model_server",
|
||||
],
|
||||
args: [
|
||||
"--port=9000",
|
||||
"--model_name=" + $.params.modelName,
|
||||
"--model_base_path=" + $.params.modelPath,
|
||||
],
|
||||
ports: [
|
||||
{
|
||||
containerPort: 9000,
|
||||
},
|
||||
],
|
||||
// TODO(jlewi): We should add readiness and liveness probes. I think the blocker is that
|
||||
// model-server doesn't have something we can use out of the box.
|
||||
resources: {
|
||||
requests: {
|
||||
memory: "1Gi",
|
||||
cpu: "1",
|
||||
},
|
||||
limits: {
|
||||
memory: "4Gi",
|
||||
cpu: "4",
|
||||
},
|
||||
},
|
||||
// The is user and group should be defined in the Docker image.
|
||||
// Per best practices we don't run as the root user.
|
||||
securityContext: {
|
||||
runAsUser: 1000,
|
||||
fsGroup: 1000,
|
||||
},
|
||||
volumeMounts+: if $.params.modelStorageType == "nfs" then [{
|
||||
name: "nfs",
|
||||
mountPath: "/mnt",
|
||||
}]
|
||||
else [],
|
||||
}, // tfServingContainer
|
||||
|
||||
tfServingContainer+: $.parts.tfServingContainerBase +
|
||||
if $.params.numGpus > 0 then
|
||||
{
|
||||
resources+: {
|
||||
limits+: {
|
||||
"nvidia.com/gpu": $.params.numGpus,
|
||||
},
|
||||
},
|
||||
}
|
||||
else {},
|
||||
|
||||
tfServingMetadata+: {
|
||||
labels: $.params.labels { version: $.params.version },
|
||||
annotations: {
|
||||
"sidecar.istio.io/inject": if $.util.toBool($.params.deployIstio) then "true",
|
||||
},
|
||||
},
|
||||
|
||||
httpProxyContainer:: {
|
||||
name: $.params.name + "-http-proxy",
|
||||
image: $.params.httpProxyImage,
|
||||
imagePullPolicy: "IfNotPresent",
|
||||
command: [
|
||||
"python",
|
||||
"/usr/src/app/server.py",
|
||||
"--port=8000",
|
||||
"--rpc_port=9000",
|
||||
"--rpc_timeout=10.0",
|
||||
],
|
||||
env: [],
|
||||
ports: [
|
||||
{
|
||||
containerPort: 8000,
|
||||
},
|
||||
],
|
||||
resources: {
|
||||
requests: {
|
||||
memory: "500Mi",
|
||||
cpu: "0.5",
|
||||
},
|
||||
limits: {
|
||||
memory: "1Gi",
|
||||
cpu: "1",
|
||||
},
|
||||
},
|
||||
securityContext: {
|
||||
runAsUser: 1000,
|
||||
fsGroup: 1000,
|
||||
},
|
||||
}, // httpProxyContainer
|
||||
|
||||
|
||||
tfDeployment: {
|
||||
apiVersion: "extensions/v1beta1",
|
||||
kind: "Deployment",
|
||||
metadata: {
|
||||
name: $.params.name + "-" + $.params.version,
|
||||
namespace: $.params.namespace,
|
||||
labels: $.params.labels,
|
||||
},
|
||||
spec: {
|
||||
template: {
|
||||
metadata: $.parts.tfServingMetadata,
|
||||
spec: {
|
||||
containers: [
|
||||
$.parts.tfServingContainer,
|
||||
if $.util.toBool($.params.deployHttpProxy) then
|
||||
$.parts.httpProxyContainer,
|
||||
],
|
||||
volumes+: if $.params.modelStorageType == "nfs" then
|
||||
[{
|
||||
name: "nfs",
|
||||
persistentVolumeClaim: {
|
||||
claimName: $.params.nfsPVC,
|
||||
},
|
||||
}]
|
||||
else [],
|
||||
},
|
||||
},
|
||||
},
|
||||
}, // tfDeployment
|
||||
|
||||
tfService: {
|
||||
apiVersion: "v1",
|
||||
kind: "Service",
|
||||
metadata: {
|
||||
labels: $.params.labels,
|
||||
name: $.params.name,
|
||||
namespace: $.params.namespace,
|
||||
annotations: {
|
||||
"getambassador.io/config":
|
||||
std.join("\n", [
|
||||
"---",
|
||||
"apiVersion: ambassador/v0",
|
||||
"kind: Mapping",
|
||||
"name: tfserving-mapping-" + $.params.name + "-get",
|
||||
"prefix: /models/" + $.params.name + "/",
|
||||
"rewrite: /",
|
||||
"method: GET",
|
||||
"service: " + $.params.name + "." + $.params.namespace + ":8000",
|
||||
"---",
|
||||
"apiVersion: ambassador/v0",
|
||||
"kind: Mapping",
|
||||
"name: tfserving-mapping-" + $.params.name + "-post",
|
||||
"prefix: /models/" + $.params.name + "/",
|
||||
"rewrite: /model/" + $.params.name + ":predict",
|
||||
"method: POST",
|
||||
"service: " + $.params.name + "." + $.params.namespace + ":8000",
|
||||
]),
|
||||
}, //annotations
|
||||
},
|
||||
spec: {
|
||||
ports: [
|
||||
{
|
||||
name: "grpc-tf-serving",
|
||||
port: 9000,
|
||||
targetPort: 9000,
|
||||
},
|
||||
{
|
||||
name: "http-tf-serving-proxy",
|
||||
port: 8000,
|
||||
targetPort: 8000,
|
||||
},
|
||||
],
|
||||
selector: $.params.labels,
|
||||
type: $.params.serviceType,
|
||||
},
|
||||
}, // tfService
|
||||
|
||||
defaultRouteRule: {
|
||||
apiVersion: "config.istio.io/v1alpha2",
|
||||
kind: "RouteRule",
|
||||
metadata: {
|
||||
name: $.params.name + "-default",
|
||||
namespace: $.params.namespace,
|
||||
},
|
||||
spec: {
|
||||
destination: {
|
||||
name: $.params.name,
|
||||
},
|
||||
precedence: 0,
|
||||
route: [
|
||||
{
|
||||
labels: { version: $.params.version },
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
|
||||
}, // parts
|
||||
|
||||
// Parts specific to S3
|
||||
s3parts:: $.parts {
|
||||
s3Env:: [
|
||||
{ name: "AWS_ACCESS_KEY_ID", valueFrom: { secretKeyRef: { name: $.s3params.s3SecretName, key: $.s3params.s3SecretAccesskeyidKeyName } } },
|
||||
{ name: "AWS_SECRET_ACCESS_KEY", valueFrom: { secretKeyRef: { name: $.s3params.s3SecretName, key: $.s3params.s3SecretSecretaccesskeyKeyName } } },
|
||||
{ name: "AWS_REGION", value: $.s3params.s3AwsRegion },
|
||||
{ name: "S3_REGION", value: $.s3params.s3AwsRegion },
|
||||
{ name: "S3_USE_HTTPS", value: $.s3params.s3UseHttps },
|
||||
{ name: "S3_VERIFY_SSL", value: $.s3params.s3VerifySsl },
|
||||
{ name: "S3_ENDPOINT", value: $.s3params.s3Endpoint },
|
||||
],
|
||||
|
||||
tfServingContainer: $.parts.tfServingContainer {
|
||||
env+: $.s3parts.s3Env,
|
||||
},
|
||||
|
||||
tfDeployment: $.parts.tfDeployment {
|
||||
spec: +{
|
||||
template: +{
|
||||
metadata: $.parts.tfServingMetadata,
|
||||
spec: +{
|
||||
containers: [
|
||||
$.s3parts.tfServingContainer,
|
||||
if $.util.toBool($.params.deployHttpProxy) then
|
||||
$.parts.httpProxyContainer,
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
}, // tfDeployment
|
||||
}, // s3parts
|
||||
|
||||
// Parts specific to GCP
|
||||
gcpParts:: $.parts {
|
||||
gcpEnv:: [
|
||||
if $.gcpParams.gcpCredentialSecretName != "" then
|
||||
{ name: "GOOGLE_APPLICATION_CREDENTIALS", value: "/secret/gcp-credentials/user-gcp-sa.json" },
|
||||
],
|
||||
|
||||
tfServingContainer: $.parts.tfServingContainer {
|
||||
env+: $.gcpParts.gcpEnv,
|
||||
volumeMounts+: [
|
||||
if $.gcpParams.gcpCredentialSecretName != "" then
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
},
|
||||
],
|
||||
},
|
||||
|
||||
tfDeployment: $.parts.tfDeployment {
|
||||
spec+: {
|
||||
template+: {
|
||||
metadata: $.parts.tfServingMetadata,
|
||||
spec+: {
|
||||
containers: [
|
||||
$.gcpParts.tfServingContainer,
|
||||
if $.util.toBool($.params.deployHttpProxy) then
|
||||
$.parts.httpProxyContainer,
|
||||
],
|
||||
volumes: [
|
||||
if $.gcpParams.gcpCredentialSecretName != "" then
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: $.gcpParams.gcpCredentialSecretName,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
}, // tfDeployment
|
||||
}, // gcpParts
|
||||
}
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
// Some useful routines.
|
||||
{
|
||||
local k = import "k.libsonnet",
|
||||
|
||||
// Convert non-boolean types like string,number to a boolean.
|
||||
// This is primarily intended for dealing with parameters that should be booleans.
|
||||
toBool:: function(x) {
|
||||
result::
|
||||
if std.type(x) == "boolean" then
|
||||
x
|
||||
else if std.type(x) == "string" then
|
||||
std.asciiUpper(x) == "TRUE"
|
||||
else if std.type(x) == "number" then
|
||||
x != 0
|
||||
else
|
||||
false,
|
||||
}.result,
|
||||
|
||||
// Produce a list of manifests. obj must be an array
|
||||
list(obj):: k.core.v1.list.new(obj,),
|
||||
}
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
- op: add
|
||||
path: /spec/template/spec/containers/0/volumeMounts
|
||||
value:
|
||||
- mountPath: $(secretMountPath)
|
||||
name: user-gcp-sa
|
||||
readOnly: true
|
||||
- op: add
|
||||
path: /spec/template/spec/volumes
|
||||
value:
|
||||
- name: user-gcp-sa
|
||||
secret:
|
||||
secretName: $(secretName)
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: GOOGLE_APPLICATION_CREDENTIALS
|
||||
value: $(GOOGLE_APPLICATION_CREDENTIALS)
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
bases:
|
||||
- ../base
|
||||
|
||||
configurations:
|
||||
- params.yaml
|
||||
|
||||
vars:
|
||||
- fieldref:
|
||||
fieldPath: data.GOOGLE_APPLICATION_CREDENTIALS
|
||||
name: GOOGLE_APPLICATION_CREDENTIALS
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-monitoring
|
||||
- fieldref:
|
||||
fieldPath: data.secretName
|
||||
name: secretName
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-monitoring
|
||||
- fieldref:
|
||||
fieldPath: data.secretMountPath
|
||||
name: secretMountPath
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-monitoring
|
||||
|
||||
patchesJson6902:
|
||||
- path: deployment_patch.yaml
|
||||
target:
|
||||
group: apps
|
||||
kind: Deployment
|
||||
name: tensorboard-tb
|
||||
version: v1beta1
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
varReference:
|
||||
- path: spec/template/spec/volumes/secret/secretName
|
||||
kind: Deployment
|
||||
- path: spec/template/spec/containers/volumeMounts/mountPath
|
||||
kind: Deployment
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
- op: add
|
||||
path: /spec/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: S3_ENDPOINT
|
||||
value: $(S3_ENDPOINT)
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_ENDPOINT_URL
|
||||
value: $(AWS_ENDPOINT_URL)
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_REGION
|
||||
value: $(AWS_REGION)
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: BUCKET_NAME
|
||||
value: $(BUCKET_NAME)
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: S3_USE_HTTPS
|
||||
value: $(S3_USE_HTTPS)
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: S3_VERIFY_SSL
|
||||
value: $(S3_VERIFY_SSL)
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_ACCESS_KEY_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: $(awsAccessKeyIDName)
|
||||
name: $(awsSecretName)
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_SECRET_ACCESS_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: $(awsSecretAccessKeyName)
|
||||
name: $(awsSecretName)
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
bases:
|
||||
- ../base
|
||||
|
||||
configurations:
|
||||
- params.yaml
|
||||
|
||||
vars:
|
||||
- fieldref:
|
||||
fieldPath: data.S3_ENDPOINT
|
||||
name: S3_ENDPOINT
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-monitoring
|
||||
- fieldref:
|
||||
fieldPath: data.AWS_ENDPOINT_URL
|
||||
name: AWS_ENDPOINT_URL
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-monitoring
|
||||
- fieldref:
|
||||
fieldPath: data.AWS_REGION
|
||||
name: AWS_REGION
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-monitoring
|
||||
- fieldref:
|
||||
fieldPath: data.BUCKET_NAME
|
||||
name: BUCKET_NAME
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-monitoring
|
||||
- fieldref:
|
||||
fieldPath: data.S3_USE_HTTPS
|
||||
name: S3_USE_HTTPS
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-monitoring
|
||||
- fieldref:
|
||||
fieldPath: data.S3_VERIFY_SSL
|
||||
name: S3_VERIFY_SSL
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-monitoring
|
||||
- fieldref:
|
||||
fieldPath: data.awsSecretName
|
||||
name: awsSecretName
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-monitoring
|
||||
- fieldref:
|
||||
fieldPath: data.awsAccessKeyIDName
|
||||
name: awsAccessKeyIDName
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-monitoring
|
||||
- fieldref:
|
||||
fieldPath: data.awsSecretAccessKeyName
|
||||
name: awsSecretAccessKeyName
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-monitoring
|
||||
|
||||
patchesJson6902:
|
||||
- path: deployment_patch.yaml
|
||||
target:
|
||||
group: apps
|
||||
kind: Deployment
|
||||
name: tensorboard-tb
|
||||
version: v1beta1
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
varReference:
|
||||
- path: spec/template/spec/containers/env/valueFrom/secretKeyRef/name
|
||||
kind: Deployment
|
||||
- path: spec/template/spec/containers/env/valueFrom/secretKeyRef/key
|
||||
kind: Deployment
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
apiVersion: apps/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: tensorboard-tb
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: tensorboard
|
||||
tb-job: tensorboard
|
||||
name: tensorboard
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- /usr/local/bin/tensorboard
|
||||
- --logdir=$(logDir)
|
||||
- --port=80
|
||||
env:
|
||||
- name: logDir
|
||||
value: $(logDir)
|
||||
image: tensorflow/tensorflow:1.11.0
|
||||
name: tensorboard
|
||||
ports:
|
||||
- containerPort: 80
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
resources:
|
||||
- deployment.yaml
|
||||
- service.yaml
|
||||
|
||||
namespace: kubeflow
|
||||
|
||||
generatorOptions:
|
||||
disableNameSuffixHash: true
|
||||
|
||||
configurations:
|
||||
- params.yaml
|
||||
|
||||
vars:
|
||||
- fieldref:
|
||||
fieldPath: data.logDir
|
||||
name: logDir
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-monitoring
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
varReference:
|
||||
- path: spec/template/spec/containers/env/value
|
||||
kind: Deployment
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
annotations:
|
||||
getambassador.io/config: |-
|
||||
---
|
||||
apiVersion: ambassador/v0
|
||||
kind: Mapping
|
||||
name: tensorboard_mapping
|
||||
prefix: /kubeflow/tensorboard/mnist
|
||||
rewrite: /
|
||||
service: tensorboard-tb.kubeflow
|
||||
---
|
||||
apiVersion: ambassador/v0
|
||||
kind: Mapping
|
||||
name: tensorboard_mapping_data
|
||||
prefix: /kubeflow/tensorboard/mnist/data/
|
||||
rewrite: /data/
|
||||
service: tensorboard-tb.kubeflow
|
||||
name: tensorboard-tb
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: 80
|
||||
selector:
|
||||
app: tensorboard
|
||||
tb-job: tensorboard
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
- op: add
|
||||
path: /spec/template/spec/containers/0/volumeMounts/-
|
||||
value:
|
||||
mountPath: /secret/gcp-credentials
|
||||
name: user-gcp-sa
|
||||
readOnly: true
|
||||
- op: add
|
||||
path: /spec/template/spec/volumes/-
|
||||
value:
|
||||
name: user-gcp-sa
|
||||
secret:
|
||||
secretName: user-gcp-sa
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: GOOGLE_APPLICATION_CREDENTIALS
|
||||
value: /secret/gcp-credentials/user-gcp-sa.json
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
bases:
|
||||
- ../base
|
||||
|
||||
patchesJson6902:
|
||||
- path: deployment_patch.yaml
|
||||
target:
|
||||
group: extensions
|
||||
kind: Deployment
|
||||
name: $(svcName)
|
||||
version: v1beta1
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
labels:
|
||||
app: mnist
|
||||
name: $(svcName)
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: mnist
|
||||
version: v1
|
||||
spec:
|
||||
containers:
|
||||
- args:
|
||||
- --port=9000
|
||||
- --rest_api_port=8500
|
||||
- --model_name=mnist
|
||||
- --model_base_path=$(modelBasePath)
|
||||
- --monitoring_config_file=/var/config/monitoring_config.txt
|
||||
command:
|
||||
- /usr/bin/tensorflow_model_server
|
||||
env:
|
||||
- name: modelBasePath
|
||||
value: $(modelBasePath)
|
||||
image: tensorflow/serving:1.11.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
tcpSocket:
|
||||
port: 9000
|
||||
name: mnist
|
||||
ports:
|
||||
- containerPort: 9000
|
||||
- containerPort: 8500
|
||||
resources:
|
||||
limits:
|
||||
cpu: "4"
|
||||
memory: 4Gi
|
||||
requests:
|
||||
cpu: "1"
|
||||
memory: 1Gi
|
||||
volumeMounts:
|
||||
- mountPath: /var/config/
|
||||
name: config-volume
|
||||
volumes:
|
||||
- configMap:
|
||||
name: mnist-deploy-config
|
||||
name: config-volume
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
resources:
|
||||
- deployment.yaml
|
||||
- mnist-deploy-config.yaml
|
||||
- service.yaml
|
||||
|
||||
namespace: kubeflow
|
||||
|
||||
generatorOptions:
|
||||
disableNameSuffixHash: true
|
||||
|
||||
configurations:
|
||||
- params.yaml
|
||||
|
||||
vars:
|
||||
- fieldref:
|
||||
fieldPath: data.name
|
||||
name: svcName
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-serving
|
||||
- fieldref:
|
||||
fieldPath: data.modelBasePath
|
||||
name: modelBasePath
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-serving
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
apiVersion: v1
|
||||
data:
|
||||
monitoring_config.txt: |-
|
||||
prometheus_config: {
|
||||
enable: true,
|
||||
path: "/monitoring/prometheus/metrics"
|
||||
}
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: mnist-deploy-config
|
||||
namespace: kubeflow
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
varReference:
|
||||
- path: spec/template/spec/containers/env/value
|
||||
kind: Deployment
|
||||
- path: metadata/name
|
||||
kind: Service
|
||||
- path: metadata/name
|
||||
kind: Deployment
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
annotations:
|
||||
getambassador.io/config: |-
|
||||
---
|
||||
apiVersion: ambassador/v0
|
||||
kind: Mapping
|
||||
name: tfserving-predict-mapping-mnist
|
||||
prefix: /tfserving/models/mnist
|
||||
rewrite: /v1/models/mnist:predict
|
||||
method: POST
|
||||
service: mnist-service.kubeflow:8500
|
||||
---
|
||||
apiVersion: ambassador/v0
|
||||
kind: Mapping
|
||||
name: tfserving-predict-mapping-mnist-get
|
||||
prefix: /tfserving/models/mnist
|
||||
rewrite: /v1/models/mnist
|
||||
method: GET
|
||||
service: mnist-service.kubeflow:8500
|
||||
prometheus.io/path: /monitoring/prometheus/metrics
|
||||
prometheus.io/port: "8500"
|
||||
prometheus.io/scrape: "true"
|
||||
labels:
|
||||
app: mnist
|
||||
name: $(svcName)
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
ports:
|
||||
- name: grpc-tf-serving
|
||||
port: 9000
|
||||
targetPort: 9000
|
||||
- name: http-tf-serving
|
||||
port: 8500
|
||||
targetPort: 8500
|
||||
selector:
|
||||
app: mnist
|
||||
type: ClusterIP
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
- op: add
|
||||
path: /spec/template/spec/containers/0/volumeMounts/-
|
||||
value:
|
||||
mountPath: $(pvcMountPath)
|
||||
name: local-storage
|
||||
|
||||
- op: add
|
||||
path: /spec/template/spec/volumes/-
|
||||
value:
|
||||
name: local-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: $(pvcName)
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
bases:
|
||||
- ../base
|
||||
|
||||
vars:
|
||||
- fieldref:
|
||||
fieldPath: data.pvcName
|
||||
name: pvcName
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-serving
|
||||
- fieldref:
|
||||
fieldPath: data.pvcMountPath
|
||||
name: pvcMountPath
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-serving
|
||||
|
||||
configurations:
|
||||
- params.yaml
|
||||
|
||||
patchesJson6902:
|
||||
- path: deployment_patch.yaml
|
||||
target:
|
||||
group: extensions
|
||||
kind: Deployment
|
||||
name: $(svcName)
|
||||
version: v1beta1
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
varReference:
|
||||
- path: spec/template/spec/volumes/persistentVolumeClaim/claimName
|
||||
kind: Deployment
|
||||
- path: spec/template/spec/containers/volumeMounts/mountPath
|
||||
kind: TFJob
|
||||
|
|
@ -21,11 +21,11 @@ Manually running the test
|
|||
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from kubernetes import client as k8s_client
|
||||
from kubeflow.tf_operator import test_runner #pylint: disable=no-name-in-module
|
||||
|
||||
from kubeflow.testing import ks_util
|
||||
from kubeflow.testing import test_util
|
||||
from kubeflow.testing import util
|
||||
|
||||
|
|
@ -38,14 +38,13 @@ class MnistDeployTest(test_util.TestCase):
|
|||
|
||||
if not self.app_dir:
|
||||
self.app_dir = os.path.join(os.path.dirname(__file__), "..",
|
||||
"ks_app")
|
||||
"serving/GCS")
|
||||
self.app_dir = os.path.abspath(self.app_dir)
|
||||
logging.info("--app_dir not set defaulting to: %s", self.app_dir)
|
||||
|
||||
self.env = env
|
||||
self.namespace = namespace
|
||||
self.params = args.params
|
||||
self.ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
|
||||
super(MnistDeployTest, self).__init__(class_name="MnistDeployTest",
|
||||
name=name)
|
||||
|
||||
|
|
@ -55,16 +54,26 @@ class MnistDeployTest(test_util.TestCase):
|
|||
# same name.
|
||||
api_client = k8s_client.ApiClient()
|
||||
|
||||
# TODO (jinchihe) beflow code will be removed once new test-worker image
|
||||
# is publish in https://github.com/kubeflow/testing/issues/373.
|
||||
kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \
|
||||
'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64'
|
||||
util.run(['wget', '-O', '/usr/local/bin/kustomize', kusUrl], cwd=self.app_dir)
|
||||
util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=self.app_dir)
|
||||
|
||||
# Apply the components
|
||||
for component in ["mnist-deploy-gcp", "mnist-service"]:
|
||||
# Setup the ksonnet app
|
||||
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
|
||||
self.params)
|
||||
configmap = 'mnist-map-serving'
|
||||
for pair in self.params.split(","):
|
||||
k, v = pair.split("=", 1)
|
||||
if k == "namespace":
|
||||
util.run(['kustomize', 'edit', 'set', k, v], cwd=self.app_dir)
|
||||
else:
|
||||
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
|
||||
'--from-literal=' + k + '=' + v], cwd=self.app_dir)
|
||||
|
||||
util.run([self.ks_cmd, "apply", self.env, "-c", component],
|
||||
cwd=self.app_dir)
|
||||
|
||||
logging.info("Created deployment %s in namespaces %s", self.name, self.namespace)
|
||||
# Seems the util.run cannot handle pipes case, using check_call.
|
||||
subCmd = 'kustomize build ' + self.app_dir + '| kubectl apply -f -'
|
||||
subprocess.check_call(subCmd, shell=True)
|
||||
|
||||
util.wait_for_deployment(api_client, self.namespace, self.name,
|
||||
timeout_minutes=4)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
"""Test training using TFJob.
|
||||
|
||||
This file tests that we can submit the job from ksonnet
|
||||
This file tests that we can submit the job
|
||||
and that the job runs to completion.
|
||||
|
||||
It is an integration test as it depends on having access to
|
||||
|
|
@ -20,18 +20,18 @@ Manually running the test
|
|||
3. To test a new image set the parameter image e.g
|
||||
--params=name=${NAME},namespace=${NAMESPACE},image=${IMAGE}
|
||||
4. To control how long it trains set sample_size and num_epochs
|
||||
--params=numTrainSteps=10,batchSize=10,...
|
||||
--params=trainSteps=10,batchSize=10,...
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from kubernetes import client as k8s_client
|
||||
from kubeflow.tf_operator import tf_job_client #pylint: disable=no-name-in-module
|
||||
from kubeflow.tf_operator import test_runner #pylint: disable=no-name-in-module
|
||||
|
||||
from kubeflow.testing import ks_util
|
||||
from kubeflow.testing import test_util
|
||||
from kubeflow.testing import util
|
||||
|
||||
|
|
@ -42,14 +42,13 @@ class TFJobTest(test_util.TestCase):
|
|||
|
||||
if not self.app_dir:
|
||||
self.app_dir = os.path.join(os.path.dirname(__file__), "..",
|
||||
"ks_app")
|
||||
"training/GCS")
|
||||
self.app_dir = os.path.abspath(self.app_dir)
|
||||
logging.info("--app_dir not set defaulting to: %s", self.app_dir)
|
||||
|
||||
self.env = env
|
||||
self.namespace = namespace
|
||||
self.params = args.params
|
||||
self.ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
|
||||
super(TFJobTest, self).__init__(class_name="TFJobTest", name=name)
|
||||
|
||||
def test_train(self):
|
||||
|
|
@ -58,15 +57,43 @@ class TFJobTest(test_util.TestCase):
|
|||
# same name.
|
||||
api_client = k8s_client.ApiClient()
|
||||
|
||||
component = "train"
|
||||
# Setup the ksonnet app
|
||||
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
|
||||
self.params)
|
||||
# TODO (jinchihe) beflow code will be removed once new test-worker image
|
||||
# is publish in https://github.com/kubeflow/testing/issues/373.
|
||||
kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \
|
||||
'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64'
|
||||
util.run(['wget', '-O', '/usr/local/bin/kustomize', kusUrl], cwd=self.app_dir)
|
||||
util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=self.app_dir)
|
||||
|
||||
# Setup parameters for kustomize
|
||||
configmap = 'mnist-map-training'
|
||||
for pair in self.params.split(","):
|
||||
k, v = pair.split("=", 1)
|
||||
if k == "namespace":
|
||||
util.run(['kustomize', 'edit', 'set', k, v], cwd=self.app_dir)
|
||||
elif k == "image":
|
||||
util.run(['kustomize', 'edit', 'set', k, 'training-image=' + v], cwd=self.app_dir)
|
||||
elif k == "numPs":
|
||||
util.run(['../base/definition.sh', '--numPs', v], cwd=self.app_dir)
|
||||
elif k == "numWorkers":
|
||||
util.run(['../base/definition.sh', '--numWorkers', v], cwd=self.app_dir)
|
||||
elif k == "secret":
|
||||
secretName, secretMountPath = v.split("=", 1)
|
||||
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
|
||||
'--from-literal=secretName=' + secretName], cwd=self.app_dir)
|
||||
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
|
||||
'--from-literal=secretMountPath=' + secretMountPath], cwd=self.app_dir)
|
||||
elif k == "envVariables":
|
||||
var_k, var_v = v.split("=", 1)
|
||||
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
|
||||
'--from-literal=' + var_k + '=' + var_v], cwd=self.app_dir)
|
||||
else:
|
||||
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
|
||||
'--from-literal=' + k + '=' + v], cwd=self.app_dir)
|
||||
|
||||
# Create the TF job
|
||||
util.run([self.ks_cmd, "apply", self.env, "-c", component],
|
||||
cwd=self.app_dir)
|
||||
# Seems the util.run cannot handle pipes case, using check_call.
|
||||
subCmd = 'kustomize build ' + self.app_dir + '| kubectl apply -f -'
|
||||
subprocess.check_call(subCmd, shell=True)
|
||||
logging.info("Created job %s in namespaces %s", self.name, self.namespace)
|
||||
|
||||
# Wait for the job to complete.
|
||||
|
|
@ -89,6 +116,21 @@ class TFJobTest(test_util.TestCase):
|
|||
self.failure = "Job {0} in namespace {1} in status {2}".format( # pylint: disable=attribute-defined-outside-init
|
||||
self.name, self.namespace, results.get("status", {}))
|
||||
logging.error(self.failure)
|
||||
|
||||
# if the TFJob failed, print out the pod logs for debugging.
|
||||
pod_names = tf_job_client.get_pod_names(
|
||||
api_client, self.namespace, self.name)
|
||||
logging.info("The Pods name:\n %s", pod_names)
|
||||
|
||||
core_api = k8s_client.CoreV1Api(api_client)
|
||||
|
||||
for pod in pod_names:
|
||||
logging.info("Getting logs of Pod %s.", pod)
|
||||
try:
|
||||
pod_logs = core_api.read_namespaced_pod_log(pod, self.namespace)
|
||||
logging.info("The logs of Pod %s log:\n %s", pod, pod_logs)
|
||||
except k8s_client.rest.ApiException as e:
|
||||
logging.info("Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n", e)
|
||||
return
|
||||
|
||||
# We don't delete the jobs. We rely on TTLSecondsAfterFinished
|
||||
|
|
|
|||
|
|
@ -0,0 +1,17 @@
|
|||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/volumeMounts
|
||||
value:
|
||||
- mountPath: $(secretMountPath)
|
||||
name: user-gcp-sa
|
||||
readOnly: true
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Chief/template/spec/volumes
|
||||
value:
|
||||
- name: user-gcp-sa
|
||||
secret:
|
||||
secretName: $(secretName)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: GOOGLE_APPLICATION_CREDENTIALS
|
||||
value: $(GOOGLE_APPLICATION_CREDENTIALS)
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/volumeMounts
|
||||
value:
|
||||
- mountPath: $(secretMountPath)
|
||||
name: user-gcp-sa
|
||||
readOnly: true
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Ps/template/spec/volumes
|
||||
value:
|
||||
- name: user-gcp-sa
|
||||
secret:
|
||||
secretName: $(secretName)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: GOOGLE_APPLICATION_CREDENTIALS
|
||||
value: $(GOOGLE_APPLICATION_CREDENTIALS)
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/volumeMounts
|
||||
value:
|
||||
- mountPath: $(secretMountPath)
|
||||
name: user-gcp-sa
|
||||
readOnly: true
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Worker/template/spec/volumes
|
||||
value:
|
||||
- name: user-gcp-sa
|
||||
secret:
|
||||
secretName: $(secretName)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: GOOGLE_APPLICATION_CREDENTIALS
|
||||
value: $(GOOGLE_APPLICATION_CREDENTIALS)
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
bases:
|
||||
- ../base
|
||||
|
||||
configurations:
|
||||
- params.yaml
|
||||
|
||||
# TBD (jinchihe) Need move the image to base file once.
|
||||
# the issue addressed: kubernetes-sigs/kustomize/issues/1040
|
||||
# TBD (jinchihe) Need to update the image once
|
||||
# the issue addressed: kubeflow/testing/issues/373
|
||||
images:
|
||||
- name: training-image
|
||||
newName: gcr.io/kubeflow-examples/mnist/model
|
||||
newTag: v20190111-v0.2-148-g313770f
|
||||
|
||||
vars:
|
||||
- fieldref:
|
||||
fieldPath: data.GOOGLE_APPLICATION_CREDENTIALS
|
||||
name: GOOGLE_APPLICATION_CREDENTIALS
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.secretName
|
||||
name: secretName
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.secretMountPath
|
||||
name: secretMountPath
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
|
||||
patchesJson6902:
|
||||
- path: Chief_patch.yaml
|
||||
target:
|
||||
group: kubeflow.org
|
||||
kind: TFJob
|
||||
name: $(trainingName)
|
||||
version: v1beta1
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
varReference:
|
||||
- path: metadata/name
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Chief/template/spec/volumes/secret/secretName
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Chief/template/spec/containers/volumeMounts/mountPath
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Worker/template/spec/volumes/secret/secretName
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Worker/template/spec/containers/volumeMounts/mountPath
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Ps/template/spec/volumes/secret/secretName
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Ps/template/spec/containers/volumeMounts/mountPath
|
||||
kind: TFJob
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: S3_ENDPOINT
|
||||
value: $(S3_ENDPOINT)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_ENDPOINT_URL
|
||||
value: $(AWS_ENDPOINT_URL)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_REGION
|
||||
value: $(AWS_REGION)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: BUCKET_NAME
|
||||
value: $(BUCKET_NAME)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: S3_USE_HTTPS
|
||||
value: $(S3_USE_HTTPS)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: S3_VERIFY_SSL
|
||||
value: $(S3_VERIFY_SSL)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_ACCESS_KEY_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: $(awsAccessKeyIDName)
|
||||
name: $(awsSecretName)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_SECRET_ACCESS_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: $(awsSecretAccessKeyName)
|
||||
name: $(awsSecretName)
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: S3_ENDPOINT
|
||||
value: $(S3_ENDPOINT)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_ENDPOINT_URL
|
||||
value: $(AWS_ENDPOINT_URL)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_REGION
|
||||
value: $(AWS_REGION)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: BUCKET_NAME
|
||||
value: $(BUCKET_NAME)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: S3_USE_HTTPS
|
||||
value: $(S3_USE_HTTPS)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: S3_VERIFY_SSL
|
||||
value: $(S3_VERIFY_SSL)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_ACCESS_KEY_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: $(awsAccessKeyIDName)
|
||||
name: $(awsSecretName)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_SECRET_ACCESS_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: $(awsSecretAccessKeyName)
|
||||
name: $(awsSecretName)
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: S3_ENDPOINT
|
||||
value: $(S3_ENDPOINT)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_ENDPOINT_URL
|
||||
value: $(AWS_ENDPOINT_URL)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_REGION
|
||||
value: $(AWS_REGION)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: BUCKET_NAME
|
||||
value: $(BUCKET_NAME)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: S3_USE_HTTPS
|
||||
value: $(S3_USE_HTTPS)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: S3_VERIFY_SSL
|
||||
value: $(S3_VERIFY_SSL)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_ACCESS_KEY_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: $(awsAccessKeyIDName)
|
||||
name: $(awsSecretName)
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: AWS_SECRET_ACCESS_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: $(awsSecretAccessKeyName)
|
||||
name: $(awsSecretName)
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
bases:
|
||||
- ../base
|
||||
|
||||
configurations:
|
||||
- params.yaml
|
||||
|
||||
# TBD (jinchihe) Need move the image to base file once.
|
||||
# the issue addressed: kubernetes-sigs/kustomize/issues/1040
|
||||
# TBD (jinchihe) Need to update the image once
|
||||
# the issue addressed: kubeflow/testing/issues/373
|
||||
images:
|
||||
- name: training-image
|
||||
newName: gcr.io/kubeflow-examples/mnist/model
|
||||
newTag: v20190111-v0.2-148-g313770f
|
||||
|
||||
vars:
|
||||
- fieldref:
|
||||
fieldPath: data.S3_ENDPOINT
|
||||
name: S3_ENDPOINT
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.AWS_ENDPOINT_URL
|
||||
name: AWS_ENDPOINT_URL
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.AWS_REGION
|
||||
name: AWS_REGION
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.BUCKET_NAME
|
||||
name: BUCKET_NAME
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.S3_USE_HTTPS
|
||||
name: S3_USE_HTTPS
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.S3_VERIFY_SSL
|
||||
name: S3_VERIFY_SSL
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.awsSecretName
|
||||
name: awsSecretName
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.awsAccessKeyIDName
|
||||
name: awsAccessKeyIDName
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.awsSecretAccessKeyName
|
||||
name: awsSecretAccessKeyName
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
|
||||
patchesJson6902:
|
||||
- path: Chief_patch.yaml
|
||||
target:
|
||||
group: kubeflow.org
|
||||
kind: TFJob
|
||||
name: $(trainingName)
|
||||
version: v1beta1
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
varReference:
|
||||
- path: metadata/name
|
||||
kind: TFJob
|
||||
- path: metadata/name
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Chief/template/spec/containers/env/valueFrom/secretKeyRef/name
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Chief/template/spec/containers/env/valueFrom/secretKeyRef/key
|
||||
kind: TFJob
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: TFJob
|
||||
metadata:
|
||||
name: $(trainingName)
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
tfReplicaSpecs:
|
||||
Chief:
|
||||
replicas: 1
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: tensorflow
|
||||
command:
|
||||
- /usr/bin/python
|
||||
- /opt/model.py
|
||||
- --tf-model-dir=$(modelDir)
|
||||
- --tf-export-dir=$(exportDir)
|
||||
- --tf-train-steps=$(trainSteps)
|
||||
- --tf-batch-size=$(batchSize)
|
||||
- --tf-learning-rate=$(learningRate)
|
||||
env:
|
||||
- name: modelDir
|
||||
value: $(modelDir)
|
||||
- name: exportDir
|
||||
value: $(exportDir)
|
||||
- name: trainSteps
|
||||
value: $(trainSteps)
|
||||
- name: batchSize
|
||||
value: $(batchSize)
|
||||
- name: learningRate
|
||||
value: $(learningRate)
|
||||
image: training-image
|
||||
workingDir: /opt
|
||||
restartPolicy: OnFailure
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: TFJob
|
||||
metadata:
|
||||
name: $(trainingName)
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
tfReplicaSpecs:
|
||||
Ps:
|
||||
replicas: %numPs%
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: tensorflow
|
||||
command:
|
||||
- /usr/bin/python
|
||||
- /opt/model.py
|
||||
- --tf-model-dir=$(modelDir)
|
||||
- --tf-export-dir=$(exportDir)
|
||||
- --tf-train-steps=$(trainSteps)
|
||||
- --tf-batch-size=$(batchSize)
|
||||
- --tf-learning-rate=$(learningRate)
|
||||
env:
|
||||
- name: modelDir
|
||||
value: $(modelDir)
|
||||
- name: exportDir
|
||||
value: $(exportDir)
|
||||
- name: trainSteps
|
||||
value: $(trainSteps)
|
||||
- name: batchSize
|
||||
value: $(batchSize)
|
||||
- name: learningRate
|
||||
value: $(learningRate)
|
||||
image: training-image
|
||||
workingDir: /opt
|
||||
restartPolicy: OnFailure
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
apiVersion: kubeflow.org/v1beta1
|
||||
kind: TFJob
|
||||
metadata:
|
||||
name: $(trainingName)
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
tfReplicaSpecs:
|
||||
Worker:
|
||||
replicas: %numWorkers%
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: tensorflow
|
||||
command:
|
||||
- /usr/bin/python
|
||||
- /opt/model.py
|
||||
- --tf-model-dir=$(modelDir)
|
||||
- --tf-export-dir=$(exportDir)
|
||||
- --tf-train-steps=$(trainSteps)
|
||||
- --tf-batch-size=$(batchSize)
|
||||
- --tf-learning-rate=$(learningRate)
|
||||
env:
|
||||
- name: modelDir
|
||||
value: $(modelDir)
|
||||
- name: exportDir
|
||||
value: $(exportDir)
|
||||
- name: trainSteps
|
||||
value: $(trainSteps)
|
||||
- name: batchSize
|
||||
value: $(batchSize)
|
||||
- name: learningRate
|
||||
value: $(learningRate)
|
||||
image: training-image
|
||||
name: tensorflow
|
||||
workingDir: /opt
|
||||
restartPolicy: OnFailure
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
#!/bin/bash
|
||||
|
||||
# The script is to define the number of Ps and Workers for TFJOB.
|
||||
# Usage: definition.sh --numPs number_of_PS --numWorkers number_of_worker
|
||||
|
||||
while (($#)); do
|
||||
case $1 in
|
||||
"--numPs")
|
||||
shift
|
||||
numPs="$1"
|
||||
shift
|
||||
;;
|
||||
"--numWorkers")
|
||||
shift
|
||||
numWorkers="$1"
|
||||
shift
|
||||
;;
|
||||
"--help")
|
||||
shift
|
||||
echo "Usage: definition.sh --numPs number_of_PS --numWorkers number_of_worker"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: '$1'"
|
||||
echo "Usage: definition.sh --numPs number_of_PS --numWorkers number_of_worker"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
BASE_PATH=$(dirname "$0")
|
||||
|
||||
if [ "x${numPs}" != "x" ]; then
|
||||
if [[ ${numPs} =~ ^[0-9]+$ ]] && [ ${numPs} -gt 0 ]; then
|
||||
(cd ${BASE_PATH}; sed -i.sedbak s/%numPs%/${numPs}/ Ps.yaml >> /dev/null)
|
||||
(cd ${BASE_PATH}; kustomize edit add patch Ps.yaml)
|
||||
sed -i.sedbak '/patchesJson6902/a \
|
||||
- path: Ps_patch.yaml \
|
||||
\ target: \
|
||||
\ group: kubeflow.org \
|
||||
\ kind: TFJob \
|
||||
\ name: \$(trainingName) \
|
||||
\ version: v1beta1 \
|
||||
\
|
||||
' kustomization.yaml
|
||||
else
|
||||
echo "ERROR: numPS must be an integer greater than or equal to 1."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "x${numWorkers}" != "x" ]; then
|
||||
if [[ ${numWorkers} =~ ^[0-9]+$ ]] && [ ${numWorkers} -gt 0 ]; then
|
||||
(cd ${BASE_PATH}; sed -i.sedbak s/%numWorkers%/${numWorkers}/ Worker.yaml >> /dev/null)
|
||||
(cd ${BASE_PATH}; kustomize edit add patch Worker.yaml)
|
||||
sed -i.sedbak '/patchesJson6902/a \
|
||||
- path: Worker_patch.yaml \
|
||||
\ target: \
|
||||
\ group: kubeflow.org \
|
||||
\ kind: TFJob \
|
||||
\ name: \$(trainingName) \
|
||||
\ version: v1beta1 \
|
||||
\
|
||||
' kustomization.yaml
|
||||
else
|
||||
echo "ERROR: numWorkers must be an integer greater than or equal to 1."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
rm -rf ${BASE_PATH}/*.sedbak
|
||||
rm -rf *.sedbak
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
resources:
|
||||
- Chief.yaml
|
||||
|
||||
namespace: kubeflow
|
||||
|
||||
generatorOptions:
|
||||
disableNameSuffixHash: true
|
||||
|
||||
configurations:
|
||||
- params.yaml
|
||||
|
||||
vars:
|
||||
- fieldref:
|
||||
fieldPath: data.name
|
||||
name: trainingName
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.modelDir
|
||||
name: modelDir
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.exportDir
|
||||
name: exportDir
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.trainSteps
|
||||
name: trainSteps
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.batchSize
|
||||
name: batchSize
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.learningRate
|
||||
name: learningRate
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
varReference:
|
||||
- path: metadata/name
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Chief/template/spec/containers/env/value
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Worker/template/spec/containers/env/value
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Ps/template/spec/containers/env/value
|
||||
kind: TFJob
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/volumeMounts
|
||||
value:
|
||||
- mountPath: $(pvcMountPath)
|
||||
name: local-storage
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Chief/template/spec/volumes
|
||||
value:
|
||||
- name: local-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: $(pvcName)
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/volumeMounts
|
||||
value:
|
||||
- mountPath: $(pvcMountPath)
|
||||
name: local-storage
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Ps/template/spec/volumes
|
||||
value:
|
||||
- name: local-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: $(pvcName)
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/volumeMounts
|
||||
value:
|
||||
- mountPath: $(pvcMountPath)
|
||||
name: local-storage
|
||||
- op: add
|
||||
path: /spec/tfReplicaSpecs/Worker/template/spec/volumes
|
||||
value:
|
||||
- name: local-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: $(pvcName)
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
bases:
|
||||
- ../base
|
||||
|
||||
configurations:
|
||||
- params.yaml
|
||||
|
||||
# TBD (jinchihe) Need move the image to base file once.
|
||||
# the issue addressed: kubernetes-sigs/kustomize/issues/1040
|
||||
# TBD (jinchihe) Need to update the image once
|
||||
# the issue addressed: kubeflow/testing/issues/373
|
||||
images:
|
||||
- name: training-image
|
||||
newName: gcr.io/kubeflow-examples/mnist/model
|
||||
newTag: v20190111-v0.2-148-g313770f
|
||||
|
||||
vars:
|
||||
- fieldref:
|
||||
fieldPath: data.pvcName
|
||||
name: pvcName
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
- fieldref:
|
||||
fieldPath: data.pvcMountPath
|
||||
name: pvcMountPath
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: mnist-map-training
|
||||
|
||||
patchesJson6902:
|
||||
- path: Chief_patch.yaml
|
||||
target:
|
||||
group: kubeflow.org
|
||||
kind: TFJob
|
||||
name: $(trainingName)
|
||||
version: v1beta1
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
varReference:
|
||||
- path: metadata/name
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Chief/template/spec/volumes/persistentVolumeClaim/claimName
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Worker/template/spec/volumes/persistentVolumeClaim/claimName
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Ps/template/spec/volumes/persistentVolumeClaim/claimName
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Chief/template/spec/containers/volumeMounts/mountPath
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Worker/template/spec/containers/volumeMounts/mountPath
|
||||
kind: TFJob
|
||||
- path: spec/tfReplicaSpecs/Ps/template/spec/containers/volumeMounts/mountPath
|
||||
kind: TFJob
|
||||
|
|
@ -20,7 +20,7 @@ local defaultParams = {
|
|||
// Which Kubeflow cluster to use for running TFJobs on.
|
||||
kfProject: "kubeflow-ci-deployment",
|
||||
kfZone: "us-east1-b",
|
||||
kfCluster: "kf-vmaster-n00",
|
||||
kfCluster: "kf-v0-5-n04",
|
||||
|
||||
// The bucket where the model should be written
|
||||
// This needs to be writable by the GCP service account in the Kubeflow cluster (not the test cluster)
|
||||
|
|
@ -311,8 +311,9 @@ local dagTemplates = [
|
|||
"--params=" + std.join(",", [
|
||||
"name=mnist-test-" + prowDict["BUILD_ID"],
|
||||
"namespace=" + testNamespace,
|
||||
"numTrainSteps=10",
|
||||
"trainSteps=10",
|
||||
"batchSize=10",
|
||||
"learningRate=0.01",
|
||||
"image=" + trainerImage,
|
||||
"numPs=1",
|
||||
"numWorkers=2",
|
||||
|
|
|
|||
Loading…
Reference in New Issue