mirror of https://github.com/kubeflow/examples.git
369 lines
15 KiB
YAML
369 lines
15 KiB
YAML
apiVersion: argoproj.io/v1alpha1
|
|
kind: Workflow
|
|
metadata:
|
|
generateName: tf-workflow-
|
|
spec:
|
|
entrypoint: tests
|
|
onExit: exit-handler
|
|
# Parameters can be passed/overridden via the argo CLI.
|
|
# To override the printed message, run `argo submit` with the -p option:
|
|
# $ argo submit examples/arguments-parameters.yaml -p message="goodbye world"
|
|
arguments:
|
|
parameters:
|
|
- name: tf-worker # number of tf workers
|
|
value: 1
|
|
- name: tf-ps # number of tf parameter servers
|
|
value: 2
|
|
- name: tf-model-image
|
|
value: elsonrodriguez/mytfmodel:1.7
|
|
- name: tf-serving-image #FIXME this image is a mirror of a private kubeflow-ci image, once we're building images swap this out. https://github.com/kubeflow/kubeflow/blob/dcf4adfe2dd1cec243647f3dd05d7c26246fddb1/components/k8s-model-server/images/Dockerfile.cpu
|
|
value: elsonrodriguez/model-server:1.6
|
|
- name: tf-tensorboard-image
|
|
value: tensorflow/tensorflow:1.7.0
|
|
- name: ks-image
|
|
value: elsonrodriguez/ksonnet:0.10.1
|
|
- name: model-name
|
|
value: mnist
|
|
- name: model-hidden-units
|
|
value: 100
|
|
- name: model-train-steps
|
|
value: 200
|
|
- name: model-batch-size
|
|
value: 100
|
|
- name: model-learning-rate
|
|
value: 0.01
|
|
- name: model-serving
|
|
value: true
|
|
- name: model-serving-servicetype
|
|
value: ClusterIP
|
|
- name: model-serving-ks-url
|
|
value: github.com/kubeflow/kubeflow/tree/master/kubeflow
|
|
- name: model-serving-ks-tag
|
|
value: 1f474f30
|
|
- name: job-name
|
|
value: myjob
|
|
- name: namespace
|
|
value: default
|
|
- name: s3-data-url
|
|
value: s3://mybucket/data/mnist/
|
|
- name: s3-train-base-url
|
|
value: s3://mybucket/models
|
|
- name: aws-endpoint-url
|
|
value: https://s3.us-west-1.amazonaws.com
|
|
- name: s3-endpoint
|
|
value: s3.us-west-1.amazonaws.com
|
|
- name: s3-use-https
|
|
value: true
|
|
- name: s3-verify-ssl
|
|
value: true
|
|
- name: aws-region
|
|
value: us-west-1
|
|
- name: aws-secret
|
|
value: aws-creds
|
|
volumes:
|
|
- name: training-data
|
|
emptyDir: {}
|
|
- name: training-output
|
|
templates:
|
|
- name: tests
|
|
steps:
|
|
- - name: get-workflow-info
|
|
template: get-workflow-info
|
|
- - name: tensorboard
|
|
template: tf-tensorboard
|
|
arguments:
|
|
parameters:
|
|
- name: s3-model-url
|
|
value: "{{steps.get-workflow-info.outputs.parameters.s3-model-url}}"
|
|
- - name: train-model
|
|
template: tf-train
|
|
arguments:
|
|
parameters:
|
|
- name: s3-model-url
|
|
value: "{{steps.get-workflow-info.outputs.parameters.s3-model-url}}"
|
|
- - name: serve-model
|
|
template: tf-inference
|
|
arguments:
|
|
parameters:
|
|
- name: s3-exported-url
|
|
value: "{{steps.get-workflow-info.outputs.parameters.s3-exported-url}}"
|
|
when: "{{workflow.parameters.model-serving}} == true"
|
|
- name: exit-handler
|
|
steps:
|
|
- - name: cleanup
|
|
template: clean
|
|
- name: get-workflow-info
|
|
container:
|
|
image: nervana/circleci:master
|
|
imagePullPolicy: Always
|
|
command: ["bash", "-c", "echo '{{workflow.parameters.s3-train-base-url}}/{{workflow.parameters.job-name}}/' | tr -d '[:space:]' > /tmp/s3-model-url; echo '{{workflow.parameters.s3-train-base-url}}/{{workflow.parameters.job-name}}/export/{{workflow.parameters.model-name}}/' | tr -d '[:space:]' > /tmp/s3-exported-url"]
|
|
outputs:
|
|
parameters:
|
|
- name: s3-model-url
|
|
valueFrom:
|
|
path: /tmp/s3-model-url
|
|
- name: s3-exported-url
|
|
valueFrom:
|
|
path: /tmp/s3-exported-url
|
|
- name: tf-train
|
|
inputs:
|
|
parameters:
|
|
- name: s3-model-url
|
|
resource:
|
|
action: apply
|
|
# NOTE: need to detect master node complete
|
|
successCondition: status.state == Succeeded
|
|
manifest: |
|
|
apiVersion: "kubeflow.org/v1alpha1"
|
|
kind: "TFJob"
|
|
metadata:
|
|
name: {{workflow.parameters.job-name}}
|
|
namespace: {{workflow.parameters.namespace}}
|
|
spec:
|
|
replicaSpecs:
|
|
- replicas: 1
|
|
tfReplicaType: MASTER
|
|
template:
|
|
spec:
|
|
serviceAccountName: tf-job-operator
|
|
containers:
|
|
- image: {{workflow.parameters.tf-model-image}}
|
|
name: tensorflow
|
|
imagePullPolicy: Always
|
|
env:
|
|
- name: TF_MODEL_DIR
|
|
value: {{inputs.parameters.s3-model-url}}
|
|
- name: TF_EXPORT_DIR
|
|
value: {{workflow.parameters.model-name}}
|
|
- name: TF_TRAIN_STEPS
|
|
value: "{{workflow.parameters.model-train-steps}}"
|
|
- name: TF_TF_BATCH_SIZE
|
|
value: "{{workflow.parameters.model-batch-size}}"
|
|
- name: TF_LEARNING_RATE
|
|
value: "{{workflow.parameters.model-learning-rate}}"
|
|
- name: AWS_ACCESS_KEY_ID
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: {{workflow.parameters.aws-secret}}
|
|
key: awsAccessKeyID
|
|
- name: AWS_SECRET_ACCESS_KEY
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: {{workflow.parameters.aws-secret}}
|
|
key: awsSecretAccessKey
|
|
- name: AWS_DEFAULT_REGION
|
|
value: {{workflow.parameters.aws-region}}
|
|
- name: AWS_REGION
|
|
value: {{workflow.parameters.aws-region}}
|
|
- name: S3_REGION
|
|
value: {{workflow.parameters.aws-region}}
|
|
- name: S3_USE_HTTPS
|
|
value: "{{workflow.parameters.s3-use-https}}"
|
|
- name: S3_VERIFY_SSL
|
|
value: "{{workflow.parameters.s3-verify-ssl}}"
|
|
- name: S3_ENDPOINT
|
|
value: {{workflow.parameters.s3-endpoint}}
|
|
restartPolicy: OnFailure
|
|
- replicas: {{workflow.parameters.tf-worker}}
|
|
tfReplicaType: WORKER
|
|
template:
|
|
spec:
|
|
serviceAccountName: tf-job-operator
|
|
containers:
|
|
- image: {{workflow.parameters.tf-model-image}}
|
|
name: tensorflow
|
|
imagePullPolicy: Always
|
|
env:
|
|
- name: TF_MODEL_DIR
|
|
value: {{inputs.parameters.s3-model-url}}
|
|
- name: TF_EXPORT_DIR
|
|
value: {{workflow.parameters.model-name}}
|
|
- name: TF_TRAIN_STEPS
|
|
value: "{{workflow.parameters.model-train-steps}}"
|
|
- name: TF_TF_BATCH_SIZE
|
|
value: "{{workflow.parameters.model-batch-size}}"
|
|
- name: TF_LEARNING_RATE
|
|
value: "{{workflow.parameters.model-learning-rate}}"
|
|
- name: AWS_ACCESS_KEY_ID
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: {{workflow.parameters.aws-secret}}
|
|
key: awsAccessKeyID
|
|
- name: AWS_SECRET_ACCESS_KEY
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: {{workflow.parameters.aws-secret}}
|
|
key: awsSecretAccessKey
|
|
- name: AWS_DEFAULT_REGION
|
|
value: {{workflow.parameters.aws-region}}
|
|
- name: AWS_REGION
|
|
value: {{workflow.parameters.aws-region}}
|
|
- name: S3_REGION
|
|
value: {{workflow.parameters.aws-region}}
|
|
- name: S3_USE_HTTPS
|
|
value: "{{workflow.parameters.s3-use-https}}"
|
|
- name: S3_VERIFY_SSL
|
|
value: "{{workflow.parameters.s3-verify-ssl}}"
|
|
- name: S3_ENDPOINT
|
|
value: {{workflow.parameters.s3-endpoint}}
|
|
restartPolicy: OnFailure
|
|
- replicas: {{workflow.parameters.tf-ps}}
|
|
tfReplicaType: PS
|
|
template:
|
|
spec:
|
|
containers:
|
|
- image: {{workflow.parameters.tf-model-image}}
|
|
name: tensorflow
|
|
imagePullPolicy: Always
|
|
env:
|
|
- name: TF_MODEL_DIR
|
|
value: {{inputs.parameters.s3-model-url}}
|
|
- name: TF_EXPORT_DIR
|
|
value: {{workflow.parameters.model-name}}
|
|
- name: TF_TRAIN_STEPS
|
|
value: "{{workflow.parameters.model-train-steps}}"
|
|
- name: TF_TF_BATCH_SIZE
|
|
value: "{{workflow.parameters.model-batch-size}}"
|
|
- name: TF_LEARNING_RATE
|
|
value: "{{workflow.parameters.model-learning-rate}}"
|
|
- name: AWS_ACCESS_KEY_ID
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: {{workflow.parameters.aws-secret}}
|
|
key: awsAccessKeyID
|
|
- name: AWS_SECRET_ACCESS_KEY
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: {{workflow.parameters.aws-secret}}
|
|
key: awsSecretAccessKey
|
|
- name: AWS_DEFAULT_REGION
|
|
value: {{workflow.parameters.aws-region}}
|
|
- name: AWS_REGION
|
|
value: {{workflow.parameters.aws-region}}
|
|
- name: S3_REGION
|
|
value: {{workflow.parameters.aws-region}}
|
|
- name: S3_USE_HTTPS
|
|
value: "{{workflow.parameters.s3-use-https}}"
|
|
- name: S3_VERIFY_SSL
|
|
value: "{{workflow.parameters.s3-verify-ssl}}"
|
|
- name: S3_ENDPOINT
|
|
value: {{workflow.parameters.s3-endpoint}}
|
|
restartPolicy: OnFailure
|
|
terminationPolicy:
|
|
chief:
|
|
replicaIndex: 0
|
|
replicaName: MASTER
|
|
- name: tf-tensorboard
|
|
inputs:
|
|
parameters:
|
|
- name: s3-model-url
|
|
resource:
|
|
action: apply
|
|
manifest: |
|
|
apiVersion: extensions/v1beta1
|
|
kind: Deployment
|
|
metadata:
|
|
labels:
|
|
app: tensorboard-{{workflow.parameters.job-name}}
|
|
name: tensorboard-{{workflow.parameters.job-name}}
|
|
namespace: {{workflow.parameters.namespace}}
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: tensorboard-{{workflow.parameters.job-name}}
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: tensorboard-{{workflow.parameters.job-name}}
|
|
spec:
|
|
containers:
|
|
- name: tensorboard-{{workflow.parameters.job-name}}
|
|
image: {{workflow.parameters.tf-tensorboard-image}}
|
|
imagePullPolicy: Always
|
|
command:
|
|
- /usr/local/bin/tensorboard
|
|
args:
|
|
- --logdir
|
|
- {{inputs.parameters.s3-model-url}}
|
|
env:
|
|
- name: AWS_ACCESS_KEY_ID
|
|
valueFrom:
|
|
secretKeyRef:
|
|
key: awsAccessKeyID
|
|
name: {{workflow.parameters.aws-secret}}
|
|
- name: AWS_SECRET_ACCESS_KEY
|
|
valueFrom:
|
|
secretKeyRef:
|
|
key: awsSecretAccessKey
|
|
name: {{workflow.parameters.aws-secret}}
|
|
- name: AWS_REGION
|
|
value: {{workflow.parameters.aws-region}}
|
|
- name: S3_REGION
|
|
value: {{workflow.parameters.aws-region}}
|
|
- name: S3_USE_HTTPS
|
|
value: "{{workflow.parameters.s3-use-https}}"
|
|
- name: S3_VERIFY_SSL
|
|
value: "{{workflow.parameters.s3-verify-ssl}}"
|
|
- name: S3_ENDPOINT
|
|
value: {{workflow.parameters.s3-endpoint}}
|
|
ports:
|
|
- containerPort: 6006
|
|
protocol: TCP
|
|
dnsPolicy: ClusterFirst
|
|
restartPolicy: Always
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
labels:
|
|
app: tensorboard-{{workflow.parameters.job-name}}
|
|
name: tensorboard-{{workflow.parameters.job-name}}
|
|
namespace: {{workflow.parameters.namespace}}
|
|
spec:
|
|
ports:
|
|
- port: 80
|
|
protocol: TCP
|
|
targetPort: 6006
|
|
selector:
|
|
app: tensorboard-{{workflow.parameters.job-name}}
|
|
sessionAffinity: None
|
|
type: ClusterIP
|
|
- name: tf-inference
|
|
inputs:
|
|
parameters:
|
|
- name: s3-exported-url
|
|
script:
|
|
image: "{{workflow.parameters.ks-image}}"
|
|
command: ["/ksonnet-entrypoint.sh"]
|
|
source: |
|
|
ks init my-model-server
|
|
cd my-model-server
|
|
ks registry add kubeflow {{workflow.parameters.model-serving-ks-url}}
|
|
ks pkg install kubeflow/tf-serving@{{workflow.parameters.model-serving-ks-tag}}
|
|
ks env add default
|
|
# TODO change mnist name to be specific to a job. Right now mnist name is required to serve the model.
|
|
ks generate tf-serving {{workflow.parameters.model-name}} --name=mnist-{{workflow.parameters.job-name}} --namespace={{workflow.parameters.namespace}} --model_path={{inputs.parameters.s3-exported-url}}
|
|
ks param set {{workflow.parameters.model-name}} model_server_image {{workflow.parameters.tf-serving-image}}
|
|
ks param set {{workflow.parameters.model-name}} model_name {{workflow.parameters.model-name}}
|
|
ks param set {{workflow.parameters.model-name}} namespace {{workflow.parameters.namespace}}
|
|
ks param set {{workflow.parameters.model-name}} service_type {{workflow.parameters.model-serving-servicetype}}
|
|
ks param set {{workflow.parameters.model-name}} s3_create_secret false
|
|
ks param set {{workflow.parameters.model-name}} s3_secret_name {{workflow.parameters.aws-secret}}
|
|
ks param set {{workflow.parameters.model-name}} s3_secret_accesskeyid_key_name awsAccessKeyID
|
|
ks param set {{workflow.parameters.model-name}} s3_secret_secretaccesskey_key_name awsSecretAccessKey
|
|
ks param set {{workflow.parameters.model-name}} s3_aws_region {{workflow.parameters.aws-region}}
|
|
ks param set {{workflow.parameters.model-name}} s3_endpoint {{workflow.parameters.s3-endpoint}}
|
|
ks param set {{workflow.parameters.model-name}} s3_use_https {{workflow.parameters.s3-use-https}} --as-string
|
|
ks param set {{workflow.parameters.model-name}} s3_verify_ssl {{workflow.parameters.s3-verify-ssl}} --as-string
|
|
ks apply default -c {{workflow.parameters.model-name}}
|
|
#FIXME This doesn't actually work in the current version of argo. We're using a default of `tf-user` in the container entrypoint for now.
|
|
env:
|
|
- name: SERVICE_ACCOUNT
|
|
value: tf-user
|
|
- name: clean
|
|
container:
|
|
image: nervana/circleci:master
|
|
imagePullPolicy: Always
|
|
command: ["bash", "-c", "kubectl delete tfjob {{workflow.parameters.job-name}} || true"]
|