examples/mnist/model-train.yaml

369 lines
15 KiB
YAML

apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: tf-workflow-
spec:
entrypoint: tests
onExit: exit-handler
# Parameters can be passed/overridden via the argo CLI.
# To override the printed message, run `argo submit` with the -p option:
# $ argo submit examples/arguments-parameters.yaml -p message="goodbye world"
arguments:
parameters:
- name: tf-worker # number of tf workers
value: 1
- name: tf-ps # number of tf parameter servers
value: 2
- name: tf-model-image
value: elsonrodriguez/mytfmodel:1.7
- name: tf-serving-image #FIXME this image is a mirror of a private kubeflow-ci image, once we're building images swap this out. https://github.com/kubeflow/kubeflow/blob/dcf4adfe2dd1cec243647f3dd05d7c26246fddb1/components/k8s-model-server/images/Dockerfile.cpu
value: elsonrodriguez/model-server:1.6
- name: tf-tensorboard-image
value: tensorflow/tensorflow:1.7.0
- name: ks-image
value: elsonrodriguez/ksonnet:0.10.1
- name: model-name
value: mnist
- name: model-hidden-units
value: 100
- name: model-train-steps
value: 200
- name: model-batch-size
value: 100
- name: model-learning-rate
value: 0.01
- name: model-serving
value: true
- name: model-serving-servicetype
value: ClusterIP
- name: model-serving-ks-url
value: github.com/kubeflow/kubeflow/tree/master/kubeflow
- name: model-serving-ks-tag
value: 1f474f30
- name: job-name
value: myjob
- name: namespace
value: default
- name: s3-data-url
value: s3://mybucket/data/mnist/
- name: s3-train-base-url
value: s3://mybucket/models
- name: aws-endpoint-url
value: https://s3.us-west-1.amazonaws.com
- name: s3-endpoint
value: s3.us-west-1.amazonaws.com
- name: s3-use-https
value: true
- name: s3-verify-ssl
value: true
- name: aws-region
value: us-west-1
- name: aws-secret
value: aws-creds
volumes:
- name: training-data
emptyDir: {}
- name: training-output
templates:
- name: tests
steps:
- - name: get-workflow-info
template: get-workflow-info
- - name: tensorboard
template: tf-tensorboard
arguments:
parameters:
- name: s3-model-url
value: "{{steps.get-workflow-info.outputs.parameters.s3-model-url}}"
- - name: train-model
template: tf-train
arguments:
parameters:
- name: s3-model-url
value: "{{steps.get-workflow-info.outputs.parameters.s3-model-url}}"
- - name: serve-model
template: tf-inference
arguments:
parameters:
- name: s3-exported-url
value: "{{steps.get-workflow-info.outputs.parameters.s3-exported-url}}"
when: "{{workflow.parameters.model-serving}} == true"
- name: exit-handler
steps:
- - name: cleanup
template: clean
- name: get-workflow-info
container:
image: nervana/circleci:master
imagePullPolicy: Always
command: ["bash", "-c", "echo '{{workflow.parameters.s3-train-base-url}}/{{workflow.parameters.job-name}}/' | tr -d '[:space:]' > /tmp/s3-model-url; echo '{{workflow.parameters.s3-train-base-url}}/{{workflow.parameters.job-name}}/export/{{workflow.parameters.model-name}}/' | tr -d '[:space:]' > /tmp/s3-exported-url"]
outputs:
parameters:
- name: s3-model-url
valueFrom:
path: /tmp/s3-model-url
- name: s3-exported-url
valueFrom:
path: /tmp/s3-exported-url
- name: tf-train
inputs:
parameters:
- name: s3-model-url
resource:
action: apply
# NOTE: need to detect master node complete
successCondition: status.state == Succeeded
manifest: |
apiVersion: "kubeflow.org/v1alpha1"
kind: "TFJob"
metadata:
name: {{workflow.parameters.job-name}}
namespace: {{workflow.parameters.namespace}}
spec:
replicaSpecs:
- replicas: 1
tfReplicaType: MASTER
template:
spec:
serviceAccountName: tf-job-operator
containers:
- image: {{workflow.parameters.tf-model-image}}
name: tensorflow
imagePullPolicy: Always
env:
- name: TF_MODEL_DIR
value: {{inputs.parameters.s3-model-url}}
- name: TF_EXPORT_DIR
value: {{workflow.parameters.model-name}}
- name: TF_TRAIN_STEPS
value: "{{workflow.parameters.model-train-steps}}"
- name: TF_TF_BATCH_SIZE
value: "{{workflow.parameters.model-batch-size}}"
- name: TF_LEARNING_RATE
value: "{{workflow.parameters.model-learning-rate}}"
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsAccessKeyID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsSecretAccessKey
- name: AWS_DEFAULT_REGION
value: {{workflow.parameters.aws-region}}
- name: AWS_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_USE_HTTPS
value: "{{workflow.parameters.s3-use-https}}"
- name: S3_VERIFY_SSL
value: "{{workflow.parameters.s3-verify-ssl}}"
- name: S3_ENDPOINT
value: {{workflow.parameters.s3-endpoint}}
restartPolicy: OnFailure
- replicas: {{workflow.parameters.tf-worker}}
tfReplicaType: WORKER
template:
spec:
serviceAccountName: tf-job-operator
containers:
- image: {{workflow.parameters.tf-model-image}}
name: tensorflow
imagePullPolicy: Always
env:
- name: TF_MODEL_DIR
value: {{inputs.parameters.s3-model-url}}
- name: TF_EXPORT_DIR
value: {{workflow.parameters.model-name}}
- name: TF_TRAIN_STEPS
value: "{{workflow.parameters.model-train-steps}}"
- name: TF_TF_BATCH_SIZE
value: "{{workflow.parameters.model-batch-size}}"
- name: TF_LEARNING_RATE
value: "{{workflow.parameters.model-learning-rate}}"
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsAccessKeyID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsSecretAccessKey
- name: AWS_DEFAULT_REGION
value: {{workflow.parameters.aws-region}}
- name: AWS_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_USE_HTTPS
value: "{{workflow.parameters.s3-use-https}}"
- name: S3_VERIFY_SSL
value: "{{workflow.parameters.s3-verify-ssl}}"
- name: S3_ENDPOINT
value: {{workflow.parameters.s3-endpoint}}
restartPolicy: OnFailure
- replicas: {{workflow.parameters.tf-ps}}
tfReplicaType: PS
template:
spec:
containers:
- image: {{workflow.parameters.tf-model-image}}
name: tensorflow
imagePullPolicy: Always
env:
- name: TF_MODEL_DIR
value: {{inputs.parameters.s3-model-url}}
- name: TF_EXPORT_DIR
value: {{workflow.parameters.model-name}}
- name: TF_TRAIN_STEPS
value: "{{workflow.parameters.model-train-steps}}"
- name: TF_TF_BATCH_SIZE
value: "{{workflow.parameters.model-batch-size}}"
- name: TF_LEARNING_RATE
value: "{{workflow.parameters.model-learning-rate}}"
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsAccessKeyID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsSecretAccessKey
- name: AWS_DEFAULT_REGION
value: {{workflow.parameters.aws-region}}
- name: AWS_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_USE_HTTPS
value: "{{workflow.parameters.s3-use-https}}"
- name: S3_VERIFY_SSL
value: "{{workflow.parameters.s3-verify-ssl}}"
- name: S3_ENDPOINT
value: {{workflow.parameters.s3-endpoint}}
restartPolicy: OnFailure
terminationPolicy:
chief:
replicaIndex: 0
replicaName: MASTER
- name: tf-tensorboard
inputs:
parameters:
- name: s3-model-url
resource:
action: apply
manifest: |
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
labels:
app: tensorboard-{{workflow.parameters.job-name}}
name: tensorboard-{{workflow.parameters.job-name}}
namespace: {{workflow.parameters.namespace}}
spec:
replicas: 1
selector:
matchLabels:
app: tensorboard-{{workflow.parameters.job-name}}
template:
metadata:
labels:
app: tensorboard-{{workflow.parameters.job-name}}
spec:
containers:
- name: tensorboard-{{workflow.parameters.job-name}}
image: {{workflow.parameters.tf-tensorboard-image}}
imagePullPolicy: Always
command:
- /usr/local/bin/tensorboard
args:
- --logdir
- {{inputs.parameters.s3-model-url}}
env:
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
key: awsAccessKeyID
name: {{workflow.parameters.aws-secret}}
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
key: awsSecretAccessKey
name: {{workflow.parameters.aws-secret}}
- name: AWS_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_USE_HTTPS
value: "{{workflow.parameters.s3-use-https}}"
- name: S3_VERIFY_SSL
value: "{{workflow.parameters.s3-verify-ssl}}"
- name: S3_ENDPOINT
value: {{workflow.parameters.s3-endpoint}}
ports:
- containerPort: 6006
protocol: TCP
dnsPolicy: ClusterFirst
restartPolicy: Always
---
apiVersion: v1
kind: Service
metadata:
labels:
app: tensorboard-{{workflow.parameters.job-name}}
name: tensorboard-{{workflow.parameters.job-name}}
namespace: {{workflow.parameters.namespace}}
spec:
ports:
- port: 80
protocol: TCP
targetPort: 6006
selector:
app: tensorboard-{{workflow.parameters.job-name}}
sessionAffinity: None
type: ClusterIP
- name: tf-inference
inputs:
parameters:
- name: s3-exported-url
script:
image: "{{workflow.parameters.ks-image}}"
command: ["/ksonnet-entrypoint.sh"]
source: |
ks init my-model-server
cd my-model-server
ks registry add kubeflow {{workflow.parameters.model-serving-ks-url}}
ks pkg install kubeflow/tf-serving@{{workflow.parameters.model-serving-ks-tag}}
ks env add default
# TODO change mnist name to be specific to a job. Right now mnist name is required to serve the model.
ks generate tf-serving {{workflow.parameters.model-name}} --name=mnist-{{workflow.parameters.job-name}} --namespace={{workflow.parameters.namespace}} --model_path={{inputs.parameters.s3-exported-url}}
ks param set {{workflow.parameters.model-name}} model_server_image {{workflow.parameters.tf-serving-image}}
ks param set {{workflow.parameters.model-name}} model_name {{workflow.parameters.model-name}}
ks param set {{workflow.parameters.model-name}} namespace {{workflow.parameters.namespace}}
ks param set {{workflow.parameters.model-name}} service_type {{workflow.parameters.model-serving-servicetype}}
ks param set {{workflow.parameters.model-name}} s3_create_secret false
ks param set {{workflow.parameters.model-name}} s3_secret_name {{workflow.parameters.aws-secret}}
ks param set {{workflow.parameters.model-name}} s3_secret_accesskeyid_key_name awsAccessKeyID
ks param set {{workflow.parameters.model-name}} s3_secret_secretaccesskey_key_name awsSecretAccessKey
ks param set {{workflow.parameters.model-name}} s3_aws_region {{workflow.parameters.aws-region}}
ks param set {{workflow.parameters.model-name}} s3_endpoint {{workflow.parameters.s3-endpoint}}
ks param set {{workflow.parameters.model-name}} s3_use_https {{workflow.parameters.s3-use-https}} --as-string
ks param set {{workflow.parameters.model-name}} s3_verify_ssl {{workflow.parameters.s3-verify-ssl}} --as-string
ks apply default -c {{workflow.parameters.model-name}}
#FIXME This doesn't actually work in the current version of argo. We're using a default of `tf-user` in the container entrypoint for now.
env:
- name: SERVICE_ACCOUNT
value: tf-user
- name: clean
container:
image: nervana/circleci:master
imagePullPolicy: Always
command: ["bash", "-c", "kubectl delete tfjob {{workflow.parameters.job-name}} || true"]