example mnist upgrade to v1alpha2 (#246)

* example mnist upgrade to v1alpha2

* Remove cleanPodPolicy

* Fix kubeflow branch to v0.2.4
This commit is contained in:
Inki Hwang 2018-09-10 05:01:21 +09:00 committed by k8s-ci-robot
parent d878462bc5
commit 8e30631c54
2 changed files with 134 additions and 138 deletions

View File

@ -103,15 +103,15 @@ APP_NAME=my-kubeflow
ks init ${APP_NAME}
cd ${APP_NAME}
ks registry add kubeflow github.com/kubeflow/kubeflow/tree/master/kubeflow
ks registry add kubeflow github.com/kubeflow/kubeflow/tree/v0.2.4/kubeflow
ks pkg install kubeflow/core@v0.1.2
ks pkg install kubeflow/core@v0.2.4
ks pkg install kubeflow/argo
# Deploy TF Operator and Argo
kubectl create namespace ${NAMESPACE}
ks generate core kubeflow-core --name=kubeflow-core --namespace=${NAMESPACE}
ks generate argo kubeflow-argo --name=kubeflow-argo --namespace=${NAMESPACE} --imageTag=v2.1.0
ks generate argo kubeflow-argo --name=kubeflow-argo --namespace=${NAMESPACE}
ks apply default -c kubeflow-core
ks apply default -c kubeflow-argo

View File

@ -112,147 +112,143 @@ spec:
resource:
action: apply
# NOTE: need to detect master node complete
successCondition: status.state == Succeeded
successCondition: status.tfReplicaStatuses.Master.succeeded == 1
manifest: |
apiVersion: "kubeflow.org/v1alpha1"
apiVersion: "kubeflow.org/v1alpha2"
kind: "TFJob"
metadata:
name: {{workflow.parameters.job-name}}
namespace: {{workflow.parameters.namespace}}
spec:
replicaSpecs:
- replicas: 1
tfReplicaType: MASTER
template:
spec:
serviceAccountName: tf-job-operator
containers:
- image: {{workflow.parameters.tf-model-image}}
name: tensorflow
imagePullPolicy: Always
env:
- name: TF_MODEL_DIR
value: {{inputs.parameters.s3-model-url}}
- name: TF_EXPORT_DIR
value: {{workflow.parameters.model-name}}
- name: TF_TRAIN_STEPS
value: "{{workflow.parameters.model-train-steps}}"
- name: TF_TF_BATCH_SIZE
value: "{{workflow.parameters.model-batch-size}}"
- name: TF_LEARNING_RATE
value: "{{workflow.parameters.model-learning-rate}}"
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsAccessKeyID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsSecretAccessKey
- name: AWS_DEFAULT_REGION
value: {{workflow.parameters.aws-region}}
- name: AWS_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_USE_HTTPS
value: "{{workflow.parameters.s3-use-https}}"
- name: S3_VERIFY_SSL
value: "{{workflow.parameters.s3-verify-ssl}}"
- name: S3_ENDPOINT
value: {{workflow.parameters.s3-endpoint}}
restartPolicy: OnFailure
- replicas: {{workflow.parameters.tf-worker}}
tfReplicaType: WORKER
template:
spec:
serviceAccountName: tf-job-operator
containers:
- image: {{workflow.parameters.tf-model-image}}
name: tensorflow
imagePullPolicy: Always
env:
- name: TF_MODEL_DIR
value: {{inputs.parameters.s3-model-url}}
- name: TF_EXPORT_DIR
value: {{workflow.parameters.model-name}}
- name: TF_TRAIN_STEPS
value: "{{workflow.parameters.model-train-steps}}"
- name: TF_TF_BATCH_SIZE
value: "{{workflow.parameters.model-batch-size}}"
- name: TF_LEARNING_RATE
value: "{{workflow.parameters.model-learning-rate}}"
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsAccessKeyID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsSecretAccessKey
- name: AWS_DEFAULT_REGION
value: {{workflow.parameters.aws-region}}
- name: AWS_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_USE_HTTPS
value: "{{workflow.parameters.s3-use-https}}"
- name: S3_VERIFY_SSL
value: "{{workflow.parameters.s3-verify-ssl}}"
- name: S3_ENDPOINT
value: {{workflow.parameters.s3-endpoint}}
restartPolicy: OnFailure
- replicas: {{workflow.parameters.tf-ps}}
tfReplicaType: PS
template:
spec:
containers:
- image: {{workflow.parameters.tf-model-image}}
name: tensorflow
imagePullPolicy: Always
env:
- name: TF_MODEL_DIR
value: {{inputs.parameters.s3-model-url}}
- name: TF_EXPORT_DIR
value: {{workflow.parameters.model-name}}
- name: TF_TRAIN_STEPS
value: "{{workflow.parameters.model-train-steps}}"
- name: TF_TF_BATCH_SIZE
value: "{{workflow.parameters.model-batch-size}}"
- name: TF_LEARNING_RATE
value: "{{workflow.parameters.model-learning-rate}}"
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsAccessKeyID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsSecretAccessKey
- name: AWS_DEFAULT_REGION
value: {{workflow.parameters.aws-region}}
- name: AWS_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_USE_HTTPS
value: "{{workflow.parameters.s3-use-https}}"
- name: S3_VERIFY_SSL
value: "{{workflow.parameters.s3-verify-ssl}}"
- name: S3_ENDPOINT
value: {{workflow.parameters.s3-endpoint}}
restartPolicy: OnFailure
terminationPolicy:
chief:
replicaIndex: 0
replicaName: MASTER
tfReplicaSpecs:
Master:
replicas: 1
template:
spec:
serviceAccountName: tf-job-operator
containers:
- image: {{workflow.parameters.tf-model-image}}
name: tensorflow
imagePullPolicy: Always
env:
- name: TF_MODEL_DIR
value: {{inputs.parameters.s3-model-url}}
- name: TF_EXPORT_DIR
value: {{workflow.parameters.model-name}}
- name: TF_TRAIN_STEPS
value: "{{workflow.parameters.model-train-steps}}"
- name: TF_BATCH_SIZE
value: "{{workflow.parameters.model-batch-size}}"
- name: TF_LEARNING_RATE
value: "{{workflow.parameters.model-learning-rate}}"
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsAccessKeyID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsSecretAccessKey
- name: AWS_DEFAULT_REGION
value: {{workflow.parameters.aws-region}}
- name: AWS_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_USE_HTTPS
value: "{{workflow.parameters.s3-use-https}}"
- name: S3_VERIFY_SSL
value: "{{workflow.parameters.s3-verify-ssl}}"
- name: S3_ENDPOINT
value: {{workflow.parameters.s3-endpoint}}
restartPolicy: OnFailure
Worker:
replicas: {{workflow.parameters.tf-worker}}
template:
spec:
serviceAccountName: tf-job-operator
containers:
- image: {{workflow.parameters.tf-model-image}}
name: tensorflow
imagePullPolicy: Always
env:
- name: TF_MODEL_DIR
value: {{inputs.parameters.s3-model-url}}
- name: TF_EXPORT_DIR
value: {{workflow.parameters.model-name}}
- name: TF_TRAIN_STEPS
value: "{{workflow.parameters.model-train-steps}}"
- name: TF_BATCH_SIZE
value: "{{workflow.parameters.model-batch-size}}"
- name: TF_LEARNING_RATE
value: "{{workflow.parameters.model-learning-rate}}"
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsAccessKeyID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsSecretAccessKey
- name: AWS_DEFAULT_REGION
value: {{workflow.parameters.aws-region}}
- name: AWS_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_USE_HTTPS
value: "{{workflow.parameters.s3-use-https}}"
- name: S3_VERIFY_SSL
value: "{{workflow.parameters.s3-verify-ssl}}"
- name: S3_ENDPOINT
value: {{workflow.parameters.s3-endpoint}}
restartPolicy: OnFailure
Ps:
replicas: {{workflow.parameters.tf-ps}}
template:
spec:
containers:
- image: {{workflow.parameters.tf-model-image}}
name: tensorflow
imagePullPolicy: Always
env:
- name: TF_MODEL_DIR
value: {{inputs.parameters.s3-model-url}}
- name: TF_EXPORT_DIR
value: {{workflow.parameters.model-name}}
- name: TF_TRAIN_STEPS
value: "{{workflow.parameters.model-train-steps}}"
- name: TF_BATCH_SIZE
value: "{{workflow.parameters.model-batch-size}}"
- name: TF_LEARNING_RATE
value: "{{workflow.parameters.model-learning-rate}}"
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsAccessKeyID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: {{workflow.parameters.aws-secret}}
key: awsSecretAccessKey
- name: AWS_DEFAULT_REGION
value: {{workflow.parameters.aws-region}}
- name: AWS_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_REGION
value: {{workflow.parameters.aws-region}}
- name: S3_USE_HTTPS
value: "{{workflow.parameters.s3-use-https}}"
- name: S3_VERIFY_SSL
value: "{{workflow.parameters.s3-verify-ssl}}"
- name: S3_ENDPOINT
value: {{workflow.parameters.s3-endpoint}}
restartPolicy: OnFailure
- name: tf-tensorboard
inputs:
parameters: