diff --git a/mnist/README.md b/mnist/README.md index 96aea5d9..97d29fbc 100644 --- a/mnist/README.md +++ b/mnist/README.md @@ -103,15 +103,15 @@ APP_NAME=my-kubeflow ks init ${APP_NAME} cd ${APP_NAME} -ks registry add kubeflow github.com/kubeflow/kubeflow/tree/master/kubeflow +ks registry add kubeflow github.com/kubeflow/kubeflow/tree/v0.2.4/kubeflow -ks pkg install kubeflow/core@v0.1.2 +ks pkg install kubeflow/core@v0.2.4 ks pkg install kubeflow/argo # Deploy TF Operator and Argo kubectl create namespace ${NAMESPACE} ks generate core kubeflow-core --name=kubeflow-core --namespace=${NAMESPACE} -ks generate argo kubeflow-argo --name=kubeflow-argo --namespace=${NAMESPACE} --imageTag=v2.1.0 +ks generate argo kubeflow-argo --name=kubeflow-argo --namespace=${NAMESPACE} ks apply default -c kubeflow-core ks apply default -c kubeflow-argo diff --git a/mnist/model-train.yaml b/mnist/model-train.yaml index 15045a6f..2fddf940 100644 --- a/mnist/model-train.yaml +++ b/mnist/model-train.yaml @@ -112,147 +112,143 @@ spec: resource: action: apply # NOTE: need to detect master node complete - successCondition: status.state == Succeeded + successCondition: status.tfReplicaStatuses.Master.succeeded == 1 manifest: | - apiVersion: "kubeflow.org/v1alpha1" + apiVersion: "kubeflow.org/v1alpha2" kind: "TFJob" metadata: name: {{workflow.parameters.job-name}} namespace: {{workflow.parameters.namespace}} spec: - replicaSpecs: - - replicas: 1 - tfReplicaType: MASTER - template: - spec: - serviceAccountName: tf-job-operator - containers: - - image: {{workflow.parameters.tf-model-image}} - name: tensorflow - imagePullPolicy: Always - env: - - name: TF_MODEL_DIR - value: {{inputs.parameters.s3-model-url}} - - name: TF_EXPORT_DIR - value: {{workflow.parameters.model-name}} - - name: TF_TRAIN_STEPS - value: "{{workflow.parameters.model-train-steps}}" - - name: TF_TF_BATCH_SIZE - value: "{{workflow.parameters.model-batch-size}}" - - name: TF_LEARNING_RATE - value: "{{workflow.parameters.model-learning-rate}}" - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - name: {{workflow.parameters.aws-secret}} - key: awsAccessKeyID - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - name: {{workflow.parameters.aws-secret}} - key: awsSecretAccessKey - - name: AWS_DEFAULT_REGION - value: {{workflow.parameters.aws-region}} - - name: AWS_REGION - value: {{workflow.parameters.aws-region}} - - name: S3_REGION - value: {{workflow.parameters.aws-region}} - - name: S3_USE_HTTPS - value: "{{workflow.parameters.s3-use-https}}" - - name: S3_VERIFY_SSL - value: "{{workflow.parameters.s3-verify-ssl}}" - - name: S3_ENDPOINT - value: {{workflow.parameters.s3-endpoint}} - restartPolicy: OnFailure - - replicas: {{workflow.parameters.tf-worker}} - tfReplicaType: WORKER - template: - spec: - serviceAccountName: tf-job-operator - containers: - - image: {{workflow.parameters.tf-model-image}} - name: tensorflow - imagePullPolicy: Always - env: - - name: TF_MODEL_DIR - value: {{inputs.parameters.s3-model-url}} - - name: TF_EXPORT_DIR - value: {{workflow.parameters.model-name}} - - name: TF_TRAIN_STEPS - value: "{{workflow.parameters.model-train-steps}}" - - name: TF_TF_BATCH_SIZE - value: "{{workflow.parameters.model-batch-size}}" - - name: TF_LEARNING_RATE - value: "{{workflow.parameters.model-learning-rate}}" - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - name: {{workflow.parameters.aws-secret}} - key: awsAccessKeyID - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - name: {{workflow.parameters.aws-secret}} - key: awsSecretAccessKey - - name: AWS_DEFAULT_REGION - value: {{workflow.parameters.aws-region}} - - name: AWS_REGION - value: {{workflow.parameters.aws-region}} - - name: S3_REGION - value: {{workflow.parameters.aws-region}} - - name: S3_USE_HTTPS - value: "{{workflow.parameters.s3-use-https}}" - - name: S3_VERIFY_SSL - value: "{{workflow.parameters.s3-verify-ssl}}" - - name: S3_ENDPOINT - value: {{workflow.parameters.s3-endpoint}} - restartPolicy: OnFailure - - replicas: {{workflow.parameters.tf-ps}} - tfReplicaType: PS - template: - spec: - containers: - - image: {{workflow.parameters.tf-model-image}} - name: tensorflow - imagePullPolicy: Always - env: - - name: TF_MODEL_DIR - value: {{inputs.parameters.s3-model-url}} - - name: TF_EXPORT_DIR - value: {{workflow.parameters.model-name}} - - name: TF_TRAIN_STEPS - value: "{{workflow.parameters.model-train-steps}}" - - name: TF_TF_BATCH_SIZE - value: "{{workflow.parameters.model-batch-size}}" - - name: TF_LEARNING_RATE - value: "{{workflow.parameters.model-learning-rate}}" - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - name: {{workflow.parameters.aws-secret}} - key: awsAccessKeyID - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - name: {{workflow.parameters.aws-secret}} - key: awsSecretAccessKey - - name: AWS_DEFAULT_REGION - value: {{workflow.parameters.aws-region}} - - name: AWS_REGION - value: {{workflow.parameters.aws-region}} - - name: S3_REGION - value: {{workflow.parameters.aws-region}} - - name: S3_USE_HTTPS - value: "{{workflow.parameters.s3-use-https}}" - - name: S3_VERIFY_SSL - value: "{{workflow.parameters.s3-verify-ssl}}" - - name: S3_ENDPOINT - value: {{workflow.parameters.s3-endpoint}} - restartPolicy: OnFailure - terminationPolicy: - chief: - replicaIndex: 0 - replicaName: MASTER + tfReplicaSpecs: + Master: + replicas: 1 + template: + spec: + serviceAccountName: tf-job-operator + containers: + - image: {{workflow.parameters.tf-model-image}} + name: tensorflow + imagePullPolicy: Always + env: + - name: TF_MODEL_DIR + value: {{inputs.parameters.s3-model-url}} + - name: TF_EXPORT_DIR + value: {{workflow.parameters.model-name}} + - name: TF_TRAIN_STEPS + value: "{{workflow.parameters.model-train-steps}}" + - name: TF_BATCH_SIZE + value: "{{workflow.parameters.model-batch-size}}" + - name: TF_LEARNING_RATE + value: "{{workflow.parameters.model-learning-rate}}" + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{workflow.parameters.aws-secret}} + key: awsAccessKeyID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{workflow.parameters.aws-secret}} + key: awsSecretAccessKey + - name: AWS_DEFAULT_REGION + value: {{workflow.parameters.aws-region}} + - name: AWS_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_USE_HTTPS + value: "{{workflow.parameters.s3-use-https}}" + - name: S3_VERIFY_SSL + value: "{{workflow.parameters.s3-verify-ssl}}" + - name: S3_ENDPOINT + value: {{workflow.parameters.s3-endpoint}} + restartPolicy: OnFailure + Worker: + replicas: {{workflow.parameters.tf-worker}} + template: + spec: + serviceAccountName: tf-job-operator + containers: + - image: {{workflow.parameters.tf-model-image}} + name: tensorflow + imagePullPolicy: Always + env: + - name: TF_MODEL_DIR + value: {{inputs.parameters.s3-model-url}} + - name: TF_EXPORT_DIR + value: {{workflow.parameters.model-name}} + - name: TF_TRAIN_STEPS + value: "{{workflow.parameters.model-train-steps}}" + - name: TF_BATCH_SIZE + value: "{{workflow.parameters.model-batch-size}}" + - name: TF_LEARNING_RATE + value: "{{workflow.parameters.model-learning-rate}}" + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{workflow.parameters.aws-secret}} + key: awsAccessKeyID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{workflow.parameters.aws-secret}} + key: awsSecretAccessKey + - name: AWS_DEFAULT_REGION + value: {{workflow.parameters.aws-region}} + - name: AWS_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_USE_HTTPS + value: "{{workflow.parameters.s3-use-https}}" + - name: S3_VERIFY_SSL + value: "{{workflow.parameters.s3-verify-ssl}}" + - name: S3_ENDPOINT + value: {{workflow.parameters.s3-endpoint}} + restartPolicy: OnFailure + Ps: + replicas: {{workflow.parameters.tf-ps}} + template: + spec: + containers: + - image: {{workflow.parameters.tf-model-image}} + name: tensorflow + imagePullPolicy: Always + env: + - name: TF_MODEL_DIR + value: {{inputs.parameters.s3-model-url}} + - name: TF_EXPORT_DIR + value: {{workflow.parameters.model-name}} + - name: TF_TRAIN_STEPS + value: "{{workflow.parameters.model-train-steps}}" + - name: TF_BATCH_SIZE + value: "{{workflow.parameters.model-batch-size}}" + - name: TF_LEARNING_RATE + value: "{{workflow.parameters.model-learning-rate}}" + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{workflow.parameters.aws-secret}} + key: awsAccessKeyID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{workflow.parameters.aws-secret}} + key: awsSecretAccessKey + - name: AWS_DEFAULT_REGION + value: {{workflow.parameters.aws-region}} + - name: AWS_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_USE_HTTPS + value: "{{workflow.parameters.s3-use-https}}" + - name: S3_VERIFY_SSL + value: "{{workflow.parameters.s3-verify-ssl}}" + - name: S3_ENDPOINT + value: {{workflow.parameters.s3-endpoint}} + restartPolicy: OnFailure - name: tf-tensorboard inputs: parameters: