revert back removed v1alpha2 yaml manifests (#475)

* revert back removed v1alpha2 yaml manifests * Add documentation * Fix format
2019-01-14 17:08:29 -08:00 · 2019-01-14 17:08:29 -08:00 · c83ed09a77
parent 6770b4adcc
commit c83ed09a77
3 changed files with 92 additions and 0 deletions
--- a/pytorch_mnist/02_distributed_training.md
+++ b/pytorch_mnist/02_distributed_training.md
@ -20,6 +20,14 @@ Since this is a strong scaling example, we should perform an average after the a

 Deploy the PyTorchJob resource to start training the CPU & GPU models:

+### If running on Kubeflow 0.3.x:
+```
+kubectl create -f training/ddp/mnist/cpu/v1alpha2/job_mnist_DDP_CPU.yaml
+kubectl create -f training/ddp/mnist/cpu/v1alpha2/job_mnist_DDP_GPU.yaml
+
+```
+
+### If running on Kubeflow 0.4.x or newer:
 ```
 kubectl create -f training/ddp/mnist/cpu/v1beta1/job_mnist_DDP_CPU.yaml
 kubectl create -f training/ddp/mnist/cpu/v1beta1/job_mnist_DDP_GPU.yaml
--- a/pytorch_mnist/training/ddp/mnist/cpu/v1alpha2/job_mnist_DDP_CPU.yaml
+++ b/pytorch_mnist/training/ddp/mnist/cpu/v1alpha2/job_mnist_DDP_CPU.yaml
@ -0,0 +1,38 @@
+apiVersion: "kubeflow.org/v1alpha2"
+kind: "PyTorchJob"
+metadata:
+  name: "pytorch-mnist-ddp-cpu"
+spec:
+  pytorchReplicaSpecs:
+    Master:
+      replicas: 1
+      restartPolicy: OnFailure
+      template:
+        spec:
+          containers:
+            - name: pytorch
+              image: gcr.io/kubeflow-examples/pytorch-mnist-ddp-cpu
+              volumeMounts:
+              - mountPath: /mnt/kubeflow-gcfs
+                name: kubeflow-gcfs
+          volumes:
+            - name: kubeflow-gcfs
+              persistentVolumeClaim:
+                claimName: kubeflow-gcfs
+                readOnly: false
+    Worker:
+      replicas: 3
+      restartPolicy: OnFailure
+      template:
+        spec:
+          containers:
+            - name: pytorch
+              image: gcr.io/kubeflow-examples/pytorch-mnist-ddp-cpu
+              volumeMounts:
+              - mountPath: /mnt/kubeflow-gcfs
+                name: kubeflow-gcfs
+          volumes:
+            - name: kubeflow-gcfs
+              persistentVolumeClaim:
+                claimName: kubeflow-gcfs
+                readOnly: false
--- a/pytorch_mnist/training/ddp/mnist/gpu/v1alpha2/job_mnist_DDP_GPU.yaml
+++ b/pytorch_mnist/training/ddp/mnist/gpu/v1alpha2/job_mnist_DDP_GPU.yaml
@ -0,0 +1,46 @@
+apiVersion: "kubeflow.org/v1alpha2"
+kind: "PyTorchJob"
+metadata:
+  name: "pytorch-mnist-ddp-gpu"
+spec:
+  pytorchReplicaSpecs:
+    Master:
+      replicas: 1
+      restartPolicy: OnFailure
+      template:
+        spec:
+          containers:
+            - name: pytorch
+              image: gcr.io/kubeflow-examples/pytorch-mnist-ddp-gpu
+              volumeMounts:
+              - mountPath: /mnt/kubeflow-gcfs
+                name: kubeflow-gcfs
+              resources:
+                limits:
+                  nvidia.com/gpu: 1
+          volumes:
+            - name: kubeflow-gcfs
+              persistentVolumeClaim:
+                claimName: kubeflow-gcfs
+                readOnly: false
+
+    Worker:
+      replicas: 3
+      restartPolicy: OnFailure
+      template:
+        spec:
+          containers:
+            - name: pytorch
+              image: gcr.io/kubeflow-examples/pytorch-mnist-ddp-gpu
+              volumeMounts:
+              - mountPath: /mnt/kubeflow-gcfs
+                name: kubeflow-gcfs
+              resources: 
+                limits:
+                  nvidia.com/gpu: 1
+          volumes:
+            - name: kubeflow-gcfs
+              persistentVolumeClaim:
+                claimName: kubeflow-gcfs
+                readOnly: false
+