revert back removed v1alpha2 yaml manifests (#475)

* revert back removed v1alpha2 yaml manifests

* Add documentation

* Fix format
This commit is contained in:
Hung-Ting Wen 2019-01-14 17:08:29 -08:00 committed by Kubernetes Prow Robot
parent 6770b4adcc
commit c83ed09a77
3 changed files with 92 additions and 0 deletions

View File

@ -20,6 +20,14 @@ Since this is a strong scaling example, we should perform an average after the a
Deploy the PyTorchJob resource to start training the CPU & GPU models:
### If running on Kubeflow 0.3.x:
```
kubectl create -f training/ddp/mnist/cpu/v1alpha2/job_mnist_DDP_CPU.yaml
kubectl create -f training/ddp/mnist/cpu/v1alpha2/job_mnist_DDP_GPU.yaml
```
### If running on Kubeflow 0.4.x or newer:
```
kubectl create -f training/ddp/mnist/cpu/v1beta1/job_mnist_DDP_CPU.yaml
kubectl create -f training/ddp/mnist/cpu/v1beta1/job_mnist_DDP_GPU.yaml

View File

@ -0,0 +1,38 @@
apiVersion: "kubeflow.org/v1alpha2"
kind: "PyTorchJob"
metadata:
name: "pytorch-mnist-ddp-cpu"
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: OnFailure
template:
spec:
containers:
- name: pytorch
image: gcr.io/kubeflow-examples/pytorch-mnist-ddp-cpu
volumeMounts:
- mountPath: /mnt/kubeflow-gcfs
name: kubeflow-gcfs
volumes:
- name: kubeflow-gcfs
persistentVolumeClaim:
claimName: kubeflow-gcfs
readOnly: false
Worker:
replicas: 3
restartPolicy: OnFailure
template:
spec:
containers:
- name: pytorch
image: gcr.io/kubeflow-examples/pytorch-mnist-ddp-cpu
volumeMounts:
- mountPath: /mnt/kubeflow-gcfs
name: kubeflow-gcfs
volumes:
- name: kubeflow-gcfs
persistentVolumeClaim:
claimName: kubeflow-gcfs
readOnly: false

View File

@ -0,0 +1,46 @@
apiVersion: "kubeflow.org/v1alpha2"
kind: "PyTorchJob"
metadata:
name: "pytorch-mnist-ddp-gpu"
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: OnFailure
template:
spec:
containers:
- name: pytorch
image: gcr.io/kubeflow-examples/pytorch-mnist-ddp-gpu
volumeMounts:
- mountPath: /mnt/kubeflow-gcfs
name: kubeflow-gcfs
resources:
limits:
nvidia.com/gpu: 1
volumes:
- name: kubeflow-gcfs
persistentVolumeClaim:
claimName: kubeflow-gcfs
readOnly: false
Worker:
replicas: 3
restartPolicy: OnFailure
template:
spec:
containers:
- name: pytorch
image: gcr.io/kubeflow-examples/pytorch-mnist-ddp-gpu
volumeMounts:
- mountPath: /mnt/kubeflow-gcfs
name: kubeflow-gcfs
resources:
limits:
nvidia.com/gpu: 1
volumes:
- name: kubeflow-gcfs
persistentVolumeClaim:
claimName: kubeflow-gcfs
readOnly: false