mirror of https://github.com/kubeflow/examples.git
revert back removed v1alpha2 yaml manifests (#475)
* revert back removed v1alpha2 yaml manifests * Add documentation * Fix format
This commit is contained in:
parent
6770b4adcc
commit
c83ed09a77
|
@ -20,6 +20,14 @@ Since this is a strong scaling example, we should perform an average after the a
|
|||
|
||||
Deploy the PyTorchJob resource to start training the CPU & GPU models:
|
||||
|
||||
### If running on Kubeflow 0.3.x:
|
||||
```
|
||||
kubectl create -f training/ddp/mnist/cpu/v1alpha2/job_mnist_DDP_CPU.yaml
|
||||
kubectl create -f training/ddp/mnist/cpu/v1alpha2/job_mnist_DDP_GPU.yaml
|
||||
|
||||
```
|
||||
|
||||
### If running on Kubeflow 0.4.x or newer:
|
||||
```
|
||||
kubectl create -f training/ddp/mnist/cpu/v1beta1/job_mnist_DDP_CPU.yaml
|
||||
kubectl create -f training/ddp/mnist/cpu/v1beta1/job_mnist_DDP_GPU.yaml
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
apiVersion: "kubeflow.org/v1alpha2"
|
||||
kind: "PyTorchJob"
|
||||
metadata:
|
||||
name: "pytorch-mnist-ddp-cpu"
|
||||
spec:
|
||||
pytorchReplicaSpecs:
|
||||
Master:
|
||||
replicas: 1
|
||||
restartPolicy: OnFailure
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: pytorch
|
||||
image: gcr.io/kubeflow-examples/pytorch-mnist-ddp-cpu
|
||||
volumeMounts:
|
||||
- mountPath: /mnt/kubeflow-gcfs
|
||||
name: kubeflow-gcfs
|
||||
volumes:
|
||||
- name: kubeflow-gcfs
|
||||
persistentVolumeClaim:
|
||||
claimName: kubeflow-gcfs
|
||||
readOnly: false
|
||||
Worker:
|
||||
replicas: 3
|
||||
restartPolicy: OnFailure
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: pytorch
|
||||
image: gcr.io/kubeflow-examples/pytorch-mnist-ddp-cpu
|
||||
volumeMounts:
|
||||
- mountPath: /mnt/kubeflow-gcfs
|
||||
name: kubeflow-gcfs
|
||||
volumes:
|
||||
- name: kubeflow-gcfs
|
||||
persistentVolumeClaim:
|
||||
claimName: kubeflow-gcfs
|
||||
readOnly: false
|
|
@ -0,0 +1,46 @@
|
|||
apiVersion: "kubeflow.org/v1alpha2"
|
||||
kind: "PyTorchJob"
|
||||
metadata:
|
||||
name: "pytorch-mnist-ddp-gpu"
|
||||
spec:
|
||||
pytorchReplicaSpecs:
|
||||
Master:
|
||||
replicas: 1
|
||||
restartPolicy: OnFailure
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: pytorch
|
||||
image: gcr.io/kubeflow-examples/pytorch-mnist-ddp-gpu
|
||||
volumeMounts:
|
||||
- mountPath: /mnt/kubeflow-gcfs
|
||||
name: kubeflow-gcfs
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
volumes:
|
||||
- name: kubeflow-gcfs
|
||||
persistentVolumeClaim:
|
||||
claimName: kubeflow-gcfs
|
||||
readOnly: false
|
||||
|
||||
Worker:
|
||||
replicas: 3
|
||||
restartPolicy: OnFailure
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: pytorch
|
||||
image: gcr.io/kubeflow-examples/pytorch-mnist-ddp-gpu
|
||||
volumeMounts:
|
||||
- mountPath: /mnt/kubeflow-gcfs
|
||||
name: kubeflow-gcfs
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
volumes:
|
||||
- name: kubeflow-gcfs
|
||||
persistentVolumeClaim:
|
||||
claimName: kubeflow-gcfs
|
||||
readOnly: false
|
||||
|
Loading…
Reference in New Issue