mirror of https://github.com/kubeflow/examples.git
parent
1e385247b0
commit
0d49548b3a
|
|
@ -0,0 +1,27 @@
|
|||
# Kubeflow MPI Horovod example
|
||||
|
||||
This example deploys MPI operator into kubeflow cluster and runs an distributed training example using GPU.
|
||||
|
||||
## Steps
|
||||
|
||||
|
||||
* Deploy [kubeflow cluster (version v0.7.0)](https://www.kubeflow.org/docs/gke/deploy/)
|
||||
* Add GPU node pool to newly created kubeflow cluster (might need to increase quotas if needed):
|
||||
```
|
||||
export PROJECT=
|
||||
export CLUSTER=
|
||||
gcloud container node-pools create gpu-pool-mpi --accelerator=type=nvidia-tesla-k80,count=4 --cluster=$CLUSTER --project=$PROJECT --machine-type=n1-standard-8 --num-nodes=2
|
||||
```
|
||||
* Deploy MPI operator into kubeflow cluster: from [kubeflow manifests](https://github.com/kubeflow/manifests) repo, run
|
||||
```
|
||||
kustomize build mpi-job/mpi-operator/base/ | kubectl apply -f -
|
||||
```
|
||||
* Deploy the MPI exmaple job:
|
||||
```
|
||||
kubectl apply -f mpi-job.yaml -n kubeflow
|
||||
```
|
||||
* Once launcher pod is up and running, log will be available from:
|
||||
```
|
||||
POD_NAME=$(kubectl -n kubeflow get pods -l mpi_job_name=tf-resnet50-horovod-job,mpi_role_type=launcher -o name)
|
||||
kubectl -n kubeflow logs -f ${POD_NAME}
|
||||
```
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
---
|
||||
apiVersion: kubeflow.org/v1alpha1
|
||||
kind: MPIJob
|
||||
metadata:
|
||||
labels:
|
||||
ksonnet.io/component: tf-resnet50-horovod-job
|
||||
name: tf-resnet50-horovod-job
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
replicas: 2
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- mpirun
|
||||
- --allow-run-as-root
|
||||
- -mca
|
||||
- btl_tcp_if_exclude
|
||||
- lo
|
||||
- -mca
|
||||
- pml
|
||||
- ob1
|
||||
- -mca
|
||||
- btl
|
||||
- ^openib
|
||||
- --bind-to
|
||||
- none
|
||||
- -map-by
|
||||
- slot
|
||||
- -x
|
||||
- LD_LIBRARY_PATH
|
||||
- -x
|
||||
- PATH
|
||||
- -x
|
||||
- NCCL_DEBUG=INFO
|
||||
- python
|
||||
- scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
|
||||
- --data_format=NCHW
|
||||
- --batch_size=128
|
||||
- --model=resnet50
|
||||
- --optimizer=sgd
|
||||
- --variable_update=horovod
|
||||
- --data_name=imagenet
|
||||
- --use_fp16
|
||||
image: mpioperator/tensorflow-benchmarks:latest
|
||||
name: tf-resnet50-horovod-job
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 4
|
||||
Loading…
Reference in New Issue