mirror of https://github.com/kubeflow/examples.git
				
				
				
			
							parent
							
								
									1e385247b0
								
							
						
					
					
						commit
						0d49548b3a
					
				|  | @ -0,0 +1,27 @@ | ||||||
|  | # Kubeflow MPI Horovod example | ||||||
|  | 
 | ||||||
|  | This example deploys MPI operator into kubeflow cluster and runs an distributed training example using GPU.  | ||||||
|  | 
 | ||||||
|  | ## Steps | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | * Deploy [kubeflow cluster (version v0.7.0)](https://www.kubeflow.org/docs/gke/deploy/) | ||||||
|  | * Add GPU node pool to newly created kubeflow cluster (might need to increase quotas if needed): | ||||||
|  | ``` | ||||||
|  | export PROJECT= | ||||||
|  | export CLUSTER= | ||||||
|  | gcloud container node-pools create gpu-pool-mpi --accelerator=type=nvidia-tesla-k80,count=4 --cluster=$CLUSTER --project=$PROJECT --machine-type=n1-standard-8 --num-nodes=2 | ||||||
|  | ``` | ||||||
|  | * Deploy MPI operator into kubeflow cluster: from [kubeflow manifests](https://github.com/kubeflow/manifests) repo, run  | ||||||
|  | ``` | ||||||
|  | kustomize build mpi-job/mpi-operator/base/ | kubectl apply -f - | ||||||
|  | ``` | ||||||
|  | * Deploy the MPI exmaple job: | ||||||
|  | ``` | ||||||
|  | kubectl apply -f mpi-job.yaml -n kubeflow | ||||||
|  | ``` | ||||||
|  | * Once launcher pod is up and running, log will be available from: | ||||||
|  | ``` | ||||||
|  | POD_NAME=$(kubectl -n kubeflow get pods -l mpi_job_name=tf-resnet50-horovod-job,mpi_role_type=launcher -o name) | ||||||
|  | kubectl -n kubeflow logs -f ${POD_NAME} | ||||||
|  | ``` | ||||||
|  | @ -0,0 +1,49 @@ | ||||||
|  | --- | ||||||
|  | apiVersion: kubeflow.org/v1alpha1 | ||||||
|  | kind: MPIJob | ||||||
|  | metadata: | ||||||
|  |   labels: | ||||||
|  |     ksonnet.io/component: tf-resnet50-horovod-job | ||||||
|  |   name: tf-resnet50-horovod-job | ||||||
|  |   namespace: kubeflow | ||||||
|  | spec: | ||||||
|  |   replicas: 2 | ||||||
|  |   template: | ||||||
|  |     spec: | ||||||
|  |       containers: | ||||||
|  |       - command: | ||||||
|  |         - mpirun | ||||||
|  |         - --allow-run-as-root | ||||||
|  |         - -mca | ||||||
|  |         - btl_tcp_if_exclude | ||||||
|  |         - lo | ||||||
|  |         - -mca | ||||||
|  |         - pml | ||||||
|  |         - ob1 | ||||||
|  |         - -mca | ||||||
|  |         - btl | ||||||
|  |         - ^openib | ||||||
|  |         - --bind-to | ||||||
|  |         - none | ||||||
|  |         - -map-by | ||||||
|  |         - slot | ||||||
|  |         - -x | ||||||
|  |         - LD_LIBRARY_PATH | ||||||
|  |         - -x | ||||||
|  |         - PATH | ||||||
|  |         - -x | ||||||
|  |         - NCCL_DEBUG=INFO | ||||||
|  |         - python | ||||||
|  |         - scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py | ||||||
|  |         - --data_format=NCHW | ||||||
|  |         - --batch_size=128 | ||||||
|  |         - --model=resnet50 | ||||||
|  |         - --optimizer=sgd | ||||||
|  |         - --variable_update=horovod | ||||||
|  |         - --data_name=imagenet | ||||||
|  |         - --use_fp16 | ||||||
|  |         image: mpioperator/tensorflow-benchmarks:latest | ||||||
|  |         name: tf-resnet50-horovod-job | ||||||
|  |         resources: | ||||||
|  |           limits: | ||||||
|  |             nvidia.com/gpu: 4 | ||||||
		Loading…
	
		Reference in New Issue