diff --git a/examples/v2beta1/nccl-tests/Dockerfile b/examples/v2beta1/nccl-tests/Dockerfile new file mode 100644 index 0000000..5d4048b --- /dev/null +++ b/examples/v2beta1/nccl-tests/Dockerfile @@ -0,0 +1,60 @@ +ARG CUDA_VERSION_MINOR=12.4.1 +ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION_MINOR}-devel-ubuntu22.04 +FROM ${BASE_IMAGE} as base + +ARG CUDA_VERSION_MAJOR=12.4 +ARG TARGET_NCCL_VERSION=2.21.5-1 + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get -qq update && \ + apt-get -qq install -y \ + --allow-change-held-packages \ + --no-install-recommends \ + --allow-downgrades \ + build-essential libtool autoconf automake autotools-dev unzip \ + ca-certificates \ + wget curl openssh-server vim environment-modules \ + iputils-ping net-tools \ + libnuma1 libsubunit0 libpci-dev \ + libpmix-dev \ + datacenter-gpu-manager \ + g++ libopenmpi-dev openmpi-bin \ + libnccl2=$TARGET_NCCL_VERSION+cuda${CUDA_VERSION_MAJOR} \ + libnccl-dev=${TARGET_NCCL_VERSION}+cuda${CUDA_VERSION_MAJOR} \ + git && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN apt-get -qq update \ + && apt-get -qq install -y --no-install-recommends \ + ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \ + && rm -rf /var/lib/apt/lists/* + +RUN apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg curl && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \ + echo "deb https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + apt-get update && apt-get install -y google-cloud-sdk && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + + + +# NCCL Tests +ENV NCCL_TESTS_COMMITISH=c6afef0 +ENV CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90' +ENV CUDA12_PTX='-gencode=arch=compute_90,code=compute_90' +WORKDIR /opt/nccl-tests +RUN wget -q -O - https://github.com/NVIDIA/nccl-tests/archive/${NCCL_TESTS_COMMITISH}.tar.gz | tar --strip-components=1 -xzf - && \ + mpicc -show && \ + export CXX=mpic++ && \ + make -j20 MPI=1 MPI_HOME=/usr/include/openmpi NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" && \ + ln -s /opt/nccl-tests /opt/nccl_tests + +RUN ldconfig + +# SSH dependencies for MPI +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ + sed -i "s/[ #]\(.*Port \).*/ \12222/g" /etc/ssh/ssh_config && \ + sed -i "s/#\(Port \).*/\12222/g" /etc/ssh/sshd_config && \ + mkdir /var/run/sshd -p diff --git a/examples/v2beta1/nccl-tests/nccl-tests.yaml b/examples/v2beta1/nccl-tests/nccl-tests.yaml new file mode 100644 index 0000000..c343934 --- /dev/null +++ b/examples/v2beta1/nccl-tests/nccl-tests.yaml @@ -0,0 +1,68 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: nccl-tests +spec: + slotsPerWorker: 8 + runPolicy: + cleanPodPolicy: Running + activeDeadlineSeconds: 666 + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + restartPolicy: OnFailure + containers: + - image: mpioperator/nccl-tests:latest + name: nccl + securityContext: + privileged: true + env: + - name: OMPI_ALLOW_RUN_AS_ROOT + value: "1" + - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM + value: "1" + - name: OMPI_MCA_orte_base_help_aggregate + value: "0" + command: ["/bin/bash", "-c"] + args: + - | + set -xe + export NCCL_DEBUG=INFO + until mpirun -np 16 -x LD_LIBRARY_PATH -bind-to none /usr/local/nvidia/bin/nvidia-smi; do sleep 5; done + mpirun -np ${NP} -bind-to none \ + -x NCCL_DEBUG \ + /opt/nccl_tests/build/all_reduce_perf -c 0 -b 8 -e 16G \ + -f 4 -g 1 -n 10 + resources: + requests: + cpu: 50m + memory: 128Mi + enableServiceLinks: false + automountServiceAccountToken: false + Worker: + replicas: 2 + template: + metadata: + annotations: + spec: + volumes: + - name: shared-memory + emptyDir: + medium: "Memory" + + containers: + - image: mpioperator/nccl-tests:latest + name: nccl + securityContext: + privileged: true + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: shared-memory + mountPath: /dev/shm + + enableServiceLinks: false + automountServiceAccountToken: false