This commit is contained in:
Sam Stoelinga 2025-09-24 21:03:29 +08:00 committed by GitHub
commit a014d42a90
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 128 additions and 0 deletions

View File

@ -0,0 +1,60 @@
ARG CUDA_VERSION_MINOR=12.4.1
ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION_MINOR}-devel-ubuntu22.04
FROM ${BASE_IMAGE} as base
ARG CUDA_VERSION_MAJOR=12.4
ARG TARGET_NCCL_VERSION=2.21.5-1
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get -qq update && \
apt-get -qq install -y \
--allow-change-held-packages \
--no-install-recommends \
--allow-downgrades \
build-essential libtool autoconf automake autotools-dev unzip \
ca-certificates \
wget curl openssh-server vim environment-modules \
iputils-ping net-tools \
libnuma1 libsubunit0 libpci-dev \
libpmix-dev \
datacenter-gpu-manager \
g++ libopenmpi-dev openmpi-bin \
libnccl2=$TARGET_NCCL_VERSION+cuda${CUDA_VERSION_MAJOR} \
libnccl-dev=${TARGET_NCCL_VERSION}+cuda${CUDA_VERSION_MAJOR} \
git && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN apt-get -qq update \
&& apt-get -qq install -y --no-install-recommends \
ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \
&& rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg curl && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \
echo "deb https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
apt-get update && apt-get install -y google-cloud-sdk && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# NCCL Tests
ENV NCCL_TESTS_COMMITISH=c6afef0
ENV CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90'
ENV CUDA12_PTX='-gencode=arch=compute_90,code=compute_90'
WORKDIR /opt/nccl-tests
RUN wget -q -O - https://github.com/NVIDIA/nccl-tests/archive/${NCCL_TESTS_COMMITISH}.tar.gz | tar --strip-components=1 -xzf - && \
mpicc -show && \
export CXX=mpic++ && \
make -j20 MPI=1 MPI_HOME=/usr/include/openmpi NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" && \
ln -s /opt/nccl-tests /opt/nccl_tests
RUN ldconfig
# SSH dependencies for MPI
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
sed -i "s/[ #]\(.*Port \).*/ \12222/g" /etc/ssh/ssh_config && \
sed -i "s/#\(Port \).*/\12222/g" /etc/ssh/sshd_config && \
mkdir /var/run/sshd -p

View File

@ -0,0 +1,68 @@
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: nccl-tests
spec:
slotsPerWorker: 8
runPolicy:
cleanPodPolicy: Running
activeDeadlineSeconds: 666
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
restartPolicy: OnFailure
containers:
- image: mpioperator/nccl-tests:latest
name: nccl
securityContext:
privileged: true
env:
- name: OMPI_ALLOW_RUN_AS_ROOT
value: "1"
- name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
value: "1"
- name: OMPI_MCA_orte_base_help_aggregate
value: "0"
command: ["/bin/bash", "-c"]
args:
- |
set -xe
export NCCL_DEBUG=INFO
until mpirun -np 16 -x LD_LIBRARY_PATH -bind-to none /usr/local/nvidia/bin/nvidia-smi; do sleep 5; done
mpirun -np ${NP} -bind-to none \
-x NCCL_DEBUG \
/opt/nccl_tests/build/all_reduce_perf -c 0 -b 8 -e 16G \
-f 4 -g 1 -n 10
resources:
requests:
cpu: 50m
memory: 128Mi
enableServiceLinks: false
automountServiceAccountToken: false
Worker:
replicas: 2
template:
metadata:
annotations:
spec:
volumes:
- name: shared-memory
emptyDir:
medium: "Memory"
containers:
- image: mpioperator/nccl-tests:latest
name: nccl
securityContext:
privileged: true
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: shared-memory
mountPath: /dev/shm
enableServiceLinks: false
automountServiceAccountToken: false