Merge 6dcc0f73e9 into b1c5c9d060
This commit is contained in:
commit
a014d42a90
|
|
@ -0,0 +1,60 @@
|
||||||
|
ARG CUDA_VERSION_MINOR=12.4.1
|
||||||
|
ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION_MINOR}-devel-ubuntu22.04
|
||||||
|
FROM ${BASE_IMAGE} as base
|
||||||
|
|
||||||
|
ARG CUDA_VERSION_MAJOR=12.4
|
||||||
|
ARG TARGET_NCCL_VERSION=2.21.5-1
|
||||||
|
|
||||||
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
|
RUN apt-get -qq update && \
|
||||||
|
apt-get -qq install -y \
|
||||||
|
--allow-change-held-packages \
|
||||||
|
--no-install-recommends \
|
||||||
|
--allow-downgrades \
|
||||||
|
build-essential libtool autoconf automake autotools-dev unzip \
|
||||||
|
ca-certificates \
|
||||||
|
wget curl openssh-server vim environment-modules \
|
||||||
|
iputils-ping net-tools \
|
||||||
|
libnuma1 libsubunit0 libpci-dev \
|
||||||
|
libpmix-dev \
|
||||||
|
datacenter-gpu-manager \
|
||||||
|
g++ libopenmpi-dev openmpi-bin \
|
||||||
|
libnccl2=$TARGET_NCCL_VERSION+cuda${CUDA_VERSION_MAJOR} \
|
||||||
|
libnccl-dev=${TARGET_NCCL_VERSION}+cuda${CUDA_VERSION_MAJOR} \
|
||||||
|
git && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN apt-get -qq update \
|
||||||
|
&& apt-get -qq install -y --no-install-recommends \
|
||||||
|
ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg curl && \
|
||||||
|
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \
|
||||||
|
echo "deb https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
|
||||||
|
apt-get update && apt-get install -y google-cloud-sdk && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# NCCL Tests
|
||||||
|
ENV NCCL_TESTS_COMMITISH=c6afef0
|
||||||
|
ENV CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90'
|
||||||
|
ENV CUDA12_PTX='-gencode=arch=compute_90,code=compute_90'
|
||||||
|
WORKDIR /opt/nccl-tests
|
||||||
|
RUN wget -q -O - https://github.com/NVIDIA/nccl-tests/archive/${NCCL_TESTS_COMMITISH}.tar.gz | tar --strip-components=1 -xzf - && \
|
||||||
|
mpicc -show && \
|
||||||
|
export CXX=mpic++ && \
|
||||||
|
make -j20 MPI=1 MPI_HOME=/usr/include/openmpi NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" && \
|
||||||
|
ln -s /opt/nccl-tests /opt/nccl_tests
|
||||||
|
|
||||||
|
RUN ldconfig
|
||||||
|
|
||||||
|
# SSH dependencies for MPI
|
||||||
|
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
|
||||||
|
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
|
||||||
|
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
|
||||||
|
sed -i "s/[ #]\(.*Port \).*/ \12222/g" /etc/ssh/ssh_config && \
|
||||||
|
sed -i "s/#\(Port \).*/\12222/g" /etc/ssh/sshd_config && \
|
||||||
|
mkdir /var/run/sshd -p
|
||||||
|
|
@ -0,0 +1,68 @@
|
||||||
|
apiVersion: kubeflow.org/v2beta1
|
||||||
|
kind: MPIJob
|
||||||
|
metadata:
|
||||||
|
name: nccl-tests
|
||||||
|
spec:
|
||||||
|
slotsPerWorker: 8
|
||||||
|
runPolicy:
|
||||||
|
cleanPodPolicy: Running
|
||||||
|
activeDeadlineSeconds: 666
|
||||||
|
mpiReplicaSpecs:
|
||||||
|
Launcher:
|
||||||
|
replicas: 1
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
restartPolicy: OnFailure
|
||||||
|
containers:
|
||||||
|
- image: mpioperator/nccl-tests:latest
|
||||||
|
name: nccl
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
env:
|
||||||
|
- name: OMPI_ALLOW_RUN_AS_ROOT
|
||||||
|
value: "1"
|
||||||
|
- name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
|
||||||
|
value: "1"
|
||||||
|
- name: OMPI_MCA_orte_base_help_aggregate
|
||||||
|
value: "0"
|
||||||
|
command: ["/bin/bash", "-c"]
|
||||||
|
args:
|
||||||
|
- |
|
||||||
|
set -xe
|
||||||
|
export NCCL_DEBUG=INFO
|
||||||
|
until mpirun -np 16 -x LD_LIBRARY_PATH -bind-to none /usr/local/nvidia/bin/nvidia-smi; do sleep 5; done
|
||||||
|
mpirun -np ${NP} -bind-to none \
|
||||||
|
-x NCCL_DEBUG \
|
||||||
|
/opt/nccl_tests/build/all_reduce_perf -c 0 -b 8 -e 16G \
|
||||||
|
-f 4 -g 1 -n 10
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 50m
|
||||||
|
memory: 128Mi
|
||||||
|
enableServiceLinks: false
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
Worker:
|
||||||
|
replicas: 2
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
annotations:
|
||||||
|
spec:
|
||||||
|
volumes:
|
||||||
|
- name: shared-memory
|
||||||
|
emptyDir:
|
||||||
|
medium: "Memory"
|
||||||
|
|
||||||
|
containers:
|
||||||
|
- image: mpioperator/nccl-tests:latest
|
||||||
|
name: nccl
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: shared-memory
|
||||||
|
mountPath: /dev/shm
|
||||||
|
|
||||||
|
enableServiceLinks: false
|
||||||
|
automountServiceAccountToken: false
|
||||||
Loading…
Reference in New Issue