Merge 6dcc0f73e9 into b1c5c9d060
This commit is contained in:
commit
a014d42a90
|
|
@ -0,0 +1,60 @@
|
|||
ARG CUDA_VERSION_MINOR=12.4.1
|
||||
ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION_MINOR}-devel-ubuntu22.04
|
||||
FROM ${BASE_IMAGE} as base
|
||||
|
||||
ARG CUDA_VERSION_MAJOR=12.4
|
||||
ARG TARGET_NCCL_VERSION=2.21.5-1
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
RUN apt-get -qq update && \
|
||||
apt-get -qq install -y \
|
||||
--allow-change-held-packages \
|
||||
--no-install-recommends \
|
||||
--allow-downgrades \
|
||||
build-essential libtool autoconf automake autotools-dev unzip \
|
||||
ca-certificates \
|
||||
wget curl openssh-server vim environment-modules \
|
||||
iputils-ping net-tools \
|
||||
libnuma1 libsubunit0 libpci-dev \
|
||||
libpmix-dev \
|
||||
datacenter-gpu-manager \
|
||||
g++ libopenmpi-dev openmpi-bin \
|
||||
libnccl2=$TARGET_NCCL_VERSION+cuda${CUDA_VERSION_MAJOR} \
|
||||
libnccl-dev=${TARGET_NCCL_VERSION}+cuda${CUDA_VERSION_MAJOR} \
|
||||
git && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN apt-get -qq update \
|
||||
&& apt-get -qq install -y --no-install-recommends \
|
||||
ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg curl && \
|
||||
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \
|
||||
echo "deb https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
|
||||
apt-get update && apt-get install -y google-cloud-sdk && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
|
||||
# NCCL Tests
|
||||
ENV NCCL_TESTS_COMMITISH=c6afef0
|
||||
ENV CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90'
|
||||
ENV CUDA12_PTX='-gencode=arch=compute_90,code=compute_90'
|
||||
WORKDIR /opt/nccl-tests
|
||||
RUN wget -q -O - https://github.com/NVIDIA/nccl-tests/archive/${NCCL_TESTS_COMMITISH}.tar.gz | tar --strip-components=1 -xzf - && \
|
||||
mpicc -show && \
|
||||
export CXX=mpic++ && \
|
||||
make -j20 MPI=1 MPI_HOME=/usr/include/openmpi NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" && \
|
||||
ln -s /opt/nccl-tests /opt/nccl_tests
|
||||
|
||||
RUN ldconfig
|
||||
|
||||
# SSH dependencies for MPI
|
||||
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
|
||||
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
|
||||
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
|
||||
sed -i "s/[ #]\(.*Port \).*/ \12222/g" /etc/ssh/ssh_config && \
|
||||
sed -i "s/#\(Port \).*/\12222/g" /etc/ssh/sshd_config && \
|
||||
mkdir /var/run/sshd -p
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
apiVersion: kubeflow.org/v2beta1
|
||||
kind: MPIJob
|
||||
metadata:
|
||||
name: nccl-tests
|
||||
spec:
|
||||
slotsPerWorker: 8
|
||||
runPolicy:
|
||||
cleanPodPolicy: Running
|
||||
activeDeadlineSeconds: 666
|
||||
mpiReplicaSpecs:
|
||||
Launcher:
|
||||
replicas: 1
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- image: mpioperator/nccl-tests:latest
|
||||
name: nccl
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
- name: OMPI_ALLOW_RUN_AS_ROOT
|
||||
value: "1"
|
||||
- name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
|
||||
value: "1"
|
||||
- name: OMPI_MCA_orte_base_help_aggregate
|
||||
value: "0"
|
||||
command: ["/bin/bash", "-c"]
|
||||
args:
|
||||
- |
|
||||
set -xe
|
||||
export NCCL_DEBUG=INFO
|
||||
until mpirun -np 16 -x LD_LIBRARY_PATH -bind-to none /usr/local/nvidia/bin/nvidia-smi; do sleep 5; done
|
||||
mpirun -np ${NP} -bind-to none \
|
||||
-x NCCL_DEBUG \
|
||||
/opt/nccl_tests/build/all_reduce_perf -c 0 -b 8 -e 16G \
|
||||
-f 4 -g 1 -n 10
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
enableServiceLinks: false
|
||||
automountServiceAccountToken: false
|
||||
Worker:
|
||||
replicas: 2
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
spec:
|
||||
volumes:
|
||||
- name: shared-memory
|
||||
emptyDir:
|
||||
medium: "Memory"
|
||||
|
||||
containers:
|
||||
- image: mpioperator/nccl-tests:latest
|
||||
name: nccl
|
||||
securityContext:
|
||||
privileged: true
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: shared-memory
|
||||
mountPath: /dev/shm
|
||||
|
||||
enableServiceLinks: false
|
||||
automountServiceAccountToken: false
|
||||
Loading…
Reference in New Issue