# # NOTE: Use the Makefiles to build this image correctly. # ARG BASE_IMG= FROM $BASE_IMG # Content below is based on the scripts/Dockerfiles here: # https://github.com/HabanaAI/Setup_and_Install/blob/1.17.1/dockerfiles/base/Dockerfile.ubuntu22.04 # https://github.com/HabanaAI/Setup_and_Install/blob/1.17.1/dockerfiles/pytorch/Dockerfile.ubuntu # args - gaudi version ARG GAUDI_VERSION=1.17.1 ARG GAUDI_REVISION=40 # args - software versions # see this support matrix for compatible versions: # https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html ARG AWS_EFA_VERSION=1.29.0 ARG HCCL_OFI_WRAPPER_VERSION=1.18.0 ARG LIBFABRIC_VERSION=1.20.0 ARG PYTORCH_VERSION=2.3.1 # Gaudi 1.17 does not currently support Python 3.11, so we downgrade to 3.10 # https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html ARG PYTHON_VERSION=3.10.14 RUN sed -i "s/python ==.*/python ==${PYTHON_VERSION}/" ${CONDA_DIR}/conda-meta/pinned \ && conda install -y -q \ python==${PYTHON_VERSION} \ && conda clean -a -f -y USER root # install - support libraries RUN export DEBIAN_FRONTEND=noninteractive \ && apt-get update -yq \ && apt-get install -yq --no-install-recommends \ apt-utils \ bc \ build-essential \ graphviz \ iproute2 \ libcairo2-dev \ libgl1 \ libglib2.0-dev \ libgnutls30 \ libgoogle-glog0v5 \ libgoogle-perftools-dev \ libhdf5-dev \ libjemalloc2 \ libjpeg-dev \ liblapack-dev \ libmkl-dev \ libnuma-dev \ libopenblas-dev \ libpcre2-dev \ libpq-dev \ libselinux1-dev \ lsof \ moreutils \ numactl \ protobuf-compiler \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libtcmalloc.so.4 # install - elastic fabric adapter # NOTE: we use a temporary GNUPGHOME to avoid polluting the user's HOME with root-owned files RUN export GNUPGHOME=$(mktemp -d) \ && curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer-$AWS_EFA_VERSION.tar.gz" -o /tmp/aws-efa-installer.tar.gz \ && curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer-$AWS_EFA_VERSION.tar.gz.sig" -o /tmp/aws-efa-installer.tar.gz.sig \ && curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer.key" | gpg --import \ && gpg --verify /tmp/aws-efa-installer.tar.gz.sig /tmp/aws-efa-installer.tar.gz \ && tar xzf /tmp/aws-efa-installer.tar.gz -C /tmp \ && cd /tmp/aws-efa-installer \ && export DEBIAN_FRONTEND=noninteractive \ && apt-get -yq update \ && ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify \ && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* \ && rm -rf /tmp/aws-efa-installer.tar.gz /tmp/aws-efa-installer.tar.gz.sig /tmp/aws-efa-installer \ && rm -rf "${GNUPGHOME}" # config - fabric adapter and mpi variables ENV MPI_ROOT=/opt/amazon/openmpi ENV PATH=${MPI_ROOT}/bin:${PATH} ENV MPICC=${MPI_ROOT}/bin/mpicc ENV OPAL_PREFIX=${MPI_ROOT} ENV RDMAV_FORK_SAFE=1 ENV FI_EFA_USE_DEVICE_RDMA=1 ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib # install - habana packages RUN curl -fsSL "https://vault.habana.ai/artifactory/api/gpg/key/public" | gpg --dearmor -o /usr/share/keyrings/habana-artifactory.gpg \ && chown root:root /usr/share/keyrings/habana-artifactory.gpg \ && chmod 644 /usr/share/keyrings/habana-artifactory.gpg \ && echo "deb [signed-by=/usr/share/keyrings/habana-artifactory.gpg] https://vault.habana.ai/artifactory/debian jammy main" | tee /etc/apt/sources.list.d/habana.list \ && export DEBIAN_FRONTEND=noninteractive \ && apt-get update -yq \ && apt-get install -yq --no-install-recommends \ habanalabs-firmware-tools="${GAUDI_VERSION}"-"${GAUDI_REVISION}" \ habanalabs-graph="${GAUDI_VERSION}"-"${GAUDI_REVISION}" \ habanalabs-rdma-core="${GAUDI_VERSION}"-"${GAUDI_REVISION}" \ habanalabs-thunk="${GAUDI_VERSION}"-"${GAUDI_REVISION}" \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* # config - habana variables ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so ENV HABANA_LOGS=/var/log/habana_logs/ ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins ENV DATA_LOADER_AEON_LIB_PATH=/usr/lib/habanalabs/libaeon.so ENV LD_LIBRARY_PATH=/usr/lib/habanalabs:${LD_LIBRARY_PATH} # install - libfabric ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" RUN curl -fsSL "https://github.com/ofiwg/libfabric/releases/download/v${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION}.tar.bz2" -o /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 \ && tar xjf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 -C /tmp \ && cd /tmp/libfabric-${LIBFABRIC_VERSION} \ && ./configure --prefix=${LIBFABRIC_ROOT} --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr \ && make -j \ && make install \ && cd / \ && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION} # config - add libfabric to loadable libraries ENV LD_LIBRARY_PATH="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}/lib:${LD_LIBRARY_PATH}" ENV PATH="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}/bin:${PATH}" # install - hccl wrapper for ofi RUN curl -fsSL "https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/tags/v${HCCL_OFI_WRAPPER_VERSION}.tar.gz" -o /tmp/hccl_ofi_wrapper.tar.gz \ && tar xzf /tmp/hccl_ofi_wrapper.tar.gz -C /tmp \ && cd /tmp/hccl_ofi_wrapper-${HCCL_OFI_WRAPPER_VERSION} \ && make \ && cp -f libhccl_ofi_wrapper.so /usr/lib/habanalabs/libhccl_ofi_wrapper.so \ && cd / \ && rm -rf /tmp/hccl_ofi_wrapper.tar.gz /tmp/hccl_ofi_wrapper-v${HCCL_OFI_WRAPPER_VERSION} \ && /sbin/ldconfig # config - add habana modules to PYTHONPATH ENV PYTHONPATH=/usr/lib/habanalabs USER $NB_UID # install - habana pytorch media modules RUN python3 -m pip install --quiet --no-cache \ habana_media_loader=="${GAUDI_VERSION}.${GAUDI_REVISION}" # install - habana pytorch modules RUN curl -fsSL "https://vault.habana.ai/artifactory/gaudi-pt-modules/${GAUDI_VERSION}/${GAUDI_REVISION}/pytorch/ubuntu2204/pytorch_modules-v${PYTORCH_VERSION}_${GAUDI_VERSION}_${GAUDI_REVISION}.tgz" -o /tmp/gaudi-pt-modules.tgz \ && mkdir -p /tmp/gaudi-pt-modules \ && tar xzf /tmp/gaudi-pt-modules.tgz -C /tmp/gaudi-pt-modules \ && cd /tmp/gaudi-pt-modules \ && PIP_QUIET=1 PIP_NO_CACHE_DIR=1 PYTHON_VERSION=3 bash install.sh ${GAUDI_VERSION} ${GAUDI_REVISION} \ && rm -rf /tmp/gaudi-pt-modules.tgz /tmp/gaudi-pt-modules # install - requirements.txt COPY --chown=${NB_USER}:users requirements.txt /tmp RUN python3 -m pip install -r /tmp/requirements.txt --quiet --no-cache-dir \ && rm -f /tmp/requirements.txt # home - pre-populate home with files for this image COPY --chown=${NB_USER}:${NB_GID} home/. ${HOME}/ # s6 - 01-copy-tmp-home # NOTE: the contents of $HOME_TMP are copied to $HOME at runtime # this is a workaround because a PVC will be mounted at $HOME # and the contents of $HOME will be hidden RUN cp -p -r -T "${HOME}" "${HOME_TMP}" \ # give group same access as user (needed for OpenShift) && chmod -R g=u "${HOME_TMP}"