MPICH support (#562)
* Add support for MPICH
* Fix CI errors
* Temporary: manual trigger
* Fix file name
* Add an empty line at the end of the file
* Fix formatting
* Revert "Temporary: manual trigger"
This reverts commit 15164a8b70.
* fix formatting
* Regenerate the mpi-operator.yaml
* Adding an empy line at the end of Dockerfiles
* Share the same entrypoin for Intel and MPICH
* share hostfile generation between Intel and MPICH
* Add validation test for MPICH
* Fix formatting
* Don't over engineer the tests - be explicit
* add non-root tests for IntelMPI and MPICH
This commit is contained in:
parent
caa1112993
commit
21f326d1d2
5
Makefile
5
Makefile
|
|
@ -23,6 +23,7 @@ BASE_IMAGE_SSH_PORT?=2222
|
||||||
IMG_BUILDER=docker
|
IMG_BUILDER=docker
|
||||||
PLATFORMS ?= linux/amd64
|
PLATFORMS ?= linux/amd64
|
||||||
INTEL_PLATFORMS ?= linux/amd64
|
INTEL_PLATFORMS ?= linux/amd64
|
||||||
|
MPICH_PLATFORMS ?= linux/amd64
|
||||||
LD_FLAGS_V2=" \
|
LD_FLAGS_V2=" \
|
||||||
-X '${REPO_PATH}/pkg/version.GitSHA=${GitSHA}' \
|
-X '${REPO_PATH}/pkg/version.GitSHA=${GitSHA}' \
|
||||||
-X '${REPO_PATH}/pkg/version.Built=${Date}' \
|
-X '${REPO_PATH}/pkg/version.Built=${Date}' \
|
||||||
|
|
@ -71,6 +72,7 @@ test: bin/envtest scheduler-plugins-crd
|
||||||
test_e2e: export TEST_MPI_OPERATOR_IMAGE=${IMAGE_NAME}:${RELEASE_VERSION}
|
test_e2e: export TEST_MPI_OPERATOR_IMAGE=${IMAGE_NAME}:${RELEASE_VERSION}
|
||||||
test_e2e: export TEST_OPENMPI_IMAGE=mpioperator/mpi-pi:${RELEASE_VERSION}-openmpi
|
test_e2e: export TEST_OPENMPI_IMAGE=mpioperator/mpi-pi:${RELEASE_VERSION}-openmpi
|
||||||
test_e2e: export TEST_INTELMPI_IMAGE=mpioperator/mpi-pi:${RELEASE_VERSION}-intel
|
test_e2e: export TEST_INTELMPI_IMAGE=mpioperator/mpi-pi:${RELEASE_VERSION}-intel
|
||||||
|
test_e2e: export TEST_MPICH_IMAGE=mpioperator/mpi-pi:${RELEASE_VERSION}-mpich
|
||||||
test_e2e: bin/kubectl kind helm images test_images dev_manifest scheduler-plugins-chart
|
test_e2e: bin/kubectl kind helm images test_images dev_manifest scheduler-plugins-chart
|
||||||
go test -v ./test/e2e/...
|
go test -v ./test/e2e/...
|
||||||
|
|
||||||
|
|
@ -108,6 +110,9 @@ test_images:
|
||||||
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(INTEL_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/intel:${RELEASE_VERSION} build/base -f build/base/intel.Dockerfile
|
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(INTEL_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/intel:${RELEASE_VERSION} build/base -f build/base/intel.Dockerfile
|
||||||
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(INTEL_PLATFORMS) -t mpioperator/intel-builder:${RELEASE_VERSION} build/base -f build/base/intel-builder.Dockerfile
|
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(INTEL_PLATFORMS) -t mpioperator/intel-builder:${RELEASE_VERSION} build/base -f build/base/intel-builder.Dockerfile
|
||||||
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(INTEL_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/mpi-pi:${RELEASE_VERSION}-intel examples/v2beta1/pi -f examples/v2beta1/pi/intel.Dockerfile
|
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(INTEL_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/mpi-pi:${RELEASE_VERSION}-intel examples/v2beta1/pi -f examples/v2beta1/pi/intel.Dockerfile
|
||||||
|
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(MPICH_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/mpich:${RELEASE_VERSION} build/base -f build/base/mpich.Dockerfile
|
||||||
|
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(MPICH_PLATFORMS) -t mpioperator/mpich-builder:${RELEASE_VERSION} build/base -f build/base/mpich-builder.Dockerfile
|
||||||
|
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(MPICH_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/mpi-pi:${RELEASE_VERSION}-mpich examples/v2beta1/pi -f examples/v2beta1/pi/mpich.Dockerfile
|
||||||
|
|
||||||
.PHONY: tidy
|
.PHONY: tidy
|
||||||
tidy:
|
tidy:
|
||||||
|
|
|
||||||
|
|
@ -218,6 +218,12 @@ For a sample that uses Intel MPI, see:
|
||||||
cat examples/pi/pi-intel.yaml
|
cat examples/pi/pi-intel.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
For a sample that uses MPICH, see:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cat examples/pi/pi-mpich.yaml
|
||||||
|
```
|
||||||
|
|
||||||
## Exposed Metrics
|
## Exposed Metrics
|
||||||
|
|
||||||
| Metric name | Metric type | Description | Labels |
|
| Metric name | Metric type | Description | Labels |
|
||||||
|
|
|
||||||
|
|
@ -22,5 +22,5 @@ RUN apt update \
|
||||||
intel-oneapi-mpi \
|
intel-oneapi-mpi \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
COPY intel-entrypoint.sh /entrypoint.sh
|
COPY entrypoint.sh /entrypoint.sh
|
||||||
ENTRYPOINT ["/entrypoint.sh"]
|
ENTRYPOINT ["/entrypoint.sh"]
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
FROM debian:bullseye as builder
|
||||||
|
|
||||||
|
RUN apt update \
|
||||||
|
&& apt install -y --no-install-recommends \
|
||||||
|
g++ \
|
||||||
|
libmpich-dev \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
ARG BASE_LABEL
|
||||||
|
|
||||||
|
FROM mpioperator/base:${BASE_LABEL}
|
||||||
|
|
||||||
|
RUN apt update \
|
||||||
|
&& apt install -y --no-install-recommends \
|
||||||
|
dnsutils \
|
||||||
|
mpich \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY entrypoint.sh /entrypoint.sh
|
||||||
|
ENTRYPOINT ["/entrypoint.sh"]
|
||||||
|
|
@ -58,10 +58,11 @@ spec:
|
||||||
mpiImplementation:
|
mpiImplementation:
|
||||||
default: OpenMPI
|
default: OpenMPI
|
||||||
description: MPIImplementation is the MPI implementation. Options
|
description: MPIImplementation is the MPI implementation. Options
|
||||||
are "OpenMPI" (default) and "Intel".
|
are "OpenMPI" (default), "Intel" and "MPICH".
|
||||||
enum:
|
enum:
|
||||||
- OpenMPI
|
- OpenMPI
|
||||||
- Intel
|
- Intel
|
||||||
|
- MPICH
|
||||||
type: string
|
type: string
|
||||||
mpiReplicaSpecs:
|
mpiReplicaSpecs:
|
||||||
additionalProperties:
|
additionalProperties:
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,6 @@ FROM mpioperator/openmpi-builder:${BASE_LABEL} as builder
|
||||||
COPY pi.cc /src/pi.cc
|
COPY pi.cc /src/pi.cc
|
||||||
RUN mpic++ /src/pi.cc -o /pi
|
RUN mpic++ /src/pi.cc -o /pi
|
||||||
|
|
||||||
|
|
||||||
FROM mpioperator/openmpi:${BASE_LABEL}
|
FROM mpioperator/openmpi:${BASE_LABEL}
|
||||||
|
|
||||||
COPY --from=builder /pi /home/mpiuser/pi
|
COPY --from=builder /pi /home/mpiuser/pi
|
||||||
|
|
@ -19,9 +19,15 @@ For Intel MPI:
|
||||||
docker build -t mpi-pi . -f intel.Dockerfile
|
docker build -t mpi-pi . -f intel.Dockerfile
|
||||||
```
|
```
|
||||||
|
|
||||||
|
For MPICH:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t mpi-pi . -f mpich.Dockerfile
|
||||||
|
```
|
||||||
|
|
||||||
## Create MPIJob
|
## Create MPIJob
|
||||||
|
|
||||||
Modify `pi.yaml` (for OpenMPI) or `pi-intel.yaml` (for Intel MPI) to set up the
|
Modify `pi.yaml` (for OpenMPI), `pi-intel.yaml` (for Intel MPI) or `pi-mpich.yaml` (for MPICH) to set up the
|
||||||
image name from your own registry.
|
image name from your own registry.
|
||||||
|
|
||||||
Then, run:
|
Then, run:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
ARG BASE_LABEL
|
||||||
|
|
||||||
|
FROM mpioperator/mpich-builder:${BASE_LABEL} as builder
|
||||||
|
|
||||||
|
COPY pi.cc /src/pi.cc
|
||||||
|
RUN mpic++ /src/pi.cc -o /pi
|
||||||
|
|
||||||
|
FROM mpioperator/mpich:${BASE_LABEL}
|
||||||
|
|
||||||
|
COPY --from=builder /pi /home/mpiuser/pi
|
||||||
|
|
@ -0,0 +1,54 @@
|
||||||
|
apiVersion: kubeflow.org/v2beta1
|
||||||
|
kind: MPIJob
|
||||||
|
metadata:
|
||||||
|
name: pi
|
||||||
|
spec:
|
||||||
|
slotsPerWorker: 1
|
||||||
|
runPolicy:
|
||||||
|
cleanPodPolicy: Running
|
||||||
|
sshAuthMountPath: /home/mpiuser/.ssh
|
||||||
|
mpiImplementation: MPICH
|
||||||
|
mpiReplicaSpecs:
|
||||||
|
Launcher:
|
||||||
|
replicas: 1
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- image: mpioperator/mpi-pi:mpich
|
||||||
|
imagePullPolicy: Always
|
||||||
|
name: mpi-launcher
|
||||||
|
securityContext:
|
||||||
|
runAsUser: 1000
|
||||||
|
args:
|
||||||
|
- mpirun
|
||||||
|
- -n
|
||||||
|
- "2"
|
||||||
|
- /home/mpiuser/pi
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 1
|
||||||
|
memory: 1Gi
|
||||||
|
Worker:
|
||||||
|
replicas: 2
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- image: mpioperator/mpi-pi:mpich
|
||||||
|
imagePullPolicy: Always
|
||||||
|
name: mpi-worker
|
||||||
|
securityContext:
|
||||||
|
runAsUser: 1000
|
||||||
|
command:
|
||||||
|
args:
|
||||||
|
- /usr/sbin/sshd
|
||||||
|
- -De
|
||||||
|
- -f
|
||||||
|
- /home/mpiuser/.sshd_config
|
||||||
|
readinessProbe:
|
||||||
|
tcpSocket:
|
||||||
|
port: 2222
|
||||||
|
initialDelaySeconds: 2
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 1
|
||||||
|
memory: 1Gi
|
||||||
|
|
@ -35,10 +35,11 @@ spec:
|
||||||
mpiImplementation:
|
mpiImplementation:
|
||||||
default: OpenMPI
|
default: OpenMPI
|
||||||
description: MPIImplementation is the MPI implementation. Options
|
description: MPIImplementation is the MPI implementation. Options
|
||||||
are "OpenMPI" (default) and "Intel".
|
are "OpenMPI" (default), "Intel" and "MPICH".
|
||||||
enum:
|
enum:
|
||||||
- OpenMPI
|
- OpenMPI
|
||||||
- Intel
|
- Intel
|
||||||
|
- MPICH
|
||||||
type: string
|
type: string
|
||||||
mpiReplicaSpecs:
|
mpiReplicaSpecs:
|
||||||
additionalProperties:
|
additionalProperties:
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,7 @@ func TestSetDefaults_MPIJob(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"base defaults overridden": {
|
"base defaults overridden (intel)": {
|
||||||
job: MPIJob{
|
job: MPIJob{
|
||||||
Spec: MPIJobSpec{
|
Spec: MPIJobSpec{
|
||||||
SlotsPerWorker: newInt32(10),
|
SlotsPerWorker: newInt32(10),
|
||||||
|
|
@ -66,6 +66,34 @@ func TestSetDefaults_MPIJob(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
"base defaults overridden (mpich)": {
|
||||||
|
job: MPIJob{
|
||||||
|
Spec: MPIJobSpec{
|
||||||
|
SlotsPerWorker: newInt32(10),
|
||||||
|
RunPolicy: RunPolicy{
|
||||||
|
CleanPodPolicy: NewCleanPodPolicy(CleanPodPolicyRunning),
|
||||||
|
TTLSecondsAfterFinished: newInt32(2),
|
||||||
|
ActiveDeadlineSeconds: newInt64(3),
|
||||||
|
BackoffLimit: newInt32(4),
|
||||||
|
},
|
||||||
|
SSHAuthMountPath: "/home/mpiuser/.ssh",
|
||||||
|
MPIImplementation: MPIImplementationMPICH,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
want: MPIJob{
|
||||||
|
Spec: MPIJobSpec{
|
||||||
|
SlotsPerWorker: newInt32(10),
|
||||||
|
RunPolicy: RunPolicy{
|
||||||
|
CleanPodPolicy: NewCleanPodPolicy(CleanPodPolicyRunning),
|
||||||
|
TTLSecondsAfterFinished: newInt32(2),
|
||||||
|
ActiveDeadlineSeconds: newInt64(3),
|
||||||
|
BackoffLimit: newInt32(4),
|
||||||
|
},
|
||||||
|
SSHAuthMountPath: "/home/mpiuser/.ssh",
|
||||||
|
MPIImplementation: MPIImplementationMPICH,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
"launcher defaults": {
|
"launcher defaults": {
|
||||||
job: MPIJob{
|
job: MPIJob{
|
||||||
Spec: MPIJobSpec{
|
Spec: MPIJobSpec{
|
||||||
|
|
|
||||||
|
|
@ -488,7 +488,7 @@ func schema_pkg_apis_kubeflow_v2beta1_MPIJobSpec(ref common.ReferenceCallback) c
|
||||||
},
|
},
|
||||||
"mpiImplementation": {
|
"mpiImplementation": {
|
||||||
SchemaProps: spec.SchemaProps{
|
SchemaProps: spec.SchemaProps{
|
||||||
Description: "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\".",
|
Description: "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\".",
|
||||||
Type: []string{"string"},
|
Type: []string{"string"},
|
||||||
Format: "",
|
Format: "",
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -322,7 +322,7 @@
|
||||||
],
|
],
|
||||||
"properties": {
|
"properties": {
|
||||||
"mpiImplementation": {
|
"mpiImplementation": {
|
||||||
"description": "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\".",
|
"description": "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\".",
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
},
|
||||||
"mpiReplicaSpecs": {
|
"mpiReplicaSpecs": {
|
||||||
|
|
|
||||||
|
|
@ -155,8 +155,8 @@ type MPIJobSpec struct {
|
||||||
SSHAuthMountPath string `json:"sshAuthMountPath,omitempty"`
|
SSHAuthMountPath string `json:"sshAuthMountPath,omitempty"`
|
||||||
|
|
||||||
// MPIImplementation is the MPI implementation.
|
// MPIImplementation is the MPI implementation.
|
||||||
// Options are "OpenMPI" (default) and "Intel".
|
// Options are "OpenMPI" (default), "Intel" and "MPICH".
|
||||||
// +kubebuilder:validation:Enum:=OpenMPI;Intel
|
// +kubebuilder:validation:Enum:=OpenMPI;Intel;MPICH
|
||||||
// +kubebuilder:default:=OpenMPI
|
// +kubebuilder:default:=OpenMPI
|
||||||
MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"`
|
MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
@ -177,6 +177,7 @@ type MPIImplementation string
|
||||||
const (
|
const (
|
||||||
MPIImplementationOpenMPI MPIImplementation = "OpenMPI"
|
MPIImplementationOpenMPI MPIImplementation = "OpenMPI"
|
||||||
MPIImplementationIntel MPIImplementation = "Intel"
|
MPIImplementationIntel MPIImplementation = "Intel"
|
||||||
|
MPIImplementationMPICH MPIImplementation = "MPICH"
|
||||||
)
|
)
|
||||||
|
|
||||||
// JobStatus represents the current observed state of the training Job.
|
// JobStatus represents the current observed state of the training Job.
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,8 @@ var (
|
||||||
|
|
||||||
validMPIImplementations = sets.NewString(
|
validMPIImplementations = sets.NewString(
|
||||||
string(kubeflow.MPIImplementationOpenMPI),
|
string(kubeflow.MPIImplementationOpenMPI),
|
||||||
string(kubeflow.MPIImplementationIntel))
|
string(kubeflow.MPIImplementationIntel),
|
||||||
|
string(kubeflow.MPIImplementationMPICH))
|
||||||
|
|
||||||
validRestartPolicies = sets.NewString(
|
validRestartPolicies = sets.NewString(
|
||||||
string(common.RestartPolicyNever),
|
string(common.RestartPolicyNever),
|
||||||
|
|
|
||||||
|
|
@ -31,7 +31,7 @@ func TestValidateMPIJob(t *testing.T) {
|
||||||
job kubeflow.MPIJob
|
job kubeflow.MPIJob
|
||||||
wantErrs field.ErrorList
|
wantErrs field.ErrorList
|
||||||
}{
|
}{
|
||||||
"valid": {
|
"valid (intel)": {
|
||||||
job: kubeflow.MPIJob{
|
job: kubeflow.MPIJob{
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
Name: "foo",
|
Name: "foo",
|
||||||
|
|
@ -57,7 +57,7 @@ func TestValidateMPIJob(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"valid with worker": {
|
"valid with worker (intel)": {
|
||||||
job: kubeflow.MPIJob{
|
job: kubeflow.MPIJob{
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
Name: "foo",
|
Name: "foo",
|
||||||
|
|
@ -92,6 +92,67 @@ func TestValidateMPIJob(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
"valid (mpich)": {
|
||||||
|
job: kubeflow.MPIJob{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: "foo",
|
||||||
|
},
|
||||||
|
Spec: kubeflow.MPIJobSpec{
|
||||||
|
SlotsPerWorker: newInt32(2),
|
||||||
|
RunPolicy: kubeflow.RunPolicy{
|
||||||
|
CleanPodPolicy: kubeflow.NewCleanPodPolicy(kubeflow.CleanPodPolicyRunning),
|
||||||
|
},
|
||||||
|
SSHAuthMountPath: "/home/mpiuser/.ssh",
|
||||||
|
MPIImplementation: kubeflow.MPIImplementationMPICH,
|
||||||
|
MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*common.ReplicaSpec{
|
||||||
|
kubeflow.MPIReplicaTypeLauncher: {
|
||||||
|
Replicas: newInt32(1),
|
||||||
|
RestartPolicy: common.RestartPolicyNever,
|
||||||
|
Template: corev1.PodTemplateSpec{
|
||||||
|
Spec: corev1.PodSpec{
|
||||||
|
Containers: []corev1.Container{{}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"valid with worker (mpich)": {
|
||||||
|
job: kubeflow.MPIJob{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: "foo",
|
||||||
|
},
|
||||||
|
Spec: kubeflow.MPIJobSpec{
|
||||||
|
SlotsPerWorker: newInt32(2),
|
||||||
|
RunPolicy: kubeflow.RunPolicy{
|
||||||
|
CleanPodPolicy: kubeflow.NewCleanPodPolicy(kubeflow.CleanPodPolicyRunning),
|
||||||
|
},
|
||||||
|
SSHAuthMountPath: "/home/mpiuser/.ssh",
|
||||||
|
MPIImplementation: kubeflow.MPIImplementationMPICH,
|
||||||
|
MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*common.ReplicaSpec{
|
||||||
|
kubeflow.MPIReplicaTypeLauncher: {
|
||||||
|
Replicas: newInt32(1),
|
||||||
|
RestartPolicy: common.RestartPolicyOnFailure,
|
||||||
|
Template: corev1.PodTemplateSpec{
|
||||||
|
Spec: corev1.PodSpec{
|
||||||
|
Containers: []corev1.Container{{}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
kubeflow.MPIReplicaTypeWorker: {
|
||||||
|
Replicas: newInt32(3),
|
||||||
|
RestartPolicy: common.RestartPolicyNever,
|
||||||
|
Template: corev1.PodTemplateSpec{
|
||||||
|
Spec: corev1.PodSpec{
|
||||||
|
Containers: []corev1.Container{{}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
"empty job": {
|
"empty job": {
|
||||||
wantErrs: field.ErrorList{
|
wantErrs: field.ErrorList{
|
||||||
&field.Error{
|
&field.Error{
|
||||||
|
|
|
||||||
|
|
@ -202,6 +202,16 @@ var (
|
||||||
Value: "-o ConnectionAttempts=10",
|
Value: "-o ConnectionAttempts=10",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
mpichEnvVars = []corev1.EnvVar{
|
||||||
|
{
|
||||||
|
Name: "HYDRA_HOST_FILE",
|
||||||
|
Value: fmt.Sprintf("%s/%s", configMountPath, hostfileName),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "HYDRA_LAUNCH_EXTRA_ARGS",
|
||||||
|
Value: "-o ConnectionAttempts=10",
|
||||||
|
},
|
||||||
|
}
|
||||||
nvidiaDisableEnvVars = []corev1.EnvVar{
|
nvidiaDisableEnvVars = []corev1.EnvVar{
|
||||||
{Name: "NVIDIA_VISIBLE_DEVICES"},
|
{Name: "NVIDIA_VISIBLE_DEVICES"},
|
||||||
{Name: "NVIDIA_DRIVER_CAPABILITIES"},
|
{Name: "NVIDIA_DRIVER_CAPABILITIES"},
|
||||||
|
|
@ -603,8 +613,9 @@ func (c *MPIJobController) syncHandler(key string) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel {
|
if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel ||
|
||||||
// The Intel implementation requires workers to communicate with the
|
mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationMPICH {
|
||||||
|
// The Intel and MPICH implementations require workers to communicate with the
|
||||||
// launcher through its hostname. For that, we create a Service which
|
// launcher through its hostname. For that, we create a Service which
|
||||||
// has the same name as the launcher's hostname.
|
// has the same name as the launcher's hostname.
|
||||||
_, err := c.getOrCreateService(mpiJob, newLauncherService(mpiJob))
|
_, err := c.getOrCreateService(mpiJob, newLauncherService(mpiJob))
|
||||||
|
|
@ -1216,7 +1227,7 @@ func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int32) *corev1.ConfigM
|
||||||
switch mpiJob.Spec.MPIImplementation {
|
switch mpiJob.Spec.MPIImplementation {
|
||||||
case kubeflow.MPIImplementationOpenMPI:
|
case kubeflow.MPIImplementationOpenMPI:
|
||||||
buffer.WriteString(fmt.Sprintf("%s%s-%d.%s.%s.svc slots=%d\n", mpiJob.Name, workerSuffix, i, workersService, mpiJob.Namespace, slots))
|
buffer.WriteString(fmt.Sprintf("%s%s-%d.%s.%s.svc slots=%d\n", mpiJob.Name, workerSuffix, i, workersService, mpiJob.Namespace, slots))
|
||||||
case kubeflow.MPIImplementationIntel:
|
case kubeflow.MPIImplementationIntel, kubeflow.MPIImplementationMPICH:
|
||||||
buffer.WriteString(fmt.Sprintf("%s%s-%d.%s.%s.svc:%d\n", mpiJob.Name, workerSuffix, i, workersService, mpiJob.Namespace, slots))
|
buffer.WriteString(fmt.Sprintf("%s%s-%d.%s.%s.svc:%d\n", mpiJob.Name, workerSuffix, i, workersService, mpiJob.Namespace, slots))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1444,6 +1455,8 @@ func (c *MPIJobController) newLauncherPodTemplate(mpiJob *kubeflow.MPIJob) corev
|
||||||
Name: intelMPISlotsEnv,
|
Name: intelMPISlotsEnv,
|
||||||
Value: slotsStr,
|
Value: slotsStr,
|
||||||
})
|
})
|
||||||
|
case kubeflow.MPIImplementationMPICH:
|
||||||
|
container.Env = append(container.Env, mpichEnvVars...)
|
||||||
}
|
}
|
||||||
|
|
||||||
container.Env = append(container.Env,
|
container.Env = append(container.Env,
|
||||||
|
|
|
||||||
|
|
@ -500,7 +500,7 @@ func TestDoNothingWithInvalidMPIJob(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestAllResourcesCreated(t *testing.T) {
|
func TestAllResourcesCreated(t *testing.T) {
|
||||||
impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel}
|
impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel, kubeflow.MPIImplementationMPICH}
|
||||||
for _, implementation := range impls {
|
for _, implementation := range impls {
|
||||||
t.Run(string(implementation), func(t *testing.T) {
|
t.Run(string(implementation), func(t *testing.T) {
|
||||||
f := newFixture(t, "")
|
f := newFixture(t, "")
|
||||||
|
|
@ -524,7 +524,8 @@ func TestAllResourcesCreated(t *testing.T) {
|
||||||
for i := 0; i < 5; i++ {
|
for i := 0; i < 5; i++ {
|
||||||
f.expectCreatePodAction(fmjc.newWorker(mpiJobCopy, i))
|
f.expectCreatePodAction(fmjc.newWorker(mpiJobCopy, i))
|
||||||
}
|
}
|
||||||
if implementation == kubeflow.MPIImplementationIntel {
|
if implementation == kubeflow.MPIImplementationIntel ||
|
||||||
|
implementation == kubeflow.MPIImplementationMPICH {
|
||||||
f.expectCreateServiceAction(newLauncherService(mpiJobCopy))
|
f.expectCreateServiceAction(newLauncherService(mpiJobCopy))
|
||||||
}
|
}
|
||||||
f.expectCreateJobAction(fmjc.newLauncherJob(mpiJobCopy))
|
f.expectCreateJobAction(fmjc.newLauncherJob(mpiJobCopy))
|
||||||
|
|
@ -796,7 +797,7 @@ func TestShutdownWorker(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCreateSuspendedMPIJob(t *testing.T) {
|
func TestCreateSuspendedMPIJob(t *testing.T) {
|
||||||
impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel}
|
impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel, kubeflow.MPIImplementationMPICH}
|
||||||
for _, implementation := range impls {
|
for _, implementation := range impls {
|
||||||
t.Run(string(implementation), func(t *testing.T) {
|
t.Run(string(implementation), func(t *testing.T) {
|
||||||
f := newFixture(t, "")
|
f := newFixture(t, "")
|
||||||
|
|
@ -819,7 +820,8 @@ func TestCreateSuspendedMPIJob(t *testing.T) {
|
||||||
t.Fatalf("Failed creating secret")
|
t.Fatalf("Failed creating secret")
|
||||||
}
|
}
|
||||||
f.expectCreateSecretAction(secret)
|
f.expectCreateSecretAction(secret)
|
||||||
if implementation == kubeflow.MPIImplementationIntel {
|
if implementation == kubeflow.MPIImplementationIntel ||
|
||||||
|
implementation == kubeflow.MPIImplementationMPICH {
|
||||||
f.expectCreateServiceAction(newLauncherService(mpiJob))
|
f.expectCreateServiceAction(newLauncherService(mpiJob))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1583,6 +1585,31 @@ func TestNewConfigMap(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
"MPICH with slots": {
|
||||||
|
mpiJob: &kubeflow.MPIJob{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: "mpich-with-slots",
|
||||||
|
Namespace: "project-x",
|
||||||
|
},
|
||||||
|
Spec: kubeflow.MPIJobSpec{
|
||||||
|
SlotsPerWorker: pointer.Int32(10),
|
||||||
|
MPIImplementation: kubeflow.MPIImplementationMPICH,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
workerReplicas: 1,
|
||||||
|
wantCM: &corev1.ConfigMap{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: "mpich-with-slots-config",
|
||||||
|
Namespace: "project-x",
|
||||||
|
Labels: map[string]string{
|
||||||
|
"app": "mpich-with-slots",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Data: map[string]string{
|
||||||
|
"hostfile": "mpich-with-slots-worker-0.mpich-with-slots-worker.project-x.svc:10\n",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
for name, tc := range testCases {
|
for name, tc := range testCases {
|
||||||
t.Run(name, func(t *testing.T) {
|
t.Run(name, func(t *testing.T) {
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,7 @@ An MPIJob CRD describes the Job. Important fields include:
|
||||||
- The launcher template, which should have a `mpirun` command.
|
- The launcher template, which should have a `mpirun` command.
|
||||||
|
|
||||||
The images are expected to have the MPI implementation binaries (such as
|
The images are expected to have the MPI implementation binaries (such as
|
||||||
OpenMPI, MPICH or Intel MPI) the user’s MPI executable.
|
OpenMPI, Intel MPI or MPICH) the user’s MPI executable.
|
||||||
|
|
||||||
A controller processes the MPIJob, starting a Job with the following steps:
|
A controller processes the MPIJob, starting a Job with the following steps:
|
||||||
1. Creates ConfigMap, which contains:
|
1. Creates ConfigMap, which contains:
|
||||||
|
|
@ -148,7 +148,7 @@ following changes:
|
||||||
doesn’t support changes to the completions field. This can be supported
|
doesn’t support changes to the completions field. This can be supported
|
||||||
starting from 1.23. In the meantime, we can replicate the behavior by
|
starting from 1.23. In the meantime, we can replicate the behavior by
|
||||||
creating a new Job and doing Pod adoption.
|
creating a new Job and doing Pod adoption.
|
||||||
- For Intel MPI, we also need a headless Service to front the launcher,
|
- For Intel MPI and MPICH, we also need a headless Service to front the launcher,
|
||||||
because workers communicate back to the launcher using its hostname.
|
because workers communicate back to the launcher using its hostname.
|
||||||
- **Revert the use of the Job API for the launcher.**
|
- **Revert the use of the Job API for the launcher.**
|
||||||
- The Job controller handles retries when the launcher or any of the workers fail.
|
- The Job controller handles retries when the launcher or any of the workers fail.
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@
|
||||||
## Properties
|
## Properties
|
||||||
Name | Type | Description | Notes
|
Name | Type | Description | Notes
|
||||||
------------ | ------------- | ------------- | -------------
|
------------ | ------------- | ------------- | -------------
|
||||||
**mpi_implementation** | **str** | MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\". | [optional]
|
**mpi_implementation** | **str** | MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\". | [optional]
|
||||||
**mpi_replica_specs** | [**dict(str, V1ReplicaSpec)**](V1ReplicaSpec.md) | MPIReplicaSpecs contains maps from `MPIReplicaType` to `ReplicaSpec` that specify the MPI replicas to run. |
|
**mpi_replica_specs** | [**dict(str, V1ReplicaSpec)**](V1ReplicaSpec.md) | MPIReplicaSpecs contains maps from `MPIReplicaType` to `ReplicaSpec` that specify the MPI replicas to run. |
|
||||||
**run_policy** | [**V2beta1RunPolicy**](V2beta1RunPolicy.md) | | [optional]
|
**run_policy** | [**V2beta1RunPolicy**](V2beta1RunPolicy.md) | | [optional]
|
||||||
**slots_per_worker** | **int** | Specifies the number of slots per worker used in hostfile. Defaults to 1. | [optional]
|
**slots_per_worker** | **int** | Specifies the number of slots per worker used in hostfile. Defaults to 1. | [optional]
|
||||||
|
|
|
||||||
|
|
@ -75,7 +75,7 @@ class V2beta1MPIJobSpec(object):
|
||||||
def mpi_implementation(self):
|
def mpi_implementation(self):
|
||||||
"""Gets the mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501
|
"""Gets the mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501
|
||||||
|
|
||||||
MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\". # noqa: E501
|
MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\". # noqa: E501
|
||||||
|
|
||||||
:return: The mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501
|
:return: The mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501
|
||||||
:rtype: str
|
:rtype: str
|
||||||
|
|
@ -86,7 +86,7 @@ class V2beta1MPIJobSpec(object):
|
||||||
def mpi_implementation(self, mpi_implementation):
|
def mpi_implementation(self, mpi_implementation):
|
||||||
"""Sets the mpi_implementation of this V2beta1MPIJobSpec.
|
"""Sets the mpi_implementation of this V2beta1MPIJobSpec.
|
||||||
|
|
||||||
MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\". # noqa: E501
|
MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\". # noqa: E501
|
||||||
|
|
||||||
:param mpi_implementation: The mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501
|
:param mpi_implementation: The mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501
|
||||||
:type mpi_implementation: str
|
:type mpi_implementation: str
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,7 @@ const (
|
||||||
envTestMPIOperatorImage = "TEST_MPI_OPERATOR_IMAGE"
|
envTestMPIOperatorImage = "TEST_MPI_OPERATOR_IMAGE"
|
||||||
envTestOpenMPIImage = "TEST_OPENMPI_IMAGE"
|
envTestOpenMPIImage = "TEST_OPENMPI_IMAGE"
|
||||||
envTestIntelMPIImage = "TEST_INTELMPI_IMAGE"
|
envTestIntelMPIImage = "TEST_INTELMPI_IMAGE"
|
||||||
|
envTestMPICHImage = "TEST_MPICH_IMAGE"
|
||||||
envTestKindImage = "TEST_KIND_IMAGE"
|
envTestKindImage = "TEST_KIND_IMAGE"
|
||||||
envSchedulerPluginsVersion = "SCHEDULER_PLUGINS_VERSION"
|
envSchedulerPluginsVersion = "SCHEDULER_PLUGINS_VERSION"
|
||||||
|
|
||||||
|
|
@ -47,6 +48,7 @@ const (
|
||||||
defaultKindImage = "kindest/node:v1.25.8"
|
defaultKindImage = "kindest/node:v1.25.8"
|
||||||
defaultOpenMPIImage = "mpioperator/mpi-pi:openmpi"
|
defaultOpenMPIImage = "mpioperator/mpi-pi:openmpi"
|
||||||
defaultIntelMPIImage = "mpioperator/mpi-pi:intel"
|
defaultIntelMPIImage = "mpioperator/mpi-pi:intel"
|
||||||
|
defaultMPICHImage = "mpioperator/mpi-pi:mpich"
|
||||||
rootPath = "../.."
|
rootPath = "../.."
|
||||||
kubectlPath = rootPath + "/bin/kubectl"
|
kubectlPath = rootPath + "/bin/kubectl"
|
||||||
kindPath = rootPath + "/bin/kind"
|
kindPath = rootPath + "/bin/kind"
|
||||||
|
|
@ -71,6 +73,7 @@ var (
|
||||||
mpiOperatorImage string
|
mpiOperatorImage string
|
||||||
openMPIImage string
|
openMPIImage string
|
||||||
intelMPIImage string
|
intelMPIImage string
|
||||||
|
mpichImage string
|
||||||
kindImage string
|
kindImage string
|
||||||
schedulerPluginsVersion string
|
schedulerPluginsVersion string
|
||||||
|
|
||||||
|
|
@ -86,6 +89,7 @@ func init() {
|
||||||
mpiOperatorImage = getEnvDefault(envTestMPIOperatorImage, defaultMPIOperatorImage)
|
mpiOperatorImage = getEnvDefault(envTestMPIOperatorImage, defaultMPIOperatorImage)
|
||||||
openMPIImage = getEnvDefault(envTestOpenMPIImage, defaultOpenMPIImage)
|
openMPIImage = getEnvDefault(envTestOpenMPIImage, defaultOpenMPIImage)
|
||||||
intelMPIImage = getEnvDefault(envTestIntelMPIImage, defaultIntelMPIImage)
|
intelMPIImage = getEnvDefault(envTestIntelMPIImage, defaultIntelMPIImage)
|
||||||
|
mpichImage = getEnvDefault(envTestMPICHImage, defaultMPICHImage)
|
||||||
kindImage = getEnvDefault(envTestKindImage, defaultKindImage)
|
kindImage = getEnvDefault(envTestKindImage, defaultKindImage)
|
||||||
schedulerPluginsVersion = getEnvDefault(envSchedulerPluginsVersion, defaultSchedulerPluginsVersion)
|
schedulerPluginsVersion = getEnvDefault(envSchedulerPluginsVersion, defaultSchedulerPluginsVersion)
|
||||||
}
|
}
|
||||||
|
|
@ -147,7 +151,7 @@ func bootstrapKindCluster() error {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("creating kind cluster: %w", err)
|
return fmt.Errorf("creating kind cluster: %w", err)
|
||||||
}
|
}
|
||||||
err = runCommand(kindPath, "load", "docker-image", mpiOperatorImage, openMPIImage, intelMPIImage)
|
err = runCommand(kindPath, "load", "docker-image", mpiOperatorImage, openMPIImage, intelMPIImage, mpichImage)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("loading container images: %w", err)
|
return fmt.Errorf("loading container images: %w", err)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -170,7 +170,6 @@ var _ = ginkgo.Describe("MPIJob", func() {
|
||||||
})
|
})
|
||||||
|
|
||||||
ginkgo.Context("with Intel Implementation", func() {
|
ginkgo.Context("with Intel Implementation", func() {
|
||||||
ginkgo.When("running as root", func() {
|
|
||||||
ginkgo.BeforeEach(func() {
|
ginkgo.BeforeEach(func() {
|
||||||
mpiJob.Spec.MPIImplementation = kubeflow.MPIImplementationIntel
|
mpiJob.Spec.MPIImplementation = kubeflow.MPIImplementationIntel
|
||||||
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers = []corev1.Container{
|
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers = []corev1.Container{
|
||||||
|
|
@ -209,12 +208,99 @@ var _ = ginkgo.Describe("MPIJob", func() {
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
ginkgo.When("running as root", func() {
|
||||||
ginkgo.It("should succeed", func() {
|
ginkgo.It("should succeed", func() {
|
||||||
mpiJob := createJobAndWaitForCompletion(mpiJob)
|
mpiJob := createJobAndWaitForCompletion(mpiJob)
|
||||||
expectConditionToBeTrue(mpiJob, kubeflow.JobSucceeded)
|
expectConditionToBeTrue(mpiJob, kubeflow.JobSucceeded)
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
ginkgo.When("running as non-root", func() {
|
||||||
|
ginkgo.BeforeEach(func () {
|
||||||
|
mpiJob.Spec.SSHAuthMountPath = "/home/mpiuser/.ssh"
|
||||||
|
|
||||||
|
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers[0].SecurityContext = &corev1.SecurityContext{
|
||||||
|
RunAsUser: newInt64(1000),
|
||||||
|
}
|
||||||
|
workerContainer := &mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.Containers[0]
|
||||||
|
workerContainer.SecurityContext = &corev1.SecurityContext{
|
||||||
|
RunAsUser: newInt64(1000),
|
||||||
|
}
|
||||||
|
workerContainer.Args = append(workerContainer.Args, "-f", "/home/mpiuser/.sshd_config")
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.It("should succeed", func() {
|
||||||
|
mpiJob := createJobAndWaitForCompletion(mpiJob)
|
||||||
|
expectConditionToBeTrue(mpiJob, kubeflow.JobSucceeded)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.Context("with MPICH Implementation", func() {
|
||||||
|
ginkgo.BeforeEach(func() {
|
||||||
|
mpiJob.Spec.MPIImplementation = kubeflow.MPIImplementationMPICH
|
||||||
|
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers = []corev1.Container{
|
||||||
|
{
|
||||||
|
Name: "launcher",
|
||||||
|
Image: mpichImage,
|
||||||
|
ImagePullPolicy: corev1.PullIfNotPresent, // use locally built image.
|
||||||
|
Command: []string{}, // uses entrypoint.
|
||||||
|
Args: []string{
|
||||||
|
"mpirun",
|
||||||
|
"-n",
|
||||||
|
"2",
|
||||||
|
"/home/mpiuser/pi",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.Containers = []corev1.Container{
|
||||||
|
{
|
||||||
|
Name: "worker",
|
||||||
|
Image: mpichImage,
|
||||||
|
ImagePullPolicy: corev1.PullIfNotPresent, // use locally built image.
|
||||||
|
Command: []string{}, // uses entrypoint.
|
||||||
|
Args: []string{
|
||||||
|
"/usr/sbin/sshd",
|
||||||
|
"-De",
|
||||||
|
},
|
||||||
|
ReadinessProbe: &corev1.Probe{
|
||||||
|
ProbeHandler: corev1.ProbeHandler{
|
||||||
|
TCPSocket: &corev1.TCPSocketAction{
|
||||||
|
Port: intstr.FromInt(2222),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
InitialDelaySeconds: 3,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.When("running as root", func() {
|
||||||
|
ginkgo.It("should succeed", func() {
|
||||||
|
mpiJob := createJobAndWaitForCompletion(mpiJob)
|
||||||
|
expectConditionToBeTrue(mpiJob, kubeflow.JobSucceeded)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.When("running as non-root", func() {
|
||||||
|
ginkgo.BeforeEach(func () {
|
||||||
|
mpiJob.Spec.SSHAuthMountPath = "/home/mpiuser/.ssh"
|
||||||
|
|
||||||
|
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers[0].SecurityContext = &corev1.SecurityContext{
|
||||||
|
RunAsUser: newInt64(1000),
|
||||||
|
}
|
||||||
|
workerContainer := &mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.Containers[0]
|
||||||
|
workerContainer.SecurityContext = &corev1.SecurityContext{
|
||||||
|
RunAsUser: newInt64(1000),
|
||||||
|
}
|
||||||
|
workerContainer.Args = append(workerContainer.Args, "-f", "/home/mpiuser/.sshd_config")
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.It("should succeed", func() {
|
||||||
|
mpiJob := createJobAndWaitForCompletion(mpiJob)
|
||||||
|
expectConditionToBeTrue(mpiJob, kubeflow.JobSucceeded)
|
||||||
|
})
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
ginkgo.Context("with scheduler-plugins", func() {
|
ginkgo.Context("with scheduler-plugins", func() {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue