MPICH support (#562)

* Add support for MPICH

* Fix CI errors

* Temporary: manual trigger

* Fix file name

* Add an empty line at the end of the file

* Fix formatting

* Revert "Temporary: manual trigger"

This reverts commit 15164a8b70.

* fix formatting

* Regenerate the mpi-operator.yaml

* Adding an empy line at the end of Dockerfiles

* Share the same entrypoin for Intel and MPICH

* share hostfile generation between Intel and MPICH

* Add validation test for MPICH

* Fix formatting

* Don't over engineer the tests - be explicit

* add non-root tests for IntelMPI and MPICH
This commit is contained in:
Mateusz Kubica 2023-06-16 18:57:36 +01:00 committed by GitHub
parent caa1112993
commit 21f326d1d2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 382 additions and 60 deletions

View File

@ -23,6 +23,7 @@ BASE_IMAGE_SSH_PORT?=2222
IMG_BUILDER=docker
PLATFORMS ?= linux/amd64
INTEL_PLATFORMS ?= linux/amd64
MPICH_PLATFORMS ?= linux/amd64
LD_FLAGS_V2=" \
-X '${REPO_PATH}/pkg/version.GitSHA=${GitSHA}' \
-X '${REPO_PATH}/pkg/version.Built=${Date}' \
@ -71,6 +72,7 @@ test: bin/envtest scheduler-plugins-crd
test_e2e: export TEST_MPI_OPERATOR_IMAGE=${IMAGE_NAME}:${RELEASE_VERSION}
test_e2e: export TEST_OPENMPI_IMAGE=mpioperator/mpi-pi:${RELEASE_VERSION}-openmpi
test_e2e: export TEST_INTELMPI_IMAGE=mpioperator/mpi-pi:${RELEASE_VERSION}-intel
test_e2e: export TEST_MPICH_IMAGE=mpioperator/mpi-pi:${RELEASE_VERSION}-mpich
test_e2e: bin/kubectl kind helm images test_images dev_manifest scheduler-plugins-chart
go test -v ./test/e2e/...
@ -108,6 +110,9 @@ test_images:
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(INTEL_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/intel:${RELEASE_VERSION} build/base -f build/base/intel.Dockerfile
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(INTEL_PLATFORMS) -t mpioperator/intel-builder:${RELEASE_VERSION} build/base -f build/base/intel-builder.Dockerfile
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(INTEL_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/mpi-pi:${RELEASE_VERSION}-intel examples/v2beta1/pi -f examples/v2beta1/pi/intel.Dockerfile
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(MPICH_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/mpich:${RELEASE_VERSION} build/base -f build/base/mpich.Dockerfile
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(MPICH_PLATFORMS) -t mpioperator/mpich-builder:${RELEASE_VERSION} build/base -f build/base/mpich-builder.Dockerfile
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(MPICH_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/mpi-pi:${RELEASE_VERSION}-mpich examples/v2beta1/pi -f examples/v2beta1/pi/mpich.Dockerfile
.PHONY: tidy
tidy:

View File

@ -218,6 +218,12 @@ For a sample that uses Intel MPI, see:
cat examples/pi/pi-intel.yaml
```
For a sample that uses MPICH, see:
```bash
cat examples/pi/pi-mpich.yaml
```
## Exposed Metrics
| Metric name | Metric type | Description | Labels |

View File

@ -22,5 +22,5 @@ RUN apt update \
intel-oneapi-mpi \
&& rm -rf /var/lib/apt/lists/*
COPY intel-entrypoint.sh /entrypoint.sh
COPY entrypoint.sh /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]

View File

@ -0,0 +1,7 @@
FROM debian:bullseye as builder
RUN apt update \
&& apt install -y --no-install-recommends \
g++ \
libmpich-dev \
&& rm -rf /var/lib/apt/lists/*

View File

@ -0,0 +1,12 @@
ARG BASE_LABEL
FROM mpioperator/base:${BASE_LABEL}
RUN apt update \
&& apt install -y --no-install-recommends \
dnsutils \
mpich \
&& rm -rf /var/lib/apt/lists/*
COPY entrypoint.sh /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]

View File

@ -58,10 +58,11 @@ spec:
mpiImplementation:
default: OpenMPI
description: MPIImplementation is the MPI implementation. Options
are "OpenMPI" (default) and "Intel".
are "OpenMPI" (default), "Intel" and "MPICH".
enum:
- OpenMPI
- Intel
- MPICH
type: string
mpiReplicaSpecs:
additionalProperties:

View File

@ -5,7 +5,6 @@ FROM mpioperator/openmpi-builder:${BASE_LABEL} as builder
COPY pi.cc /src/pi.cc
RUN mpic++ /src/pi.cc -o /pi
FROM mpioperator/openmpi:${BASE_LABEL}
COPY --from=builder /pi /home/mpiuser/pi

View File

@ -19,9 +19,15 @@ For Intel MPI:
docker build -t mpi-pi . -f intel.Dockerfile
```
For MPICH:
```bash
docker build -t mpi-pi . -f mpich.Dockerfile
```
## Create MPIJob
Modify `pi.yaml` (for OpenMPI) or `pi-intel.yaml` (for Intel MPI) to set up the
Modify `pi.yaml` (for OpenMPI), `pi-intel.yaml` (for Intel MPI) or `pi-mpich.yaml` (for MPICH) to set up the
image name from your own registry.
Then, run:

View File

@ -0,0 +1,10 @@
ARG BASE_LABEL
FROM mpioperator/mpich-builder:${BASE_LABEL} as builder
COPY pi.cc /src/pi.cc
RUN mpic++ /src/pi.cc -o /pi
FROM mpioperator/mpich:${BASE_LABEL}
COPY --from=builder /pi /home/mpiuser/pi

View File

@ -0,0 +1,54 @@
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: pi
spec:
slotsPerWorker: 1
runPolicy:
cleanPodPolicy: Running
sshAuthMountPath: /home/mpiuser/.ssh
mpiImplementation: MPICH
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
containers:
- image: mpioperator/mpi-pi:mpich
imagePullPolicy: Always
name: mpi-launcher
securityContext:
runAsUser: 1000
args:
- mpirun
- -n
- "2"
- /home/mpiuser/pi
resources:
limits:
cpu: 1
memory: 1Gi
Worker:
replicas: 2
template:
spec:
containers:
- image: mpioperator/mpi-pi:mpich
imagePullPolicy: Always
name: mpi-worker
securityContext:
runAsUser: 1000
command:
args:
- /usr/sbin/sshd
- -De
- -f
- /home/mpiuser/.sshd_config
readinessProbe:
tcpSocket:
port: 2222
initialDelaySeconds: 2
resources:
limits:
cpu: 1
memory: 1Gi

View File

@ -35,10 +35,11 @@ spec:
mpiImplementation:
default: OpenMPI
description: MPIImplementation is the MPI implementation. Options
are "OpenMPI" (default) and "Intel".
are "OpenMPI" (default), "Intel" and "MPICH".
enum:
- OpenMPI
- Intel
- MPICH
type: string
mpiReplicaSpecs:
additionalProperties:

View File

@ -38,7 +38,7 @@ func TestSetDefaults_MPIJob(t *testing.T) {
},
},
},
"base defaults overridden": {
"base defaults overridden (intel)": {
job: MPIJob{
Spec: MPIJobSpec{
SlotsPerWorker: newInt32(10),
@ -66,6 +66,34 @@ func TestSetDefaults_MPIJob(t *testing.T) {
},
},
},
"base defaults overridden (mpich)": {
job: MPIJob{
Spec: MPIJobSpec{
SlotsPerWorker: newInt32(10),
RunPolicy: RunPolicy{
CleanPodPolicy: NewCleanPodPolicy(CleanPodPolicyRunning),
TTLSecondsAfterFinished: newInt32(2),
ActiveDeadlineSeconds: newInt64(3),
BackoffLimit: newInt32(4),
},
SSHAuthMountPath: "/home/mpiuser/.ssh",
MPIImplementation: MPIImplementationMPICH,
},
},
want: MPIJob{
Spec: MPIJobSpec{
SlotsPerWorker: newInt32(10),
RunPolicy: RunPolicy{
CleanPodPolicy: NewCleanPodPolicy(CleanPodPolicyRunning),
TTLSecondsAfterFinished: newInt32(2),
ActiveDeadlineSeconds: newInt64(3),
BackoffLimit: newInt32(4),
},
SSHAuthMountPath: "/home/mpiuser/.ssh",
MPIImplementation: MPIImplementationMPICH,
},
},
},
"launcher defaults": {
job: MPIJob{
Spec: MPIJobSpec{

View File

@ -488,7 +488,7 @@ func schema_pkg_apis_kubeflow_v2beta1_MPIJobSpec(ref common.ReferenceCallback) c
},
"mpiImplementation": {
SchemaProps: spec.SchemaProps{
Description: "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\".",
Description: "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\".",
Type: []string{"string"},
Format: "",
},

View File

@ -322,7 +322,7 @@
],
"properties": {
"mpiImplementation": {
"description": "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\".",
"description": "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\".",
"type": "string"
},
"mpiReplicaSpecs": {

View File

@ -155,8 +155,8 @@ type MPIJobSpec struct {
SSHAuthMountPath string `json:"sshAuthMountPath,omitempty"`
// MPIImplementation is the MPI implementation.
// Options are "OpenMPI" (default) and "Intel".
// +kubebuilder:validation:Enum:=OpenMPI;Intel
// Options are "OpenMPI" (default), "Intel" and "MPICH".
// +kubebuilder:validation:Enum:=OpenMPI;Intel;MPICH
// +kubebuilder:default:=OpenMPI
MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"`
}
@ -177,6 +177,7 @@ type MPIImplementation string
const (
MPIImplementationOpenMPI MPIImplementation = "OpenMPI"
MPIImplementationIntel MPIImplementation = "Intel"
MPIImplementationMPICH MPIImplementation = "MPICH"
)
// JobStatus represents the current observed state of the training Job.

View File

@ -35,7 +35,8 @@ var (
validMPIImplementations = sets.NewString(
string(kubeflow.MPIImplementationOpenMPI),
string(kubeflow.MPIImplementationIntel))
string(kubeflow.MPIImplementationIntel),
string(kubeflow.MPIImplementationMPICH))
validRestartPolicies = sets.NewString(
string(common.RestartPolicyNever),

View File

@ -31,7 +31,7 @@ func TestValidateMPIJob(t *testing.T) {
job kubeflow.MPIJob
wantErrs field.ErrorList
}{
"valid": {
"valid (intel)": {
job: kubeflow.MPIJob{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
@ -57,7 +57,7 @@ func TestValidateMPIJob(t *testing.T) {
},
},
},
"valid with worker": {
"valid with worker (intel)": {
job: kubeflow.MPIJob{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
@ -92,6 +92,67 @@ func TestValidateMPIJob(t *testing.T) {
},
},
},
"valid (mpich)": {
job: kubeflow.MPIJob{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: kubeflow.MPIJobSpec{
SlotsPerWorker: newInt32(2),
RunPolicy: kubeflow.RunPolicy{
CleanPodPolicy: kubeflow.NewCleanPodPolicy(kubeflow.CleanPodPolicyRunning),
},
SSHAuthMountPath: "/home/mpiuser/.ssh",
MPIImplementation: kubeflow.MPIImplementationMPICH,
MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*common.ReplicaSpec{
kubeflow.MPIReplicaTypeLauncher: {
Replicas: newInt32(1),
RestartPolicy: common.RestartPolicyNever,
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{}},
},
},
},
},
},
},
},
"valid with worker (mpich)": {
job: kubeflow.MPIJob{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: kubeflow.MPIJobSpec{
SlotsPerWorker: newInt32(2),
RunPolicy: kubeflow.RunPolicy{
CleanPodPolicy: kubeflow.NewCleanPodPolicy(kubeflow.CleanPodPolicyRunning),
},
SSHAuthMountPath: "/home/mpiuser/.ssh",
MPIImplementation: kubeflow.MPIImplementationMPICH,
MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*common.ReplicaSpec{
kubeflow.MPIReplicaTypeLauncher: {
Replicas: newInt32(1),
RestartPolicy: common.RestartPolicyOnFailure,
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{}},
},
},
},
kubeflow.MPIReplicaTypeWorker: {
Replicas: newInt32(3),
RestartPolicy: common.RestartPolicyNever,
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{}},
},
},
},
},
},
},
},
"empty job": {
wantErrs: field.ErrorList{
&field.Error{

View File

@ -202,6 +202,16 @@ var (
Value: "-o ConnectionAttempts=10",
},
}
mpichEnvVars = []corev1.EnvVar{
{
Name: "HYDRA_HOST_FILE",
Value: fmt.Sprintf("%s/%s", configMountPath, hostfileName),
},
{
Name: "HYDRA_LAUNCH_EXTRA_ARGS",
Value: "-o ConnectionAttempts=10",
},
}
nvidiaDisableEnvVars = []corev1.EnvVar{
{Name: "NVIDIA_VISIBLE_DEVICES"},
{Name: "NVIDIA_DRIVER_CAPABILITIES"},
@ -603,8 +613,9 @@ func (c *MPIJobController) syncHandler(key string) error {
return err
}
}
if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel {
// The Intel implementation requires workers to communicate with the
if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel ||
mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationMPICH {
// The Intel and MPICH implementations require workers to communicate with the
// launcher through its hostname. For that, we create a Service which
// has the same name as the launcher's hostname.
_, err := c.getOrCreateService(mpiJob, newLauncherService(mpiJob))
@ -1216,7 +1227,7 @@ func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int32) *corev1.ConfigM
switch mpiJob.Spec.MPIImplementation {
case kubeflow.MPIImplementationOpenMPI:
buffer.WriteString(fmt.Sprintf("%s%s-%d.%s.%s.svc slots=%d\n", mpiJob.Name, workerSuffix, i, workersService, mpiJob.Namespace, slots))
case kubeflow.MPIImplementationIntel:
case kubeflow.MPIImplementationIntel, kubeflow.MPIImplementationMPICH:
buffer.WriteString(fmt.Sprintf("%s%s-%d.%s.%s.svc:%d\n", mpiJob.Name, workerSuffix, i, workersService, mpiJob.Namespace, slots))
}
}
@ -1444,6 +1455,8 @@ func (c *MPIJobController) newLauncherPodTemplate(mpiJob *kubeflow.MPIJob) corev
Name: intelMPISlotsEnv,
Value: slotsStr,
})
case kubeflow.MPIImplementationMPICH:
container.Env = append(container.Env, mpichEnvVars...)
}
container.Env = append(container.Env,

View File

@ -500,7 +500,7 @@ func TestDoNothingWithInvalidMPIJob(t *testing.T) {
}
func TestAllResourcesCreated(t *testing.T) {
impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel}
impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel, kubeflow.MPIImplementationMPICH}
for _, implementation := range impls {
t.Run(string(implementation), func(t *testing.T) {
f := newFixture(t, "")
@ -524,7 +524,8 @@ func TestAllResourcesCreated(t *testing.T) {
for i := 0; i < 5; i++ {
f.expectCreatePodAction(fmjc.newWorker(mpiJobCopy, i))
}
if implementation == kubeflow.MPIImplementationIntel {
if implementation == kubeflow.MPIImplementationIntel ||
implementation == kubeflow.MPIImplementationMPICH {
f.expectCreateServiceAction(newLauncherService(mpiJobCopy))
}
f.expectCreateJobAction(fmjc.newLauncherJob(mpiJobCopy))
@ -796,7 +797,7 @@ func TestShutdownWorker(t *testing.T) {
}
func TestCreateSuspendedMPIJob(t *testing.T) {
impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel}
impls := []kubeflow.MPIImplementation{kubeflow.MPIImplementationOpenMPI, kubeflow.MPIImplementationIntel, kubeflow.MPIImplementationMPICH}
for _, implementation := range impls {
t.Run(string(implementation), func(t *testing.T) {
f := newFixture(t, "")
@ -819,7 +820,8 @@ func TestCreateSuspendedMPIJob(t *testing.T) {
t.Fatalf("Failed creating secret")
}
f.expectCreateSecretAction(secret)
if implementation == kubeflow.MPIImplementationIntel {
if implementation == kubeflow.MPIImplementationIntel ||
implementation == kubeflow.MPIImplementationMPICH {
f.expectCreateServiceAction(newLauncherService(mpiJob))
}
@ -1583,6 +1585,31 @@ func TestNewConfigMap(t *testing.T) {
},
},
},
"MPICH with slots": {
mpiJob: &kubeflow.MPIJob{
ObjectMeta: metav1.ObjectMeta{
Name: "mpich-with-slots",
Namespace: "project-x",
},
Spec: kubeflow.MPIJobSpec{
SlotsPerWorker: pointer.Int32(10),
MPIImplementation: kubeflow.MPIImplementationMPICH,
},
},
workerReplicas: 1,
wantCM: &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: "mpich-with-slots-config",
Namespace: "project-x",
Labels: map[string]string{
"app": "mpich-with-slots",
},
},
Data: map[string]string{
"hostfile": "mpich-with-slots-worker-0.mpich-with-slots-worker.project-x.svc:10\n",
},
},
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {

View File

@ -40,7 +40,7 @@ An MPIJob CRD describes the Job. Important fields include:
- The launcher template, which should have a `mpirun` command.
The images are expected to have the MPI implementation binaries (such as
OpenMPI, MPICH or Intel MPI) the users MPI executable.
OpenMPI, Intel MPI or MPICH) the users MPI executable.
A controller processes the MPIJob, starting a Job with the following steps:
1. Creates ConfigMap, which contains:
@ -148,7 +148,7 @@ following changes:
doesnt support changes to the completions field. This can be supported
starting from 1.23. In the meantime, we can replicate the behavior by
creating a new Job and doing Pod adoption.
- For Intel MPI, we also need a headless Service to front the launcher,
- For Intel MPI and MPICH, we also need a headless Service to front the launcher,
because workers communicate back to the launcher using its hostname.
- **Revert the use of the Job API for the launcher.**
- The Job controller handles retries when the launcher or any of the workers fail.

View File

@ -4,7 +4,7 @@
## Properties
Name | Type | Description | Notes
------------ | ------------- | ------------- | -------------
**mpi_implementation** | **str** | MPIImplementation is the MPI implementation. Options are \&quot;OpenMPI\&quot; (default) and \&quot;Intel\&quot;. | [optional]
**mpi_implementation** | **str** | MPIImplementation is the MPI implementation. Options are \&quot;OpenMPI\&quot; (default), \&quot;Intel\&quot; and \&quot;MPICH\&quot;. | [optional]
**mpi_replica_specs** | [**dict(str, V1ReplicaSpec)**](V1ReplicaSpec.md) | MPIReplicaSpecs contains maps from &#x60;MPIReplicaType&#x60; to &#x60;ReplicaSpec&#x60; that specify the MPI replicas to run. |
**run_policy** | [**V2beta1RunPolicy**](V2beta1RunPolicy.md) | | [optional]
**slots_per_worker** | **int** | Specifies the number of slots per worker used in hostfile. Defaults to 1. | [optional]

View File

@ -75,7 +75,7 @@ class V2beta1MPIJobSpec(object):
def mpi_implementation(self):
"""Gets the mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501
MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\". # noqa: E501
MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\". # noqa: E501
:return: The mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501
:rtype: str
@ -86,7 +86,7 @@ class V2beta1MPIJobSpec(object):
def mpi_implementation(self, mpi_implementation):
"""Sets the mpi_implementation of this V2beta1MPIJobSpec.
MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\". # noqa: E501
MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\". # noqa: E501
:param mpi_implementation: The mpi_implementation of this V2beta1MPIJobSpec. # noqa: E501
:type mpi_implementation: str

View File

@ -40,6 +40,7 @@ const (
envTestMPIOperatorImage = "TEST_MPI_OPERATOR_IMAGE"
envTestOpenMPIImage = "TEST_OPENMPI_IMAGE"
envTestIntelMPIImage = "TEST_INTELMPI_IMAGE"
envTestMPICHImage = "TEST_MPICH_IMAGE"
envTestKindImage = "TEST_KIND_IMAGE"
envSchedulerPluginsVersion = "SCHEDULER_PLUGINS_VERSION"
@ -47,6 +48,7 @@ const (
defaultKindImage = "kindest/node:v1.25.8"
defaultOpenMPIImage = "mpioperator/mpi-pi:openmpi"
defaultIntelMPIImage = "mpioperator/mpi-pi:intel"
defaultMPICHImage = "mpioperator/mpi-pi:mpich"
rootPath = "../.."
kubectlPath = rootPath + "/bin/kubectl"
kindPath = rootPath + "/bin/kind"
@ -71,6 +73,7 @@ var (
mpiOperatorImage string
openMPIImage string
intelMPIImage string
mpichImage string
kindImage string
schedulerPluginsVersion string
@ -86,6 +89,7 @@ func init() {
mpiOperatorImage = getEnvDefault(envTestMPIOperatorImage, defaultMPIOperatorImage)
openMPIImage = getEnvDefault(envTestOpenMPIImage, defaultOpenMPIImage)
intelMPIImage = getEnvDefault(envTestIntelMPIImage, defaultIntelMPIImage)
mpichImage = getEnvDefault(envTestMPICHImage, defaultMPICHImage)
kindImage = getEnvDefault(envTestKindImage, defaultKindImage)
schedulerPluginsVersion = getEnvDefault(envSchedulerPluginsVersion, defaultSchedulerPluginsVersion)
}
@ -147,7 +151,7 @@ func bootstrapKindCluster() error {
if err != nil {
return fmt.Errorf("creating kind cluster: %w", err)
}
err = runCommand(kindPath, "load", "docker-image", mpiOperatorImage, openMPIImage, intelMPIImage)
err = runCommand(kindPath, "load", "docker-image", mpiOperatorImage, openMPIImage, intelMPIImage, mpichImage)
if err != nil {
return fmt.Errorf("loading container images: %w", err)
}

View File

@ -170,7 +170,6 @@ var _ = ginkgo.Describe("MPIJob", func() {
})
ginkgo.Context("with Intel Implementation", func() {
ginkgo.When("running as root", func() {
ginkgo.BeforeEach(func() {
mpiJob.Spec.MPIImplementation = kubeflow.MPIImplementationIntel
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers = []corev1.Container{
@ -209,12 +208,99 @@ var _ = ginkgo.Describe("MPIJob", func() {
}
})
ginkgo.When("running as root", func() {
ginkgo.It("should succeed", func() {
mpiJob := createJobAndWaitForCompletion(mpiJob)
expectConditionToBeTrue(mpiJob, kubeflow.JobSucceeded)
})
})
ginkgo.When("running as non-root", func() {
ginkgo.BeforeEach(func () {
mpiJob.Spec.SSHAuthMountPath = "/home/mpiuser/.ssh"
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers[0].SecurityContext = &corev1.SecurityContext{
RunAsUser: newInt64(1000),
}
workerContainer := &mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.Containers[0]
workerContainer.SecurityContext = &corev1.SecurityContext{
RunAsUser: newInt64(1000),
}
workerContainer.Args = append(workerContainer.Args, "-f", "/home/mpiuser/.sshd_config")
})
ginkgo.It("should succeed", func() {
mpiJob := createJobAndWaitForCompletion(mpiJob)
expectConditionToBeTrue(mpiJob, kubeflow.JobSucceeded)
})
})
})
ginkgo.Context("with MPICH Implementation", func() {
ginkgo.BeforeEach(func() {
mpiJob.Spec.MPIImplementation = kubeflow.MPIImplementationMPICH
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers = []corev1.Container{
{
Name: "launcher",
Image: mpichImage,
ImagePullPolicy: corev1.PullIfNotPresent, // use locally built image.
Command: []string{}, // uses entrypoint.
Args: []string{
"mpirun",
"-n",
"2",
"/home/mpiuser/pi",
},
},
}
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.Containers = []corev1.Container{
{
Name: "worker",
Image: mpichImage,
ImagePullPolicy: corev1.PullIfNotPresent, // use locally built image.
Command: []string{}, // uses entrypoint.
Args: []string{
"/usr/sbin/sshd",
"-De",
},
ReadinessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
TCPSocket: &corev1.TCPSocketAction{
Port: intstr.FromInt(2222),
},
},
InitialDelaySeconds: 3,
},
},
}
})
ginkgo.When("running as root", func() {
ginkgo.It("should succeed", func() {
mpiJob := createJobAndWaitForCompletion(mpiJob)
expectConditionToBeTrue(mpiJob, kubeflow.JobSucceeded)
})
})
ginkgo.When("running as non-root", func() {
ginkgo.BeforeEach(func () {
mpiJob.Spec.SSHAuthMountPath = "/home/mpiuser/.ssh"
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers[0].SecurityContext = &corev1.SecurityContext{
RunAsUser: newInt64(1000),
}
workerContainer := &mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.Containers[0]
workerContainer.SecurityContext = &corev1.SecurityContext{
RunAsUser: newInt64(1000),
}
workerContainer.Args = append(workerContainer.Args, "-f", "/home/mpiuser/.sshd_config")
})
ginkgo.It("should succeed", func() {
mpiJob := createJobAndWaitForCompletion(mpiJob)
expectConditionToBeTrue(mpiJob, kubeflow.JobSucceeded)
})
})
})
ginkgo.Context("with scheduler-plugins", func() {