mpi-operator/v2/test/integration/mpi_job_controller_test.go

459 lines
13 KiB
Go

// Copyright 2021 The Kubeflow Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package integration
import (
"context"
"fmt"
"testing"
"time"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
kubeinformers "k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/reference"
common "github.com/kubeflow/common/pkg/apis/common/v1"
kubeflow "github.com/kubeflow/mpi-operator/v2/pkg/apis/kubeflow/v2beta1"
clientset "github.com/kubeflow/mpi-operator/v2/pkg/client/clientset/versioned"
"github.com/kubeflow/mpi-operator/v2/pkg/client/clientset/versioned/scheme"
informers "github.com/kubeflow/mpi-operator/v2/pkg/client/informers/externalversions"
"github.com/kubeflow/mpi-operator/v2/pkg/controller"
)
const (
waitInterval = 100 * time.Millisecond
)
func TestMPIJobSuccess(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
s := newTestSetup(ctx, t)
startController(ctx, s.kClient, s.mpiClient)
mpiJob := &kubeflow.MPIJob{
ObjectMeta: metav1.ObjectMeta{
Name: "job",
Namespace: s.namespace,
},
Spec: kubeflow.MPIJobSpec{
SlotsPerWorker: newInt32(1),
CleanPodPolicy: newCleanPodPolicy(common.CleanPodPolicyRunning),
MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*common.ReplicaSpec{
kubeflow.MPIReplicaTypeLauncher: {
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "main",
Image: "mpi-image",
},
},
},
},
},
kubeflow.MPIReplicaTypeWorker: {
Replicas: newInt32(2),
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "main",
Image: "mpi-image",
},
},
},
},
},
},
},
}
var err error
mpiJob, err = s.mpiClient.KubeflowV2beta1().MPIJobs(s.namespace).Create(ctx, mpiJob, metav1.CreateOptions{})
if err != nil {
t.Fatalf("Failed sending job to apiserver: %v", err)
}
s.events.expect(eventForJob(corev1.Event{
Type: corev1.EventTypeNormal,
Reason: "MPIJobCreated",
}, mpiJob))
podsByRole := validateMPIJobDependencies(ctx, t, s.kClient, mpiJob, 2)
validateMPIJobStatus(ctx, t, s.mpiClient, mpiJob, map[common.ReplicaType]*common.ReplicaStatus{
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {},
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {},
})
s.events.verify(t)
err = updatePodsToPhase(ctx, s.kClient, podsByRole["worker"], corev1.PodRunning)
if err != nil {
t.Fatalf("Updating worker Pods to Running phase: %v", err)
}
validateMPIJobStatus(ctx, t, s.mpiClient, mpiJob, map[common.ReplicaType]*common.ReplicaStatus{
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {},
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {
Active: 2,
},
})
s.events.expect(eventForJob(corev1.Event{
Type: corev1.EventTypeNormal,
Reason: "MPIJobRunning",
}, mpiJob))
err = updatePodsToPhase(ctx, s.kClient, podsByRole["launcher"], corev1.PodRunning)
if err != nil {
t.Fatalf("Updating launcher Pods to Running phase: %v", err)
}
validateMPIJobStatus(ctx, t, s.mpiClient, mpiJob, map[common.ReplicaType]*common.ReplicaStatus{
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {
Active: 1,
},
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {
Active: 2,
},
})
s.events.verify(t)
s.events.expect(eventForJob(corev1.Event{
Type: corev1.EventTypeNormal,
Reason: "MPIJobSucceeded",
}, mpiJob))
err = updatePodsToPhase(ctx, s.kClient, podsByRole["launcher"], corev1.PodSucceeded)
if err != nil {
t.Fatalf("Updating launcher Pods to Succeeded phase: %v", err)
}
validateMPIJobDependencies(ctx, t, s.kClient, mpiJob, 0)
validateMPIJobStatus(ctx, t, s.mpiClient, mpiJob, map[common.ReplicaType]*common.ReplicaStatus{
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {
Succeeded: 1,
},
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {},
})
s.events.verify(t)
}
func TestMPIJobFailure(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
s := newTestSetup(ctx, t)
startController(ctx, s.kClient, s.mpiClient)
mpiJob := &kubeflow.MPIJob{
ObjectMeta: metav1.ObjectMeta{
Name: "job",
Namespace: s.namespace,
},
Spec: kubeflow.MPIJobSpec{
SlotsPerWorker: newInt32(1),
CleanPodPolicy: newCleanPodPolicy(common.CleanPodPolicyRunning),
MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*common.ReplicaSpec{
kubeflow.MPIReplicaTypeLauncher: {
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "main",
Image: "mpi-image",
},
},
},
},
},
kubeflow.MPIReplicaTypeWorker: {
Replicas: newInt32(2),
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "main",
Image: "mpi-image",
},
},
},
},
},
},
},
}
var err error
mpiJob, err = s.mpiClient.KubeflowV2beta1().MPIJobs(s.namespace).Create(ctx, mpiJob, metav1.CreateOptions{})
if err != nil {
t.Fatalf("Failed sending job to apiserver: %v", err)
}
podsByRole := validateMPIJobDependencies(ctx, t, s.kClient, mpiJob, 2)
s.events.expect(eventForJob(corev1.Event{
Type: corev1.EventTypeNormal,
Reason: "MPIJobRunning",
}, mpiJob))
err = updatePodsToPhase(ctx, s.kClient, podsByRole["launcher"], corev1.PodRunning)
err = updatePodsToPhase(ctx, s.kClient, podsByRole["worker"], corev1.PodRunning)
if err != nil {
t.Fatalf("Updating launcher Pods to Running phase: %v", err)
}
s.events.verify(t)
s.events.expect(eventForJob(corev1.Event{
Type: corev1.EventTypeWarning,
Reason: "MPIJobFailed",
}, mpiJob))
err = updatePodsToPhase(ctx, s.kClient, podsByRole["launcher"], corev1.PodFailed)
validateMPIJobDependencies(ctx, t, s.kClient, mpiJob, 0)
validateMPIJobStatus(ctx, t, s.mpiClient, mpiJob, map[common.ReplicaType]*common.ReplicaStatus{
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {
Failed: 1,
},
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {},
})
s.events.verify(t)
}
func startController(ctx context.Context, kClient kubernetes.Interface, mpiClient clientset.Interface) {
kubeInformerFactory := kubeinformers.NewSharedInformerFactory(kClient, 0)
mpiInformerFactory := informers.NewSharedInformerFactory(mpiClient, 0)
ctrl := controller.NewMPIJobController(
kClient,
mpiClient,
nil,
kubeInformerFactory.Core().V1().ConfigMaps(),
kubeInformerFactory.Core().V1().Secrets(),
kubeInformerFactory.Core().V1().Services(),
kubeInformerFactory.Core().V1().Pods(),
nil,
mpiInformerFactory.Kubeflow().V2beta1().MPIJobs(),
"")
go kubeInformerFactory.Start(ctx.Done())
go mpiInformerFactory.Start(ctx.Done())
go ctrl.Run(1, ctx.Done())
}
func validateMPIJobDependencies(ctx context.Context, t *testing.T, kubeClient kubernetes.Interface, job *kubeflow.MPIJob, workers int) map[string][]corev1.Pod {
t.Helper()
var (
svc *corev1.Service
cfgMap *corev1.ConfigMap
secret *corev1.Secret
podsByRole map[string][]corev1.Pod
)
var problems []string
if err := wait.Poll(waitInterval, wait.ForeverTestTimeout, func() (bool, error) {
problems = nil
var err error
svc, err = getServiceForJob(ctx, kubeClient, job)
if err != nil {
return false, err
}
if svc == nil {
problems = append(problems, "Service not found")
}
cfgMap, err = getConfigMapForJob(ctx, kubeClient, job)
if err != nil {
return false, err
}
if cfgMap == nil {
problems = append(problems, "ConfigMap not found")
}
secret, err = getSecretForJob(ctx, kubeClient, job)
if err != nil {
return false, err
}
if secret == nil {
problems = append(problems, "Secret not found")
}
pods, err := getPodsForJob(ctx, kubeClient, job)
if err != nil {
return false, err
}
podsByRole = splitByRole(pods)
if diff := diffCounts(podsByRole, map[string]int{
"worker": workers,
"launcher": 1,
}); diff != "" {
problems = append(problems, fmt.Sprintf("Pods per role don't match (-want,+got):\n%s", diff))
}
if len(problems) == 0 {
return true, nil
}
return false, nil
}); err != nil {
for _, p := range problems {
t.Error(p)
}
t.Fatalf("Waiting for job dependencies: %v", err)
}
svcSelector, err := labels.ValidatedSelectorFromSet(svc.Spec.Selector)
if err != nil {
t.Fatalf("Invalid workers Service selector: %v", err)
}
for _, pods := range podsByRole {
for _, p := range pods {
if !svcSelector.Matches(labels.Set(p.Labels)) {
t.Errorf("Workers Service selector doesn't match pod %s", p.Name)
}
found := false
for _, v := range p.Spec.Volumes {
if v.Secret != nil && v.Secret.SecretName == secret.Name {
found = true
break
}
}
if !found {
t.Errorf("Secret %s not mounted in Pod %s", secret.Name, p.Name)
}
}
}
found := false
for _, v := range podsByRole["launcher"][0].Spec.Volumes {
if v.ConfigMap != nil && v.ConfigMap.Name == cfgMap.Name {
found = true
break
}
}
if !found {
t.Errorf("ConfigMap %s not mounted in launcher Pod", cfgMap.Name)
}
return podsByRole
}
func validateMPIJobStatus(ctx context.Context, t *testing.T, client clientset.Interface, job *kubeflow.MPIJob, want map[common.ReplicaType]*common.ReplicaStatus) {
t.Helper()
var got map[common.ReplicaType]*common.ReplicaStatus
if err := wait.Poll(waitInterval, wait.ForeverTestTimeout, func() (bool, error) {
newJob, err := client.KubeflowV2beta1().MPIJobs(job.Namespace).Get(ctx, job.Name, metav1.GetOptions{})
if err != nil {
return false, err
}
got = newJob.Status.ReplicaStatuses
return cmp.Equal(want, got), nil
}); err != nil {
diff := cmp.Diff(want, got)
t.Fatalf("Waiting for Job status: %v\n(-want,+got)\n%s", err, diff)
}
}
func updatePodsToPhase(ctx context.Context, client kubernetes.Interface, pods []corev1.Pod, phase corev1.PodPhase) error {
for i, p := range pods {
p.Status.Phase = phase
newPod, err := client.CoreV1().Pods(p.Namespace).UpdateStatus(ctx, &p, metav1.UpdateOptions{})
if err != nil {
return err
}
pods[i] = *newPod
}
return nil
}
func splitByRole(pods []corev1.Pod) map[string][]corev1.Pod {
got := make(map[string][]corev1.Pod)
for _, p := range pods {
role := ""
if p.Labels != nil {
role = p.Labels["mpi-job-role"]
}
got[role] = append(got[role], p)
}
return got
}
func diffCounts(gotMap map[string][]corev1.Pod, want map[string]int) string {
got := make(map[string]int)
for k, v := range gotMap {
got[k] = len(v)
}
return cmp.Diff(want, got, cmpopts.IgnoreMapEntries(func(k string, v int) bool {
return v == 0
}))
}
func getServiceForJob(ctx context.Context, client kubernetes.Interface, job *kubeflow.MPIJob) (*corev1.Service, error) {
result, err := client.CoreV1().Services(job.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return nil, err
}
for _, obj := range result.Items {
if metav1.IsControlledBy(&obj, job) {
return &obj, nil
}
}
return nil, nil
}
func getConfigMapForJob(ctx context.Context, client kubernetes.Interface, job *kubeflow.MPIJob) (*corev1.ConfigMap, error) {
result, err := client.CoreV1().ConfigMaps(job.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return nil, err
}
for _, obj := range result.Items {
if metav1.IsControlledBy(&obj, job) {
return &obj, nil
}
}
return nil, nil
}
func getSecretForJob(ctx context.Context, client kubernetes.Interface, job *kubeflow.MPIJob) (*corev1.Secret, error) {
result, err := client.CoreV1().Secrets(job.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return nil, err
}
for _, obj := range result.Items {
if metav1.IsControlledBy(&obj, job) {
return &obj, nil
}
}
return nil, nil
}
func getPodsForJob(ctx context.Context, client kubernetes.Interface, job *kubeflow.MPIJob) ([]corev1.Pod, error) {
result, err := client.CoreV1().Pods(job.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return nil, err
}
var pods []corev1.Pod
for _, p := range result.Items {
if p.DeletionTimestamp == nil && metav1.IsControlledBy(&p, job) {
pods = append(pods, p)
}
}
return pods, nil
}
func newInt32(v int32) *int32 {
return &v
}
func newCleanPodPolicy(policy common.CleanPodPolicy) *common.CleanPodPolicy {
return &policy
}
func eventForJob(event corev1.Event, job *kubeflow.MPIJob) corev1.Event {
event.Namespace = job.Namespace
event.Source.Component = "mpi-job-controller"
ref, err := reference.GetReference(scheme.Scheme, job)
runtime.Must(err)
event.InvolvedObject = *ref
return event
}