mpi-operator/v2/pkg/controller/mpi_job_controller_test.go

902 lines
27 KiB
Go

// Copyright 2018 The Kubeflow Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package controller
import (
"fmt"
"reflect"
"testing"
"time"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/diff"
kubeinformers "k8s.io/client-go/informers"
k8sfake "k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
podgroupv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
volcanofake "volcano.sh/apis/pkg/client/clientset/versioned/fake"
volcanoinformers "volcano.sh/apis/pkg/client/informers/externalversions"
common "github.com/kubeflow/common/pkg/apis/common/v1"
kubeflow "github.com/kubeflow/mpi-operator/v2/pkg/apis/kubeflow/v2beta1"
"github.com/kubeflow/mpi-operator/v2/pkg/client/clientset/versioned/fake"
"github.com/kubeflow/mpi-operator/v2/pkg/client/clientset/versioned/scheme"
informers "github.com/kubeflow/mpi-operator/v2/pkg/client/informers/externalversions"
"k8s.io/apimachinery/pkg/api/resource"
)
var (
alwaysReady = func() bool { return true }
noResyncPeriodFunc = func() time.Duration { return 0 }
)
const (
gpuResourceName = "nvidia.com/gpu"
extendedGPUResourceName = "vendor-domain/gpu"
)
type fixture struct {
t *testing.T
client *fake.Clientset
kubeClient *k8sfake.Clientset
volcanoClient *volcanofake.Clientset
// Objects to put in the store.
configMapLister []*corev1.ConfigMap
serviceLister []*corev1.Service
secretLister []*corev1.Secret
podGroupLister []*podgroupv1beta1.PodGroup
podLister []*corev1.Pod
mpiJobLister []*kubeflow.MPIJob
// Actions expected to happen on the client.
kubeActions []core.Action
actions []core.Action
// Objects from here are pre-loaded into NewSimpleFake.
kubeObjects []runtime.Object
objects []runtime.Object
}
func newFixture(t *testing.T) *fixture {
f := &fixture{}
f.t = t
f.objects = []runtime.Object{}
f.kubeObjects = []runtime.Object{}
return f
}
func newMPIJobCommon(name string, startTime, completionTime *metav1.Time) *kubeflow.MPIJob {
cleanPodPolicyAll := common.CleanPodPolicyAll
mpiJob := &kubeflow.MPIJob{
TypeMeta: metav1.TypeMeta{APIVersion: kubeflow.SchemeGroupVersion.String()},
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: metav1.NamespaceDefault,
},
Spec: kubeflow.MPIJobSpec{
CleanPodPolicy: &cleanPodPolicyAll,
MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*common.ReplicaSpec{
kubeflow.MPIReplicaTypeWorker: {
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "foo",
Image: "bar",
},
},
},
},
},
kubeflow.MPIReplicaTypeLauncher: {
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "foo",
Image: "bar",
},
},
},
},
},
},
},
Status: common.JobStatus{},
}
if startTime != nil {
mpiJob.Status.StartTime = startTime
}
if completionTime != nil {
mpiJob.Status.CompletionTime = completionTime
}
return mpiJob
}
func newMPIJob(name string, replicas *int32, pusPerReplica int64, resourceName string, startTime, completionTime *metav1.Time) *kubeflow.MPIJob {
mpiJob := newMPIJobCommon(name, startTime, completionTime)
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Replicas = replicas
workerContainers := mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.Containers
for i := range workerContainers {
container := &workerContainers[i]
container.Resources = corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceName(resourceName): *resource.NewQuantity(pusPerReplica, resource.DecimalExponent),
},
}
}
return mpiJob
}
func newMPIJobWithLauncher(name string, replicas *int32, pusPerReplica int64, resourceName string, startTime, completionTime *metav1.Time) *kubeflow.MPIJob {
mpiJob := newMPIJob(name, replicas, pusPerReplica, resourceName, startTime, completionTime)
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Replicas = int32Ptr(1)
launcherContainers := mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers
for i := range launcherContainers {
container := &launcherContainers[i]
container.Resources = corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceName(resourceName): *resource.NewQuantity(pusPerReplica, resource.DecimalExponent),
},
}
}
return mpiJob
}
func (f *fixture) newController(gangSchedulerName string) (*MPIJobController, informers.SharedInformerFactory, kubeinformers.SharedInformerFactory) {
f.client = fake.NewSimpleClientset(f.objects...)
f.kubeClient = k8sfake.NewSimpleClientset(f.kubeObjects...)
i := informers.NewSharedInformerFactory(f.client, noResyncPeriodFunc())
k8sI := kubeinformers.NewSharedInformerFactory(f.kubeClient, noResyncPeriodFunc())
volcanoInformerFactory := volcanoinformers.NewSharedInformerFactory(f.volcanoClient, 0)
podgroupsInformer := volcanoInformerFactory.Scheduling().V1beta1().PodGroups()
c := NewMPIJobController(
f.kubeClient,
f.client,
f.volcanoClient,
k8sI.Core().V1().ConfigMaps(),
k8sI.Core().V1().Secrets(),
k8sI.Core().V1().Services(),
k8sI.Core().V1().Pods(),
podgroupsInformer,
i.Kubeflow().V2beta1().MPIJobs(),
gangSchedulerName,
)
c.configMapSynced = alwaysReady
c.serviceSynced = alwaysReady
c.secretSynced = alwaysReady
c.podSynced = alwaysReady
c.podgroupsSynced = alwaysReady
c.mpiJobSynced = alwaysReady
c.recorder = &record.FakeRecorder{}
for _, configMap := range f.configMapLister {
err := k8sI.Core().V1().ConfigMaps().Informer().GetIndexer().Add(configMap)
if err != nil {
fmt.Println("Failed to create config map")
}
}
for _, service := range f.serviceLister {
err := k8sI.Core().V1().Services().Informer().GetIndexer().Add(service)
if err != nil {
fmt.Println("Failed to create service account")
}
}
for _, secret := range f.secretLister {
err := k8sI.Core().V1().Secrets().Informer().GetIndexer().Add(secret)
if err != nil {
fmt.Println("Failed to create role")
}
}
for _, pod := range f.podLister {
err := k8sI.Core().V1().Pods().Informer().GetIndexer().Add(pod)
if err != nil {
fmt.Println("Failed to create pod")
}
}
for _, podGroup := range f.podGroupLister {
err := podgroupsInformer.Informer().GetIndexer().Add(podGroup)
if err != nil {
fmt.Println("Failed to create pod group")
}
}
for _, mpiJob := range f.mpiJobLister {
err := i.Kubeflow().V2beta1().MPIJobs().Informer().GetIndexer().Add(mpiJob)
if err != nil {
fmt.Println("Failed to create mpijob")
}
}
return c, i, k8sI
}
func (f *fixture) run(mpiJobName string) {
f.runController(mpiJobName, true, false, "")
}
func (f *fixture) runExpectError(mpiJobName string) {
f.runController(mpiJobName, true, true, "")
}
func (f *fixture) runController(mpiJobName string, startInformers, expectError bool, gangSchedulerName string) {
c, i, k8sI := f.newController(gangSchedulerName)
if startInformers {
stopCh := make(chan struct{})
defer close(stopCh)
i.Start(stopCh)
k8sI.Start(stopCh)
}
err := c.syncHandler(mpiJobName)
if !expectError && err != nil {
f.t.Errorf("error syncing mpi job: %v", err)
} else if expectError && err == nil {
f.t.Error("expected error syncing mpi job, got nil")
}
actions := filterInformerActions(f.client.Actions())
for i, action := range actions {
if len(f.actions) < i+1 {
f.t.Errorf("%d unexpected actions: %+v", len(actions)-len(f.actions), actions[i:])
break
}
expectedAction := f.actions[i]
checkAction(expectedAction, action, f.t)
}
if len(f.actions) > len(actions) {
f.t.Errorf("%d additional expected actions:%+v", len(f.actions)-len(actions), f.actions[len(actions):])
}
k8sActions := filterInformerActions(f.kubeClient.Actions())
for i, action := range k8sActions {
if len(f.kubeActions) < i+1 {
f.t.Errorf("%d unexpected actions: %+v", len(k8sActions)-len(f.kubeActions), k8sActions[i:])
break
}
expectedAction := f.kubeActions[i]
checkAction(expectedAction, action, f.t)
}
if len(f.kubeActions) > len(k8sActions) {
f.t.Errorf("%d additional expected actions:%+v", len(f.kubeActions)-len(k8sActions), f.kubeActions[len(k8sActions):])
}
}
// checkAction verifies that expected and actual actions are equal and both have
// same attached resources
func checkAction(expected, actual core.Action, t *testing.T) {
if !(expected.Matches(actual.GetVerb(), actual.GetResource().Resource) && actual.GetSubresource() == expected.GetSubresource()) {
t.Errorf("Expected\n\t%#v\ngot\n\t%#v", expected, actual)
return
}
if reflect.TypeOf(actual) != reflect.TypeOf(expected) {
t.Errorf("Action has wrong type. Expected: %t. Got: %t", expected, actual)
return
}
//nolint
switch a := actual.(type) {
case core.UpdateAction:
e, _ := expected.(core.UpdateAction)
expObject := e.GetObject()
object := a.GetObject()
expMPIJob, ok1 := expObject.(*kubeflow.MPIJob)
gotMPIJob, ok2 := object.(*kubeflow.MPIJob)
if ok1 && ok2 {
clearConditionTime(expMPIJob)
clearConditionTime(gotMPIJob)
if !reflect.DeepEqual(expMPIJob, gotMPIJob) {
t.Errorf("Action %s %s has wrong object\nDiff:\n %s",
a.GetVerb(), a.GetResource().Resource, diff.ObjectGoPrintDiff(expObject, object))
}
return
}
if !reflect.DeepEqual(expObject, object) {
t.Errorf("Action %s %s has wrong object\nDiff:\n %s",
a.GetVerb(), a.GetResource().Resource, diff.ObjectGoPrintDiff(expObject, object))
}
case core.CreateAction:
e, _ := expected.(core.CreateAction)
expObject := e.GetObject()
object := a.GetObject()
if !reflect.DeepEqual(expObject, object) {
t.Errorf("Action %s %s has wrong object\nDiff:\n %s",
a.GetVerb(), a.GetResource().Resource, diff.ObjectGoPrintDiff(expObject, object))
}
case core.PatchAction:
e, _ := expected.(core.PatchAction)
expPatch := e.GetPatch()
patch := a.GetPatch()
if !reflect.DeepEqual(expPatch, expPatch) {
t.Errorf("Action %s %s has wrong patch\nDiff:\n %s",
a.GetVerb(), a.GetResource().Resource, diff.ObjectGoPrintDiff(expPatch, patch))
}
}
}
// filterInformerActions filters list and watch actions for testing resources.
// Since list and watch don't change resource state we can filter it to lower
// nose level in our tests.
func filterInformerActions(actions []core.Action) []core.Action {
var ret []core.Action
for _, action := range actions {
if len(action.GetNamespace()) == 0 &&
(action.Matches("list", "configmaps") ||
action.Matches("watch", "configmaps") ||
action.Matches("list", "services") ||
action.Matches("watch", "services") ||
action.Matches("list", "secrets") ||
action.Matches("watch", "secrets") ||
action.Matches("list", "pods") ||
action.Matches("watch", "pods") ||
action.Matches("list", "podgroups") ||
action.Matches("watch", "podgroups") ||
action.Matches("list", "mpijobs") ||
action.Matches("watch", "mpijobs")) {
continue
}
ret = append(ret, action)
}
return ret
}
func (f *fixture) expectCreateJobAction(d *corev1.Pod) {
f.kubeActions = append(f.kubeActions, core.NewCreateAction(schema.GroupVersionResource{Resource: "pods"}, d.Namespace, d))
}
func (f *fixture) expectUpdateMPIJobStatusAction(mpiJob *kubeflow.MPIJob) {
action := core.NewUpdateAction(schema.GroupVersionResource{Resource: "mpijobs"}, mpiJob.Namespace, mpiJob)
action.Subresource = "status"
f.actions = append(f.actions, action)
}
func (f *fixture) setUpMPIJob(mpiJob *kubeflow.MPIJob) {
f.mpiJobLister = append(f.mpiJobLister, mpiJob)
f.objects = append(f.objects, mpiJob)
}
func (f *fixture) setUpLauncher(launcher *corev1.Pod) {
f.podLister = append(f.podLister, launcher)
f.kubeObjects = append(f.kubeObjects, launcher)
}
func (f *fixture) setUpWorker(worker *corev1.Pod) {
f.podLister = append(f.podLister, worker)
f.kubeObjects = append(f.kubeObjects, worker)
}
func (f *fixture) setUpConfigMap(configMap *corev1.ConfigMap) {
f.configMapLister = append(f.configMapLister, configMap)
f.kubeObjects = append(f.kubeObjects, configMap)
}
func (f *fixture) setUpService(service *corev1.Service) {
f.serviceLister = append(f.serviceLister, service)
f.kubeObjects = append(f.kubeObjects, service)
}
func (f *fixture) setUpSecret(secret *corev1.Secret) {
f.secretLister = append(f.secretLister, secret)
f.kubeObjects = append(f.kubeObjects, secret)
}
func setUpMPIJobTimestamp(mpiJob *kubeflow.MPIJob, startTime, completionTime *metav1.Time) {
if startTime != nil {
mpiJob.Status.StartTime = startTime
}
if completionTime != nil {
mpiJob.Status.CompletionTime = completionTime
}
}
func clearConditionTime(mpiJob *kubeflow.MPIJob) {
var clearConditions []common.JobCondition
for _, condition := range mpiJob.Status.Conditions {
condition.LastTransitionTime = metav1.Time{}
condition.LastUpdateTime = metav1.Time{}
clearConditions = append(clearConditions, condition)
}
mpiJob.Status.Conditions = clearConditions
}
func getKey(mpiJob *kubeflow.MPIJob, t *testing.T) string {
key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(mpiJob)
if err != nil {
t.Errorf("Unexpected error getting key for mpi job %v: %v", mpiJob.Name, err)
return ""
}
return key
}
func TestDoNothingWithInvalidKey(t *testing.T) {
f := newFixture(t)
f.run("foo/bar/baz")
}
func TestDoNothingWithNonexistentMPIJob(t *testing.T) {
f := newFixture(t)
startTime := metav1.Now()
completionTime := metav1.Now()
mpiJob := newMPIJob("test", int32Ptr(64), 1, gpuResourceName, &startTime, &completionTime)
f.run(getKey(mpiJob, t))
}
func TestLauncherNotControlledByUs(t *testing.T) {
f := newFixture(t)
startTime := metav1.Now()
completionTime := metav1.Now()
mpiJob := newMPIJob("test", int32Ptr(64), 1, gpuResourceName, &startTime, &completionTime)
f.setUpMPIJob(mpiJob)
fmjc := f.newFakeMPIJobController()
launcher := fmjc.newLauncher(mpiJob, isGPULauncher(mpiJob))
launcher.OwnerReferences = nil
f.setUpLauncher(launcher)
f.runExpectError(getKey(mpiJob, t))
}
func TestIsGPULauncher(t *testing.T) {
f := newFixture(t)
startTime := metav1.Now()
completionTime := metav1.Now()
testCases := map[string]struct {
gpu string
expected bool
}{
"isNvidiaGPU": {
gpu: gpuResourceName,
expected: true,
},
"isExtendedGPU": {
gpu: extendedGPUResourceName,
expected: true,
},
"notGPU": {
gpu: "vendor-domain/resourcetype",
expected: false,
},
}
for testName, testCase := range testCases {
mpiJob := newMPIJobWithLauncher("test", int32Ptr(64), 1, testCase.gpu, &startTime, &completionTime)
f.setUpMPIJob(mpiJob)
if result := isGPULauncher(mpiJob); result != testCase.expected {
t.Errorf("%s expected: %v, actual: %v, gpu=%v", testName, testCase.expected, result, testCase.gpu)
}
}
}
func TestLauncherSucceeded(t *testing.T) {
f := newFixture(t)
startTime := metav1.Now()
completionTime := metav1.Now()
mpiJob := newMPIJob("test", int32Ptr(64), 1, gpuResourceName, &startTime, &completionTime)
f.setUpMPIJob(mpiJob)
fmjc := f.newFakeMPIJobController()
launcher := fmjc.newLauncher(mpiJob, isGPULauncher(mpiJob))
launcher.Status.Phase = corev1.PodSucceeded
f.setUpLauncher(launcher)
mpiJobCopy := mpiJob.DeepCopy()
scheme.Scheme.Default(mpiJobCopy)
mpiJobCopy.Status.ReplicaStatuses = map[common.ReplicaType]*common.ReplicaStatus{
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {
Active: 0,
Succeeded: 1,
Failed: 0,
},
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {},
}
setUpMPIJobTimestamp(mpiJobCopy, &startTime, &completionTime)
msg := fmt.Sprintf("MPIJob %s/%s successfully completed.", mpiJob.Namespace, mpiJob.Name)
err := updateMPIJobConditions(mpiJobCopy, common.JobSucceeded, mpiJobSucceededReason, msg)
if err != nil {
t.Errorf("Failed to update MPIJob conditions")
}
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
f.run(getKey(mpiJob, t))
}
func TestLauncherFailed(t *testing.T) {
f := newFixture(t)
startTime := metav1.Now()
completionTime := metav1.Now()
mpiJob := newMPIJob("test", int32Ptr(64), 1, gpuResourceName, &startTime, &completionTime)
f.setUpMPIJob(mpiJob)
fmjc := f.newFakeMPIJobController()
launcher := fmjc.newLauncher(mpiJob, isGPULauncher(mpiJob))
launcher.Status.Phase = corev1.PodFailed
f.setUpLauncher(launcher)
mpiJobCopy := mpiJob.DeepCopy()
scheme.Scheme.Default(mpiJobCopy)
mpiJobCopy.Status.ReplicaStatuses = map[common.ReplicaType]*common.ReplicaStatus{
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {
Active: 0,
Succeeded: 0,
Failed: 1,
},
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {},
}
setUpMPIJobTimestamp(mpiJobCopy, &startTime, &completionTime)
msg := fmt.Sprintf("MPIJob %s/%s has failed", mpiJob.Namespace, mpiJob.Name)
err := updateMPIJobConditions(mpiJobCopy, common.JobFailed, mpiJobFailedReason, msg)
if err != nil {
t.Errorf("Failed to update MPIJob conditions")
}
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
f.run(getKey(mpiJob, t))
}
func TestConfigMapNotControlledByUs(t *testing.T) {
f := newFixture(t)
startTime := metav1.Now()
completionTime := metav1.Now()
var replicas int32 = 64
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
f.setUpMPIJob(mpiJob)
f.setUpService(newWorkersService(mpiJob))
configMap := newConfigMap(mpiJob, replicas, isGPULauncher(mpiJob))
updateDiscoverHostsInConfigMap(configMap, mpiJob, nil, isGPULauncher(mpiJob))
configMap.OwnerReferences = nil
f.setUpConfigMap(configMap)
f.runExpectError(getKey(mpiJob, t))
}
func TestServiceNotControlledByUs(t *testing.T) {
f := newFixture(t)
startTime := metav1.Now()
completionTime := metav1.Now()
var replicas int32 = 64
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
f.setUpMPIJob(mpiJob)
service := newWorkersService(mpiJob)
service.OwnerReferences = nil
f.setUpService(service)
f.runExpectError(getKey(mpiJob, t))
}
func TestSecretNotControlledByUs(t *testing.T) {
f := newFixture(t)
startTime := metav1.Now()
completionTime := metav1.Now()
var replicas int32 = 64
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
f.setUpMPIJob(mpiJob)
configMap := newConfigMap(mpiJob, replicas, isGPULauncher(mpiJob))
updateDiscoverHostsInConfigMap(configMap, mpiJob, nil, isGPULauncher(mpiJob))
f.setUpConfigMap(configMap)
f.setUpService(newWorkersService(mpiJob))
secret, err := newSSHAuthSecret(mpiJob)
if err != nil {
t.Fatalf("Creating SSH auth Secret: %v", err)
}
secret.OwnerReferences = nil
f.setUpSecret(secret)
f.runExpectError(getKey(mpiJob, t))
}
func TestShutdownWorker(t *testing.T) {
f := newFixture(t)
startTime := metav1.Now()
completionTime := metav1.Now()
var replicas int32 = 8
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
msg := fmt.Sprintf("MPIJob %s/%s successfully completed.", mpiJob.Namespace, mpiJob.Name)
err := updateMPIJobConditions(mpiJob, common.JobSucceeded, mpiJobSucceededReason, msg)
if err != nil {
t.Errorf("Failed to update MPIJob conditions")
}
f.setUpMPIJob(mpiJob)
fmjc := f.newFakeMPIJobController()
launcher := fmjc.newLauncher(mpiJob, isGPULauncher(mpiJob))
launcher.Status.Phase = corev1.PodSucceeded
f.setUpLauncher(launcher)
for i := 0; i < int(replicas); i++ {
name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
worker := newWorker(mpiJob, name, "")
f.setUpWorker(worker)
}
/*
if err := fmjc.deleteWorkerPods(mpiJob); err != nil {
t.Errorf("Failed to delete worker: %v", err)
}
*/
for i := 0; i < int(replicas); i++ {
name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
f.kubeActions = append(f.kubeActions, core.NewDeleteAction(schema.GroupVersionResource{Resource: "pods"}, mpiJob.Namespace, name))
}
mpiJobCopy := mpiJob.DeepCopy()
scheme.Scheme.Default(mpiJobCopy)
mpiJobCopy.Status.ReplicaStatuses = map[common.ReplicaType]*common.ReplicaStatus{
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {
Active: 0,
Succeeded: 0,
Failed: 0,
},
}
setUpMPIJobTimestamp(mpiJobCopy, &startTime, &completionTime)
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
f.run(getKey(mpiJob, t))
}
func TestWorkerNotControlledByUs(t *testing.T) {
f := newFixture(t)
startTime := metav1.Now()
completionTime := metav1.Now()
var replicas int32 = 8
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
f.setUpMPIJob(mpiJob)
configMap := newConfigMap(mpiJob, replicas, isGPULauncher(mpiJob))
updateDiscoverHostsInConfigMap(configMap, mpiJob, nil, isGPULauncher(mpiJob))
f.setUpConfigMap(configMap)
f.setUpService(newWorkersService(mpiJob))
secret, err := newSSHAuthSecret(mpiJob)
if err != nil {
t.Fatalf("Creating SSH auth secret: %v", err)
}
f.setUpSecret(secret)
for i := 0; i < int(replicas); i++ {
name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
worker := newWorker(mpiJob, name, "")
worker.OwnerReferences = nil
f.setUpWorker(worker)
}
f.runExpectError(getKey(mpiJob, t))
}
func TestLauncherActiveWorkerNotReady(t *testing.T) {
f := newFixture(t)
startTime := metav1.Now()
completionTime := metav1.Now()
var replicas int32 = 8
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
f.setUpMPIJob(mpiJob)
configMap := newConfigMap(mpiJob, replicas, isGPULauncher(mpiJob))
updateDiscoverHostsInConfigMap(configMap, mpiJob, nil, isGPULauncher(mpiJob))
f.setUpConfigMap(configMap)
f.setUpService(newWorkersService(mpiJob))
secret, err := newSSHAuthSecret(mpiJob)
if err != nil {
t.Fatalf("Creating SSH auth secret: %v", err)
}
f.setUpSecret(secret)
fmjc := f.newFakeMPIJobController()
launcher := fmjc.newLauncher(mpiJob, isGPULauncher(mpiJob))
launcher.Status.Phase = corev1.PodRunning
f.setUpLauncher(launcher)
for i := 0; i < int(replicas); i++ {
name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
worker := newWorker(mpiJob, name, "")
worker.Status.Phase = corev1.PodPending
f.setUpWorker(worker)
}
mpiJobCopy := mpiJob.DeepCopy()
scheme.Scheme.Default(mpiJobCopy)
mpiJobCopy.Status.ReplicaStatuses = map[common.ReplicaType]*common.ReplicaStatus{
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {
Active: 1,
Succeeded: 0,
Failed: 0,
},
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {
Active: 0,
Succeeded: 0,
Failed: 0,
},
}
setUpMPIJobTimestamp(mpiJobCopy, &startTime, &completionTime)
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
f.run(getKey(mpiJob, t))
}
func TestLauncherActiveWorkerReady(t *testing.T) {
f := newFixture(t)
startTime := metav1.Now()
completionTime := metav1.Now()
var replicas int32 = 8
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
f.setUpMPIJob(mpiJob)
f.setUpService(newWorkersService(mpiJob))
secret, err := newSSHAuthSecret(mpiJob)
if err != nil {
t.Fatalf("Creating SSH auth secret: %v", err)
}
f.setUpSecret(secret)
fmjc := f.newFakeMPIJobController()
launcher := fmjc.newLauncher(mpiJob, isGPULauncher(mpiJob))
launcher.Status.Phase = corev1.PodRunning
f.setUpLauncher(launcher)
var runningPodList []*corev1.Pod
for i := 0; i < int(replicas); i++ {
name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
worker := newWorker(mpiJob, name, "")
worker.Status.Phase = corev1.PodRunning
runningPodList = append(runningPodList, worker)
f.setUpWorker(worker)
}
configMap := newConfigMap(mpiJob, replicas, isGPULauncher(mpiJob))
updateDiscoverHostsInConfigMap(configMap, mpiJob, runningPodList, isGPULauncher(mpiJob))
f.setUpConfigMap(configMap)
mpiJobCopy := mpiJob.DeepCopy()
scheme.Scheme.Default(mpiJobCopy)
mpiJobCopy.Status.ReplicaStatuses = map[common.ReplicaType]*common.ReplicaStatus{
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {
Active: 1,
Succeeded: 0,
Failed: 0,
},
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {
Active: 8,
Succeeded: 0,
Failed: 0,
},
}
setUpMPIJobTimestamp(mpiJobCopy, &startTime, &completionTime)
msg := fmt.Sprintf("MPIJob %s/%s is running.", mpiJob.Namespace, mpiJob.Name)
err = updateMPIJobConditions(mpiJobCopy, common.JobRunning, mpiJobRunningReason, msg)
if err != nil {
t.Errorf("Failed to update MPIJob conditions")
}
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
f.run(getKey(mpiJob, t))
}
func TestWorkerReady(t *testing.T) {
f := newFixture(t)
startTime := metav1.Now()
completionTime := metav1.Now()
var replicas int32 = 16
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
f.setUpMPIJob(mpiJob)
f.setUpService(newWorkersService(mpiJob))
secret, err := newSSHAuthSecret(mpiJob)
if err != nil {
t.Fatalf("Creating SSH auth secret: %v", err)
}
f.setUpSecret(secret)
var runningPodList []*corev1.Pod
for i := 0; i < 16; i++ {
name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
worker := newWorker(mpiJob, name, "")
worker.Status.Phase = corev1.PodRunning
runningPodList = append(runningPodList, worker)
f.setUpWorker(worker)
}
configMap := newConfigMap(mpiJob, replicas, isGPULauncher(mpiJob))
updateDiscoverHostsInConfigMap(configMap, mpiJob, runningPodList, isGPULauncher(mpiJob))
f.setUpConfigMap(configMap)
mpiJobCopy := mpiJob.DeepCopy()
scheme.Scheme.Default(mpiJobCopy)
fmjc := f.newFakeMPIJobController()
expLauncher := fmjc.newLauncher(mpiJobCopy, isGPULauncher(mpiJob))
f.expectCreateJobAction(expLauncher)
mpiJobCopy.Status.ReplicaStatuses = map[common.ReplicaType]*common.ReplicaStatus{
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {
Active: 0,
Succeeded: 0,
Failed: 0,
},
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {
Active: 16,
Succeeded: 0,
Failed: 0,
},
}
setUpMPIJobTimestamp(mpiJobCopy, &startTime, &completionTime)
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
f.run(getKey(mpiJob, t))
}
func int32Ptr(i int32) *int32 { return &i }
func (f *fixture) newFakeMPIJobController() *MPIJobController {
kubeClient := k8sfake.NewSimpleClientset(f.kubeObjects...)
k8sI := kubeinformers.NewSharedInformerFactory(kubeClient, noResyncPeriodFunc())
return &MPIJobController{
recorder: &record.FakeRecorder{},
podLister: k8sI.Core().V1().Pods().Lister(),
}
}