902 lines
27 KiB
Go
902 lines
27 KiB
Go
// Copyright 2018 The Kubeflow Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package controller
|
|
|
|
import (
|
|
"fmt"
|
|
"reflect"
|
|
"testing"
|
|
"time"
|
|
|
|
corev1 "k8s.io/api/core/v1"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/runtime"
|
|
"k8s.io/apimachinery/pkg/runtime/schema"
|
|
"k8s.io/apimachinery/pkg/util/diff"
|
|
kubeinformers "k8s.io/client-go/informers"
|
|
k8sfake "k8s.io/client-go/kubernetes/fake"
|
|
core "k8s.io/client-go/testing"
|
|
"k8s.io/client-go/tools/cache"
|
|
"k8s.io/client-go/tools/record"
|
|
|
|
podgroupv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
|
|
volcanofake "volcano.sh/apis/pkg/client/clientset/versioned/fake"
|
|
volcanoinformers "volcano.sh/apis/pkg/client/informers/externalversions"
|
|
|
|
common "github.com/kubeflow/common/pkg/apis/common/v1"
|
|
kubeflow "github.com/kubeflow/mpi-operator/v2/pkg/apis/kubeflow/v2beta1"
|
|
"github.com/kubeflow/mpi-operator/v2/pkg/client/clientset/versioned/fake"
|
|
"github.com/kubeflow/mpi-operator/v2/pkg/client/clientset/versioned/scheme"
|
|
informers "github.com/kubeflow/mpi-operator/v2/pkg/client/informers/externalversions"
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
)
|
|
|
|
var (
|
|
alwaysReady = func() bool { return true }
|
|
noResyncPeriodFunc = func() time.Duration { return 0 }
|
|
)
|
|
|
|
const (
|
|
gpuResourceName = "nvidia.com/gpu"
|
|
extendedGPUResourceName = "vendor-domain/gpu"
|
|
)
|
|
|
|
type fixture struct {
|
|
t *testing.T
|
|
|
|
client *fake.Clientset
|
|
kubeClient *k8sfake.Clientset
|
|
volcanoClient *volcanofake.Clientset
|
|
|
|
// Objects to put in the store.
|
|
configMapLister []*corev1.ConfigMap
|
|
serviceLister []*corev1.Service
|
|
secretLister []*corev1.Secret
|
|
podGroupLister []*podgroupv1beta1.PodGroup
|
|
podLister []*corev1.Pod
|
|
mpiJobLister []*kubeflow.MPIJob
|
|
|
|
// Actions expected to happen on the client.
|
|
kubeActions []core.Action
|
|
actions []core.Action
|
|
|
|
// Objects from here are pre-loaded into NewSimpleFake.
|
|
kubeObjects []runtime.Object
|
|
objects []runtime.Object
|
|
}
|
|
|
|
func newFixture(t *testing.T) *fixture {
|
|
f := &fixture{}
|
|
f.t = t
|
|
f.objects = []runtime.Object{}
|
|
f.kubeObjects = []runtime.Object{}
|
|
return f
|
|
}
|
|
|
|
func newMPIJobCommon(name string, startTime, completionTime *metav1.Time) *kubeflow.MPIJob {
|
|
cleanPodPolicyAll := common.CleanPodPolicyAll
|
|
mpiJob := &kubeflow.MPIJob{
|
|
TypeMeta: metav1.TypeMeta{APIVersion: kubeflow.SchemeGroupVersion.String()},
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: name,
|
|
Namespace: metav1.NamespaceDefault,
|
|
},
|
|
Spec: kubeflow.MPIJobSpec{
|
|
CleanPodPolicy: &cleanPodPolicyAll,
|
|
MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*common.ReplicaSpec{
|
|
kubeflow.MPIReplicaTypeWorker: {
|
|
Template: corev1.PodTemplateSpec{
|
|
Spec: corev1.PodSpec{
|
|
Containers: []corev1.Container{
|
|
{
|
|
Name: "foo",
|
|
Image: "bar",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
kubeflow.MPIReplicaTypeLauncher: {
|
|
Template: corev1.PodTemplateSpec{
|
|
Spec: corev1.PodSpec{
|
|
Containers: []corev1.Container{
|
|
{
|
|
Name: "foo",
|
|
Image: "bar",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
Status: common.JobStatus{},
|
|
}
|
|
|
|
if startTime != nil {
|
|
mpiJob.Status.StartTime = startTime
|
|
}
|
|
if completionTime != nil {
|
|
mpiJob.Status.CompletionTime = completionTime
|
|
}
|
|
|
|
return mpiJob
|
|
}
|
|
|
|
func newMPIJob(name string, replicas *int32, pusPerReplica int64, resourceName string, startTime, completionTime *metav1.Time) *kubeflow.MPIJob {
|
|
mpiJob := newMPIJobCommon(name, startTime, completionTime)
|
|
|
|
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Replicas = replicas
|
|
|
|
workerContainers := mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.Containers
|
|
for i := range workerContainers {
|
|
container := &workerContainers[i]
|
|
container.Resources = corev1.ResourceRequirements{
|
|
Limits: corev1.ResourceList{
|
|
corev1.ResourceName(resourceName): *resource.NewQuantity(pusPerReplica, resource.DecimalExponent),
|
|
},
|
|
}
|
|
}
|
|
|
|
return mpiJob
|
|
}
|
|
|
|
func newMPIJobWithLauncher(name string, replicas *int32, pusPerReplica int64, resourceName string, startTime, completionTime *metav1.Time) *kubeflow.MPIJob {
|
|
mpiJob := newMPIJob(name, replicas, pusPerReplica, resourceName, startTime, completionTime)
|
|
|
|
mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Replicas = int32Ptr(1)
|
|
|
|
launcherContainers := mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers
|
|
for i := range launcherContainers {
|
|
container := &launcherContainers[i]
|
|
container.Resources = corev1.ResourceRequirements{
|
|
Limits: corev1.ResourceList{
|
|
corev1.ResourceName(resourceName): *resource.NewQuantity(pusPerReplica, resource.DecimalExponent),
|
|
},
|
|
}
|
|
}
|
|
|
|
return mpiJob
|
|
}
|
|
|
|
func (f *fixture) newController(gangSchedulerName string) (*MPIJobController, informers.SharedInformerFactory, kubeinformers.SharedInformerFactory) {
|
|
f.client = fake.NewSimpleClientset(f.objects...)
|
|
f.kubeClient = k8sfake.NewSimpleClientset(f.kubeObjects...)
|
|
|
|
i := informers.NewSharedInformerFactory(f.client, noResyncPeriodFunc())
|
|
k8sI := kubeinformers.NewSharedInformerFactory(f.kubeClient, noResyncPeriodFunc())
|
|
|
|
volcanoInformerFactory := volcanoinformers.NewSharedInformerFactory(f.volcanoClient, 0)
|
|
podgroupsInformer := volcanoInformerFactory.Scheduling().V1beta1().PodGroups()
|
|
|
|
c := NewMPIJobController(
|
|
f.kubeClient,
|
|
f.client,
|
|
f.volcanoClient,
|
|
k8sI.Core().V1().ConfigMaps(),
|
|
k8sI.Core().V1().Secrets(),
|
|
k8sI.Core().V1().Services(),
|
|
k8sI.Core().V1().Pods(),
|
|
podgroupsInformer,
|
|
i.Kubeflow().V2beta1().MPIJobs(),
|
|
gangSchedulerName,
|
|
)
|
|
|
|
c.configMapSynced = alwaysReady
|
|
c.serviceSynced = alwaysReady
|
|
c.secretSynced = alwaysReady
|
|
c.podSynced = alwaysReady
|
|
c.podgroupsSynced = alwaysReady
|
|
c.mpiJobSynced = alwaysReady
|
|
c.recorder = &record.FakeRecorder{}
|
|
|
|
for _, configMap := range f.configMapLister {
|
|
err := k8sI.Core().V1().ConfigMaps().Informer().GetIndexer().Add(configMap)
|
|
if err != nil {
|
|
fmt.Println("Failed to create config map")
|
|
}
|
|
}
|
|
|
|
for _, service := range f.serviceLister {
|
|
err := k8sI.Core().V1().Services().Informer().GetIndexer().Add(service)
|
|
if err != nil {
|
|
fmt.Println("Failed to create service account")
|
|
}
|
|
}
|
|
|
|
for _, secret := range f.secretLister {
|
|
err := k8sI.Core().V1().Secrets().Informer().GetIndexer().Add(secret)
|
|
if err != nil {
|
|
fmt.Println("Failed to create role")
|
|
}
|
|
}
|
|
|
|
for _, pod := range f.podLister {
|
|
err := k8sI.Core().V1().Pods().Informer().GetIndexer().Add(pod)
|
|
if err != nil {
|
|
fmt.Println("Failed to create pod")
|
|
}
|
|
}
|
|
|
|
for _, podGroup := range f.podGroupLister {
|
|
err := podgroupsInformer.Informer().GetIndexer().Add(podGroup)
|
|
if err != nil {
|
|
fmt.Println("Failed to create pod group")
|
|
}
|
|
}
|
|
|
|
for _, mpiJob := range f.mpiJobLister {
|
|
err := i.Kubeflow().V2beta1().MPIJobs().Informer().GetIndexer().Add(mpiJob)
|
|
if err != nil {
|
|
fmt.Println("Failed to create mpijob")
|
|
}
|
|
}
|
|
|
|
return c, i, k8sI
|
|
}
|
|
|
|
func (f *fixture) run(mpiJobName string) {
|
|
f.runController(mpiJobName, true, false, "")
|
|
}
|
|
|
|
func (f *fixture) runExpectError(mpiJobName string) {
|
|
f.runController(mpiJobName, true, true, "")
|
|
}
|
|
|
|
func (f *fixture) runController(mpiJobName string, startInformers, expectError bool, gangSchedulerName string) {
|
|
c, i, k8sI := f.newController(gangSchedulerName)
|
|
if startInformers {
|
|
stopCh := make(chan struct{})
|
|
defer close(stopCh)
|
|
i.Start(stopCh)
|
|
k8sI.Start(stopCh)
|
|
}
|
|
|
|
err := c.syncHandler(mpiJobName)
|
|
if !expectError && err != nil {
|
|
f.t.Errorf("error syncing mpi job: %v", err)
|
|
} else if expectError && err == nil {
|
|
f.t.Error("expected error syncing mpi job, got nil")
|
|
}
|
|
|
|
actions := filterInformerActions(f.client.Actions())
|
|
for i, action := range actions {
|
|
if len(f.actions) < i+1 {
|
|
f.t.Errorf("%d unexpected actions: %+v", len(actions)-len(f.actions), actions[i:])
|
|
break
|
|
}
|
|
|
|
expectedAction := f.actions[i]
|
|
checkAction(expectedAction, action, f.t)
|
|
}
|
|
|
|
if len(f.actions) > len(actions) {
|
|
f.t.Errorf("%d additional expected actions:%+v", len(f.actions)-len(actions), f.actions[len(actions):])
|
|
}
|
|
|
|
k8sActions := filterInformerActions(f.kubeClient.Actions())
|
|
for i, action := range k8sActions {
|
|
if len(f.kubeActions) < i+1 {
|
|
f.t.Errorf("%d unexpected actions: %+v", len(k8sActions)-len(f.kubeActions), k8sActions[i:])
|
|
break
|
|
}
|
|
|
|
expectedAction := f.kubeActions[i]
|
|
checkAction(expectedAction, action, f.t)
|
|
}
|
|
|
|
if len(f.kubeActions) > len(k8sActions) {
|
|
f.t.Errorf("%d additional expected actions:%+v", len(f.kubeActions)-len(k8sActions), f.kubeActions[len(k8sActions):])
|
|
}
|
|
}
|
|
|
|
// checkAction verifies that expected and actual actions are equal and both have
|
|
// same attached resources
|
|
func checkAction(expected, actual core.Action, t *testing.T) {
|
|
if !(expected.Matches(actual.GetVerb(), actual.GetResource().Resource) && actual.GetSubresource() == expected.GetSubresource()) {
|
|
t.Errorf("Expected\n\t%#v\ngot\n\t%#v", expected, actual)
|
|
return
|
|
}
|
|
|
|
if reflect.TypeOf(actual) != reflect.TypeOf(expected) {
|
|
t.Errorf("Action has wrong type. Expected: %t. Got: %t", expected, actual)
|
|
return
|
|
}
|
|
|
|
//nolint
|
|
switch a := actual.(type) {
|
|
case core.UpdateAction:
|
|
e, _ := expected.(core.UpdateAction)
|
|
expObject := e.GetObject()
|
|
object := a.GetObject()
|
|
|
|
expMPIJob, ok1 := expObject.(*kubeflow.MPIJob)
|
|
gotMPIJob, ok2 := object.(*kubeflow.MPIJob)
|
|
if ok1 && ok2 {
|
|
clearConditionTime(expMPIJob)
|
|
clearConditionTime(gotMPIJob)
|
|
|
|
if !reflect.DeepEqual(expMPIJob, gotMPIJob) {
|
|
t.Errorf("Action %s %s has wrong object\nDiff:\n %s",
|
|
a.GetVerb(), a.GetResource().Resource, diff.ObjectGoPrintDiff(expObject, object))
|
|
}
|
|
return
|
|
}
|
|
|
|
if !reflect.DeepEqual(expObject, object) {
|
|
t.Errorf("Action %s %s has wrong object\nDiff:\n %s",
|
|
a.GetVerb(), a.GetResource().Resource, diff.ObjectGoPrintDiff(expObject, object))
|
|
}
|
|
case core.CreateAction:
|
|
e, _ := expected.(core.CreateAction)
|
|
expObject := e.GetObject()
|
|
object := a.GetObject()
|
|
|
|
if !reflect.DeepEqual(expObject, object) {
|
|
t.Errorf("Action %s %s has wrong object\nDiff:\n %s",
|
|
a.GetVerb(), a.GetResource().Resource, diff.ObjectGoPrintDiff(expObject, object))
|
|
}
|
|
case core.PatchAction:
|
|
e, _ := expected.(core.PatchAction)
|
|
expPatch := e.GetPatch()
|
|
patch := a.GetPatch()
|
|
|
|
if !reflect.DeepEqual(expPatch, expPatch) {
|
|
t.Errorf("Action %s %s has wrong patch\nDiff:\n %s",
|
|
a.GetVerb(), a.GetResource().Resource, diff.ObjectGoPrintDiff(expPatch, patch))
|
|
}
|
|
}
|
|
}
|
|
|
|
// filterInformerActions filters list and watch actions for testing resources.
|
|
// Since list and watch don't change resource state we can filter it to lower
|
|
// nose level in our tests.
|
|
func filterInformerActions(actions []core.Action) []core.Action {
|
|
var ret []core.Action
|
|
for _, action := range actions {
|
|
if len(action.GetNamespace()) == 0 &&
|
|
(action.Matches("list", "configmaps") ||
|
|
action.Matches("watch", "configmaps") ||
|
|
action.Matches("list", "services") ||
|
|
action.Matches("watch", "services") ||
|
|
action.Matches("list", "secrets") ||
|
|
action.Matches("watch", "secrets") ||
|
|
action.Matches("list", "pods") ||
|
|
action.Matches("watch", "pods") ||
|
|
action.Matches("list", "podgroups") ||
|
|
action.Matches("watch", "podgroups") ||
|
|
action.Matches("list", "mpijobs") ||
|
|
action.Matches("watch", "mpijobs")) {
|
|
continue
|
|
}
|
|
ret = append(ret, action)
|
|
}
|
|
|
|
return ret
|
|
}
|
|
|
|
func (f *fixture) expectCreateJobAction(d *corev1.Pod) {
|
|
f.kubeActions = append(f.kubeActions, core.NewCreateAction(schema.GroupVersionResource{Resource: "pods"}, d.Namespace, d))
|
|
}
|
|
|
|
func (f *fixture) expectUpdateMPIJobStatusAction(mpiJob *kubeflow.MPIJob) {
|
|
action := core.NewUpdateAction(schema.GroupVersionResource{Resource: "mpijobs"}, mpiJob.Namespace, mpiJob)
|
|
action.Subresource = "status"
|
|
f.actions = append(f.actions, action)
|
|
}
|
|
|
|
func (f *fixture) setUpMPIJob(mpiJob *kubeflow.MPIJob) {
|
|
f.mpiJobLister = append(f.mpiJobLister, mpiJob)
|
|
f.objects = append(f.objects, mpiJob)
|
|
}
|
|
|
|
func (f *fixture) setUpLauncher(launcher *corev1.Pod) {
|
|
f.podLister = append(f.podLister, launcher)
|
|
f.kubeObjects = append(f.kubeObjects, launcher)
|
|
}
|
|
|
|
func (f *fixture) setUpWorker(worker *corev1.Pod) {
|
|
f.podLister = append(f.podLister, worker)
|
|
f.kubeObjects = append(f.kubeObjects, worker)
|
|
}
|
|
|
|
func (f *fixture) setUpConfigMap(configMap *corev1.ConfigMap) {
|
|
f.configMapLister = append(f.configMapLister, configMap)
|
|
f.kubeObjects = append(f.kubeObjects, configMap)
|
|
}
|
|
|
|
func (f *fixture) setUpService(service *corev1.Service) {
|
|
f.serviceLister = append(f.serviceLister, service)
|
|
f.kubeObjects = append(f.kubeObjects, service)
|
|
}
|
|
|
|
func (f *fixture) setUpSecret(secret *corev1.Secret) {
|
|
f.secretLister = append(f.secretLister, secret)
|
|
f.kubeObjects = append(f.kubeObjects, secret)
|
|
}
|
|
|
|
func setUpMPIJobTimestamp(mpiJob *kubeflow.MPIJob, startTime, completionTime *metav1.Time) {
|
|
if startTime != nil {
|
|
mpiJob.Status.StartTime = startTime
|
|
}
|
|
|
|
if completionTime != nil {
|
|
mpiJob.Status.CompletionTime = completionTime
|
|
}
|
|
}
|
|
|
|
func clearConditionTime(mpiJob *kubeflow.MPIJob) {
|
|
var clearConditions []common.JobCondition
|
|
for _, condition := range mpiJob.Status.Conditions {
|
|
condition.LastTransitionTime = metav1.Time{}
|
|
condition.LastUpdateTime = metav1.Time{}
|
|
clearConditions = append(clearConditions, condition)
|
|
}
|
|
mpiJob.Status.Conditions = clearConditions
|
|
}
|
|
|
|
func getKey(mpiJob *kubeflow.MPIJob, t *testing.T) string {
|
|
key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(mpiJob)
|
|
if err != nil {
|
|
t.Errorf("Unexpected error getting key for mpi job %v: %v", mpiJob.Name, err)
|
|
return ""
|
|
}
|
|
return key
|
|
}
|
|
|
|
func TestDoNothingWithInvalidKey(t *testing.T) {
|
|
f := newFixture(t)
|
|
f.run("foo/bar/baz")
|
|
}
|
|
|
|
func TestDoNothingWithNonexistentMPIJob(t *testing.T) {
|
|
f := newFixture(t)
|
|
startTime := metav1.Now()
|
|
completionTime := metav1.Now()
|
|
mpiJob := newMPIJob("test", int32Ptr(64), 1, gpuResourceName, &startTime, &completionTime)
|
|
f.run(getKey(mpiJob, t))
|
|
}
|
|
|
|
func TestLauncherNotControlledByUs(t *testing.T) {
|
|
f := newFixture(t)
|
|
startTime := metav1.Now()
|
|
completionTime := metav1.Now()
|
|
|
|
mpiJob := newMPIJob("test", int32Ptr(64), 1, gpuResourceName, &startTime, &completionTime)
|
|
f.setUpMPIJob(mpiJob)
|
|
|
|
fmjc := f.newFakeMPIJobController()
|
|
launcher := fmjc.newLauncher(mpiJob, isGPULauncher(mpiJob))
|
|
launcher.OwnerReferences = nil
|
|
f.setUpLauncher(launcher)
|
|
|
|
f.runExpectError(getKey(mpiJob, t))
|
|
}
|
|
|
|
func TestIsGPULauncher(t *testing.T) {
|
|
f := newFixture(t)
|
|
|
|
startTime := metav1.Now()
|
|
completionTime := metav1.Now()
|
|
|
|
testCases := map[string]struct {
|
|
gpu string
|
|
expected bool
|
|
}{
|
|
"isNvidiaGPU": {
|
|
gpu: gpuResourceName,
|
|
expected: true,
|
|
},
|
|
"isExtendedGPU": {
|
|
gpu: extendedGPUResourceName,
|
|
expected: true,
|
|
},
|
|
"notGPU": {
|
|
gpu: "vendor-domain/resourcetype",
|
|
expected: false,
|
|
},
|
|
}
|
|
for testName, testCase := range testCases {
|
|
mpiJob := newMPIJobWithLauncher("test", int32Ptr(64), 1, testCase.gpu, &startTime, &completionTime)
|
|
f.setUpMPIJob(mpiJob)
|
|
if result := isGPULauncher(mpiJob); result != testCase.expected {
|
|
t.Errorf("%s expected: %v, actual: %v, gpu=%v", testName, testCase.expected, result, testCase.gpu)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestLauncherSucceeded(t *testing.T) {
|
|
f := newFixture(t)
|
|
|
|
startTime := metav1.Now()
|
|
completionTime := metav1.Now()
|
|
|
|
mpiJob := newMPIJob("test", int32Ptr(64), 1, gpuResourceName, &startTime, &completionTime)
|
|
f.setUpMPIJob(mpiJob)
|
|
|
|
fmjc := f.newFakeMPIJobController()
|
|
launcher := fmjc.newLauncher(mpiJob, isGPULauncher(mpiJob))
|
|
launcher.Status.Phase = corev1.PodSucceeded
|
|
f.setUpLauncher(launcher)
|
|
|
|
mpiJobCopy := mpiJob.DeepCopy()
|
|
scheme.Scheme.Default(mpiJobCopy)
|
|
mpiJobCopy.Status.ReplicaStatuses = map[common.ReplicaType]*common.ReplicaStatus{
|
|
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {
|
|
Active: 0,
|
|
Succeeded: 1,
|
|
Failed: 0,
|
|
},
|
|
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {},
|
|
}
|
|
|
|
setUpMPIJobTimestamp(mpiJobCopy, &startTime, &completionTime)
|
|
|
|
msg := fmt.Sprintf("MPIJob %s/%s successfully completed.", mpiJob.Namespace, mpiJob.Name)
|
|
err := updateMPIJobConditions(mpiJobCopy, common.JobSucceeded, mpiJobSucceededReason, msg)
|
|
if err != nil {
|
|
t.Errorf("Failed to update MPIJob conditions")
|
|
}
|
|
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
|
|
|
|
f.run(getKey(mpiJob, t))
|
|
}
|
|
|
|
func TestLauncherFailed(t *testing.T) {
|
|
f := newFixture(t)
|
|
startTime := metav1.Now()
|
|
completionTime := metav1.Now()
|
|
|
|
mpiJob := newMPIJob("test", int32Ptr(64), 1, gpuResourceName, &startTime, &completionTime)
|
|
f.setUpMPIJob(mpiJob)
|
|
|
|
fmjc := f.newFakeMPIJobController()
|
|
launcher := fmjc.newLauncher(mpiJob, isGPULauncher(mpiJob))
|
|
launcher.Status.Phase = corev1.PodFailed
|
|
f.setUpLauncher(launcher)
|
|
|
|
mpiJobCopy := mpiJob.DeepCopy()
|
|
scheme.Scheme.Default(mpiJobCopy)
|
|
mpiJobCopy.Status.ReplicaStatuses = map[common.ReplicaType]*common.ReplicaStatus{
|
|
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {
|
|
Active: 0,
|
|
Succeeded: 0,
|
|
Failed: 1,
|
|
},
|
|
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {},
|
|
}
|
|
setUpMPIJobTimestamp(mpiJobCopy, &startTime, &completionTime)
|
|
|
|
msg := fmt.Sprintf("MPIJob %s/%s has failed", mpiJob.Namespace, mpiJob.Name)
|
|
err := updateMPIJobConditions(mpiJobCopy, common.JobFailed, mpiJobFailedReason, msg)
|
|
if err != nil {
|
|
t.Errorf("Failed to update MPIJob conditions")
|
|
}
|
|
|
|
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
|
|
|
|
f.run(getKey(mpiJob, t))
|
|
}
|
|
|
|
func TestConfigMapNotControlledByUs(t *testing.T) {
|
|
f := newFixture(t)
|
|
startTime := metav1.Now()
|
|
completionTime := metav1.Now()
|
|
|
|
var replicas int32 = 64
|
|
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
|
|
f.setUpMPIJob(mpiJob)
|
|
f.setUpService(newWorkersService(mpiJob))
|
|
|
|
configMap := newConfigMap(mpiJob, replicas, isGPULauncher(mpiJob))
|
|
updateDiscoverHostsInConfigMap(configMap, mpiJob, nil, isGPULauncher(mpiJob))
|
|
configMap.OwnerReferences = nil
|
|
f.setUpConfigMap(configMap)
|
|
|
|
f.runExpectError(getKey(mpiJob, t))
|
|
}
|
|
|
|
func TestServiceNotControlledByUs(t *testing.T) {
|
|
f := newFixture(t)
|
|
startTime := metav1.Now()
|
|
completionTime := metav1.Now()
|
|
|
|
var replicas int32 = 64
|
|
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
|
|
f.setUpMPIJob(mpiJob)
|
|
|
|
service := newWorkersService(mpiJob)
|
|
service.OwnerReferences = nil
|
|
f.setUpService(service)
|
|
|
|
f.runExpectError(getKey(mpiJob, t))
|
|
}
|
|
|
|
func TestSecretNotControlledByUs(t *testing.T) {
|
|
f := newFixture(t)
|
|
startTime := metav1.Now()
|
|
completionTime := metav1.Now()
|
|
|
|
var replicas int32 = 64
|
|
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
|
|
f.setUpMPIJob(mpiJob)
|
|
|
|
configMap := newConfigMap(mpiJob, replicas, isGPULauncher(mpiJob))
|
|
updateDiscoverHostsInConfigMap(configMap, mpiJob, nil, isGPULauncher(mpiJob))
|
|
f.setUpConfigMap(configMap)
|
|
f.setUpService(newWorkersService(mpiJob))
|
|
|
|
secret, err := newSSHAuthSecret(mpiJob)
|
|
if err != nil {
|
|
t.Fatalf("Creating SSH auth Secret: %v", err)
|
|
}
|
|
secret.OwnerReferences = nil
|
|
f.setUpSecret(secret)
|
|
|
|
f.runExpectError(getKey(mpiJob, t))
|
|
}
|
|
|
|
func TestShutdownWorker(t *testing.T) {
|
|
f := newFixture(t)
|
|
startTime := metav1.Now()
|
|
completionTime := metav1.Now()
|
|
|
|
var replicas int32 = 8
|
|
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
|
|
msg := fmt.Sprintf("MPIJob %s/%s successfully completed.", mpiJob.Namespace, mpiJob.Name)
|
|
err := updateMPIJobConditions(mpiJob, common.JobSucceeded, mpiJobSucceededReason, msg)
|
|
if err != nil {
|
|
t.Errorf("Failed to update MPIJob conditions")
|
|
}
|
|
f.setUpMPIJob(mpiJob)
|
|
|
|
fmjc := f.newFakeMPIJobController()
|
|
launcher := fmjc.newLauncher(mpiJob, isGPULauncher(mpiJob))
|
|
launcher.Status.Phase = corev1.PodSucceeded
|
|
f.setUpLauncher(launcher)
|
|
|
|
for i := 0; i < int(replicas); i++ {
|
|
name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
|
|
worker := newWorker(mpiJob, name, "")
|
|
f.setUpWorker(worker)
|
|
}
|
|
|
|
/*
|
|
if err := fmjc.deleteWorkerPods(mpiJob); err != nil {
|
|
t.Errorf("Failed to delete worker: %v", err)
|
|
}
|
|
*/
|
|
for i := 0; i < int(replicas); i++ {
|
|
name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
|
|
f.kubeActions = append(f.kubeActions, core.NewDeleteAction(schema.GroupVersionResource{Resource: "pods"}, mpiJob.Namespace, name))
|
|
}
|
|
|
|
mpiJobCopy := mpiJob.DeepCopy()
|
|
scheme.Scheme.Default(mpiJobCopy)
|
|
mpiJobCopy.Status.ReplicaStatuses = map[common.ReplicaType]*common.ReplicaStatus{
|
|
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {
|
|
Active: 0,
|
|
Succeeded: 0,
|
|
Failed: 0,
|
|
},
|
|
}
|
|
setUpMPIJobTimestamp(mpiJobCopy, &startTime, &completionTime)
|
|
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
|
|
|
|
f.run(getKey(mpiJob, t))
|
|
}
|
|
|
|
func TestWorkerNotControlledByUs(t *testing.T) {
|
|
f := newFixture(t)
|
|
startTime := metav1.Now()
|
|
completionTime := metav1.Now()
|
|
|
|
var replicas int32 = 8
|
|
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
|
|
f.setUpMPIJob(mpiJob)
|
|
|
|
configMap := newConfigMap(mpiJob, replicas, isGPULauncher(mpiJob))
|
|
updateDiscoverHostsInConfigMap(configMap, mpiJob, nil, isGPULauncher(mpiJob))
|
|
f.setUpConfigMap(configMap)
|
|
f.setUpService(newWorkersService(mpiJob))
|
|
secret, err := newSSHAuthSecret(mpiJob)
|
|
if err != nil {
|
|
t.Fatalf("Creating SSH auth secret: %v", err)
|
|
}
|
|
f.setUpSecret(secret)
|
|
|
|
for i := 0; i < int(replicas); i++ {
|
|
name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
|
|
worker := newWorker(mpiJob, name, "")
|
|
worker.OwnerReferences = nil
|
|
f.setUpWorker(worker)
|
|
}
|
|
|
|
f.runExpectError(getKey(mpiJob, t))
|
|
}
|
|
|
|
func TestLauncherActiveWorkerNotReady(t *testing.T) {
|
|
f := newFixture(t)
|
|
startTime := metav1.Now()
|
|
completionTime := metav1.Now()
|
|
|
|
var replicas int32 = 8
|
|
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
|
|
f.setUpMPIJob(mpiJob)
|
|
|
|
configMap := newConfigMap(mpiJob, replicas, isGPULauncher(mpiJob))
|
|
updateDiscoverHostsInConfigMap(configMap, mpiJob, nil, isGPULauncher(mpiJob))
|
|
f.setUpConfigMap(configMap)
|
|
f.setUpService(newWorkersService(mpiJob))
|
|
secret, err := newSSHAuthSecret(mpiJob)
|
|
if err != nil {
|
|
t.Fatalf("Creating SSH auth secret: %v", err)
|
|
}
|
|
f.setUpSecret(secret)
|
|
|
|
fmjc := f.newFakeMPIJobController()
|
|
launcher := fmjc.newLauncher(mpiJob, isGPULauncher(mpiJob))
|
|
launcher.Status.Phase = corev1.PodRunning
|
|
f.setUpLauncher(launcher)
|
|
|
|
for i := 0; i < int(replicas); i++ {
|
|
name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
|
|
worker := newWorker(mpiJob, name, "")
|
|
worker.Status.Phase = corev1.PodPending
|
|
f.setUpWorker(worker)
|
|
}
|
|
mpiJobCopy := mpiJob.DeepCopy()
|
|
scheme.Scheme.Default(mpiJobCopy)
|
|
mpiJobCopy.Status.ReplicaStatuses = map[common.ReplicaType]*common.ReplicaStatus{
|
|
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {
|
|
Active: 1,
|
|
Succeeded: 0,
|
|
Failed: 0,
|
|
},
|
|
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {
|
|
Active: 0,
|
|
Succeeded: 0,
|
|
Failed: 0,
|
|
},
|
|
}
|
|
setUpMPIJobTimestamp(mpiJobCopy, &startTime, &completionTime)
|
|
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
|
|
|
|
f.run(getKey(mpiJob, t))
|
|
}
|
|
|
|
func TestLauncherActiveWorkerReady(t *testing.T) {
|
|
f := newFixture(t)
|
|
startTime := metav1.Now()
|
|
completionTime := metav1.Now()
|
|
|
|
var replicas int32 = 8
|
|
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
|
|
f.setUpMPIJob(mpiJob)
|
|
f.setUpService(newWorkersService(mpiJob))
|
|
secret, err := newSSHAuthSecret(mpiJob)
|
|
if err != nil {
|
|
t.Fatalf("Creating SSH auth secret: %v", err)
|
|
}
|
|
f.setUpSecret(secret)
|
|
|
|
fmjc := f.newFakeMPIJobController()
|
|
launcher := fmjc.newLauncher(mpiJob, isGPULauncher(mpiJob))
|
|
launcher.Status.Phase = corev1.PodRunning
|
|
f.setUpLauncher(launcher)
|
|
|
|
var runningPodList []*corev1.Pod
|
|
for i := 0; i < int(replicas); i++ {
|
|
name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
|
|
worker := newWorker(mpiJob, name, "")
|
|
worker.Status.Phase = corev1.PodRunning
|
|
runningPodList = append(runningPodList, worker)
|
|
f.setUpWorker(worker)
|
|
}
|
|
|
|
configMap := newConfigMap(mpiJob, replicas, isGPULauncher(mpiJob))
|
|
updateDiscoverHostsInConfigMap(configMap, mpiJob, runningPodList, isGPULauncher(mpiJob))
|
|
f.setUpConfigMap(configMap)
|
|
|
|
mpiJobCopy := mpiJob.DeepCopy()
|
|
scheme.Scheme.Default(mpiJobCopy)
|
|
mpiJobCopy.Status.ReplicaStatuses = map[common.ReplicaType]*common.ReplicaStatus{
|
|
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {
|
|
Active: 1,
|
|
Succeeded: 0,
|
|
Failed: 0,
|
|
},
|
|
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {
|
|
Active: 8,
|
|
Succeeded: 0,
|
|
Failed: 0,
|
|
},
|
|
}
|
|
setUpMPIJobTimestamp(mpiJobCopy, &startTime, &completionTime)
|
|
msg := fmt.Sprintf("MPIJob %s/%s is running.", mpiJob.Namespace, mpiJob.Name)
|
|
err = updateMPIJobConditions(mpiJobCopy, common.JobRunning, mpiJobRunningReason, msg)
|
|
if err != nil {
|
|
t.Errorf("Failed to update MPIJob conditions")
|
|
}
|
|
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
|
|
|
|
f.run(getKey(mpiJob, t))
|
|
}
|
|
|
|
func TestWorkerReady(t *testing.T) {
|
|
f := newFixture(t)
|
|
startTime := metav1.Now()
|
|
completionTime := metav1.Now()
|
|
|
|
var replicas int32 = 16
|
|
mpiJob := newMPIJob("test", &replicas, 1, gpuResourceName, &startTime, &completionTime)
|
|
f.setUpMPIJob(mpiJob)
|
|
f.setUpService(newWorkersService(mpiJob))
|
|
secret, err := newSSHAuthSecret(mpiJob)
|
|
if err != nil {
|
|
t.Fatalf("Creating SSH auth secret: %v", err)
|
|
}
|
|
f.setUpSecret(secret)
|
|
|
|
var runningPodList []*corev1.Pod
|
|
for i := 0; i < 16; i++ {
|
|
name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
|
|
worker := newWorker(mpiJob, name, "")
|
|
worker.Status.Phase = corev1.PodRunning
|
|
runningPodList = append(runningPodList, worker)
|
|
f.setUpWorker(worker)
|
|
}
|
|
|
|
configMap := newConfigMap(mpiJob, replicas, isGPULauncher(mpiJob))
|
|
updateDiscoverHostsInConfigMap(configMap, mpiJob, runningPodList, isGPULauncher(mpiJob))
|
|
f.setUpConfigMap(configMap)
|
|
|
|
mpiJobCopy := mpiJob.DeepCopy()
|
|
scheme.Scheme.Default(mpiJobCopy)
|
|
|
|
fmjc := f.newFakeMPIJobController()
|
|
expLauncher := fmjc.newLauncher(mpiJobCopy, isGPULauncher(mpiJob))
|
|
f.expectCreateJobAction(expLauncher)
|
|
|
|
mpiJobCopy.Status.ReplicaStatuses = map[common.ReplicaType]*common.ReplicaStatus{
|
|
common.ReplicaType(kubeflow.MPIReplicaTypeLauncher): {
|
|
Active: 0,
|
|
Succeeded: 0,
|
|
Failed: 0,
|
|
},
|
|
common.ReplicaType(kubeflow.MPIReplicaTypeWorker): {
|
|
Active: 16,
|
|
Succeeded: 0,
|
|
Failed: 0,
|
|
},
|
|
}
|
|
setUpMPIJobTimestamp(mpiJobCopy, &startTime, &completionTime)
|
|
f.expectUpdateMPIJobStatusAction(mpiJobCopy)
|
|
|
|
f.run(getKey(mpiJob, t))
|
|
}
|
|
|
|
func int32Ptr(i int32) *int32 { return &i }
|
|
|
|
func (f *fixture) newFakeMPIJobController() *MPIJobController {
|
|
kubeClient := k8sfake.NewSimpleClientset(f.kubeObjects...)
|
|
|
|
k8sI := kubeinformers.NewSharedInformerFactory(kubeClient, noResyncPeriodFunc())
|
|
return &MPIJobController{
|
|
recorder: &record.FakeRecorder{},
|
|
podLister: k8sI.Core().V1().Pods().Lister(),
|
|
}
|
|
}
|