mpi-operator/pkg/controller/podgroup.go

476 lines
19 KiB
Go

// Copyright 2023 The Kubeflow Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package controller
import (
"context"
"sort"
"github.com/google/go-cmp/cmp"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
schedulinglisters "k8s.io/client-go/listers/scheduling/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/klog"
"k8s.io/utils/ptr"
schedv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1"
schedclientset "sigs.k8s.io/scheduler-plugins/pkg/generated/clientset/versioned"
schedinformers "sigs.k8s.io/scheduler-plugins/pkg/generated/informers/externalversions"
schedinformer "sigs.k8s.io/scheduler-plugins/pkg/generated/informers/externalversions/scheduling/v1alpha1"
volcanov1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned"
volcanoinformers "volcano.sh/apis/pkg/client/informers/externalversions"
volcanopodgroupinformer "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
"github.com/kubeflow/mpi-operator/cmd/mpi-operator/app/options"
kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1"
)
type PodGroupControl interface {
// StartInformerFactory will start the podGroup informer.
StartInformerFactory(stopCh <-chan struct{})
// PodGroupSharedIndexInformer will return Indexer based on SharedInformer for the podGroup.
PodGroupSharedIndexInformer() cache.SharedIndexInformer
// newPodGroup will generate a new podGroup for an MPIJob resource.
// It also sets the appropriate OwnerReferences on the resource so
// handleObject can discover the MPIJob resource that 'owns' it.
newPodGroup(mpiJob *kubeflow.MPIJob) metav1.Object
// getPodGroup will return a podGroup.
getPodGroup(namespace, name string) (metav1.Object, error)
// createPodGroup will create a podGroup.
createPodGroup(ctx context.Context, pg metav1.Object) (metav1.Object, error)
// updatePodGroup will update a podGroup.
updatePodGroup(ctx context.Context, old, new metav1.Object) (metav1.Object, error)
// deletePodGroup will delete a podGroup.
deletePodGroup(ctx context.Context, namespace, name string) error
// decoratePodTemplateSpec will decorate the podTemplate before it's used to generate a pod with information for gang-scheduling.
decoratePodTemplateSpec(pts *corev1.PodTemplateSpec, mpiJobName string)
// calculatePGMinResources will calculate minResources for podGroup.
calculatePGMinResources(minMember *int32, mpiJob *kubeflow.MPIJob) *corev1.ResourceList
// pgSpecsAreEqual will return true if the spec fields of two podGroup are equals.
pgSpecsAreEqual(a, b metav1.Object) bool
}
// VolcanoCtrl is the implementation fo PodGroupControl with volcano.
type VolcanoCtrl struct {
Client volcanoclient.Interface
InformerFactory volcanoinformers.SharedInformerFactory
PodGroupInformer volcanopodgroupinformer.PodGroupInformer
PriorityClassLister schedulinglisters.PriorityClassLister
schedulerName string
}
func NewVolcanoCtrl(c volcanoclient.Interface, watchNamespace string, pcLister schedulinglisters.PriorityClassLister) *VolcanoCtrl {
var informerFactoryOpts []volcanoinformers.SharedInformerOption
if watchNamespace != metav1.NamespaceAll {
informerFactoryOpts = append(informerFactoryOpts, volcanoinformers.WithNamespace(watchNamespace))
}
informerFactory := volcanoinformers.NewSharedInformerFactoryWithOptions(c, 0, informerFactoryOpts...)
return &VolcanoCtrl{
Client: c,
InformerFactory: informerFactory,
PodGroupInformer: informerFactory.Scheduling().V1beta1().PodGroups(),
PriorityClassLister: pcLister,
schedulerName: options.GangSchedulerVolcano,
}
}
func (v *VolcanoCtrl) PodGroupSharedIndexInformer() cache.SharedIndexInformer {
return v.PodGroupInformer.Informer()
}
func (v *VolcanoCtrl) StartInformerFactory(stopCh <-chan struct{}) {
go v.InformerFactory.Start(stopCh)
}
// newPodGroup will generate a new PodGroup for an MPIJob resource.
// If the parameters set in the schedulingPolicy aren't empty, it will pass them to a new PodGroup;
// if they are empty, it will set the default values in the following:
//
// minMember: NUM(workers) + 1
// queue: A "scheduling.volcano.sh/queue-name" annotation value.
// priorityClass: A value returned from the calcPriorityClassName function.
// minResources: nil
//
// However, it doesn't pass the ".schedulingPolicy.scheduleTimeoutSeconds" to the podGroup resource.
func (v *VolcanoCtrl) newPodGroup(mpiJob *kubeflow.MPIJob) metav1.Object {
if mpiJob == nil {
return nil
}
minMember := calculateMinAvailable(mpiJob)
queueName := mpiJob.Annotations[volcanov1beta1.QueueNameAnnotationKey]
if schedulingPolicy := mpiJob.Spec.RunPolicy.SchedulingPolicy; schedulingPolicy != nil && len(schedulingPolicy.Queue) != 0 {
queueName = schedulingPolicy.Queue
}
return &volcanov1beta1.PodGroup{
TypeMeta: metav1.TypeMeta{
APIVersion: volcanov1beta1.SchemeGroupVersion.String(),
Kind: "PodGroup",
},
ObjectMeta: metav1.ObjectMeta{
Name: mpiJob.Name,
Namespace: mpiJob.Namespace,
OwnerReferences: []metav1.OwnerReference{
*metav1.NewControllerRef(mpiJob, kubeflow.SchemeGroupVersionKind),
},
},
Spec: volcanov1beta1.PodGroupSpec{
MinMember: *minMember,
Queue: queueName,
PriorityClassName: calculatePriorityClassName(mpiJob.Spec.MPIReplicaSpecs, mpiJob.Spec.RunPolicy.SchedulingPolicy),
MinResources: v.calculatePGMinResources(minMember, mpiJob),
},
}
}
func (v *VolcanoCtrl) getPodGroup(namespace, name string) (metav1.Object, error) {
return v.PodGroupInformer.Lister().PodGroups(namespace).Get(name)
}
func (v *VolcanoCtrl) createPodGroup(ctx context.Context, pg metav1.Object) (metav1.Object, error) {
podGroup := pg.(*volcanov1beta1.PodGroup)
return v.Client.SchedulingV1beta1().PodGroups(pg.GetNamespace()).Create(ctx, podGroup, metav1.CreateOptions{})
}
func (v *VolcanoCtrl) updatePodGroup(ctx context.Context, old, new metav1.Object) (metav1.Object, error) {
oldPG := old.(*volcanov1beta1.PodGroup)
newPG := new.(*volcanov1beta1.PodGroup)
oldPG.Spec = newPG.Spec
return v.Client.SchedulingV1beta1().PodGroups(oldPG.GetNamespace()).Update(ctx, oldPG, metav1.UpdateOptions{})
}
func (v *VolcanoCtrl) deletePodGroup(ctx context.Context, namespace, name string) error {
return v.Client.SchedulingV1beta1().PodGroups(namespace).Delete(ctx, name, metav1.DeleteOptions{})
}
func (v *VolcanoCtrl) decoratePodTemplateSpec(pts *corev1.PodTemplateSpec, mpiJobName string) {
if pts.Spec.SchedulerName != v.schedulerName {
klog.Warningf("%s scheduler is specified when gang-scheduling is enabled and it will be overwritten", pts.Spec.SchedulerName)
}
pts.Spec.SchedulerName = v.schedulerName
if pts.Annotations == nil {
pts.Annotations = make(map[string]string)
}
// We create the podGroup with the same name as the mpiJob.
pts.Annotations[volcanov1beta1.KubeGroupNameAnnotationKey] = mpiJobName
}
// calculatePGMinResources calculates minResources for volcano podGroup.
// The minMember is task's total MinAvailable or replicas if task's minAvailable is not set in vcJob.
// PodGroup's MinResources leaves empty now if it is not set. So we calculate the minResources among those first minMember replicas with higher priority.
// ret: https://github.com/volcano-sh/volcano/blob/1933d46bdc4434772518ebb74c4281671ddeffa1/pkg/webhooks/admission/jobs/mutate/mutate_job.go#L168
// ref: https://github.com/volcano-sh/volcano/blob/1933d46bdc4434772518ebb74c4281671ddeffa1/pkg/controllers/job/job_controller_actions.go#L761
func (v *VolcanoCtrl) calculatePGMinResources(minMember *int32, mpiJob *kubeflow.MPIJob) *corev1.ResourceList {
if schedPolicy := mpiJob.Spec.RunPolicy.SchedulingPolicy; schedPolicy != nil && schedPolicy.MinResources != nil {
return schedPolicy.MinResources
}
if minMember != nil && *minMember == 0 {
return nil
}
// sort task by priorityClasses
return calPGMinResource(minMember, mpiJob, v.PriorityClassLister)
}
func (v *VolcanoCtrl) pgSpecsAreEqual(a, b metav1.Object) bool {
PGa := a.(*volcanov1beta1.PodGroup)
PGb := b.(*volcanov1beta1.PodGroup)
return equality.Semantic.DeepEqual(PGa.Spec, PGb.Spec)
}
var _ PodGroupControl = &VolcanoCtrl{}
// SchedulerPluginsCtrl is the implementation fo PodGroupControl with scheduler-plugins.
type SchedulerPluginsCtrl struct {
Client schedclientset.Interface
InformerFactory schedinformers.SharedInformerFactory
PodGroupInformer schedinformer.PodGroupInformer
PriorityClassLister schedulinglisters.PriorityClassLister
schedulerName string
}
func NewSchedulerPluginsCtrl(
c schedclientset.Interface,
watchNamespace, schedulerName string,
pcLister schedulinglisters.PriorityClassLister,
) *SchedulerPluginsCtrl {
var informerFactoryOpts []schedinformers.SharedInformerOption
if watchNamespace != metav1.NamespaceAll {
informerFactoryOpts = append(informerFactoryOpts, schedinformers.WithNamespace(watchNamespace))
}
pgInformerFactory := schedinformers.NewSharedInformerFactoryWithOptions(c, 0, informerFactoryOpts...)
return &SchedulerPluginsCtrl{
Client: c,
InformerFactory: pgInformerFactory,
PodGroupInformer: pgInformerFactory.Scheduling().V1alpha1().PodGroups(),
PriorityClassLister: pcLister,
schedulerName: schedulerName,
}
}
func (s *SchedulerPluginsCtrl) PodGroupSharedIndexInformer() cache.SharedIndexInformer {
return s.PodGroupInformer.Informer()
}
func (s *SchedulerPluginsCtrl) StartInformerFactory(stopCh <-chan struct{}) {
go s.InformerFactory.Start(stopCh)
}
// newPodGroup will generate a new PodGroup for an MPIJob resource.
// If the parameters set in the schedulingPolicy aren't empty, it will pass them to a new PodGroup;
// if they are empty, it will set the default values in the following:
//
// minMember: NUM(workers) + 1
// scheduleTimeoutSeconds: 0
// minResources: Follows the result of calculatePGMinResources.
//
// However, it doesn't pass the ".schedulingPolicy.priorityClass" and "schedulingPolicy.queue" to the podGroup resource.
func (s *SchedulerPluginsCtrl) newPodGroup(mpiJob *kubeflow.MPIJob) metav1.Object {
if mpiJob == nil {
return nil
}
scheduleTimeoutSec := ptr.To[int32](0)
if schedPolicy := mpiJob.Spec.RunPolicy.SchedulingPolicy; schedPolicy != nil && schedPolicy.ScheduleTimeoutSeconds != nil {
scheduleTimeoutSec = schedPolicy.ScheduleTimeoutSeconds
}
minMember := calculateMinAvailable(mpiJob)
var minResources corev1.ResourceList
if origin := s.calculatePGMinResources(minMember, mpiJob); origin != nil {
minResources = *origin
}
return &schedv1alpha1.PodGroup{
TypeMeta: metav1.TypeMeta{
APIVersion: schedv1alpha1.SchemeGroupVersion.String(),
Kind: "PodGroup",
},
ObjectMeta: metav1.ObjectMeta{
Name: mpiJob.Name,
Namespace: mpiJob.Namespace,
OwnerReferences: []metav1.OwnerReference{
*metav1.NewControllerRef(mpiJob, kubeflow.SchemeGroupVersionKind),
},
},
Spec: schedv1alpha1.PodGroupSpec{
MinMember: *minMember,
ScheduleTimeoutSeconds: scheduleTimeoutSec,
MinResources: minResources,
},
}
}
func (s *SchedulerPluginsCtrl) getPodGroup(namespace, name string) (metav1.Object, error) {
return s.PodGroupInformer.Lister().PodGroups(namespace).Get(name)
}
func (s *SchedulerPluginsCtrl) createPodGroup(ctx context.Context, pg metav1.Object) (metav1.Object, error) {
podGroup := pg.(*schedv1alpha1.PodGroup)
return s.Client.SchedulingV1alpha1().PodGroups(pg.GetNamespace()).Create(ctx, podGroup, metav1.CreateOptions{})
}
func (s *SchedulerPluginsCtrl) updatePodGroup(ctx context.Context, old, new metav1.Object) (metav1.Object, error) {
oldPG := old.(*schedv1alpha1.PodGroup)
newPG := new.(*schedv1alpha1.PodGroup)
oldPG.Spec = newPG.Spec
return s.Client.SchedulingV1alpha1().PodGroups(oldPG.GetNamespace()).Update(ctx, oldPG, metav1.UpdateOptions{})
}
func (s *SchedulerPluginsCtrl) deletePodGroup(ctx context.Context, namespace, name string) error {
return s.Client.SchedulingV1alpha1().PodGroups(namespace).Delete(ctx, name, metav1.DeleteOptions{})
}
func (s *SchedulerPluginsCtrl) decoratePodTemplateSpec(pts *corev1.PodTemplateSpec, mpiJobName string) {
if pts.Spec.SchedulerName != s.schedulerName {
klog.Warningf("%s scheduler is specified when gang-scheduling is enabled and it will be overwritten", pts.Spec.SchedulerName)
}
pts.Spec.SchedulerName = s.schedulerName
if pts.Labels == nil {
pts.Labels = make(map[string]string)
}
pts.Labels[schedv1alpha1.PodGroupLabel] = mpiJobName
}
// calculatePGMinResources will calculate minResources for podGroup.
// It calculates the sum of resources defined in all containers and will return the result.
// If the number of replicas (.spec.mpiReplicaSpecs[Launcher].replicas + .spec.mpiReplicaSpecs[Worker].replicas)
// is more of minMember, it reorders replicas according to each priorityClass setting in `podSpec.priorityClassName`
// and then resources with a priority less than minMember will not be added to minResources.
// Note that it doesn't account for the priorityClass specified in podSpec.priorityClassName
// if the priorityClass doesn't exist in the cluster when it reorders replicas.
//
// By adding appropriate required resources to a podGroup resource,
// the coscheduling plugin can filter out the pods that belong to the podGroup in PreFilter
// if the cluster doesn't have enough resources.
// ref: https://github.com/kubernetes-sigs/scheduler-plugins/blob/93d7c92851c4a17f110907f3b5be873176628441/pkg/coscheduling/core/core.go#L159-L182
func (s *SchedulerPluginsCtrl) calculatePGMinResources(minMember *int32, mpiJob *kubeflow.MPIJob) *corev1.ResourceList {
if schedPolicy := mpiJob.Spec.RunPolicy.SchedulingPolicy; schedPolicy != nil && schedPolicy.MinResources != nil {
return schedPolicy.MinResources
}
if minMember != nil && *minMember == 0 {
return nil
}
return calPGMinResource(minMember, mpiJob, s.PriorityClassLister)
}
func (s *SchedulerPluginsCtrl) pgSpecsAreEqual(a, b metav1.Object) bool {
PGa := a.(*schedv1alpha1.PodGroup)
PGb := b.(*schedv1alpha1.PodGroup)
return equality.Semantic.DeepEqual(PGa.Spec, PGb.Spec)
}
var _ PodGroupControl = &SchedulerPluginsCtrl{}
// calPGMinResource returns the minimum resource for mpiJob with minMembers
func calPGMinResource(minMember *int32, mpiJob *kubeflow.MPIJob, pcLister schedulinglisters.PriorityClassLister) *corev1.ResourceList {
var order replicasOrder
for rt, replica := range mpiJob.Spec.MPIReplicaSpecs {
rp := replicaPriority{
priority: 0,
replicaType: rt,
ReplicaSpec: *replica,
}
pcName := replica.Template.Spec.PriorityClassName
if len(pcName) != 0 && pcLister != nil {
if priorityClass, err := pcLister.Get(pcName); err != nil {
klog.Warningf("Ignore replica %q priority class %q: %v", rt, pcName, err)
} else {
rp.priority = priorityClass.Value
}
}
order = append(order, rp)
}
sort.Sort(sort.Reverse(order))
// Launcher + Worker > minMember
replicas := *order[0].Replicas
if len(order) > 1 {
// When using runLauncherAsWorker, there may be no worker.
replicas += *order[1].Replicas
}
if minMember != nil && replicas > *minMember {
// If the launcher and workers have the same priority, it treats workers as a lower priority.
if order[0].priority == order[1].priority {
wIndex := order.getWorkerIndex()
if wIndex == -1 {
klog.Warningf("Couldn't find the worker replicas")
return nil
}
order[wIndex].Replicas = ptr.To(*minMember - 1)
} else {
order[1].Replicas = ptr.To(*minMember - 1)
}
}
minResources := corev1.ResourceList{}
for _, rp := range order {
if rp.Replicas == nil {
continue
}
for _, c := range rp.Template.Spec.Containers {
addResources(minResources, c.Resources, int64(*rp.Replicas))
}
}
return &minResources
}
// calculateMinAvailable calculates minAvailable for the PodGroup.
// If the schedulingPolicy.minAvailable is nil, it returns returns `NUM(workers) + 1`; otherwise returns `schedulingPolicy.minAvailable`.
func calculateMinAvailable(mpiJob *kubeflow.MPIJob) *int32 {
if schedulingPolicy := mpiJob.Spec.RunPolicy.SchedulingPolicy; schedulingPolicy != nil && schedulingPolicy.MinAvailable != nil {
return schedulingPolicy.MinAvailable
}
return ptr.To(workerReplicas(mpiJob) + 1)
}
// calculatePriorityClassName calculates the priorityClass name needed for podGroup according to the following priorities:
// 1. .spec.runPolicy.schedulingPolicy.priorityClass
// 2. .spec.mpiReplicaSecs[Launcher].template.spec.priorityClassName
// 3. .spec.mpiReplicaSecs[Worker].template.spec.priorityClassName
func calculatePriorityClassName(
replicas map[kubeflow.MPIReplicaType]*kubeflow.ReplicaSpec,
schedulingPolicy *kubeflow.SchedulingPolicy,
) string {
if schedulingPolicy != nil && len(schedulingPolicy.PriorityClass) != 0 {
return schedulingPolicy.PriorityClass
} else if l := replicas[kubeflow.MPIReplicaTypeLauncher]; l != nil && len(l.Template.Spec.PriorityClassName) != 0 {
return l.Template.Spec.PriorityClassName
} else if w := replicas[kubeflow.MPIReplicaTypeWorker]; w != nil && len(w.Template.Spec.PriorityClassName) != 0 {
return w.Template.Spec.PriorityClassName
} else {
return ""
}
}
// addResources adds resources to minResources.
// If resources don't have requests, it defaults limit if that is explicitly specified.
func addResources(minResources corev1.ResourceList, resources corev1.ResourceRequirements, replicas int64) {
if minResources == nil || cmp.Equal(resources, corev1.ResourceRequirements{}) {
return
}
merged := corev1.ResourceList{}
for name, req := range resources.Requests {
merged[name] = req
}
for name, lim := range resources.Limits {
if _, ok := merged[name]; !ok {
merged[name] = lim
}
}
for name, quantity := range merged {
quantity.Mul(replicas)
if q, ok := minResources[name]; !ok {
minResources[name] = quantity.DeepCopy()
} else {
q.Add(quantity)
minResources[name] = q
}
}
}
type replicaPriority struct {
priority int32
replicaType kubeflow.MPIReplicaType
kubeflow.ReplicaSpec
}
type replicasOrder []replicaPriority
func (p replicasOrder) Len() int {
return len(p)
}
func (p replicasOrder) Less(i, j int) bool {
return p[i].priority < p[j].priority
}
func (p replicasOrder) Swap(i, j int) {
p[i], p[j] = p[j], p[i]
}
// getWorkerIndex will return an index holding the replicaSpec for the worker.
// If the worker doesn't exist, it returns -1.
func (p replicasOrder) getWorkerIndex() int {
for i := range p {
if p[i].replicaType == kubeflow.MPIReplicaTypeWorker {
return i
}
}
return -1
}