volcano/pkg/scheduler/plugins/overcommit/overcommit.go

/*
Copyright 2021 The Volcano Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package overcommit

import (
	"strings"

	v1 "k8s.io/api/core/v1"
	"k8s.io/klog/v2"

	"volcano.sh/apis/pkg/apis/scheduling"
	"volcano.sh/volcano/pkg/scheduler/api"
	"volcano.sh/volcano/pkg/scheduler/framework"
	"volcano.sh/volcano/pkg/scheduler/plugins/util"
)

const (
	// PluginName is name of plugin
	PluginName = "overcommit"
	// overCommitFactor is resource overCommit factor for enqueue action
	// It determines the number of `pending` pods that the scheduler will tolerate
	// when the resources of the cluster is insufficient
	// This field is used as the default key in factorMaps
	overCommitFactor = "overcommit-factor"
	// defaultOverCommitFactor defines the default overCommit resource factor for enqueue action
	defaultOverCommitFactor = 1.2
)

const (
	// overCommitFactorPrefix is the prefix of resource overCommit factor
	// We use this prefix to segment the rules for custom resources
	// in the configuration file.
	overCommitFactorPrefix = "overcommit-factor."
)

// overcommitFactors defines the resource overCommit factors
type overcommitFactors struct {
	// factorMaps defines the resource overCommit factors
	// key: resource, example: "cpu", "memory", "ephemeral-storage", "nvidia.com/gpu"
	// value: overCommit factors
	// when initializing, we will store a default value into this map
	// key: "overcommit-factor", value: defaultOverCommitFactor
	factorMaps map[string]float64
}

type overcommitPlugin struct {
	// pluginArguments Arguments given for the plugin
	pluginArguments framework.Arguments
	totalResource   *api.Resource
	idleResource    *api.Resource
	inqueueResource *api.Resource
	// overCommitFactor is the different resource overCommit factors
	overCommitFactors *overcommitFactors
}

// New function returns overcommit plugin object
func New(arguments framework.Arguments) framework.Plugin {
	return &overcommitPlugin{
		pluginArguments: arguments,
		totalResource:   api.EmptyResource(),
		idleResource:    api.EmptyResource(),
		inqueueResource: api.EmptyResource(),
		overCommitFactors: &overcommitFactors{
			factorMaps: map[string]float64{
				overCommitFactor: defaultOverCommitFactor,
			},
		},
	}
}

func (op *overcommitPlugin) Name() string {
	return PluginName
}

/*
User should give overcommit factors through overcommit plugin arguments as format below:

Example:

actions: "enqueue, allocate, backfill"
tiers:
- plugins:
  - name: overcommit
    arguments:
    overcommit-factor.cpu: 1.2
    overcommit-factor.memory: 1.0
    overcommit-factor: 1.2
*/
func (op *overcommitPlugin) OnSessionOpen(ssn *framework.Session) {
	klog.V(5).Infof("Enter overcommit plugin ...")
	defer klog.V(5).Infof("Leaving overcommit plugin.")

	// parse plugin arguments
	op.parse()

	// validate plugin arguments
	op.validate()

	op.totalResource.Add(ssn.TotalResource)
	// calculate idle resources of total cluster, overcommit resources included
	used := api.EmptyResource()
	for _, node := range ssn.Nodes {
		used.Add(node.Used)
	}

	op.idleResource = op.totalResource.Clone().
		ScaleResourcesWithRatios(op.overCommitFactors.factorMaps, op.overCommitFactors.factorMaps[overCommitFactor]).SubWithoutAssert(used)

	for _, job := range ssn.Jobs {
		// calculate inqueue job resources
		if job.PodGroup.Status.Phase == scheduling.PodGroupInqueue && job.PodGroup.Spec.MinResources != nil {
			op.inqueueResource.Add(api.NewResource(*job.PodGroup.Spec.MinResources))
			continue
		}
		// calculate inqueue resource for running jobs
		// the judgement 'job.PodGroup.Status.Running >= job.PodGroup.Spec.MinMember' will work on cases such as the following condition:
		// Considering a Spark job is completed(driver pod is completed) while the podgroup keeps running, the allocated resource will be reserved again if without the judgement.
		if job.PodGroup.Status.Phase == scheduling.PodGroupRunning &&
			job.PodGroup.Spec.MinResources != nil &&
			int32(util.CalculateAllocatedTaskNum(job)) >= job.PodGroup.Spec.MinMember {
			inqueued := util.GetInqueueResource(job, job.Allocated)
			op.inqueueResource.Add(inqueued)
		}
	}

	ssn.AddJobEnqueueableFn(op.Name(), func(obj interface{}) int {
		job := obj.(*api.JobInfo)
		idle := op.idleResource
		inqueue := api.EmptyResource()
		inqueue.Add(op.inqueueResource)
		if job.PodGroup.Spec.MinResources == nil {
			klog.V(4).Infof("Job <%s/%s> is bestEffort, permit to be inqueue.", job.Namespace, job.Name)
			return util.Permit
		}

		//TODO: if allow 1 more job to be inqueue beyond overcommit-factor, large job may be inqueue and create pods
		jobMinReq := api.NewResource(*job.PodGroup.Spec.MinResources)
		if inqueue.Add(jobMinReq).LessEqualWithDimension(idle, jobMinReq) { // only compare the requested resource
			klog.V(4).Infof("Sufficient resources, permit job <%s/%s> to be inqueue", job.Namespace, job.Name)
			return util.Permit
		}
		klog.V(4).Infof("Resource in cluster is overused, reject job <%s/%s> to be inqueue",
			job.Namespace, job.Name)
		ssn.RecordPodGroupEvent(job.PodGroup, v1.EventTypeNormal, string(scheduling.PodGroupUnschedulableType), "resource in cluster is overused")
		return util.Reject
	})

	ssn.AddJobEnqueuedFn(op.Name(), func(obj interface{}) {
		job := obj.(*api.JobInfo)
		if job.PodGroup.Spec.MinResources == nil {
			return
		}
		jobMinReq := api.NewResource(*job.PodGroup.Spec.MinResources)
		op.inqueueResource.Add(jobMinReq)
	})
}

func (op *overcommitPlugin) OnSessionClose(ssn *framework.Session) {
	op.totalResource = nil
	op.idleResource = nil
	op.inqueueResource = nil
}

// parseFactor iterates through the arguments map and extracts values based on the keys with specific prefixes.
// If a key matches overCommitFactor, its corresponding value is directly added to the target map.
// For keys starting with overCommitFactorPrefix,
// the suffix after the prefix is extracted and used as the key in the target map along with the corresponding value.
func (op *overcommitPlugin) parseFactor(arguments framework.Arguments, target map[string]float64) {
	for key, value := range arguments {
		switch v := value.(type) {
		case float64:
			if key == overCommitFactor {
				// If the key is equal to overCommitFactor,
				// directly add the value to the target map
				target[overCommitFactor] = v
			}

			if strings.HasPrefix(key, overCommitFactorPrefix) {
				// If the key starts with overCommitFactorPrefix
				// Extract the suffix after the prefix
				// Update target map with the extracted suffix and corresponding value
				suffix := strings.TrimPrefix(key, overCommitFactorPrefix)
				target[suffix] = v
			}
		case int:
			// Handle int values by converting them to float64
			floatValue := float64(v)
			if key == overCommitFactor {
				target[overCommitFactor] = floatValue
			}

			if strings.HasPrefix(key, overCommitFactorPrefix) {
				suffix := strings.TrimPrefix(key, overCommitFactorPrefix)
				target[suffix] = floatValue
			}
		default:
			// we should log the unexpected value type here to prevent panics
			klog.Warningf("Unexpected value type for key %s: %T\n", key, value)
		}
	}
}

func (op *overcommitPlugin) parse() {
	op.parseFactor(op.pluginArguments, op.overCommitFactors.factorMaps)
}

// validate is used to validate the input parameters,
// and if the input parameters are invalid, use the default value.
func (op *overcommitPlugin) validate() {
	for k, v := range op.overCommitFactors.factorMaps {
		if v < 1.0 {
			klog.Warningf("Invalid input %f for %v overcommit factor, reason: %v overcommit factor cannot be less than 1,"+
				" using default value: %f.", v, k, k, defaultOverCommitFactor)
			op.overCommitFactors.factorMaps[k] = defaultOverCommitFactor
		}
	}
}