194 lines
6.9 KiB
Go
194 lines
6.9 KiB
Go
/*
|
|
Copyright 2016 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package estimator
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
|
|
apiv1 "k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
|
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
|
|
"k8s.io/autoscaler/cluster-autoscaler/simulator/predicatechecker"
|
|
"k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"
|
|
klog "k8s.io/klog/v2"
|
|
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
|
|
)
|
|
|
|
// podInfo contains Pod and score that corresponds to how important it is to handle the pod first.
|
|
type podInfo struct {
|
|
score float64
|
|
pod *apiv1.Pod
|
|
}
|
|
|
|
// BinpackingNodeEstimator estimates the number of needed nodes to handle the given amount of pods.
|
|
type BinpackingNodeEstimator struct {
|
|
predicateChecker predicatechecker.PredicateChecker
|
|
clusterSnapshot clustersnapshot.ClusterSnapshot
|
|
limiter EstimationLimiter
|
|
}
|
|
|
|
// NewBinpackingNodeEstimator builds a new BinpackingNodeEstimator.
|
|
func NewBinpackingNodeEstimator(
|
|
predicateChecker predicatechecker.PredicateChecker,
|
|
clusterSnapshot clustersnapshot.ClusterSnapshot,
|
|
limiter EstimationLimiter) *BinpackingNodeEstimator {
|
|
return &BinpackingNodeEstimator{
|
|
predicateChecker: predicateChecker,
|
|
clusterSnapshot: clusterSnapshot,
|
|
limiter: limiter,
|
|
}
|
|
}
|
|
|
|
// Estimate implements First Fit Decreasing bin-packing approximation algorithm.
|
|
// See https://en.wikipedia.org/wiki/Bin_packing_problem for more details.
|
|
// While it is a multi-dimensional bin packing (cpu, mem, ports) in most cases the main dimension
|
|
// will be cpu thus the estimated overprovisioning of 11/9 * optimal + 6/9 should be
|
|
// still be maintained.
|
|
// It is assumed that all pods from the given list can fit to nodeTemplate.
|
|
// Returns the number of nodes needed to accommodate all pods from the list.
|
|
func (e *BinpackingNodeEstimator) Estimate(
|
|
pods []*apiv1.Pod,
|
|
nodeTemplate *schedulerframework.NodeInfo,
|
|
nodeGroup cloudprovider.NodeGroup) (int, []*apiv1.Pod) {
|
|
|
|
e.limiter.StartEstimation(pods, nodeGroup)
|
|
defer e.limiter.EndEstimation()
|
|
|
|
podInfos := calculatePodScore(pods, nodeTemplate)
|
|
sort.Slice(podInfos, func(i, j int) bool { return podInfos[i].score > podInfos[j].score })
|
|
|
|
newNodeNames := make(map[string]bool)
|
|
newNodesWithPods := make(map[string]bool)
|
|
|
|
e.clusterSnapshot.Fork()
|
|
defer func() {
|
|
e.clusterSnapshot.Revert()
|
|
}()
|
|
|
|
newNodeNameIndex := 0
|
|
scheduledPods := []*apiv1.Pod{}
|
|
lastNodeName := ""
|
|
|
|
for _, podInfo := range podInfos {
|
|
found := false
|
|
|
|
nodeName, err := e.predicateChecker.FitsAnyNodeMatching(e.clusterSnapshot, podInfo.pod, func(nodeInfo *schedulerframework.NodeInfo) bool {
|
|
return newNodeNames[nodeInfo.Node().Name]
|
|
})
|
|
if err == nil {
|
|
found = true
|
|
if err := e.clusterSnapshot.AddPod(podInfo.pod, nodeName); err != nil {
|
|
klog.Errorf("Error adding pod %v.%v to node %v in ClusterSnapshot; %v", podInfo.pod.Namespace, podInfo.pod.Name, nodeName, err)
|
|
return 0, nil
|
|
}
|
|
scheduledPods = append(scheduledPods, podInfo.pod)
|
|
newNodesWithPods[nodeName] = true
|
|
}
|
|
|
|
if !found {
|
|
// Stop binpacking if we reach the limit of nodes we can add.
|
|
// We return the result of the binpacking that we already performed.
|
|
if !e.limiter.PermissionToAddNode() {
|
|
break
|
|
}
|
|
|
|
// If the last node we've added is empty and the pod couldn't schedule on it, it wouldn't be able to schedule
|
|
// on a new node either. There is no point adding more nodes to snapshot in such case, especially because of
|
|
// performance cost each extra node adds to future FitsAnyNodeMatching calls.
|
|
if lastNodeName != "" && !newNodesWithPods[lastNodeName] {
|
|
continue
|
|
}
|
|
|
|
// Add new node
|
|
newNodeName, err := e.addNewNodeToSnapshot(nodeTemplate, newNodeNameIndex)
|
|
if err != nil {
|
|
klog.Errorf("Error while adding new node for template to ClusterSnapshot; %v", err)
|
|
return 0, nil
|
|
}
|
|
newNodeNameIndex++
|
|
newNodeNames[newNodeName] = true
|
|
lastNodeName = newNodeName
|
|
|
|
// And try to schedule pod to it.
|
|
// Note that this may still fail (ex. if topology spreading with zonal topologyKey is used);
|
|
// in this case we can't help the pending pod. We keep the node in clusterSnapshot to avoid
|
|
// adding and removing node to snapshot for each such pod.
|
|
if err := e.predicateChecker.CheckPredicates(e.clusterSnapshot, podInfo.pod, newNodeName); err != nil {
|
|
continue
|
|
}
|
|
if err := e.clusterSnapshot.AddPod(podInfo.pod, newNodeName); err != nil {
|
|
klog.Errorf("Error adding pod %v.%v to node %v in ClusterSnapshot; %v", podInfo.pod.Namespace, podInfo.pod.Name, newNodeName, err)
|
|
return 0, nil
|
|
}
|
|
newNodesWithPods[newNodeName] = true
|
|
scheduledPods = append(scheduledPods, podInfo.pod)
|
|
}
|
|
}
|
|
return len(newNodesWithPods), scheduledPods
|
|
}
|
|
|
|
func (e *BinpackingNodeEstimator) addNewNodeToSnapshot(
|
|
template *schedulerframework.NodeInfo,
|
|
nameIndex int) (string, error) {
|
|
|
|
newNodeInfo := scheduler.DeepCopyTemplateNode(template, fmt.Sprintf("e-%d", nameIndex))
|
|
var pods []*apiv1.Pod
|
|
for _, podInfo := range newNodeInfo.Pods {
|
|
pods = append(pods, podInfo.Pod)
|
|
}
|
|
if err := e.clusterSnapshot.AddNodeWithPods(newNodeInfo.Node(), pods); err != nil {
|
|
return "", err
|
|
}
|
|
return newNodeInfo.Node().Name, nil
|
|
}
|
|
|
|
// Calculates score for all pods and returns podInfo structure.
|
|
// Score is defined as cpu_sum/node_capacity + mem_sum/node_capacity.
|
|
// Pods that have bigger requirements should be processed first, thus have higher scores.
|
|
func calculatePodScore(pods []*apiv1.Pod, nodeTemplate *schedulerframework.NodeInfo) []*podInfo {
|
|
podInfos := make([]*podInfo, 0, len(pods))
|
|
|
|
for _, pod := range pods {
|
|
cpuSum := resource.Quantity{}
|
|
memorySum := resource.Quantity{}
|
|
|
|
for _, container := range pod.Spec.Containers {
|
|
if request, ok := container.Resources.Requests[apiv1.ResourceCPU]; ok {
|
|
cpuSum.Add(request)
|
|
}
|
|
if request, ok := container.Resources.Requests[apiv1.ResourceMemory]; ok {
|
|
memorySum.Add(request)
|
|
}
|
|
}
|
|
score := float64(0)
|
|
if cpuAllocatable, ok := nodeTemplate.Node().Status.Allocatable[apiv1.ResourceCPU]; ok && cpuAllocatable.MilliValue() > 0 {
|
|
score += float64(cpuSum.MilliValue()) / float64(cpuAllocatable.MilliValue())
|
|
}
|
|
if memAllocatable, ok := nodeTemplate.Node().Status.Allocatable[apiv1.ResourceMemory]; ok && memAllocatable.Value() > 0 {
|
|
score += float64(memorySum.Value()) / float64(memAllocatable.Value())
|
|
}
|
|
|
|
podInfos = append(podInfos, &podInfo{
|
|
score: score,
|
|
pod: pod,
|
|
})
|
|
}
|
|
return podInfos
|
|
}
|