Introduce binbacking optimization for similar pods.

The optimization uses the fact that pods which are equivalent do not
need to be check multiple times against already filled nodes.
This changes the time complexity from O(pods*nodes) to O(pods).
This commit is contained in:
Daniel Gutowski 2024-03-28 09:43:51 -07:00
parent 5d0c973652
commit 5aa6b2cb07
6 changed files with 335 additions and 165 deletions

View File

@ -696,10 +696,10 @@ func (o *ScaleUpOrchestrator) ComputeSimilarNodeGroups(
func matchingSchedulablePodGroups(podGroups []estimator.PodEquivalenceGroup, similarPodGroups []estimator.PodEquivalenceGroup) bool {
schedulableSamplePods := make(map[*apiv1.Pod]bool)
for _, podGroup := range similarPodGroups {
schedulableSamplePods[podGroup.Pods[0]] = true
schedulableSamplePods[podGroup.Exemplar()] = true
}
for _, podGroup := range podGroups {
if _, found := schedulableSamplePods[podGroup.Pods[0]]; !found {
if _, found := schedulableSamplePods[podGroup.Exemplar()]; !found {
return false
}
}

View File

@ -36,6 +36,16 @@ type BinpackingNodeEstimator struct {
podOrderer EstimationPodOrderer
context EstimationContext
estimationAnalyserFunc EstimationAnalyserFunc // optional
}
// estimationState contains helper variables to avoid coping them independently in each function.
type estimationState struct {
scheduledPods []*apiv1.Pod
newNodeNameIndex int
lastNodeName string
newNodeNames map[string]bool
newNodesWithPods map[string]bool
}
// NewBinpackingNodeEstimator builds a new BinpackingNodeEstimator.
@ -57,6 +67,16 @@ func NewBinpackingNodeEstimator(
}
}
func newEstimationState() *estimationState {
return &estimationState{
scheduledPods: []*apiv1.Pod{},
newNodeNameIndex: 0,
lastNodeName: "",
newNodeNames: map[string]bool{},
newNodesWithPods: map[string]bool{},
}
}
// Estimate implements First-Fit bin-packing approximation algorithm
// The ordering of the pods depend on the EstimatePodOrderer, the default
// order is DecreasingPodOrderer
@ -70,105 +90,149 @@ func NewBinpackingNodeEstimator(
func (e *BinpackingNodeEstimator) Estimate(
podsEquivalenceGroups []PodEquivalenceGroup,
nodeTemplate *schedulerframework.NodeInfo,
nodeGroup cloudprovider.NodeGroup) (int, []*apiv1.Pod) {
nodeGroup cloudprovider.NodeGroup,
) (int, []*apiv1.Pod) {
e.limiter.StartEstimation(podsEquivalenceGroups, nodeGroup, e.context)
defer e.limiter.EndEstimation()
podsEquivalenceGroups = e.podOrderer.Order(podsEquivalenceGroups, nodeTemplate, nodeGroup)
newNodeNames := make(map[string]bool)
newNodesWithPods := make(map[string]bool)
e.clusterSnapshot.Fork()
defer func() {
e.clusterSnapshot.Revert()
}()
newNodeNameIndex := 0
scheduledPods := []*apiv1.Pod{}
lastNodeName := ""
estimationState := newEstimationState()
for _, podsEquivalenceGroup := range podsEquivalenceGroups {
for _, pod := range podsEquivalenceGroup.Pods {
found := false
var err error
var remainingPods []*apiv1.Pod
nodeName, err := e.predicateChecker.FitsAnyNodeMatching(e.clusterSnapshot, pod, func(nodeInfo *schedulerframework.NodeInfo) bool {
return newNodeNames[nodeInfo.Node().Name]
})
if err == nil {
found = true
if err := e.clusterSnapshot.AddPod(pod, nodeName); err != nil {
klog.Errorf("Error adding pod %v.%v to node %v in ClusterSnapshot; %v", pod.Namespace, pod.Name, nodeName, err)
return 0, nil
}
scheduledPods = append(scheduledPods, pod)
newNodesWithPods[nodeName] = true
}
remainingPods, err = e.tryToScheduleOnExistingNodes(estimationState, podsEquivalenceGroup.Pods)
if err != nil {
klog.Errorf(err.Error())
return 0, nil
}
if !found {
// If the last node we've added is empty and the pod couldn't schedule on it, it wouldn't be able to schedule
// on a new node either. There is no point adding more nodes to snapshot in such case, especially because of
// performance cost each extra node adds to future FitsAnyNodeMatching calls.
if lastNodeName != "" && !newNodesWithPods[lastNodeName] {
continue
}
// Stop binpacking if we reach the limit of nodes we can add.
// We return the result of the binpacking that we already performed.
//
// The thresholdBasedEstimationLimiter implementation assumes that for
// each call that returns true, one node gets added. Therefore this
// must be the last check right before really adding a node.
if !e.limiter.PermissionToAddNode() {
break
}
// Add new node
newNodeName, err := e.addNewNodeToSnapshot(nodeTemplate, newNodeNameIndex)
if err != nil {
klog.Errorf("Error while adding new node for template to ClusterSnapshot; %v", err)
return 0, nil
}
newNodeNameIndex++
newNodeNames[newNodeName] = true
lastNodeName = newNodeName
// And try to schedule pod to it.
// Note that this may still fail (ex. if topology spreading with zonal topologyKey is used);
// in this case we can't help the pending pod. We keep the node in clusterSnapshot to avoid
// adding and removing node to snapshot for each such pod.
if err := e.predicateChecker.CheckPredicates(e.clusterSnapshot, pod, newNodeName); err != nil {
continue
}
if err := e.clusterSnapshot.AddPod(pod, newNodeName); err != nil {
klog.Errorf("Error adding pod %v.%v to node %v in ClusterSnapshot; %v", pod.Namespace, pod.Name, newNodeName, err)
return 0, nil
}
newNodesWithPods[newNodeName] = true
scheduledPods = append(scheduledPods, pod)
}
err = e.tryToScheduleOnNewNodes(estimationState, nodeTemplate, remainingPods)
if err != nil {
klog.Errorf(err.Error())
return 0, nil
}
}
if e.estimationAnalyserFunc != nil {
e.estimationAnalyserFunc(e.clusterSnapshot, nodeGroup, newNodesWithPods)
e.estimationAnalyserFunc(e.clusterSnapshot, nodeGroup, estimationState.newNodesWithPods)
}
return len(estimationState.newNodesWithPods), estimationState.scheduledPods
}
return len(newNodesWithPods), scheduledPods
func (e *BinpackingNodeEstimator) tryToScheduleOnExistingNodes(
estimationState *estimationState,
pods []*apiv1.Pod,
) ([]*apiv1.Pod, error) {
var index int
for index = 0; index < len(pods); index++ {
pod := pods[index]
// Check schedulability on all nodes created during simulation
nodeName, err := e.predicateChecker.FitsAnyNodeMatching(e.clusterSnapshot, pod, func(nodeInfo *schedulerframework.NodeInfo) bool {
return estimationState.newNodeNames[nodeInfo.Node().Name]
})
if err != nil {
break
}
if err := e.tryToAddNode(estimationState, pod, nodeName); err != nil {
return nil, err
}
}
return pods[index:], nil
}
func (e *BinpackingNodeEstimator) tryToScheduleOnNewNodes(
estimationState *estimationState,
nodeTemplate *schedulerframework.NodeInfo,
pods []*apiv1.Pod,
) error {
for _, pod := range pods {
found := false
if estimationState.lastNodeName != "" {
// Check schedulability on only newly created node
if err := e.predicateChecker.CheckPredicates(e.clusterSnapshot, pod, estimationState.lastNodeName); err == nil {
found = true
if err := e.tryToAddNode(estimationState, pod, estimationState.lastNodeName); err != nil {
return err
}
}
}
if !found {
// If the last node we've added is empty and the pod couldn't schedule on it, it wouldn't be able to schedule
// on a new node either. There is no point adding more nodes to snapshot in such case, especially because of
// performance cost each extra node adds to future FitsAnyNodeMatching calls.
if estimationState.lastNodeName != "" && !estimationState.newNodesWithPods[estimationState.lastNodeName] {
break
}
// Stop binpacking if we reach the limit of nodes we can add.
// We return the result of the binpacking that we already performed.
//
// The thresholdBasedEstimationLimiter implementation assumes that for
// each call that returns true, one node gets added. Therefore this
// must be the last check right before really adding a node.
if !e.limiter.PermissionToAddNode() {
break
}
// Add new node
if err := e.addNewNodeToSnapshot(estimationState, nodeTemplate); err != nil {
return fmt.Errorf("Error while adding new node for template to ClusterSnapshot; %w", err)
}
// And try to schedule pod to it.
// Note that this may still fail (ex. if topology spreading with zonal topologyKey is used);
// in this case we can't help the pending pod. We keep the node in clusterSnapshot to avoid
// adding and removing node to snapshot for each such pod.
if err := e.predicateChecker.CheckPredicates(e.clusterSnapshot, pod, estimationState.lastNodeName); err != nil {
break
}
if err := e.tryToAddNode(estimationState, pod, estimationState.lastNodeName); err != nil {
return err
}
}
}
return nil
}
func (e *BinpackingNodeEstimator) addNewNodeToSnapshot(
estimationState *estimationState,
template *schedulerframework.NodeInfo,
nameIndex int) (string, error) {
newNodeInfo := scheduler.DeepCopyTemplateNode(template, fmt.Sprintf("e-%d", nameIndex))
) error {
newNodeInfo := scheduler.DeepCopyTemplateNode(template, fmt.Sprintf("e-%d", estimationState.newNodeNameIndex))
var pods []*apiv1.Pod
for _, podInfo := range newNodeInfo.Pods {
pods = append(pods, podInfo.Pod)
}
if err := e.clusterSnapshot.AddNodeWithPods(newNodeInfo.Node(), pods); err != nil {
return "", err
return err
}
return newNodeInfo.Node().Name, nil
estimationState.newNodeNameIndex++
estimationState.lastNodeName = newNodeInfo.Node().Name
estimationState.newNodeNames[estimationState.lastNodeName] = true
return nil
}
func (e *BinpackingNodeEstimator) tryToAddNode(
estimationState *estimationState,
pod *apiv1.Pod,
nodeName string,
) error {
if err := e.clusterSnapshot.AddPod(pod, nodeName); err != nil {
return fmt.Errorf("Error adding pod %v.%v to node %v in ClusterSnapshot; %v", pod.Namespace, pod.Name, nodeName, err)
}
estimationState.newNodesWithPods[nodeName] = true
estimationState.scheduledPods = append(estimationState.scheduledPods, pod)
return nil
}

View File

@ -32,49 +32,7 @@ import (
"github.com/stretchr/testify/assert"
)
func makePodEquivalenceGroup(cpuPerPod int64, memoryPerPod int64, hostport int32, maxSkew int32, topologySpreadingKey string, podCount int) PodEquivalenceGroup {
pod := &apiv1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "estimatee",
Namespace: "universe",
Labels: map[string]string{
"app": "estimatee",
},
},
Spec: apiv1.PodSpec{
Containers: []apiv1.Container{
{
Resources: apiv1.ResourceRequirements{
Requests: apiv1.ResourceList{
apiv1.ResourceCPU: *resource.NewMilliQuantity(cpuPerPod, resource.DecimalSI),
apiv1.ResourceMemory: *resource.NewQuantity(memoryPerPod*units.MiB, resource.DecimalSI),
},
},
},
},
},
}
if hostport > 0 {
pod.Spec.Containers[0].Ports = []apiv1.ContainerPort{
{
HostPort: hostport,
},
}
}
if maxSkew > 0 {
pod.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{
{
MaxSkew: maxSkew,
TopologyKey: topologySpreadingKey,
WhenUnsatisfiable: "DoNotSchedule",
LabelSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"app": "estimatee",
},
},
},
}
}
func makePodEquivalenceGroup(pod *apiv1.Pod, podCount int) PodEquivalenceGroup {
pods := []*apiv1.Pod{}
for i := 0; i < podCount; i++ {
pods = append(pods, pod)
@ -107,7 +65,18 @@ func makeNode(cpu, mem, podCount int64, name string, zone string) *apiv1.Node {
}
func TestBinpackingEstimate(t *testing.T) {
highResourcePodGroup := makePodEquivalenceGroup(500, 1000, 0, 0, "", 10)
highResourcePodGroup := makePodEquivalenceGroup(
BuildTestPod(
"estimatee",
500,
1000,
WithNamespace("universe"),
WithLabels(map[string]string{
"app": "estimatee",
}),
),
10,
)
testCases := []struct {
name string
millicores int64
@ -120,63 +89,122 @@ func TestBinpackingEstimate(t *testing.T) {
expectProcessedPods []*apiv1.Pod
}{
{
name: "simple resource-based binpacking",
millicores: 350*3 - 50,
memory: 2 * 1000,
podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(350, 1000, 0, 0, "", 10)},
expectNodeCount: 5,
expectPodCount: 10,
name: "simple resource-based binpacking",
millicores: 350*3 - 50,
memory: 2 * 1000,
podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(
BuildTestPod(
"estimatee",
350,
1000,
WithNamespace("universe"),
WithLabels(map[string]string{
"app": "estimatee",
})), 10)},
expectNodeCount: 5,
expectPodCount: 10,
},
{
name: "pods-per-node bound binpacking",
millicores: 10000,
memory: 20000,
podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(10, 100, 0, 0, "", 20)},
expectNodeCount: 2,
expectPodCount: 20,
name: "pods-per-node bound binpacking",
millicores: 10000,
memory: 20000,
podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(
BuildTestPod(
"estimatee",
10,
100,
WithNamespace("universe"),
WithLabels(map[string]string{
"app": "estimatee",
})), 20)},
expectNodeCount: 2,
expectPodCount: 20,
},
{
name: "hostport conflict forces pod-per-node",
millicores: 1000,
memory: 5000,
podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(200, 1000, 5555, 0, "", 8)},
expectNodeCount: 8,
expectPodCount: 8,
name: "hostport conflict forces pod-per-node",
millicores: 1000,
memory: 5000,
podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(
BuildTestPod(
"estimatee",
200,
1000,
WithNamespace("universe"),
WithLabels(map[string]string{
"app": "estimatee",
}),
WithHostPort(5555)), 8)},
expectNodeCount: 8,
expectPodCount: 8,
},
{
name: "limiter cuts binpacking",
millicores: 1000,
memory: 5000,
podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(500, 1000, 0, 0, "", 20)},
maxNodes: 5,
expectNodeCount: 5,
expectPodCount: 10,
name: "limiter cuts binpacking",
millicores: 1000,
memory: 5000,
podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(
BuildTestPod(
"estimatee",
500,
1000,
WithNamespace("universe"),
WithLabels(map[string]string{
"app": "estimatee",
})), 20)},
maxNodes: 5,
expectNodeCount: 5,
expectPodCount: 10,
},
{
name: "decreasing ordered pods are processed first",
millicores: 1000,
memory: 5000,
podsEquivalenceGroup: append([]PodEquivalenceGroup{makePodEquivalenceGroup(50, 1000, 0, 0, "", 10)}, highResourcePodGroup),
maxNodes: 5,
expectNodeCount: 5,
expectPodCount: 10,
expectProcessedPods: highResourcePodGroup.Pods,
name: "decreasing ordered pods are processed first",
millicores: 1000,
memory: 5000,
podsEquivalenceGroup: append([]PodEquivalenceGroup{makePodEquivalenceGroup(
BuildTestPod(
"estimatee",
50,
1000,
WithNamespace("universe"),
WithLabels(map[string]string{
"app": "estimatee",
})), 10)}, highResourcePodGroup),
maxNodes: 5,
expectNodeCount: 5,
expectPodCount: 10,
expectProcessedPods: highResourcePodGroup.Pods,
},
{
name: "hostname topology spreading with maxSkew=2 forces 2 pods/node",
millicores: 1000,
memory: 5000,
podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(20, 100, 0, 2, "kubernetes.io/hostname", 8)},
expectNodeCount: 4,
expectPodCount: 8,
name: "hostname topology spreading with maxSkew=2 forces 2 pods/node",
millicores: 1000,
memory: 5000,
podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(
BuildTestPod(
"estimatee",
20,
100,
WithNamespace("universe"),
WithLabels(map[string]string{
"app": "estimatee",
}),
WithMaxSkew(2, "kubernetes.io/hostname")), 8)},
expectNodeCount: 4,
expectPodCount: 8,
},
{
name: "zonal topology spreading with maxSkew=2 only allows 2 pods to schedule",
millicores: 1000,
memory: 5000,
podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(20, 100, 0, 2, "topology.kubernetes.io/zone", 8)},
expectNodeCount: 1,
expectPodCount: 2,
name: "zonal topology spreading with maxSkew=2 only allows 2 pods to schedule",
millicores: 1000,
memory: 5000,
podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(
BuildTestPod(
"estimatee",
20,
100,
WithNamespace("universe"),
WithLabels(map[string]string{
"app": "estimatee",
}),
WithMaxSkew(2, "topology.kubernetes.io/zone")), 8)},
expectNodeCount: 1,
expectPodCount: 2,
},
}
for _, tc := range testCases {
@ -211,7 +239,30 @@ func BenchmarkBinpackingEstimate(b *testing.B) {
maxNodes := 3000
expectNodeCount := 2595
expectPodCount := 51000
podsEquivalenceGroup := []PodEquivalenceGroup{makePodEquivalenceGroup(50, 100, 0, 0, "", 50000), makePodEquivalenceGroup(95, 190, 0, 0, "", 1000)}
podsEquivalenceGroup := []PodEquivalenceGroup{
makePodEquivalenceGroup(
BuildTestPod(
"estimatee",
50,
100,
WithNamespace("universe"),
WithLabels(map[string]string{
"app": "estimatee",
})),
50000,
),
makePodEquivalenceGroup(
BuildTestPod(
"estimatee",
95,
190,
WithNamespace("universe"),
WithLabels(map[string]string{
"app": "estimatee",
})),
1000,
),
}
for i := 0; i < b.N; i++ {
clusterSnapshot := clustersnapshot.NewBasicClusterSnapshot()

View File

@ -60,14 +60,14 @@ func (d *DecreasingPodOrderer) Order(podsEquivalentGroups []PodEquivalenceGroup,
// Score is defined as cpu_sum/node_capacity + mem_sum/node_capacity.
// Pods that have bigger requirements should be processed first, thus have higher scores.
func (d *DecreasingPodOrderer) calculatePodScore(podsEquivalentGroup PodEquivalenceGroup, nodeTemplate *framework.NodeInfo) *podScoreInfo {
if len(podsEquivalentGroup.Pods) == 0 {
samplePod := podsEquivalentGroup.Exemplar()
if samplePod == nil {
return &podScoreInfo{
score: 0,
podsEquivalentGroup: podsEquivalentGroup,
}
}
samplePod := podsEquivalentGroup.Pods[0]
cpuSum := resource.Quantity{}
memorySum := resource.Quantity{}

View File

@ -41,6 +41,14 @@ type PodEquivalenceGroup struct {
Pods []*apiv1.Pod
}
// Exemplar returns an example pod from the group.
func (p *PodEquivalenceGroup) Exemplar() *apiv1.Pod {
if len(p.Pods) == 0 {
return nil
}
return p.Pods[0]
}
// Estimator calculates the number of nodes of given type needed to schedule pods.
// It returns the number of new nodes needed as well as the list of pods it managed
// to schedule on those nodes.

View File

@ -103,6 +103,53 @@ func WithNodeName(nodeName string) func(*apiv1.Pod) {
}
}
// WithNamespace sets a namespace to the pod.
func WithNamespace(namespace string) func(*apiv1.Pod) {
return func(pod *apiv1.Pod) {
pod.ObjectMeta.Namespace = namespace
}
}
// WithLabels sets a Labels to the pod.
func WithLabels(labels map[string]string) func(*apiv1.Pod) {
return func(pod *apiv1.Pod) {
pod.ObjectMeta.Labels = labels
}
}
// WithHostPort sets a namespace to the pod.
func WithHostPort(hostport int32) func(*apiv1.Pod) {
return func(pod *apiv1.Pod) {
if hostport > 0 {
pod.Spec.Containers[0].Ports = []apiv1.ContainerPort{
{
HostPort: hostport,
},
}
}
}
}
// WithMaxSkew sets a namespace to the pod.
func WithMaxSkew(maxSkew int32, topologySpreadingKey string) func(*apiv1.Pod) {
return func(pod *apiv1.Pod) {
if maxSkew > 0 {
pod.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{
{
MaxSkew: maxSkew,
TopologyKey: topologySpreadingKey,
WhenUnsatisfiable: "DoNotSchedule",
LabelSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"app": "estimatee",
},
},
},
}
}
}
}
// BuildTestPodWithEphemeralStorage creates a pod with cpu, memory and ephemeral storage resources.
func BuildTestPodWithEphemeralStorage(name string, cpu, mem, ephemeralStorage int64) *apiv1.Pod {
startTime := metav1.Unix(0, 0)