Check if pods fit on the new node in binpacking
Previously we've just assumed pod will always fit on a newly added node during binpacking, because we've already checked that a pod fits on an empty template node earlier in scale-up logic. This assumption is incorrect, as it doesn't take into account potential impact of other scheduling we've done in binpacking. For pods using zonal Filters (such as PodTopologySpreading with zonal topology key) the pod may no longer be able to schedule even on an empty node as a result of earlier decisions we've made in binpacking.
This commit is contained in:
		
							parent
							
								
									ab891418f6
								
							
						
					
					
						commit
						5342f189f1
					
				|  | @ -73,6 +73,7 @@ func (e *BinpackingNodeEstimator) Estimate( | |||
| 	sort.Slice(podInfos, func(i, j int) bool { return podInfos[i].score > podInfos[j].score }) | ||||
| 
 | ||||
| 	newNodeNames := make(map[string]bool) | ||||
| 	newNodesWithPods := make(map[string]bool) | ||||
| 
 | ||||
| 	if err := e.clusterSnapshot.Fork(); err != nil { | ||||
| 		klog.Errorf("Error while calling ClusterSnapshot.Fork; %v", err) | ||||
|  | @ -86,6 +87,7 @@ func (e *BinpackingNodeEstimator) Estimate( | |||
| 
 | ||||
| 	newNodeNameIndex := 0 | ||||
| 	scheduledPods := []*apiv1.Pod{} | ||||
| 	lastNodeName := "" | ||||
| 
 | ||||
| 	for _, podInfo := range podInfos { | ||||
| 		found := false | ||||
|  | @ -100,6 +102,7 @@ func (e *BinpackingNodeEstimator) Estimate( | |||
| 				return 0, nil | ||||
| 			} | ||||
| 			scheduledPods = append(scheduledPods, podInfo.pod) | ||||
| 			newNodesWithPods[nodeName] = true | ||||
| 		} | ||||
| 
 | ||||
| 		if !found { | ||||
|  | @ -109,6 +112,13 @@ func (e *BinpackingNodeEstimator) Estimate( | |||
| 				break | ||||
| 			} | ||||
| 
 | ||||
| 			// If the last node we've added is empty and the pod couldn't schedule on it, it wouldn't be able to schedule
 | ||||
| 			// on a new node either. There is no point adding more nodes to snapshot in such case, especially because of
 | ||||
| 			// performance cost each extra node adds to future FitsAnyNodeMatching calls.
 | ||||
| 			if lastNodeName != "" && !newNodesWithPods[lastNodeName] { | ||||
| 				continue | ||||
| 			} | ||||
| 
 | ||||
| 			// Add new node
 | ||||
| 			newNodeName, err := e.addNewNodeToSnapshot(nodeTemplate, newNodeNameIndex) | ||||
| 			if err != nil { | ||||
|  | @ -116,16 +126,25 @@ func (e *BinpackingNodeEstimator) Estimate( | |||
| 				return 0, nil | ||||
| 			} | ||||
| 			newNodeNameIndex++ | ||||
| 			// And schedule pod to it
 | ||||
| 			newNodeNames[newNodeName] = true | ||||
| 			lastNodeName = newNodeName | ||||
| 
 | ||||
| 			// And try to schedule pod to it.
 | ||||
| 			// Note that this may still fail (ex. if topology spreading with zonal topologyKey is used);
 | ||||
| 			// in this case we can't help the pending pod. We keep the node in clusterSnapshot to avoid
 | ||||
| 			// adding and removing node to snapshot for each such pod.
 | ||||
| 			if err := e.predicateChecker.CheckPredicates(e.clusterSnapshot, podInfo.pod, newNodeName); err != nil { | ||||
| 				continue | ||||
| 			} | ||||
| 			if err := e.clusterSnapshot.AddPod(podInfo.pod, newNodeName); err != nil { | ||||
| 				klog.Errorf("Error adding pod %v.%v to node %v in ClusterSnapshot; %v", podInfo.pod.Namespace, podInfo.pod.Name, newNodeName, err) | ||||
| 				return 0, nil | ||||
| 			} | ||||
| 			newNodeNames[newNodeName] = true | ||||
| 			newNodesWithPods[newNodeName] = true | ||||
| 			scheduledPods = append(scheduledPods, podInfo.pod) | ||||
| 		} | ||||
| 	} | ||||
| 	return len(newNodeNames), scheduledPods | ||||
| 	return len(newNodesWithPods), scheduledPods | ||||
| } | ||||
| 
 | ||||
| func (e *BinpackingNodeEstimator) addNewNodeToSnapshot( | ||||
|  |  | |||
|  | @ -22,6 +22,7 @@ import ( | |||
| 
 | ||||
| 	apiv1 "k8s.io/api/core/v1" | ||||
| 	"k8s.io/apimachinery/pkg/api/resource" | ||||
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||
| 	"k8s.io/autoscaler/cluster-autoscaler/simulator" | ||||
| 	. "k8s.io/autoscaler/cluster-autoscaler/utils/test" | ||||
| 	"k8s.io/autoscaler/cluster-autoscaler/utils/units" | ||||
|  | @ -30,8 +31,15 @@ import ( | |||
| 	"github.com/stretchr/testify/assert" | ||||
| ) | ||||
| 
 | ||||
| func makePods(cpuPerPod int64, memoryPerPod int64, hostport int32, podCount int) []*apiv1.Pod { | ||||
| func makePods(cpuPerPod int64, memoryPerPod int64, hostport int32, maxSkew int32, topologySpreadingKey string, podCount int) []*apiv1.Pod { | ||||
| 	pod := &apiv1.Pod{ | ||||
| 		ObjectMeta: metav1.ObjectMeta{ | ||||
| 			Name:      "estimatee", | ||||
| 			Namespace: "universe", | ||||
| 			Labels: map[string]string{ | ||||
| 				"app": "estimatee", | ||||
| 			}, | ||||
| 		}, | ||||
| 		Spec: apiv1.PodSpec{ | ||||
| 			Containers: []apiv1.Container{ | ||||
| 				{ | ||||
|  | @ -45,13 +53,27 @@ func makePods(cpuPerPod int64, memoryPerPod int64, hostport int32, podCount int) | |||
| 			}, | ||||
| 		}, | ||||
| 	} | ||||
| 	if hostport != 0 { | ||||
| 	if hostport > 0 { | ||||
| 		pod.Spec.Containers[0].Ports = []apiv1.ContainerPort{ | ||||
| 			{ | ||||
| 				HostPort: hostport, | ||||
| 			}, | ||||
| 		} | ||||
| 	} | ||||
| 	if maxSkew > 0 { | ||||
| 		pod.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{ | ||||
| 			{ | ||||
| 				MaxSkew:           maxSkew, | ||||
| 				TopologyKey:       topologySpreadingKey, | ||||
| 				WhenUnsatisfiable: "DoNotSchedule", | ||||
| 				LabelSelector: &metav1.LabelSelector{ | ||||
| 					MatchLabels: map[string]string{ | ||||
| 						"app": "estimatee", | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 		} | ||||
| 	} | ||||
| 	pods := []*apiv1.Pod{} | ||||
| 	for i := 0; i < podCount; i++ { | ||||
| 		pods = append(pods, pod) | ||||
|  | @ -59,21 +81,44 @@ func makePods(cpuPerPod int64, memoryPerPod int64, hostport int32, podCount int) | |||
| 	return pods | ||||
| } | ||||
| 
 | ||||
| func makeNode(cpu int64, mem int64, name string, zone string) *apiv1.Node { | ||||
| 	node := &apiv1.Node{ | ||||
| 		ObjectMeta: metav1.ObjectMeta{ | ||||
| 			Name: name, | ||||
| 			Labels: map[string]string{ | ||||
| 				"kubernetes.io/hostname":      name, | ||||
| 				"topology.kubernetes.io/zone": zone, | ||||
| 			}, | ||||
| 		}, | ||||
| 		Status: apiv1.NodeStatus{ | ||||
| 			Capacity: apiv1.ResourceList{ | ||||
| 				apiv1.ResourceCPU:    *resource.NewMilliQuantity(cpu, resource.DecimalSI), | ||||
| 				apiv1.ResourceMemory: *resource.NewQuantity(mem*units.MiB, resource.DecimalSI), | ||||
| 				apiv1.ResourcePods:   *resource.NewQuantity(10, resource.DecimalSI), | ||||
| 			}, | ||||
| 		}, | ||||
| 	} | ||||
| 	node.Status.Allocatable = node.Status.Capacity | ||||
| 	SetNodeReadyState(node, true, time.Time{}) | ||||
| 	return node | ||||
| } | ||||
| 
 | ||||
| func TestBinpackingEstimate(t *testing.T) { | ||||
| 	testCases := []struct { | ||||
| 		name            string | ||||
| 		millicores      int64 | ||||
| 		memory          int64 | ||||
| 		maxNodes        int | ||||
| 		pods            []*apiv1.Pod | ||||
| 		expectNodeCount int | ||||
| 		expectPodCount  int | ||||
| 		name                 string | ||||
| 		millicores           int64 | ||||
| 		memory               int64 | ||||
| 		maxNodes             int | ||||
| 		pods                 []*apiv1.Pod | ||||
| 		topologySpreadingKey string | ||||
| 		expectNodeCount      int | ||||
| 		expectPodCount       int | ||||
| 	}{ | ||||
| 		{ | ||||
| 			name:            "simple resource-based binpacking", | ||||
| 			millicores:      350*3 - 50, | ||||
| 			memory:          2 * 1000, | ||||
| 			pods:            makePods(350, 1000, 0, 10), | ||||
| 			pods:            makePods(350, 1000, 0, 0, "", 10), | ||||
| 			expectNodeCount: 5, | ||||
| 			expectPodCount:  10, | ||||
| 		}, | ||||
|  | @ -81,7 +126,7 @@ func TestBinpackingEstimate(t *testing.T) { | |||
| 			name:            "pods-per-node bound binpacking", | ||||
| 			millicores:      10000, | ||||
| 			memory:          20000, | ||||
| 			pods:            makePods(10, 100, 0, 20), | ||||
| 			pods:            makePods(10, 100, 0, 0, "", 20), | ||||
| 			expectNodeCount: 2, | ||||
| 			expectPodCount:  20, | ||||
| 		}, | ||||
|  | @ -89,7 +134,7 @@ func TestBinpackingEstimate(t *testing.T) { | |||
| 			name:            "hostport conflict forces pod-per-node", | ||||
| 			millicores:      1000, | ||||
| 			memory:          5000, | ||||
| 			pods:            makePods(200, 1000, 5555, 8), | ||||
| 			pods:            makePods(200, 1000, 5555, 0, "", 8), | ||||
| 			expectNodeCount: 8, | ||||
| 			expectPodCount:  8, | ||||
| 		}, | ||||
|  | @ -97,41 +142,46 @@ func TestBinpackingEstimate(t *testing.T) { | |||
| 			name:            "limiter cuts binpacking", | ||||
| 			millicores:      1000, | ||||
| 			memory:          5000, | ||||
| 			pods:            makePods(500, 1000, 0, 20), | ||||
| 			pods:            makePods(500, 1000, 0, 0, "", 20), | ||||
| 			maxNodes:        5, | ||||
| 			expectNodeCount: 5, | ||||
| 			expectPodCount:  10, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:            "hostname topology spreading with maxSkew=2 forces 2 pods/node", | ||||
| 			millicores:      1000, | ||||
| 			memory:          5000, | ||||
| 			pods:            makePods(20, 100, 0, 2, "kubernetes.io/hostname", 8), | ||||
| 			expectNodeCount: 4, | ||||
| 			expectPodCount:  8, | ||||
| 		}, | ||||
| 		{ | ||||
| 			name:            "zonal topology spreading with maxSkew=2 only allows 2 pods to schedule", | ||||
| 			millicores:      1000, | ||||
| 			memory:          5000, | ||||
| 			pods:            makePods(20, 100, 0, 2, "topology.kubernetes.io/zone", 8), | ||||
| 			expectNodeCount: 1, | ||||
| 			expectPodCount:  2, | ||||
| 		}, | ||||
| 	} | ||||
| 	for _, tc := range testCases { | ||||
| 		t.Run(tc.name, func(t *testing.T) { | ||||
| 			limiter := NewThresholdBasedEstimationLimiter(tc.maxNodes, time.Duration(0)) | ||||
| 			estimator := newBinPackingEstimator(t, limiter) | ||||
| 			node := &apiv1.Node{ | ||||
| 				Status: apiv1.NodeStatus{ | ||||
| 					Capacity: apiv1.ResourceList{ | ||||
| 						apiv1.ResourceCPU:    *resource.NewMilliQuantity(tc.millicores, resource.DecimalSI), | ||||
| 						apiv1.ResourceMemory: *resource.NewQuantity(tc.memory*units.MiB, resource.DecimalSI), | ||||
| 						apiv1.ResourcePods:   *resource.NewQuantity(10, resource.DecimalSI), | ||||
| 					}, | ||||
| 				}, | ||||
| 			} | ||||
| 			node.Status.Allocatable = node.Status.Capacity | ||||
| 			SetNodeReadyState(node, true, time.Time{}) | ||||
| 			clusterSnapshot := simulator.NewBasicClusterSnapshot() | ||||
| 			// Add one node in different zone to trigger topology spread constraints
 | ||||
| 			clusterSnapshot.AddNode(makeNode(100, 100, "oldnode", "zone-jupiter")) | ||||
| 
 | ||||
| 			predicateChecker, err := simulator.NewTestPredicateChecker() | ||||
| 			assert.NoError(t, err) | ||||
| 			limiter := NewThresholdBasedEstimationLimiter(tc.maxNodes, time.Duration(0)) | ||||
| 			estimator := NewBinpackingNodeEstimator(predicateChecker, clusterSnapshot, limiter) | ||||
| 
 | ||||
| 			node := makeNode(tc.millicores, tc.memory, "template", "zone-mars") | ||||
| 			nodeInfo := schedulerframework.NewNodeInfo() | ||||
| 			nodeInfo.SetNode(node) | ||||
| 
 | ||||
| 			estimatedNodes, estimatedPods := estimator.Estimate(tc.pods, nodeInfo, nil) | ||||
| 			assert.Equal(t, tc.expectNodeCount, estimatedNodes) | ||||
| 			assert.Equal(t, tc.expectPodCount, len(estimatedPods)) | ||||
| 		}) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func newBinPackingEstimator(t *testing.T, l EstimationLimiter) *BinpackingNodeEstimator { | ||||
| 	predicateChecker, err := simulator.NewTestPredicateChecker() | ||||
| 	clusterSnapshot := simulator.NewBasicClusterSnapshot() | ||||
| 	assert.NoError(t, err) | ||||
| 	estimator := NewBinpackingNodeEstimator(predicateChecker, clusterSnapshot, l) | ||||
| 	return estimator | ||||
| } | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue