Introduce binbacking optimization for similar pods.

The optimization uses the fact that pods which are equivalent do not need to be check multiple times against already filled nodes. This changes the time complexity from O(pods*nodes) to O(pods).
2024-03-28 09:43:51 -07:00 · 2024-03-28 09:43:51 -07:00 · 5aa6b2cb07
parent 5d0c973652
commit 5aa6b2cb07
6 changed files with 335 additions and 165 deletions
--- a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go
+++ b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go
@ -696,10 +696,10 @@ func (o *ScaleUpOrchestrator) ComputeSimilarNodeGroups(
 func matchingSchedulablePodGroups(podGroups []estimator.PodEquivalenceGroup, similarPodGroups []estimator.PodEquivalenceGroup) bool {
 	schedulableSamplePods := make(map[*apiv1.Pod]bool)
 	for _, podGroup := range similarPodGroups {
-		schedulableSamplePods[podGroup.Pods[0]] = true
+		schedulableSamplePods[podGroup.Exemplar()] = true
 	}
 	for _, podGroup := range podGroups {
-		if _, found := schedulableSamplePods[podGroup.Pods[0]]; !found {
+		if _, found := schedulableSamplePods[podGroup.Exemplar()]; !found {
 			return false
 		}
 	}
--- a/cluster-autoscaler/estimator/binpacking_estimator.go
+++ b/cluster-autoscaler/estimator/binpacking_estimator.go
@ -36,6 +36,16 @@ type BinpackingNodeEstimator struct {
 	podOrderer             EstimationPodOrderer
 	context                EstimationContext
 	estimationAnalyserFunc EstimationAnalyserFunc // optional
+
+}
+
+// estimationState contains helper variables to avoid coping them independently in each function.
+type estimationState struct {
+	scheduledPods    []*apiv1.Pod
+	newNodeNameIndex int
+	lastNodeName     string
+	newNodeNames     map[string]bool
+	newNodesWithPods map[string]bool
 }

 // NewBinpackingNodeEstimator builds a new BinpackingNodeEstimator.
@ -57,6 +67,16 @@ func NewBinpackingNodeEstimator(
 	}
 }

+func newEstimationState() *estimationState {
+	return &estimationState{
+		scheduledPods:    []*apiv1.Pod{},
+		newNodeNameIndex: 0,
+		lastNodeName:     "",
+		newNodeNames:     map[string]bool{},
+		newNodesWithPods: map[string]bool{},
+	}
+}
+
 // Estimate implements First-Fit bin-packing approximation algorithm
 // The ordering of the pods depend on the EstimatePodOrderer, the default
 // order is DecreasingPodOrderer
@ -70,105 +90,149 @@ func NewBinpackingNodeEstimator(
 func (e *BinpackingNodeEstimator) Estimate(
 	podsEquivalenceGroups []PodEquivalenceGroup,
 	nodeTemplate *schedulerframework.NodeInfo,
-	nodeGroup cloudprovider.NodeGroup) (int, []*apiv1.Pod) {
+	nodeGroup cloudprovider.NodeGroup,
+) (int, []*apiv1.Pod) {

 	e.limiter.StartEstimation(podsEquivalenceGroups, nodeGroup, e.context)
 	defer e.limiter.EndEstimation()

 	podsEquivalenceGroups = e.podOrderer.Order(podsEquivalenceGroups, nodeTemplate, nodeGroup)

-	newNodeNames := make(map[string]bool)
-	newNodesWithPods := make(map[string]bool)
-
 	e.clusterSnapshot.Fork()
 	defer func() {
 		e.clusterSnapshot.Revert()
 	}()

-	newNodeNameIndex := 0
-	scheduledPods := []*apiv1.Pod{}
-	lastNodeName := ""
-
+	estimationState := newEstimationState()
 	for _, podsEquivalenceGroup := range podsEquivalenceGroups {
-		for _, pod := range podsEquivalenceGroup.Pods {
-			found := false
+		var err error
+		var remainingPods []*apiv1.Pod

-			nodeName, err := e.predicateChecker.FitsAnyNodeMatching(e.clusterSnapshot, pod, func(nodeInfo *schedulerframework.NodeInfo) bool {
-				return newNodeNames[nodeInfo.Node().Name]
-			})
-			if err == nil {
-				found = true
-				if err := e.clusterSnapshot.AddPod(pod, nodeName); err != nil {
-					klog.Errorf("Error adding pod %v.%v to node %v in ClusterSnapshot; %v", pod.Namespace, pod.Name, nodeName, err)
-					return 0, nil
-				}
-				scheduledPods = append(scheduledPods, pod)
-				newNodesWithPods[nodeName] = true
-			}
+		remainingPods, err = e.tryToScheduleOnExistingNodes(estimationState, podsEquivalenceGroup.Pods)
+		if err != nil {
+			klog.Errorf(err.Error())
+			return 0, nil
+		}

-			if !found {
-				// If the last node we've added is empty and the pod couldn't schedule on it, it wouldn't be able to schedule
-				// on a new node either. There is no point adding more nodes to snapshot in such case, especially because of
-				// performance cost each extra node adds to future FitsAnyNodeMatching calls.
-				if lastNodeName != "" && !newNodesWithPods[lastNodeName] {
-					continue
-				}
-
-				// Stop binpacking if we reach the limit of nodes we can add.
-				// We return the result of the binpacking that we already performed.
-				//
-				// The thresholdBasedEstimationLimiter implementation assumes that for
-				// each call that returns true, one node gets added. Therefore this
-				// must be the last check right before really adding a node.
-				if !e.limiter.PermissionToAddNode() {
-					break
-				}
-
-				// Add new node
-				newNodeName, err := e.addNewNodeToSnapshot(nodeTemplate, newNodeNameIndex)
-				if err != nil {
-					klog.Errorf("Error while adding new node for template to ClusterSnapshot; %v", err)
-					return 0, nil
-				}
-				newNodeNameIndex++
-				newNodeNames[newNodeName] = true
-				lastNodeName = newNodeName
-
-				// And try to schedule pod to it.
-				// Note that this may still fail (ex. if topology spreading with zonal topologyKey is used);
-				// in this case we can't help the pending pod. We keep the node in clusterSnapshot to avoid
-				// adding and removing node to snapshot for each such pod.
-				if err := e.predicateChecker.CheckPredicates(e.clusterSnapshot, pod, newNodeName); err != nil {
-					continue
-				}
-				if err := e.clusterSnapshot.AddPod(pod, newNodeName); err != nil {
-					klog.Errorf("Error adding pod %v.%v to node %v in ClusterSnapshot; %v", pod.Namespace, pod.Name, newNodeName, err)
-					return 0, nil
-				}
-				newNodesWithPods[newNodeName] = true
-				scheduledPods = append(scheduledPods, pod)
-			}
+		err = e.tryToScheduleOnNewNodes(estimationState, nodeTemplate, remainingPods)
+		if err != nil {
+			klog.Errorf(err.Error())
+			return 0, nil
 		}
 	}

 	if e.estimationAnalyserFunc != nil {
-		e.estimationAnalyserFunc(e.clusterSnapshot, nodeGroup, newNodesWithPods)
+		e.estimationAnalyserFunc(e.clusterSnapshot, nodeGroup, estimationState.newNodesWithPods)
 	}
+	return len(estimationState.newNodesWithPods), estimationState.scheduledPods
+}

-	return len(newNodesWithPods), scheduledPods
+func (e *BinpackingNodeEstimator) tryToScheduleOnExistingNodes(
+	estimationState *estimationState,
+	pods []*apiv1.Pod,
+) ([]*apiv1.Pod, error) {
+	var index int
+	for index = 0; index < len(pods); index++ {
+		pod := pods[index]
+
+		// Check schedulability on all nodes created during simulation
+		nodeName, err := e.predicateChecker.FitsAnyNodeMatching(e.clusterSnapshot, pod, func(nodeInfo *schedulerframework.NodeInfo) bool {
+			return estimationState.newNodeNames[nodeInfo.Node().Name]
+		})
+		if err != nil {
+			break
+		}
+
+		if err := e.tryToAddNode(estimationState, pod, nodeName); err != nil {
+			return nil, err
+		}
+	}
+	return pods[index:], nil
+}
+
+func (e *BinpackingNodeEstimator) tryToScheduleOnNewNodes(
+	estimationState *estimationState,
+	nodeTemplate *schedulerframework.NodeInfo,
+	pods []*apiv1.Pod,
+) error {
+	for _, pod := range pods {
+		found := false
+
+		if estimationState.lastNodeName != "" {
+			// Check schedulability on only newly created node
+			if err := e.predicateChecker.CheckPredicates(e.clusterSnapshot, pod, estimationState.lastNodeName); err == nil {
+				found = true
+				if err := e.tryToAddNode(estimationState, pod, estimationState.lastNodeName); err != nil {
+					return err
+				}
+			}
+		}
+
+		if !found {
+			// If the last node we've added is empty and the pod couldn't schedule on it, it wouldn't be able to schedule
+			// on a new node either. There is no point adding more nodes to snapshot in such case, especially because of
+			// performance cost each extra node adds to future FitsAnyNodeMatching calls.
+			if estimationState.lastNodeName != "" && !estimationState.newNodesWithPods[estimationState.lastNodeName] {
+				break
+			}
+
+			// Stop binpacking if we reach the limit of nodes we can add.
+			// We return the result of the binpacking that we already performed.
+			//
+			// The thresholdBasedEstimationLimiter implementation assumes that for
+			// each call that returns true, one node gets added. Therefore this
+			// must be the last check right before really adding a node.
+			if !e.limiter.PermissionToAddNode() {
+				break
+			}
+
+			// Add new node
+			if err := e.addNewNodeToSnapshot(estimationState, nodeTemplate); err != nil {
+				return fmt.Errorf("Error while adding new node for template to ClusterSnapshot; %w", err)
+			}
+
+			// And try to schedule pod to it.
+			// Note that this may still fail (ex. if topology spreading with zonal topologyKey is used);
+			// in this case we can't help the pending pod. We keep the node in clusterSnapshot to avoid
+			// adding and removing node to snapshot for each such pod.
+			if err := e.predicateChecker.CheckPredicates(e.clusterSnapshot, pod, estimationState.lastNodeName); err != nil {
+				break
+			}
+			if err := e.tryToAddNode(estimationState, pod, estimationState.lastNodeName); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
 }

 func (e *BinpackingNodeEstimator) addNewNodeToSnapshot(
+	estimationState *estimationState,
 	template *schedulerframework.NodeInfo,
-	nameIndex int) (string, error) {
-
-	newNodeInfo := scheduler.DeepCopyTemplateNode(template, fmt.Sprintf("e-%d", nameIndex))
+) error {
+	newNodeInfo := scheduler.DeepCopyTemplateNode(template, fmt.Sprintf("e-%d", estimationState.newNodeNameIndex))
 	var pods []*apiv1.Pod
 	for _, podInfo := range newNodeInfo.Pods {
 		pods = append(pods, podInfo.Pod)
 	}
 	if err := e.clusterSnapshot.AddNodeWithPods(newNodeInfo.Node(), pods); err != nil {
-		return "", err
+		return err
 	}
-	return newNodeInfo.Node().Name, nil
+	estimationState.newNodeNameIndex++
+	estimationState.lastNodeName = newNodeInfo.Node().Name
+	estimationState.newNodeNames[estimationState.lastNodeName] = true
+	return nil
+}
+
+func (e *BinpackingNodeEstimator) tryToAddNode(
+	estimationState *estimationState,
+	pod *apiv1.Pod,
+	nodeName string,
+) error {
+	if err := e.clusterSnapshot.AddPod(pod, nodeName); err != nil {
+		return fmt.Errorf("Error adding pod %v.%v to node %v in ClusterSnapshot; %v", pod.Namespace, pod.Name, nodeName, err)
+	}
+	estimationState.newNodesWithPods[nodeName] = true
+	estimationState.scheduledPods = append(estimationState.scheduledPods, pod)
+	return nil
 }
--- a/cluster-autoscaler/estimator/binpacking_estimator_test.go
+++ b/cluster-autoscaler/estimator/binpacking_estimator_test.go
@ -32,49 +32,7 @@ import (
 	"github.com/stretchr/testify/assert"
 )

-func makePodEquivalenceGroup(cpuPerPod int64, memoryPerPod int64, hostport int32, maxSkew int32, topologySpreadingKey string, podCount int) PodEquivalenceGroup {
-	pod := &apiv1.Pod{
-		ObjectMeta: metav1.ObjectMeta{
-			Name:      "estimatee",
-			Namespace: "universe",
-			Labels: map[string]string{
-				"app": "estimatee",
-			},
-		},
-		Spec: apiv1.PodSpec{
-			Containers: []apiv1.Container{
-				{
-					Resources: apiv1.ResourceRequirements{
-						Requests: apiv1.ResourceList{
-							apiv1.ResourceCPU:    *resource.NewMilliQuantity(cpuPerPod, resource.DecimalSI),
-							apiv1.ResourceMemory: *resource.NewQuantity(memoryPerPod*units.MiB, resource.DecimalSI),
-						},
-					},
-				},
-			},
-		},
-	}
-	if hostport > 0 {
-		pod.Spec.Containers[0].Ports = []apiv1.ContainerPort{
-			{
-				HostPort: hostport,
-			},
-		}
-	}
-	if maxSkew > 0 {
-		pod.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{
-			{
-				MaxSkew:           maxSkew,
-				TopologyKey:       topologySpreadingKey,
-				WhenUnsatisfiable: "DoNotSchedule",
-				LabelSelector: &metav1.LabelSelector{
-					MatchLabels: map[string]string{
-						"app": "estimatee",
-					},
-				},
-			},
-		}
-	}
+func makePodEquivalenceGroup(pod *apiv1.Pod, podCount int) PodEquivalenceGroup {
 	pods := []*apiv1.Pod{}
 	for i := 0; i < podCount; i++ {
 		pods = append(pods, pod)
@ -107,7 +65,18 @@ func makeNode(cpu, mem, podCount int64, name string, zone string) *apiv1.Node {
 }

 func TestBinpackingEstimate(t *testing.T) {
-	highResourcePodGroup := makePodEquivalenceGroup(500, 1000, 0, 0, "", 10)
+	highResourcePodGroup := makePodEquivalenceGroup(
+		BuildTestPod(
+			"estimatee",
+			500,
+			1000,
+			WithNamespace("universe"),
+			WithLabels(map[string]string{
+				"app": "estimatee",
+			}),
+		),
+		10,
+	)
 	testCases := []struct {
 		name                 string
 		millicores           int64
@ -120,63 +89,122 @@ func TestBinpackingEstimate(t *testing.T) {
 		expectProcessedPods  []*apiv1.Pod
 	}{
 		{
-			name:                 "simple resource-based binpacking",
-			millicores:           350*3 - 50,
-			memory:               2 * 1000,
-			podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(350, 1000, 0, 0, "", 10)},
-			expectNodeCount:      5,
-			expectPodCount:       10,
+			name:       "simple resource-based binpacking",
+			millicores: 350*3 - 50,
+			memory:     2 * 1000,
+			podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(
+				BuildTestPod(
+					"estimatee",
+					350,
+					1000,
+					WithNamespace("universe"),
+					WithLabels(map[string]string{
+						"app": "estimatee",
+					})), 10)},
+			expectNodeCount: 5,
+			expectPodCount:  10,
 		},
 		{
-			name:                 "pods-per-node bound binpacking",
-			millicores:           10000,
-			memory:               20000,
-			podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(10, 100, 0, 0, "", 20)},
-			expectNodeCount:      2,
-			expectPodCount:       20,
+			name:       "pods-per-node bound binpacking",
+			millicores: 10000,
+			memory:     20000,
+			podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(
+				BuildTestPod(
+					"estimatee",
+					10,
+					100,
+					WithNamespace("universe"),
+					WithLabels(map[string]string{
+						"app": "estimatee",
+					})), 20)},
+			expectNodeCount: 2,
+			expectPodCount:  20,
 		},
 		{
-			name:                 "hostport conflict forces pod-per-node",
-			millicores:           1000,
-			memory:               5000,
-			podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(200, 1000, 5555, 0, "", 8)},
-			expectNodeCount:      8,
-			expectPodCount:       8,
+			name:       "hostport conflict forces pod-per-node",
+			millicores: 1000,
+			memory:     5000,
+			podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(
+				BuildTestPod(
+					"estimatee",
+					200,
+					1000,
+					WithNamespace("universe"),
+					WithLabels(map[string]string{
+						"app": "estimatee",
+					}),
+					WithHostPort(5555)), 8)},
+			expectNodeCount: 8,
+			expectPodCount:  8,
 		},
 		{
-			name:                 "limiter cuts binpacking",
-			millicores:           1000,
-			memory:               5000,
-			podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(500, 1000, 0, 0, "", 20)},
-			maxNodes:             5,
-			expectNodeCount:      5,
-			expectPodCount:       10,
+			name:       "limiter cuts binpacking",
+			millicores: 1000,
+			memory:     5000,
+			podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(
+				BuildTestPod(
+					"estimatee",
+					500,
+					1000,
+					WithNamespace("universe"),
+					WithLabels(map[string]string{
+						"app": "estimatee",
+					})), 20)},
+			maxNodes:        5,
+			expectNodeCount: 5,
+			expectPodCount:  10,
 		},
 		{
-			name:                 "decreasing ordered pods are processed first",
-			millicores:           1000,
-			memory:               5000,
-			podsEquivalenceGroup: append([]PodEquivalenceGroup{makePodEquivalenceGroup(50, 1000, 0, 0, "", 10)}, highResourcePodGroup),
-			maxNodes:             5,
-			expectNodeCount:      5,
-			expectPodCount:       10,
-			expectProcessedPods:  highResourcePodGroup.Pods,
+			name:       "decreasing ordered pods are processed first",
+			millicores: 1000,
+			memory:     5000,
+			podsEquivalenceGroup: append([]PodEquivalenceGroup{makePodEquivalenceGroup(
+				BuildTestPod(
+					"estimatee",
+					50,
+					1000,
+					WithNamespace("universe"),
+					WithLabels(map[string]string{
+						"app": "estimatee",
+					})), 10)}, highResourcePodGroup),
+			maxNodes:            5,
+			expectNodeCount:     5,
+			expectPodCount:      10,
+			expectProcessedPods: highResourcePodGroup.Pods,
 		},
 		{
-			name:                 "hostname topology spreading with maxSkew=2 forces 2 pods/node",
-			millicores:           1000,
-			memory:               5000,
-			podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(20, 100, 0, 2, "kubernetes.io/hostname", 8)},
-			expectNodeCount:      4,
-			expectPodCount:       8,
+			name:       "hostname topology spreading with maxSkew=2 forces 2 pods/node",
+			millicores: 1000,
+			memory:     5000,
+			podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(
+				BuildTestPod(
+					"estimatee",
+					20,
+					100,
+					WithNamespace("universe"),
+					WithLabels(map[string]string{
+						"app": "estimatee",
+					}),
+					WithMaxSkew(2, "kubernetes.io/hostname")), 8)},
+			expectNodeCount: 4,
+			expectPodCount:  8,
 		},
 		{
-			name:                 "zonal topology spreading with maxSkew=2 only allows 2 pods to schedule",
-			millicores:           1000,
-			memory:               5000,
-			podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(20, 100, 0, 2, "topology.kubernetes.io/zone", 8)},
-			expectNodeCount:      1,
-			expectPodCount:       2,
+			name:       "zonal topology spreading with maxSkew=2 only allows 2 pods to schedule",
+			millicores: 1000,
+			memory:     5000,
+			podsEquivalenceGroup: []PodEquivalenceGroup{makePodEquivalenceGroup(
+				BuildTestPod(
+					"estimatee",
+					20,
+					100,
+					WithNamespace("universe"),
+					WithLabels(map[string]string{
+						"app": "estimatee",
+					}),
+					WithMaxSkew(2, "topology.kubernetes.io/zone")), 8)},
+			expectNodeCount: 1,
+			expectPodCount:  2,
 		},
 	}
 	for _, tc := range testCases {
@ -211,7 +239,30 @@ func BenchmarkBinpackingEstimate(b *testing.B) {
 	maxNodes := 3000
 	expectNodeCount := 2595
 	expectPodCount := 51000
-	podsEquivalenceGroup := []PodEquivalenceGroup{makePodEquivalenceGroup(50, 100, 0, 0, "", 50000), makePodEquivalenceGroup(95, 190, 0, 0, "", 1000)}
+	podsEquivalenceGroup := []PodEquivalenceGroup{
+		makePodEquivalenceGroup(
+			BuildTestPod(
+				"estimatee",
+				50,
+				100,
+				WithNamespace("universe"),
+				WithLabels(map[string]string{
+					"app": "estimatee",
+				})),
+			50000,
+		),
+		makePodEquivalenceGroup(
+			BuildTestPod(
+				"estimatee",
+				95,
+				190,
+				WithNamespace("universe"),
+				WithLabels(map[string]string{
+					"app": "estimatee",
+				})),
+			1000,
+		),
+	}

 	for i := 0; i < b.N; i++ {
 		clusterSnapshot := clustersnapshot.NewBasicClusterSnapshot()
--- a/cluster-autoscaler/estimator/decreasing_pod_orderer.go
+++ b/cluster-autoscaler/estimator/decreasing_pod_orderer.go
@ -60,14 +60,14 @@ func (d *DecreasingPodOrderer) Order(podsEquivalentGroups []PodEquivalenceGroup,
 // Score is defined as cpu_sum/node_capacity + mem_sum/node_capacity.
 // Pods that have bigger requirements should be processed first, thus have higher scores.
 func (d *DecreasingPodOrderer) calculatePodScore(podsEquivalentGroup PodEquivalenceGroup, nodeTemplate *framework.NodeInfo) *podScoreInfo {
-	if len(podsEquivalentGroup.Pods) == 0 {
+	samplePod := podsEquivalentGroup.Exemplar()
+	if samplePod == nil {
 		return &podScoreInfo{
 			score:               0,
 			podsEquivalentGroup: podsEquivalentGroup,
 		}
 	}

-	samplePod := podsEquivalentGroup.Pods[0]
 	cpuSum := resource.Quantity{}
 	memorySum := resource.Quantity{}

--- a/cluster-autoscaler/estimator/estimator.go
+++ b/cluster-autoscaler/estimator/estimator.go
@ -41,6 +41,14 @@ type PodEquivalenceGroup struct {
 	Pods []*apiv1.Pod
 }

+// Exemplar returns an example pod from the group.
+func (p *PodEquivalenceGroup) Exemplar() *apiv1.Pod {
+	if len(p.Pods) == 0 {
+		return nil
+	}
+	return p.Pods[0]
+}
+
 // Estimator calculates the number of nodes of given type needed to schedule pods.
 // It returns the number of new nodes needed as well as the list of pods it managed
 // to schedule on those nodes.
--- a/cluster-autoscaler/utils/test/test_utils.go
+++ b/cluster-autoscaler/utils/test/test_utils.go
@ -103,6 +103,53 @@ func WithNodeName(nodeName string) func(*apiv1.Pod) {
 	}
 }

+// WithNamespace sets a namespace to the pod.
+func WithNamespace(namespace string) func(*apiv1.Pod) {
+	return func(pod *apiv1.Pod) {
+		pod.ObjectMeta.Namespace = namespace
+	}
+}
+
+// WithLabels sets a Labels to the pod.
+func WithLabels(labels map[string]string) func(*apiv1.Pod) {
+	return func(pod *apiv1.Pod) {
+		pod.ObjectMeta.Labels = labels
+	}
+}
+
+// WithHostPort sets a namespace to the pod.
+func WithHostPort(hostport int32) func(*apiv1.Pod) {
+	return func(pod *apiv1.Pod) {
+		if hostport > 0 {
+			pod.Spec.Containers[0].Ports = []apiv1.ContainerPort{
+				{
+					HostPort: hostport,
+				},
+			}
+		}
+	}
+}
+
+// WithMaxSkew sets a namespace to the pod.
+func WithMaxSkew(maxSkew int32, topologySpreadingKey string) func(*apiv1.Pod) {
+	return func(pod *apiv1.Pod) {
+		if maxSkew > 0 {
+			pod.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{
+				{
+					MaxSkew:           maxSkew,
+					TopologyKey:       topologySpreadingKey,
+					WhenUnsatisfiable: "DoNotSchedule",
+					LabelSelector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{
+							"app": "estimatee",
+						},
+					},
+				},
+			}
+		}
+	}
+}
+
 // BuildTestPodWithEphemeralStorage creates a pod with cpu, memory and ephemeral storage resources.
 func BuildTestPodWithEphemeralStorage(name string, cpu, mem, ephemeralStorage int64) *apiv1.Pod {
 	startTime := metav1.Unix(0, 0)