add lifted files
Signed-off-by: Garrybest <garrybest@foxmail.com>
This commit is contained in:
parent
40becff2a1
commit
f52043b447
|
@ -0,0 +1,157 @@
|
||||||
|
// This code is mostly lifted from the Kubernetes codebase to establish the internal cache.
|
||||||
|
// https://github.com/kubernetes/kubernetes/blob/release-1.25/pkg/scheduler/eventhandlers.go
|
||||||
|
|
||||||
|
package server
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
corev1 "k8s.io/api/core/v1"
|
||||||
|
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
|
||||||
|
"k8s.io/client-go/informers"
|
||||||
|
"k8s.io/client-go/tools/cache"
|
||||||
|
"k8s.io/klog/v2"
|
||||||
|
)
|
||||||
|
|
||||||
|
func addAllEventHandlers(es *AccurateSchedulerEstimatorServer, informerFactory informers.SharedInformerFactory) {
|
||||||
|
// scheduled pod cache
|
||||||
|
informerFactory.Core().V1().Pods().Informer().AddEventHandler(
|
||||||
|
cache.FilteringResourceEventHandler{
|
||||||
|
FilterFunc: func(obj interface{}) bool {
|
||||||
|
switch t := obj.(type) {
|
||||||
|
case *corev1.Pod:
|
||||||
|
return assignedPod(t)
|
||||||
|
case cache.DeletedFinalStateUnknown:
|
||||||
|
if _, ok := t.Obj.(*corev1.Pod); ok {
|
||||||
|
// The carried object may be stale, so we don't use it to check if
|
||||||
|
// it's assigned or not. Attempting to cleanup anyways.
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
utilruntime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod", obj))
|
||||||
|
return false
|
||||||
|
default:
|
||||||
|
utilruntime.HandleError(fmt.Errorf("unable to handle object: %T", obj))
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Handler: cache.ResourceEventHandlerFuncs{
|
||||||
|
AddFunc: es.addPodToCache,
|
||||||
|
UpdateFunc: es.updatePodInCache,
|
||||||
|
DeleteFunc: es.deletePodFromCache,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
informerFactory.Core().V1().Nodes().Informer().AddEventHandler(
|
||||||
|
cache.ResourceEventHandlerFuncs{
|
||||||
|
AddFunc: es.addNodeToCache,
|
||||||
|
UpdateFunc: es.updateNodeInCache,
|
||||||
|
DeleteFunc: es.deleteNodeFromCache,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (es *AccurateSchedulerEstimatorServer) addPodToCache(obj interface{}) {
|
||||||
|
pod, ok := obj.(*corev1.Pod)
|
||||||
|
if !ok {
|
||||||
|
klog.ErrorS(nil, "Cannot convert to *v1.Pod", "obj", obj)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
klog.V(3).InfoS("Add event for scheduled pod", "pod", klog.KObj(pod))
|
||||||
|
|
||||||
|
if err := es.Cache.AddPod(pod); err != nil {
|
||||||
|
klog.ErrorS(err, "Estimator cache AddPod failed", "pod", klog.KObj(pod))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (es *AccurateSchedulerEstimatorServer) updatePodInCache(oldObj, newObj interface{}) {
|
||||||
|
oldPod, ok := oldObj.(*corev1.Pod)
|
||||||
|
if !ok {
|
||||||
|
klog.ErrorS(nil, "Cannot convert oldObj to *v1.Pod", "oldObj", oldObj)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
newPod, ok := newObj.(*corev1.Pod)
|
||||||
|
if !ok {
|
||||||
|
klog.ErrorS(nil, "Cannot convert newObj to *v1.Pod", "newObj", newObj)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
klog.V(4).InfoS("Update event for scheduled pod", "pod", klog.KObj(oldPod))
|
||||||
|
|
||||||
|
if err := es.Cache.UpdatePod(oldPod, newPod); err != nil {
|
||||||
|
klog.ErrorS(err, "Estimator cache UpdatePod failed", "pod", klog.KObj(oldPod))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (es *AccurateSchedulerEstimatorServer) deletePodFromCache(obj interface{}) {
|
||||||
|
var pod *corev1.Pod
|
||||||
|
switch t := obj.(type) {
|
||||||
|
case *corev1.Pod:
|
||||||
|
pod = t
|
||||||
|
case cache.DeletedFinalStateUnknown:
|
||||||
|
var ok bool
|
||||||
|
pod, ok = t.Obj.(*corev1.Pod)
|
||||||
|
if !ok {
|
||||||
|
klog.ErrorS(nil, "Cannot convert to *v1.Pod", "obj", t.Obj)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
klog.ErrorS(nil, "Cannot convert to *v1.Pod", "obj", t)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
klog.V(3).InfoS("Delete event for scheduled pod", "pod", klog.KObj(pod))
|
||||||
|
if err := es.Cache.RemovePod(pod); err != nil {
|
||||||
|
klog.ErrorS(err, "Estimator cache RemovePod failed", "pod", klog.KObj(pod))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (es *AccurateSchedulerEstimatorServer) addNodeToCache(obj interface{}) {
|
||||||
|
node, ok := obj.(*corev1.Node)
|
||||||
|
if !ok {
|
||||||
|
klog.ErrorS(nil, "Cannot convert to *v1.Node", "obj", obj)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
es.Cache.AddNode(node)
|
||||||
|
klog.V(3).InfoS("Add event for node", "node", klog.KObj(node))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (es *AccurateSchedulerEstimatorServer) updateNodeInCache(oldObj, newObj interface{}) {
|
||||||
|
oldNode, ok := oldObj.(*corev1.Node)
|
||||||
|
if !ok {
|
||||||
|
klog.ErrorS(nil, "Cannot convert oldObj to *v1.Node", "oldObj", oldObj)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
newNode, ok := newObj.(*corev1.Node)
|
||||||
|
if !ok {
|
||||||
|
klog.ErrorS(nil, "Cannot convert newObj to *v1.Node", "newObj", newObj)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
es.Cache.UpdateNode(oldNode, newNode)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (es *AccurateSchedulerEstimatorServer) deleteNodeFromCache(obj interface{}) {
|
||||||
|
var node *corev1.Node
|
||||||
|
switch t := obj.(type) {
|
||||||
|
case *corev1.Node:
|
||||||
|
node = t
|
||||||
|
case cache.DeletedFinalStateUnknown:
|
||||||
|
var ok bool
|
||||||
|
node, ok = t.Obj.(*corev1.Node)
|
||||||
|
if !ok {
|
||||||
|
klog.ErrorS(nil, "Cannot convert to *v1.Node", "obj", t.Obj)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
klog.ErrorS(nil, "Cannot convert to *v1.Node", "obj", t)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
klog.V(3).InfoS("Delete event for node", "node", klog.KObj(node))
|
||||||
|
if err := es.Cache.RemoveNode(node); err != nil {
|
||||||
|
klog.ErrorS(err, "Scheduler cache RemoveNode failed")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// assignedPod selects pods that are assigned (scheduled and running).
|
||||||
|
func assignedPod(pod *corev1.Pod) bool {
|
||||||
|
return len(pod.Spec.NodeName) != 0
|
||||||
|
}
|
|
@ -44,7 +44,6 @@ package lifted
|
||||||
| objectwatcher.go | https://github.com/kubernetes-sigs/kubefed/blob/master/pkg/controller/util/propagatedversion.go#L35-L43 | func ObjectVersion | N |
|
| objectwatcher.go | https://github.com/kubernetes-sigs/kubefed/blob/master/pkg/controller/util/propagatedversion.go#L35-L43 | func ObjectVersion | N |
|
||||||
| objectwatcher.go | https://github.com/kubernetes-sigs/kubefed/blob/master/pkg/controller/util/propagatedversion.go#L45-L59 | func ObjectNeedsUpdate | N |
|
| objectwatcher.go | https://github.com/kubernetes-sigs/kubefed/blob/master/pkg/controller/util/propagatedversion.go#L45-L59 | func ObjectNeedsUpdate | N |
|
||||||
| objectwatcher.go | https://github.com/kubernetes-sigs/kubefed/blob/master/pkg/controller/util/meta.go#L63-L80 | func objectMetaObjEquivalent | Y |
|
| objectwatcher.go | https://github.com/kubernetes-sigs/kubefed/blob/master/pkg/controller/util/meta.go#L63-L80 | func objectMetaObjEquivalent | Y |
|
||||||
| parallelism_test.go | https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/scheduler/framework/parallelize/parallelism_test.go | func TestChunkSize | N |
|
|
||||||
| podtemplate.go | https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/controller/controller_utils.go#L466-L472 | func getPodsLabelSet | N |
|
| podtemplate.go | https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/controller/controller_utils.go#L466-L472 | func getPodsLabelSet | N |
|
||||||
| podtemplate.go | https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/controller/controller_utils.go#L474-L478 | func getPodsFinalizers | N |
|
| podtemplate.go | https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/controller/controller_utils.go#L474-L478 | func getPodsFinalizers | N |
|
||||||
| podtemplate.go | https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/controller/controller_utils.go#L480-L486 | func getPodsAnnotationSet | N |
|
| podtemplate.go | https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/controller/controller_utils.go#L480-L486 | func getPodsAnnotationSet | N |
|
||||||
|
|
|
@ -0,0 +1,767 @@
|
||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// This code is directly lifted from the Kubernetes codebase in order to avoid relying on the k8s.io/kubernetes package.
|
||||||
|
// For reference:
|
||||||
|
// https://github.com/kubernetes/kubernetes/blob/release-1.25/pkg/scheduler/internal/cache/cache.go
|
||||||
|
|
||||||
|
package cache
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
corev1 "k8s.io/api/core/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
|
"k8s.io/apimachinery/pkg/util/wait"
|
||||||
|
"k8s.io/klog/v2"
|
||||||
|
|
||||||
|
"github.com/karmada-io/karmada/pkg/util/lifted/scheduler/framework"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
cleanAssumedPeriod = 1 * time.Second
|
||||||
|
)
|
||||||
|
|
||||||
|
// New returns a Cache implementation.
|
||||||
|
// It automatically starts a go routine that manages expiration of assumed pods.
|
||||||
|
// "ttl" is how long the assumed pod will get expired, "0" means pod will never expire.
|
||||||
|
// "stop" is the channel that would close the background goroutine.
|
||||||
|
func New(ttl time.Duration, stop <-chan struct{}) Cache {
|
||||||
|
cache := newCache(ttl, cleanAssumedPeriod, stop)
|
||||||
|
cache.run()
|
||||||
|
return cache
|
||||||
|
}
|
||||||
|
|
||||||
|
// nodeInfoListItem holds a NodeInfo pointer and acts as an item in a doubly
|
||||||
|
// linked list. When a NodeInfo is updated, it goes to the head of the list.
|
||||||
|
// The items closer to the head are the most recently updated items.
|
||||||
|
type nodeInfoListItem struct {
|
||||||
|
info *framework.NodeInfo
|
||||||
|
next *nodeInfoListItem
|
||||||
|
prev *nodeInfoListItem
|
||||||
|
}
|
||||||
|
|
||||||
|
type cacheImpl struct {
|
||||||
|
stop <-chan struct{}
|
||||||
|
ttl time.Duration
|
||||||
|
period time.Duration
|
||||||
|
|
||||||
|
// This mutex guards all fields within this cache struct.
|
||||||
|
mu sync.RWMutex
|
||||||
|
// a set of assumed pod keys.
|
||||||
|
// The key could further be used to get an entry in podStates.
|
||||||
|
assumedPods sets.String
|
||||||
|
// a map from pod key to podState.
|
||||||
|
podStates map[string]*podState
|
||||||
|
nodes map[string]*nodeInfoListItem
|
||||||
|
// headNode points to the most recently updated NodeInfo in "nodes". It is the
|
||||||
|
// head of the linked list.
|
||||||
|
headNode *nodeInfoListItem
|
||||||
|
nodeTree *nodeTree
|
||||||
|
// A map from image name to its imageState.
|
||||||
|
imageStates map[string]*imageState
|
||||||
|
}
|
||||||
|
|
||||||
|
type podState struct {
|
||||||
|
pod *corev1.Pod
|
||||||
|
// Used by assumedPod to determinate expiration.
|
||||||
|
// If deadline is nil, assumedPod will never expire.
|
||||||
|
deadline *time.Time
|
||||||
|
// Used to block cache from expiring assumedPod if binding still runs
|
||||||
|
bindingFinished bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type imageState struct {
|
||||||
|
// Size of the image
|
||||||
|
size int64
|
||||||
|
// A set of node names for nodes having this image present
|
||||||
|
nodes sets.String
|
||||||
|
}
|
||||||
|
|
||||||
|
// createImageStateSummary returns a summarizing snapshot of the given image's state.
|
||||||
|
func (cache *cacheImpl) createImageStateSummary(state *imageState) *framework.ImageStateSummary {
|
||||||
|
return &framework.ImageStateSummary{
|
||||||
|
Size: state.size,
|
||||||
|
NumNodes: len(state.nodes),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func newCache(ttl, period time.Duration, stop <-chan struct{}) *cacheImpl {
|
||||||
|
return &cacheImpl{
|
||||||
|
ttl: ttl,
|
||||||
|
period: period,
|
||||||
|
stop: stop,
|
||||||
|
|
||||||
|
nodes: make(map[string]*nodeInfoListItem),
|
||||||
|
nodeTree: newNodeTree(nil),
|
||||||
|
assumedPods: make(sets.String),
|
||||||
|
podStates: make(map[string]*podState),
|
||||||
|
imageStates: make(map[string]*imageState),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// newNodeInfoListItem initializes a new nodeInfoListItem.
|
||||||
|
func newNodeInfoListItem(ni *framework.NodeInfo) *nodeInfoListItem {
|
||||||
|
return &nodeInfoListItem{
|
||||||
|
info: ni,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// moveNodeInfoToHead moves a NodeInfo to the head of "cache.nodes" doubly
|
||||||
|
// linked list. The head is the most recently updated NodeInfo.
|
||||||
|
// We assume cache lock is already acquired.
|
||||||
|
func (cache *cacheImpl) moveNodeInfoToHead(name string) {
|
||||||
|
ni, ok := cache.nodes[name]
|
||||||
|
if !ok {
|
||||||
|
klog.ErrorS(nil, "No node info with given name found in the cache", "node", klog.KRef("", name))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// if the node info list item is already at the head, we are done.
|
||||||
|
if ni == cache.headNode {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if ni.prev != nil {
|
||||||
|
ni.prev.next = ni.next
|
||||||
|
}
|
||||||
|
if ni.next != nil {
|
||||||
|
ni.next.prev = ni.prev
|
||||||
|
}
|
||||||
|
if cache.headNode != nil {
|
||||||
|
cache.headNode.prev = ni
|
||||||
|
}
|
||||||
|
ni.next = cache.headNode
|
||||||
|
ni.prev = nil
|
||||||
|
cache.headNode = ni
|
||||||
|
}
|
||||||
|
|
||||||
|
// removeNodeInfoFromList removes a NodeInfo from the "cache.nodes" doubly
|
||||||
|
// linked list.
|
||||||
|
// We assume cache lock is already acquired.
|
||||||
|
func (cache *cacheImpl) removeNodeInfoFromList(name string) {
|
||||||
|
ni, ok := cache.nodes[name]
|
||||||
|
if !ok {
|
||||||
|
klog.ErrorS(nil, "No node info with given name found in the cache", "node", klog.KRef("", name))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if ni.prev != nil {
|
||||||
|
ni.prev.next = ni.next
|
||||||
|
}
|
||||||
|
if ni.next != nil {
|
||||||
|
ni.next.prev = ni.prev
|
||||||
|
}
|
||||||
|
// if the removed item was at the head, we must update the head.
|
||||||
|
if ni == cache.headNode {
|
||||||
|
cache.headNode = ni.next
|
||||||
|
}
|
||||||
|
delete(cache.nodes, name)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dump produces a dump of the current scheduler cache. This is used for
|
||||||
|
// debugging purposes only and shouldn't be confused with UpdateSnapshot
|
||||||
|
// function.
|
||||||
|
// This method is expensive, and should be only used in non-critical path.
|
||||||
|
func (cache *cacheImpl) Dump() *Dump {
|
||||||
|
cache.mu.RLock()
|
||||||
|
defer cache.mu.RUnlock()
|
||||||
|
|
||||||
|
nodes := make(map[string]*framework.NodeInfo, len(cache.nodes))
|
||||||
|
for k, v := range cache.nodes {
|
||||||
|
nodes[k] = v.info.Clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
return &Dump{
|
||||||
|
Nodes: nodes,
|
||||||
|
AssumedPods: cache.assumedPods.Union(nil),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateSnapshot takes a snapshot of cached NodeInfo map. This is called at
|
||||||
|
// beginning of every scheduling cycle.
|
||||||
|
// The snapshot only includes Nodes that are not deleted at the time this function is called.
|
||||||
|
// nodeInfo.Node() is guaranteed to be not nil for all the nodes in the snapshot.
|
||||||
|
// This function tracks generation number of NodeInfo and updates only the
|
||||||
|
// entries of an existing snapshot that have changed after the snapshot was taken.
|
||||||
|
//
|
||||||
|
//nolint:gocyclo
|
||||||
|
func (cache *cacheImpl) UpdateSnapshot(nodeSnapshot *Snapshot) error {
|
||||||
|
cache.mu.Lock()
|
||||||
|
defer cache.mu.Unlock()
|
||||||
|
|
||||||
|
// Get the last generation of the snapshot.
|
||||||
|
snapshotGeneration := nodeSnapshot.generation
|
||||||
|
|
||||||
|
// NodeInfoList and HavePodsWithAffinityNodeInfoList must be re-created if a node was added
|
||||||
|
// or removed from the cache.
|
||||||
|
updateAllLists := false
|
||||||
|
// HavePodsWithAffinityNodeInfoList must be re-created if a node changed its
|
||||||
|
// status from having pods with affinity to NOT having pods with affinity or the other
|
||||||
|
// way around.
|
||||||
|
updateNodesHavePodsWithAffinity := false
|
||||||
|
// HavePodsWithRequiredAntiAffinityNodeInfoList must be re-created if a node changed its
|
||||||
|
// status from having pods with required anti-affinity to NOT having pods with required
|
||||||
|
// anti-affinity or the other way around.
|
||||||
|
updateNodesHavePodsWithRequiredAntiAffinity := false
|
||||||
|
// usedPVCSet must be re-created whenever the head node generation is greater than
|
||||||
|
// last snapshot generation.
|
||||||
|
updateUsedPVCSet := false
|
||||||
|
|
||||||
|
// Start from the head of the NodeInfo doubly linked list and update snapshot
|
||||||
|
// of NodeInfos updated after the last snapshot.
|
||||||
|
for node := cache.headNode; node != nil; node = node.next {
|
||||||
|
if node.info.Generation <= snapshotGeneration {
|
||||||
|
// all the nodes are updated before the existing snapshot. We are done.
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if np := node.info.Node(); np != nil {
|
||||||
|
existing, ok := nodeSnapshot.nodeInfoMap[np.Name]
|
||||||
|
if !ok {
|
||||||
|
updateAllLists = true
|
||||||
|
existing = &framework.NodeInfo{}
|
||||||
|
nodeSnapshot.nodeInfoMap[np.Name] = existing
|
||||||
|
}
|
||||||
|
clone := node.info.Clone()
|
||||||
|
// We track nodes that have pods with affinity, here we check if this node changed its
|
||||||
|
// status from having pods with affinity to NOT having pods with affinity or the other
|
||||||
|
// way around.
|
||||||
|
if (len(existing.PodsWithAffinity) > 0) != (len(clone.PodsWithAffinity) > 0) {
|
||||||
|
updateNodesHavePodsWithAffinity = true
|
||||||
|
}
|
||||||
|
if (len(existing.PodsWithRequiredAntiAffinity) > 0) != (len(clone.PodsWithRequiredAntiAffinity) > 0) {
|
||||||
|
updateNodesHavePodsWithRequiredAntiAffinity = true
|
||||||
|
}
|
||||||
|
if !updateUsedPVCSet {
|
||||||
|
if len(existing.PVCRefCounts) != len(clone.PVCRefCounts) {
|
||||||
|
updateUsedPVCSet = true
|
||||||
|
} else {
|
||||||
|
for pvcKey := range clone.PVCRefCounts {
|
||||||
|
if _, found := existing.PVCRefCounts[pvcKey]; !found {
|
||||||
|
updateUsedPVCSet = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// We need to preserve the original pointer of the NodeInfo struct since it
|
||||||
|
// is used in the NodeInfoList, which we may not update.
|
||||||
|
*existing = *clone
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Update the snapshot generation with the latest NodeInfo generation.
|
||||||
|
if cache.headNode != nil {
|
||||||
|
nodeSnapshot.generation = cache.headNode.info.Generation
|
||||||
|
}
|
||||||
|
|
||||||
|
// Comparing to pods in nodeTree.
|
||||||
|
// Deleted nodes get removed from the tree, but they might remain in the nodes map
|
||||||
|
// if they still have non-deleted Pods.
|
||||||
|
if len(nodeSnapshot.nodeInfoMap) > cache.nodeTree.numNodes {
|
||||||
|
cache.removeDeletedNodesFromSnapshot(nodeSnapshot)
|
||||||
|
updateAllLists = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if updateAllLists || updateNodesHavePodsWithAffinity || updateNodesHavePodsWithRequiredAntiAffinity || updateUsedPVCSet {
|
||||||
|
cache.updateNodeInfoSnapshotList(nodeSnapshot, updateAllLists)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(nodeSnapshot.nodeInfoList) != cache.nodeTree.numNodes {
|
||||||
|
errMsg := fmt.Sprintf("snapshot state is not consistent, length of NodeInfoList=%v not equal to length of nodes in tree=%v "+
|
||||||
|
", length of NodeInfoMap=%v, length of nodes in cache=%v"+
|
||||||
|
", trying to recover",
|
||||||
|
len(nodeSnapshot.nodeInfoList), cache.nodeTree.numNodes,
|
||||||
|
len(nodeSnapshot.nodeInfoMap), len(cache.nodes))
|
||||||
|
klog.ErrorS(nil, errMsg)
|
||||||
|
// We will try to recover by re-creating the lists for the next scheduling cycle, but still return an
|
||||||
|
// error to surface the problem, the error will likely cause a failure to the current scheduling cycle.
|
||||||
|
cache.updateNodeInfoSnapshotList(nodeSnapshot, true)
|
||||||
|
return fmt.Errorf(errMsg)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cache *cacheImpl) updateNodeInfoSnapshotList(snapshot *Snapshot, updateAll bool) {
|
||||||
|
snapshot.havePodsWithAffinityNodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
|
||||||
|
snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
|
||||||
|
snapshot.usedPVCSet = sets.NewString()
|
||||||
|
if updateAll {
|
||||||
|
// Take a snapshot of the nodes order in the tree
|
||||||
|
snapshot.nodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
|
||||||
|
nodesList, err := cache.nodeTree.list()
|
||||||
|
if err != nil {
|
||||||
|
klog.ErrorS(err, "Error occurred while retrieving the list of names of the nodes from node tree")
|
||||||
|
}
|
||||||
|
for _, nodeName := range nodesList {
|
||||||
|
if nodeInfo := snapshot.nodeInfoMap[nodeName]; nodeInfo != nil {
|
||||||
|
snapshot.nodeInfoList = append(snapshot.nodeInfoList, nodeInfo)
|
||||||
|
if len(nodeInfo.PodsWithAffinity) > 0 {
|
||||||
|
snapshot.havePodsWithAffinityNodeInfoList = append(snapshot.havePodsWithAffinityNodeInfoList, nodeInfo)
|
||||||
|
}
|
||||||
|
if len(nodeInfo.PodsWithRequiredAntiAffinity) > 0 {
|
||||||
|
snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo)
|
||||||
|
}
|
||||||
|
for key := range nodeInfo.PVCRefCounts {
|
||||||
|
snapshot.usedPVCSet.Insert(key)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
klog.ErrorS(nil, "Node exists in nodeTree but not in NodeInfoMap, this should not happen", "node", klog.KRef("", nodeName))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for _, nodeInfo := range snapshot.nodeInfoList {
|
||||||
|
if len(nodeInfo.PodsWithAffinity) > 0 {
|
||||||
|
snapshot.havePodsWithAffinityNodeInfoList = append(snapshot.havePodsWithAffinityNodeInfoList, nodeInfo)
|
||||||
|
}
|
||||||
|
if len(nodeInfo.PodsWithRequiredAntiAffinity) > 0 {
|
||||||
|
snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo)
|
||||||
|
}
|
||||||
|
for key := range nodeInfo.PVCRefCounts {
|
||||||
|
snapshot.usedPVCSet.Insert(key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If certain nodes were deleted after the last snapshot was taken, we should remove them from the snapshot.
|
||||||
|
func (cache *cacheImpl) removeDeletedNodesFromSnapshot(snapshot *Snapshot) {
|
||||||
|
toDelete := len(snapshot.nodeInfoMap) - cache.nodeTree.numNodes
|
||||||
|
for name := range snapshot.nodeInfoMap {
|
||||||
|
if toDelete <= 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if n, ok := cache.nodes[name]; !ok || n.info.Node() == nil {
|
||||||
|
delete(snapshot.nodeInfoMap, name)
|
||||||
|
toDelete--
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NodeCount returns the number of nodes in the cache.
|
||||||
|
// DO NOT use outside of tests.
|
||||||
|
func (cache *cacheImpl) NodeCount() int {
|
||||||
|
cache.mu.RLock()
|
||||||
|
defer cache.mu.RUnlock()
|
||||||
|
return len(cache.nodes)
|
||||||
|
}
|
||||||
|
|
||||||
|
// PodCount returns the number of pods in the cache (including those from deleted nodes).
|
||||||
|
// DO NOT use outside of tests.
|
||||||
|
func (cache *cacheImpl) PodCount() (int, error) {
|
||||||
|
cache.mu.RLock()
|
||||||
|
defer cache.mu.RUnlock()
|
||||||
|
// podFilter is expected to return true for most or all of the pods. We
|
||||||
|
// can avoid expensive array growth without wasting too much memory by
|
||||||
|
// pre-allocating capacity.
|
||||||
|
count := 0
|
||||||
|
for _, n := range cache.nodes {
|
||||||
|
count += len(n.info.Pods)
|
||||||
|
}
|
||||||
|
return count, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cache *cacheImpl) AssumePod(pod *corev1.Pod) error {
|
||||||
|
key, err := framework.GetPodKey(pod)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
cache.mu.Lock()
|
||||||
|
defer cache.mu.Unlock()
|
||||||
|
if _, ok := cache.podStates[key]; ok {
|
||||||
|
return fmt.Errorf("pod %v(%v) is in the cache, so can't be assumed", key, klog.KObj(pod))
|
||||||
|
}
|
||||||
|
|
||||||
|
return cache.addPod(pod, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cache *cacheImpl) FinishBinding(pod *corev1.Pod) error {
|
||||||
|
return cache.finishBinding(pod, time.Now())
|
||||||
|
}
|
||||||
|
|
||||||
|
// finishBinding exists to make tests deterministic by injecting now as an argument
|
||||||
|
func (cache *cacheImpl) finishBinding(pod *corev1.Pod, now time.Time) error {
|
||||||
|
key, err := framework.GetPodKey(pod)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
cache.mu.RLock()
|
||||||
|
defer cache.mu.RUnlock()
|
||||||
|
|
||||||
|
klog.V(5).InfoS("Finished binding for pod, can be expired", "podKey", key, "pod", klog.KObj(pod))
|
||||||
|
currState, ok := cache.podStates[key]
|
||||||
|
if ok && cache.assumedPods.Has(key) {
|
||||||
|
if cache.ttl == time.Duration(0) {
|
||||||
|
currState.deadline = nil
|
||||||
|
} else {
|
||||||
|
dl := now.Add(cache.ttl)
|
||||||
|
currState.deadline = &dl
|
||||||
|
}
|
||||||
|
currState.bindingFinished = true
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cache *cacheImpl) ForgetPod(pod *corev1.Pod) error {
|
||||||
|
key, err := framework.GetPodKey(pod)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
cache.mu.Lock()
|
||||||
|
defer cache.mu.Unlock()
|
||||||
|
|
||||||
|
currState, ok := cache.podStates[key]
|
||||||
|
if ok && currState.pod.Spec.NodeName != pod.Spec.NodeName {
|
||||||
|
return fmt.Errorf("pod %v(%v) was assumed on %v but assigned to %v", key, klog.KObj(pod), pod.Spec.NodeName, currState.pod.Spec.NodeName)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only assumed pod can be forgotten.
|
||||||
|
if ok && cache.assumedPods.Has(key) {
|
||||||
|
return cache.removePod(pod)
|
||||||
|
}
|
||||||
|
return fmt.Errorf("pod %v(%v) wasn't assumed so cannot be forgotten", key, klog.KObj(pod))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assumes that lock is already acquired.
|
||||||
|
func (cache *cacheImpl) addPod(pod *corev1.Pod, assumePod bool) error {
|
||||||
|
key, err := framework.GetPodKey(pod)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
n, ok := cache.nodes[pod.Spec.NodeName]
|
||||||
|
if !ok {
|
||||||
|
n = newNodeInfoListItem(framework.NewNodeInfo())
|
||||||
|
cache.nodes[pod.Spec.NodeName] = n
|
||||||
|
}
|
||||||
|
n.info.AddPod(pod)
|
||||||
|
cache.moveNodeInfoToHead(pod.Spec.NodeName)
|
||||||
|
ps := &podState{
|
||||||
|
pod: pod,
|
||||||
|
}
|
||||||
|
cache.podStates[key] = ps
|
||||||
|
if assumePod {
|
||||||
|
cache.assumedPods.Insert(key)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assumes that lock is already acquired.
|
||||||
|
func (cache *cacheImpl) updatePod(oldPod, newPod *corev1.Pod) error {
|
||||||
|
if err := cache.removePod(oldPod); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return cache.addPod(newPod, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assumes that lock is already acquired.
|
||||||
|
// Removes a pod from the cached node info. If the node information was already
|
||||||
|
// removed and there are no more pods left in the node, cleans up the node from
|
||||||
|
// the cache.
|
||||||
|
func (cache *cacheImpl) removePod(pod *corev1.Pod) error {
|
||||||
|
key, err := framework.GetPodKey(pod)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
n, ok := cache.nodes[pod.Spec.NodeName]
|
||||||
|
if !ok {
|
||||||
|
klog.ErrorS(nil, "Node not found when trying to remove pod", "node", klog.KRef("", pod.Spec.NodeName), "podKey", key, "pod", klog.KObj(pod))
|
||||||
|
} else {
|
||||||
|
if err := n.info.RemovePod(pod); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if len(n.info.Pods) == 0 && n.info.Node() == nil {
|
||||||
|
cache.removeNodeInfoFromList(pod.Spec.NodeName)
|
||||||
|
} else {
|
||||||
|
cache.moveNodeInfoToHead(pod.Spec.NodeName)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
delete(cache.podStates, key)
|
||||||
|
delete(cache.assumedPods, key)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cache *cacheImpl) AddPod(pod *corev1.Pod) error {
|
||||||
|
key, err := framework.GetPodKey(pod)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
cache.mu.Lock()
|
||||||
|
defer cache.mu.Unlock()
|
||||||
|
|
||||||
|
currState, ok := cache.podStates[key]
|
||||||
|
switch {
|
||||||
|
case ok && cache.assumedPods.Has(key):
|
||||||
|
// When assuming, we've already added the Pod to cache,
|
||||||
|
// Just update here to make sure the Pod's status is up-to-date.
|
||||||
|
if err = cache.updatePod(currState.pod, pod); err != nil {
|
||||||
|
klog.ErrorS(err, "Error occurred while updating pod")
|
||||||
|
}
|
||||||
|
if currState.pod.Spec.NodeName != pod.Spec.NodeName {
|
||||||
|
// The pod was added to a different node than it was assumed to.
|
||||||
|
klog.InfoS("Pod was added to a different node than it was assumed", "podKey", key, "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
case !ok:
|
||||||
|
// Pod was expired. We should add it back.
|
||||||
|
if err = cache.addPod(pod, false); err != nil {
|
||||||
|
klog.ErrorS(err, "Error occurred while adding pod")
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return fmt.Errorf("pod %v(%v) was already in added state", key, klog.KObj(pod))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cache *cacheImpl) UpdatePod(oldPod, newPod *corev1.Pod) error {
|
||||||
|
key, err := framework.GetPodKey(oldPod)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
cache.mu.Lock()
|
||||||
|
defer cache.mu.Unlock()
|
||||||
|
|
||||||
|
currState, ok := cache.podStates[key]
|
||||||
|
// An assumed pod won't have Update/Remove event. It needs to have Add event
|
||||||
|
// before Update event, in which case the state would change from Assumed to Added.
|
||||||
|
if ok && !cache.assumedPods.Has(key) {
|
||||||
|
if currState.pod.Spec.NodeName != newPod.Spec.NodeName {
|
||||||
|
klog.ErrorS(nil, "Pod updated on a different node than previously added to", "podKey", key, "pod", klog.KObj(oldPod))
|
||||||
|
klog.ErrorS(nil, "scheduler cache is corrupted and can badly affect scheduling decisions")
|
||||||
|
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
|
||||||
|
}
|
||||||
|
return cache.updatePod(oldPod, newPod)
|
||||||
|
}
|
||||||
|
return fmt.Errorf("pod %v(%v) is not added to scheduler cache, so cannot be updated", key, klog.KObj(oldPod))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cache *cacheImpl) RemovePod(pod *corev1.Pod) error {
|
||||||
|
key, err := framework.GetPodKey(pod)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
cache.mu.Lock()
|
||||||
|
defer cache.mu.Unlock()
|
||||||
|
|
||||||
|
currState, ok := cache.podStates[key]
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("pod %v(%v) is not found in scheduler cache, so cannot be removed from it", key, klog.KObj(pod))
|
||||||
|
}
|
||||||
|
if currState.pod.Spec.NodeName != pod.Spec.NodeName {
|
||||||
|
klog.ErrorS(nil, "Pod was added to a different node than it was assumed", "podKey", key, "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName))
|
||||||
|
if pod.Spec.NodeName != "" {
|
||||||
|
// An empty NodeName is possible when the scheduler misses a Delete
|
||||||
|
// event and it gets the last known state from the informer cache.
|
||||||
|
klog.ErrorS(nil, "scheduler cache is corrupted and can badly affect scheduling decisions")
|
||||||
|
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return cache.removePod(currState.pod)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cache *cacheImpl) IsAssumedPod(pod *corev1.Pod) (bool, error) {
|
||||||
|
key, err := framework.GetPodKey(pod)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
cache.mu.RLock()
|
||||||
|
defer cache.mu.RUnlock()
|
||||||
|
|
||||||
|
return cache.assumedPods.Has(key), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetPod might return a pod for which its node has already been deleted from
|
||||||
|
// the main cache. This is useful to properly process pod update events.
|
||||||
|
func (cache *cacheImpl) GetPod(pod *corev1.Pod) (*corev1.Pod, error) {
|
||||||
|
key, err := framework.GetPodKey(pod)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
cache.mu.RLock()
|
||||||
|
defer cache.mu.RUnlock()
|
||||||
|
|
||||||
|
podState, ok := cache.podStates[key]
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("pod %v(%v) does not exist in scheduler cache", key, klog.KObj(pod))
|
||||||
|
}
|
||||||
|
|
||||||
|
return podState.pod, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cache *cacheImpl) AddNode(node *corev1.Node) *framework.NodeInfo {
|
||||||
|
cache.mu.Lock()
|
||||||
|
defer cache.mu.Unlock()
|
||||||
|
|
||||||
|
n, ok := cache.nodes[node.Name]
|
||||||
|
if !ok {
|
||||||
|
n = newNodeInfoListItem(framework.NewNodeInfo())
|
||||||
|
cache.nodes[node.Name] = n
|
||||||
|
} else {
|
||||||
|
cache.removeNodeImageStates(n.info.Node())
|
||||||
|
}
|
||||||
|
cache.moveNodeInfoToHead(node.Name)
|
||||||
|
|
||||||
|
cache.nodeTree.addNode(node)
|
||||||
|
cache.addNodeImageStates(node, n.info)
|
||||||
|
n.info.SetNode(node)
|
||||||
|
return n.info.Clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cache *cacheImpl) UpdateNode(oldNode, newNode *corev1.Node) *framework.NodeInfo {
|
||||||
|
cache.mu.Lock()
|
||||||
|
defer cache.mu.Unlock()
|
||||||
|
|
||||||
|
n, ok := cache.nodes[newNode.Name]
|
||||||
|
if !ok {
|
||||||
|
n = newNodeInfoListItem(framework.NewNodeInfo())
|
||||||
|
cache.nodes[newNode.Name] = n
|
||||||
|
cache.nodeTree.addNode(newNode)
|
||||||
|
} else {
|
||||||
|
cache.removeNodeImageStates(n.info.Node())
|
||||||
|
}
|
||||||
|
cache.moveNodeInfoToHead(newNode.Name)
|
||||||
|
|
||||||
|
cache.nodeTree.updateNode(oldNode, newNode)
|
||||||
|
cache.addNodeImageStates(newNode, n.info)
|
||||||
|
n.info.SetNode(newNode)
|
||||||
|
return n.info.Clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
// RemoveNode removes a node from the cache's tree.
|
||||||
|
// The node might still have pods because their deletion events didn't arrive
|
||||||
|
// yet. Those pods are considered removed from the cache, being the node tree
|
||||||
|
// the source of truth.
|
||||||
|
// However, we keep a ghost node with the list of pods until all pod deletion
|
||||||
|
// events have arrived. A ghost node is skipped from snapshots.
|
||||||
|
func (cache *cacheImpl) RemoveNode(node *corev1.Node) error {
|
||||||
|
cache.mu.Lock()
|
||||||
|
defer cache.mu.Unlock()
|
||||||
|
|
||||||
|
n, ok := cache.nodes[node.Name]
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("node %v is not found", node.Name)
|
||||||
|
}
|
||||||
|
n.info.RemoveNode()
|
||||||
|
// We remove NodeInfo for this node only if there aren't any pods on this node.
|
||||||
|
// We can't do it unconditionally, because notifications about pods are delivered
|
||||||
|
// in a different watch, and thus can potentially be observed later, even though
|
||||||
|
// they happened before node removal.
|
||||||
|
if len(n.info.Pods) == 0 {
|
||||||
|
cache.removeNodeInfoFromList(node.Name)
|
||||||
|
} else {
|
||||||
|
cache.moveNodeInfoToHead(node.Name)
|
||||||
|
}
|
||||||
|
if err := cache.nodeTree.removeNode(node); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
cache.removeNodeImageStates(node)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// addNodeImageStates adds states of the images on given node to the given nodeInfo and update the imageStates in
|
||||||
|
// scheduler cache. This function assumes the lock to scheduler cache has been acquired.
|
||||||
|
func (cache *cacheImpl) addNodeImageStates(node *corev1.Node, nodeInfo *framework.NodeInfo) {
|
||||||
|
newSum := make(map[string]*framework.ImageStateSummary)
|
||||||
|
|
||||||
|
for _, image := range node.Status.Images {
|
||||||
|
for _, name := range image.Names {
|
||||||
|
// update the entry in imageStates
|
||||||
|
state, ok := cache.imageStates[name]
|
||||||
|
if !ok {
|
||||||
|
state = &imageState{
|
||||||
|
size: image.SizeBytes,
|
||||||
|
nodes: sets.NewString(node.Name),
|
||||||
|
}
|
||||||
|
cache.imageStates[name] = state
|
||||||
|
} else {
|
||||||
|
state.nodes.Insert(node.Name)
|
||||||
|
}
|
||||||
|
// create the imageStateSummary for this image
|
||||||
|
if _, ok := newSum[name]; !ok {
|
||||||
|
newSum[name] = cache.createImageStateSummary(state)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nodeInfo.ImageStates = newSum
|
||||||
|
}
|
||||||
|
|
||||||
|
// removeNodeImageStates removes the given node record from image entries having the node
|
||||||
|
// in imageStates cache. After the removal, if any image becomes free, i.e., the image
|
||||||
|
// is no longer available on any node, the image entry will be removed from imageStates.
|
||||||
|
func (cache *cacheImpl) removeNodeImageStates(node *corev1.Node) {
|
||||||
|
if node == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, image := range node.Status.Images {
|
||||||
|
for _, name := range image.Names {
|
||||||
|
state, ok := cache.imageStates[name]
|
||||||
|
if ok {
|
||||||
|
state.nodes.Delete(node.Name)
|
||||||
|
if len(state.nodes) == 0 {
|
||||||
|
// Remove the unused image to make sure the length of
|
||||||
|
// imageStates represents the total number of different
|
||||||
|
// images on all nodes
|
||||||
|
delete(cache.imageStates, name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cache *cacheImpl) run() {
|
||||||
|
go wait.Until(cache.cleanupExpiredAssumedPods, cache.period, cache.stop)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cache *cacheImpl) cleanupExpiredAssumedPods() {
|
||||||
|
cache.cleanupAssumedPods(time.Now())
|
||||||
|
}
|
||||||
|
|
||||||
|
// cleanupAssumedPods exists for making test deterministic by taking time as input argument.
|
||||||
|
// It also reports metrics on the cache size for nodes, pods, and assumed pods.
|
||||||
|
func (cache *cacheImpl) cleanupAssumedPods(now time.Time) {
|
||||||
|
cache.mu.Lock()
|
||||||
|
defer cache.mu.Unlock()
|
||||||
|
|
||||||
|
// The size of assumedPods should be small
|
||||||
|
for key := range cache.assumedPods {
|
||||||
|
ps, ok := cache.podStates[key]
|
||||||
|
if !ok {
|
||||||
|
klog.ErrorS(nil, "Key found in assumed set but not in podStates, potentially a logical error")
|
||||||
|
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
|
||||||
|
}
|
||||||
|
if !ps.bindingFinished {
|
||||||
|
klog.V(5).InfoS("Could not expire cache for pod as binding is still in progress", "podKey", key, "pod", klog.KObj(ps.pod))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if cache.ttl != 0 && now.After(*ps.deadline) {
|
||||||
|
klog.InfoS("Pod expired", "podKey", key, "pod", klog.KObj(ps.pod))
|
||||||
|
if err := cache.removePod(ps.pod); err != nil {
|
||||||
|
klog.ErrorS(err, "ExpirePod failed", "podKey", key, "pod", klog.KObj(ps.pod))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,127 @@
|
||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// This code is directly lifted from the Kubernetes codebase in order to avoid relying on the k8s.io/kubernetes package.
|
||||||
|
// For reference:
|
||||||
|
// https://github.com/kubernetes/kubernetes/blob/release-1.25/pkg/scheduler/internal/cache/interface.go
|
||||||
|
|
||||||
|
package cache
|
||||||
|
|
||||||
|
import (
|
||||||
|
corev1 "k8s.io/api/core/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
|
|
||||||
|
"github.com/karmada-io/karmada/pkg/util/lifted/scheduler/framework"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Cache collects pods' information and provides node-level aggregated information.
|
||||||
|
// It's intended for generic scheduler to do efficient lookup.
|
||||||
|
// Cache's operations are pod centric. It does incremental updates based on pod events.
|
||||||
|
// Pod events are sent via network. We don't have guaranteed delivery of all events:
|
||||||
|
// We use Reflector to list and watch from remote.
|
||||||
|
// Reflector might be slow and do a relist, which would lead to missing events.
|
||||||
|
//
|
||||||
|
// State Machine of a pod's events in scheduler's cache:
|
||||||
|
//
|
||||||
|
// +-------------------------------------------+ +----+
|
||||||
|
// | Add | | |
|
||||||
|
// | | | | Update
|
||||||
|
// + Assume Add v v |
|
||||||
|
//
|
||||||
|
// Initial +--------> Assumed +------------+---> Added <--+
|
||||||
|
//
|
||||||
|
// ^ + + | +
|
||||||
|
// | | | | |
|
||||||
|
// | | | Add | | Remove
|
||||||
|
// | | | | |
|
||||||
|
// | | | + |
|
||||||
|
// +----------------+ +-----------> Expired +----> Deleted
|
||||||
|
// Forget Expire
|
||||||
|
//
|
||||||
|
// Note that an assumed pod can expire, because if we haven't received Add event notifying us
|
||||||
|
// for a while, there might be some problems and we shouldn't keep the pod in cache anymore.
|
||||||
|
//
|
||||||
|
// Note that "Initial", "Expired", and "Deleted" pods do not actually exist in cache.
|
||||||
|
// Based on existing use cases, we are making the following assumptions:
|
||||||
|
// - No pod would be assumed twice
|
||||||
|
// - A pod could be added without going through scheduler. In this case, we will see Add but not Assume event.
|
||||||
|
// - If a pod wasn't added, it wouldn't be removed or updated.
|
||||||
|
// - Both "Expired" and "Deleted" are valid end states. In case of some problems, e.g. network issue,
|
||||||
|
// a pod might have changed its state (e.g. added and deleted) without delivering notification to the cache.
|
||||||
|
type Cache interface {
|
||||||
|
// NodeCount returns the number of nodes in the cache.
|
||||||
|
// DO NOT use outside of tests.
|
||||||
|
NodeCount() int
|
||||||
|
|
||||||
|
// PodCount returns the number of pods in the cache (including those from deleted nodes).
|
||||||
|
// DO NOT use outside of tests.
|
||||||
|
PodCount() (int, error)
|
||||||
|
|
||||||
|
// AssumePod assumes a pod scheduled and aggregates the pod's information into its node.
|
||||||
|
// The implementation also decides the policy to expire pod before being confirmed (receiving Add event).
|
||||||
|
// After expiration, its information would be subtracted.
|
||||||
|
AssumePod(pod *corev1.Pod) error
|
||||||
|
|
||||||
|
// FinishBinding signals that cache for assumed pod can be expired
|
||||||
|
FinishBinding(pod *corev1.Pod) error
|
||||||
|
|
||||||
|
// ForgetPod removes an assumed pod from cache.
|
||||||
|
ForgetPod(pod *corev1.Pod) error
|
||||||
|
|
||||||
|
// AddPod either confirms a pod if it's assumed, or adds it back if it's expired.
|
||||||
|
// If added back, the pod's information would be added again.
|
||||||
|
AddPod(pod *corev1.Pod) error
|
||||||
|
|
||||||
|
// UpdatePod removes oldPod's information and adds newPod's information.
|
||||||
|
UpdatePod(oldPod, newPod *corev1.Pod) error
|
||||||
|
|
||||||
|
// RemovePod removes a pod. The pod's information would be subtracted from assigned node.
|
||||||
|
RemovePod(pod *corev1.Pod) error
|
||||||
|
|
||||||
|
// GetPod returns the pod from the cache with the same namespace and the
|
||||||
|
// same name of the specified pod.
|
||||||
|
GetPod(pod *corev1.Pod) (*corev1.Pod, error)
|
||||||
|
|
||||||
|
// IsAssumedPod returns true if the pod is assumed and not expired.
|
||||||
|
IsAssumedPod(pod *corev1.Pod) (bool, error)
|
||||||
|
|
||||||
|
// AddNode adds overall information about node.
|
||||||
|
// It returns a clone of added NodeInfo object.
|
||||||
|
AddNode(node *corev1.Node) *framework.NodeInfo
|
||||||
|
|
||||||
|
// UpdateNode updates overall information about node.
|
||||||
|
// It returns a clone of updated NodeInfo object.
|
||||||
|
UpdateNode(oldNode, newNode *corev1.Node) *framework.NodeInfo
|
||||||
|
|
||||||
|
// RemoveNode removes overall information about node.
|
||||||
|
RemoveNode(node *corev1.Node) error
|
||||||
|
|
||||||
|
// UpdateSnapshot updates the passed infoSnapshot to the current contents of Cache.
|
||||||
|
// The node info contains aggregated information of pods scheduled (including assumed to be)
|
||||||
|
// on this node.
|
||||||
|
// The snapshot only includes Nodes that are not deleted at the time this function is called.
|
||||||
|
// nodeinfo.Node() is guaranteed to be not nil for all the nodes in the snapshot.
|
||||||
|
UpdateSnapshot(nodeSnapshot *Snapshot) error
|
||||||
|
|
||||||
|
// Dump produces a dump of the current cache.
|
||||||
|
Dump() *Dump
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dump is a dump of the cache state.
|
||||||
|
type Dump struct {
|
||||||
|
AssumedPods sets.String
|
||||||
|
Nodes map[string]*framework.NodeInfo
|
||||||
|
}
|
|
@ -0,0 +1,147 @@
|
||||||
|
/*
|
||||||
|
Copyright 2018 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// This code is directly lifted from the Kubernetes codebase in order to avoid relying on the k8s.io/kubernetes package.
|
||||||
|
// For reference:
|
||||||
|
// https://github.com/kubernetes/kubernetes/blob/release-1.25/pkg/scheduler/internal/cache/node_tree.go
|
||||||
|
|
||||||
|
package cache
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
corev1 "k8s.io/api/core/v1"
|
||||||
|
utilnode "k8s.io/component-helpers/node/topology"
|
||||||
|
"k8s.io/klog/v2"
|
||||||
|
)
|
||||||
|
|
||||||
|
// nodeTree is a tree-like data structure that holds node names in each zone. Zone names are
|
||||||
|
// keys to "NodeTree.tree" and values of "NodeTree.tree" are arrays of node names.
|
||||||
|
// NodeTree is NOT thread-safe, any concurrent updates/reads from it must be synchronized by the caller.
|
||||||
|
// It is used only by schedulerCache, and should stay as such.
|
||||||
|
type nodeTree struct {
|
||||||
|
tree map[string][]string // a map from zone (region-zone) to an array of nodes in the zone.
|
||||||
|
zones []string // a list of all the zones in the tree (keys)
|
||||||
|
numNodes int
|
||||||
|
}
|
||||||
|
|
||||||
|
// newNodeTree creates a NodeTree from nodes.
|
||||||
|
func newNodeTree(nodes []*corev1.Node) *nodeTree {
|
||||||
|
nt := &nodeTree{
|
||||||
|
tree: make(map[string][]string, len(nodes)),
|
||||||
|
}
|
||||||
|
for _, n := range nodes {
|
||||||
|
nt.addNode(n)
|
||||||
|
}
|
||||||
|
return nt
|
||||||
|
}
|
||||||
|
|
||||||
|
// addNode adds a node and its corresponding zone to the tree. If the zone already exists, the node
|
||||||
|
// is added to the array of nodes in that zone.
|
||||||
|
func (nt *nodeTree) addNode(n *corev1.Node) {
|
||||||
|
zone := utilnode.GetZoneKey(n)
|
||||||
|
if na, ok := nt.tree[zone]; ok {
|
||||||
|
for _, nodeName := range na {
|
||||||
|
if nodeName == n.Name {
|
||||||
|
klog.InfoS("Node already exists in the NodeTree", "node", klog.KObj(n))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nt.tree[zone] = append(na, n.Name)
|
||||||
|
} else {
|
||||||
|
nt.zones = append(nt.zones, zone)
|
||||||
|
nt.tree[zone] = []string{n.Name}
|
||||||
|
}
|
||||||
|
klog.V(2).InfoS("Added node in listed group to NodeTree", "node", klog.KObj(n), "zone", zone)
|
||||||
|
nt.numNodes++
|
||||||
|
}
|
||||||
|
|
||||||
|
// removeNode removes a node from the NodeTree.
|
||||||
|
func (nt *nodeTree) removeNode(n *corev1.Node) error {
|
||||||
|
zone := utilnode.GetZoneKey(n)
|
||||||
|
if na, ok := nt.tree[zone]; ok {
|
||||||
|
for i, nodeName := range na {
|
||||||
|
if nodeName == n.Name {
|
||||||
|
nt.tree[zone] = append(na[:i], na[i+1:]...)
|
||||||
|
if len(nt.tree[zone]) == 0 {
|
||||||
|
nt.removeZone(zone)
|
||||||
|
}
|
||||||
|
klog.V(2).InfoS("Removed node in listed group from NodeTree", "node", klog.KObj(n), "zone", zone)
|
||||||
|
nt.numNodes--
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
klog.ErrorS(nil, "Node in listed group was not found", "node", klog.KObj(n), "zone", zone)
|
||||||
|
return fmt.Errorf("node %q in group %q was not found", n.Name, zone)
|
||||||
|
}
|
||||||
|
|
||||||
|
// removeZone removes a zone from tree.
|
||||||
|
// This function must be called while writer locks are hold.
|
||||||
|
func (nt *nodeTree) removeZone(zone string) {
|
||||||
|
delete(nt.tree, zone)
|
||||||
|
for i, z := range nt.zones {
|
||||||
|
if z == zone {
|
||||||
|
nt.zones = append(nt.zones[:i], nt.zones[i+1:]...)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// updateNode updates a node in the NodeTree.
|
||||||
|
func (nt *nodeTree) updateNode(old, new *corev1.Node) {
|
||||||
|
var oldZone string
|
||||||
|
if old != nil {
|
||||||
|
oldZone = utilnode.GetZoneKey(old)
|
||||||
|
}
|
||||||
|
newZone := utilnode.GetZoneKey(new)
|
||||||
|
// If the zone ID of the node has not changed, we don't need to do anything. Name of the node
|
||||||
|
// cannot be changed in an update.
|
||||||
|
if oldZone == newZone {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = nt.removeNode(old) // No error checking. We ignore whether the old node exists or not.
|
||||||
|
nt.addNode(new)
|
||||||
|
}
|
||||||
|
|
||||||
|
// list returns the list of names of the node. NodeTree iterates over zones and in each zone iterates
|
||||||
|
// over nodes in a round robin fashion.
|
||||||
|
func (nt *nodeTree) list() ([]string, error) {
|
||||||
|
if len(nt.zones) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
nodesList := make([]string, 0, nt.numNodes)
|
||||||
|
numExhaustedZones := 0
|
||||||
|
nodeIndex := 0
|
||||||
|
for len(nodesList) < nt.numNodes {
|
||||||
|
if numExhaustedZones >= len(nt.zones) { // all zones are exhausted.
|
||||||
|
return nodesList, errors.New("all zones exhausted before reaching count of nodes expected")
|
||||||
|
}
|
||||||
|
for zoneIndex := 0; zoneIndex < len(nt.zones); zoneIndex++ {
|
||||||
|
na := nt.tree[nt.zones[zoneIndex]]
|
||||||
|
if nodeIndex >= len(na) { // If the zone is exhausted, continue
|
||||||
|
if nodeIndex == len(na) { // If it is the first time the zone is exhausted
|
||||||
|
numExhaustedZones++
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
nodesList = append(nodesList, na[nodeIndex])
|
||||||
|
}
|
||||||
|
nodeIndex++
|
||||||
|
}
|
||||||
|
return nodesList, nil
|
||||||
|
}
|
|
@ -0,0 +1,205 @@
|
||||||
|
/*
|
||||||
|
Copyright 2019 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// This code is directly lifted from the Kubernetes codebase in order to avoid relying on the k8s.io/kubernetes package.
|
||||||
|
// For reference:
|
||||||
|
// https://github.com/kubernetes/kubernetes/blob/release-1.25/pkg/scheduler/internal/cache/snapshot.go
|
||||||
|
|
||||||
|
package cache
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
corev1 "k8s.io/api/core/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
|
|
||||||
|
"github.com/karmada-io/karmada/pkg/util/lifted/scheduler/framework"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Snapshot is a snapshot of cache NodeInfo and NodeTree order. The scheduler takes a
|
||||||
|
// snapshot at the beginning of each scheduling cycle and uses it for its operations in that cycle.
|
||||||
|
type Snapshot struct {
|
||||||
|
// nodeInfoMap a map of node name to a snapshot of its NodeInfo.
|
||||||
|
nodeInfoMap map[string]*framework.NodeInfo
|
||||||
|
// nodeInfoList is the list of nodes as ordered in the cache's nodeTree.
|
||||||
|
nodeInfoList []*framework.NodeInfo
|
||||||
|
// havePodsWithAffinityNodeInfoList is the list of nodes with at least one pod declaring affinity terms.
|
||||||
|
havePodsWithAffinityNodeInfoList []*framework.NodeInfo
|
||||||
|
// havePodsWithRequiredAntiAffinityNodeInfoList is the list of nodes with at least one pod declaring
|
||||||
|
// required anti-affinity terms.
|
||||||
|
havePodsWithRequiredAntiAffinityNodeInfoList []*framework.NodeInfo
|
||||||
|
// usedPVCSet contains a set of PVC names that have one or more scheduled pods using them,
|
||||||
|
// keyed in the format "namespace/name".
|
||||||
|
usedPVCSet sets.String
|
||||||
|
generation int64
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ framework.SharedLister = &Snapshot{}
|
||||||
|
|
||||||
|
// NewEmptySnapshot initializes a Snapshot struct and returns it.
|
||||||
|
func NewEmptySnapshot() *Snapshot {
|
||||||
|
return &Snapshot{
|
||||||
|
nodeInfoMap: make(map[string]*framework.NodeInfo),
|
||||||
|
usedPVCSet: sets.NewString(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewSnapshot initializes a Snapshot struct and returns it.
|
||||||
|
func NewSnapshot(pods []*corev1.Pod, nodes []*corev1.Node) *Snapshot {
|
||||||
|
nodeInfoMap := createNodeInfoMap(pods, nodes)
|
||||||
|
nodeInfoList := make([]*framework.NodeInfo, 0, len(nodeInfoMap))
|
||||||
|
havePodsWithAffinityNodeInfoList := make([]*framework.NodeInfo, 0, len(nodeInfoMap))
|
||||||
|
havePodsWithRequiredAntiAffinityNodeInfoList := make([]*framework.NodeInfo, 0, len(nodeInfoMap))
|
||||||
|
for _, v := range nodeInfoMap {
|
||||||
|
nodeInfoList = append(nodeInfoList, v)
|
||||||
|
if len(v.PodsWithAffinity) > 0 {
|
||||||
|
havePodsWithAffinityNodeInfoList = append(havePodsWithAffinityNodeInfoList, v)
|
||||||
|
}
|
||||||
|
if len(v.PodsWithRequiredAntiAffinity) > 0 {
|
||||||
|
havePodsWithRequiredAntiAffinityNodeInfoList = append(havePodsWithRequiredAntiAffinityNodeInfoList, v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
s := NewEmptySnapshot()
|
||||||
|
s.nodeInfoMap = nodeInfoMap
|
||||||
|
s.nodeInfoList = nodeInfoList
|
||||||
|
s.havePodsWithAffinityNodeInfoList = havePodsWithAffinityNodeInfoList
|
||||||
|
s.havePodsWithRequiredAntiAffinityNodeInfoList = havePodsWithRequiredAntiAffinityNodeInfoList
|
||||||
|
s.usedPVCSet = createUsedPVCSet(pods)
|
||||||
|
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
// createNodeInfoMap obtains a list of pods and pivots that list into a map
|
||||||
|
// where the keys are node names and the values are the aggregated information
|
||||||
|
// for that node.
|
||||||
|
func createNodeInfoMap(pods []*corev1.Pod, nodes []*corev1.Node) map[string]*framework.NodeInfo {
|
||||||
|
nodeNameToInfo := make(map[string]*framework.NodeInfo)
|
||||||
|
for _, pod := range pods {
|
||||||
|
nodeName := pod.Spec.NodeName
|
||||||
|
if _, ok := nodeNameToInfo[nodeName]; !ok {
|
||||||
|
nodeNameToInfo[nodeName] = framework.NewNodeInfo()
|
||||||
|
}
|
||||||
|
nodeNameToInfo[nodeName].AddPod(pod)
|
||||||
|
}
|
||||||
|
imageExistenceMap := createImageExistenceMap(nodes)
|
||||||
|
|
||||||
|
for _, node := range nodes {
|
||||||
|
if _, ok := nodeNameToInfo[node.Name]; !ok {
|
||||||
|
nodeNameToInfo[node.Name] = framework.NewNodeInfo()
|
||||||
|
}
|
||||||
|
nodeInfo := nodeNameToInfo[node.Name]
|
||||||
|
nodeInfo.SetNode(node)
|
||||||
|
nodeInfo.ImageStates = getNodeImageStates(node, imageExistenceMap)
|
||||||
|
}
|
||||||
|
return nodeNameToInfo
|
||||||
|
}
|
||||||
|
|
||||||
|
func createUsedPVCSet(pods []*corev1.Pod) sets.String {
|
||||||
|
usedPVCSet := sets.NewString()
|
||||||
|
for _, pod := range pods {
|
||||||
|
if pod.Spec.NodeName == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v := range pod.Spec.Volumes {
|
||||||
|
if v.PersistentVolumeClaim == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
key := framework.GetNamespacedName(pod.Namespace, v.PersistentVolumeClaim.ClaimName)
|
||||||
|
usedPVCSet.Insert(key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return usedPVCSet
|
||||||
|
}
|
||||||
|
|
||||||
|
// getNodeImageStates returns the given node's image states based on the given imageExistence map.
|
||||||
|
func getNodeImageStates(node *corev1.Node, imageExistenceMap map[string]sets.String) map[string]*framework.ImageStateSummary {
|
||||||
|
imageStates := make(map[string]*framework.ImageStateSummary)
|
||||||
|
|
||||||
|
for _, image := range node.Status.Images {
|
||||||
|
for _, name := range image.Names {
|
||||||
|
imageStates[name] = &framework.ImageStateSummary{
|
||||||
|
Size: image.SizeBytes,
|
||||||
|
NumNodes: len(imageExistenceMap[name]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return imageStates
|
||||||
|
}
|
||||||
|
|
||||||
|
// createImageExistenceMap returns a map recording on which nodes the images exist, keyed by the images' names.
|
||||||
|
func createImageExistenceMap(nodes []*corev1.Node) map[string]sets.String {
|
||||||
|
imageExistenceMap := make(map[string]sets.String)
|
||||||
|
for _, node := range nodes {
|
||||||
|
for _, image := range node.Status.Images {
|
||||||
|
for _, name := range image.Names {
|
||||||
|
if _, ok := imageExistenceMap[name]; !ok {
|
||||||
|
imageExistenceMap[name] = sets.NewString(node.Name)
|
||||||
|
} else {
|
||||||
|
imageExistenceMap[name].Insert(node.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return imageExistenceMap
|
||||||
|
}
|
||||||
|
|
||||||
|
// NodeInfos returns a NodeInfoLister.
|
||||||
|
func (s *Snapshot) NodeInfos() framework.NodeInfoLister {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
// StorageInfos returns a StorageInfoLister.
|
||||||
|
func (s *Snapshot) StorageInfos() framework.StorageInfoLister {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
// NumNodes returns the number of nodes in the snapshot.
|
||||||
|
func (s *Snapshot) NumNodes() int {
|
||||||
|
return len(s.nodeInfoList)
|
||||||
|
}
|
||||||
|
|
||||||
|
// List returns the list of nodes in the snapshot.
|
||||||
|
func (s *Snapshot) List() ([]*framework.NodeInfo, error) {
|
||||||
|
return s.nodeInfoList, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// HavePodsWithAffinityList returns the list of nodes with at least one pod with inter-pod affinity
|
||||||
|
func (s *Snapshot) HavePodsWithAffinityList() ([]*framework.NodeInfo, error) {
|
||||||
|
return s.havePodsWithAffinityNodeInfoList, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// HavePodsWithRequiredAntiAffinityList returns the list of nodes with at least one pod with
|
||||||
|
// required inter-pod anti-affinity
|
||||||
|
func (s *Snapshot) HavePodsWithRequiredAntiAffinityList() ([]*framework.NodeInfo, error) {
|
||||||
|
return s.havePodsWithRequiredAntiAffinityNodeInfoList, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get returns the NodeInfo of the given node name.
|
||||||
|
func (s *Snapshot) Get(nodeName string) (*framework.NodeInfo, error) {
|
||||||
|
if v, ok := s.nodeInfoMap[nodeName]; ok && v.Node() != nil {
|
||||||
|
return v, nil
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("nodeinfo not found for node name %q", nodeName)
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsPVCUsedByPods returns true/false on whether the PVC is used by one or more scheduled pods,
|
||||||
|
// keyed in the format "namespace/name".
|
||||||
|
func (s *Snapshot) IsPVCUsedByPods(key string) bool {
|
||||||
|
return s.usedPVCSet.Has(key)
|
||||||
|
}
|
|
@ -0,0 +1,46 @@
|
||||||
|
/*
|
||||||
|
Copyright 2019 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// This code is directly lifted from the Kubernetes codebase in order to avoid relying on the k8s.io/kubernetes package.
|
||||||
|
// For reference:
|
||||||
|
// https://github.com/kubernetes/kubernetes/blob/release-1.25/pkg/scheduler/framework/listers.go
|
||||||
|
|
||||||
|
package framework
|
||||||
|
|
||||||
|
// NodeInfoLister interface represents anything that can list/get NodeInfo objects from node name.
|
||||||
|
type NodeInfoLister interface {
|
||||||
|
// List returns the list of NodeInfos.
|
||||||
|
List() ([]*NodeInfo, error)
|
||||||
|
// HavePodsWithAffinityList returns the list of NodeInfos of nodes with pods with affinity terms.
|
||||||
|
HavePodsWithAffinityList() ([]*NodeInfo, error)
|
||||||
|
// HavePodsWithRequiredAntiAffinityList returns the list of NodeInfos of nodes with pods with required anti-affinity terms.
|
||||||
|
HavePodsWithRequiredAntiAffinityList() ([]*NodeInfo, error)
|
||||||
|
// Get returns the NodeInfo of the given node name.
|
||||||
|
Get(nodeName string) (*NodeInfo, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// StorageInfoLister interface represents anything that handles storage-related operations and resources.
|
||||||
|
type StorageInfoLister interface {
|
||||||
|
// IsPVCUsedByPods returns true/false on whether the PVC is used by one or more scheduled pods,
|
||||||
|
// keyed in the format "namespace/name".
|
||||||
|
IsPVCUsedByPods(key string) bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// SharedLister groups scheduler-specific listers.
|
||||||
|
type SharedLister interface {
|
||||||
|
NodeInfos() NodeInfoLister
|
||||||
|
StorageInfos() StorageInfoLister
|
||||||
|
}
|
|
@ -18,7 +18,7 @@ limitations under the License.
|
||||||
// For reference:
|
// For reference:
|
||||||
// https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/scheduler/framework/parallelize/parallelism.go
|
// https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/scheduler/framework/parallelize/parallelism.go
|
||||||
|
|
||||||
package lifted
|
package parallelize
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
|
@ -18,7 +18,7 @@ limitations under the License.
|
||||||
// For reference:
|
// For reference:
|
||||||
// https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/scheduler/framework/parallelize/parallelism_test.go
|
// https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/scheduler/framework/parallelize/parallelism_test.go
|
||||||
|
|
||||||
package lifted
|
package parallelize
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
|
@ -0,0 +1,770 @@
|
||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// This code is directly lifted from the Kubernetes codebase in order to avoid relying on the k8s.io/kubernetes package.
|
||||||
|
// For reference:
|
||||||
|
// https://github.com/kubernetes/kubernetes/blob/release-1.25/pkg/scheduler/framework/types.go
|
||||||
|
|
||||||
|
package framework
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
corev1 "k8s.io/api/core/v1"
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/labels"
|
||||||
|
utilerrors "k8s.io/apimachinery/pkg/util/errors"
|
||||||
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
|
"k8s.io/klog/v2"
|
||||||
|
|
||||||
|
"github.com/karmada-io/karmada/pkg/util"
|
||||||
|
schedutil "github.com/karmada-io/karmada/pkg/util/lifted/scheduler/util"
|
||||||
|
)
|
||||||
|
|
||||||
|
var generation int64
|
||||||
|
|
||||||
|
// ActionType is an integer to represent one type of resource change.
|
||||||
|
// Different ActionTypes can be bit-wised to compose new semantics.
|
||||||
|
type ActionType int64
|
||||||
|
|
||||||
|
// Constants for ActionTypes.
|
||||||
|
const (
|
||||||
|
Add ActionType = 1 << iota // 1
|
||||||
|
Delete // 10
|
||||||
|
// UpdateNodeXYZ is only applicable for Node events.
|
||||||
|
UpdateNodeAllocatable // 100
|
||||||
|
UpdateNodeLabel // 1000
|
||||||
|
UpdateNodeTaint // 10000
|
||||||
|
UpdateNodeCondition // 100000
|
||||||
|
|
||||||
|
All ActionType = 1<<iota - 1 // 111111
|
||||||
|
|
||||||
|
// Use the general Update type if you don't either know or care the specific sub-Update type to use.
|
||||||
|
Update = UpdateNodeAllocatable | UpdateNodeLabel | UpdateNodeTaint | UpdateNodeCondition
|
||||||
|
)
|
||||||
|
|
||||||
|
// GVK is short for group/version/kind, which can uniquely represent a particular API resource.
|
||||||
|
type GVK string
|
||||||
|
|
||||||
|
// Constants for GVKs.
|
||||||
|
const (
|
||||||
|
Pod GVK = "Pod"
|
||||||
|
Node GVK = "Node"
|
||||||
|
PersistentVolume GVK = "PersistentVolume"
|
||||||
|
PersistentVolumeClaim GVK = "PersistentVolumeClaim"
|
||||||
|
StorageClass GVK = "storage.k8s.io/StorageClass"
|
||||||
|
CSINode GVK = "storage.k8s.io/CSINode"
|
||||||
|
CSIDriver GVK = "storage.k8s.io/CSIDriver"
|
||||||
|
CSIStorageCapacity GVK = "storage.k8s.io/CSIStorageCapacity"
|
||||||
|
WildCard GVK = "*"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ClusterEvent abstracts how a system resource's state gets changed.
|
||||||
|
// Resource represents the standard API resources such as Pod, Node, etc.
|
||||||
|
// ActionType denotes the specific change such as Add, Update or Delete.
|
||||||
|
type ClusterEvent struct {
|
||||||
|
Resource GVK
|
||||||
|
ActionType ActionType
|
||||||
|
Label string
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsWildCard returns true if ClusterEvent follows WildCard semantics
|
||||||
|
func (ce ClusterEvent) IsWildCard() bool {
|
||||||
|
return ce.Resource == WildCard && ce.ActionType == All
|
||||||
|
}
|
||||||
|
|
||||||
|
// QueuedPodInfo is a Pod wrapper with additional information related to
|
||||||
|
// the pod's status in the scheduling queue, such as the timestamp when
|
||||||
|
// it's added to the queue.
|
||||||
|
type QueuedPodInfo struct {
|
||||||
|
*PodInfo
|
||||||
|
// The time pod added to the scheduling queue.
|
||||||
|
Timestamp time.Time
|
||||||
|
// Number of schedule attempts before successfully scheduled.
|
||||||
|
// It's used to record the # attempts metric.
|
||||||
|
Attempts int
|
||||||
|
// The time when the pod is added to the queue for the first time. The pod may be added
|
||||||
|
// back to the queue multiple times before it's successfully scheduled.
|
||||||
|
// It shouldn't be updated once initialized. It's used to record the e2e scheduling
|
||||||
|
// latency for a pod.
|
||||||
|
InitialAttemptTimestamp time.Time
|
||||||
|
// If a Pod failed in a scheduling cycle, record the plugin names it failed by.
|
||||||
|
UnschedulablePlugins sets.String
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeepCopy returns a deep copy of the QueuedPodInfo object.
|
||||||
|
func (pqi *QueuedPodInfo) DeepCopy() *QueuedPodInfo {
|
||||||
|
return &QueuedPodInfo{
|
||||||
|
PodInfo: pqi.PodInfo.DeepCopy(),
|
||||||
|
Timestamp: pqi.Timestamp,
|
||||||
|
Attempts: pqi.Attempts,
|
||||||
|
InitialAttemptTimestamp: pqi.InitialAttemptTimestamp,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// PodInfo is a wrapper to a Pod with additional pre-computed information to
|
||||||
|
// accelerate processing. This information is typically immutable (e.g., pre-processed
|
||||||
|
// inter-pod affinity selectors).
|
||||||
|
type PodInfo struct {
|
||||||
|
Pod *corev1.Pod
|
||||||
|
RequiredAffinityTerms []AffinityTerm
|
||||||
|
RequiredAntiAffinityTerms []AffinityTerm
|
||||||
|
PreferredAffinityTerms []WeightedAffinityTerm
|
||||||
|
PreferredAntiAffinityTerms []WeightedAffinityTerm
|
||||||
|
ParseError error
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeepCopy returns a deep copy of the PodInfo object.
|
||||||
|
func (pi *PodInfo) DeepCopy() *PodInfo {
|
||||||
|
return &PodInfo{
|
||||||
|
Pod: pi.Pod.DeepCopy(),
|
||||||
|
RequiredAffinityTerms: pi.RequiredAffinityTerms,
|
||||||
|
RequiredAntiAffinityTerms: pi.RequiredAntiAffinityTerms,
|
||||||
|
PreferredAffinityTerms: pi.PreferredAffinityTerms,
|
||||||
|
PreferredAntiAffinityTerms: pi.PreferredAntiAffinityTerms,
|
||||||
|
ParseError: pi.ParseError,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update creates a full new PodInfo by default. And only updates the pod when the PodInfo
|
||||||
|
// has been instantiated and the passed pod is the exact same one as the original pod.
|
||||||
|
func (pi *PodInfo) Update(pod *corev1.Pod) {
|
||||||
|
if pod != nil && pi.Pod != nil && pi.Pod.UID == pod.UID {
|
||||||
|
// PodInfo includes immutable information, and so it is safe to update the pod in place if it is
|
||||||
|
// the exact same pod
|
||||||
|
pi.Pod = pod
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var preferredAffinityTerms []corev1.WeightedPodAffinityTerm
|
||||||
|
var preferredAntiAffinityTerms []corev1.WeightedPodAffinityTerm
|
||||||
|
if affinity := pod.Spec.Affinity; affinity != nil {
|
||||||
|
if a := affinity.PodAffinity; a != nil {
|
||||||
|
preferredAffinityTerms = a.PreferredDuringSchedulingIgnoredDuringExecution
|
||||||
|
}
|
||||||
|
if a := affinity.PodAntiAffinity; a != nil {
|
||||||
|
preferredAntiAffinityTerms = a.PreferredDuringSchedulingIgnoredDuringExecution
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Attempt to parse the affinity terms
|
||||||
|
var parseErrs []error
|
||||||
|
requiredAffinityTerms, err := getAffinityTerms(pod, getPodAffinityTerms(pod.Spec.Affinity))
|
||||||
|
if err != nil {
|
||||||
|
parseErrs = append(parseErrs, fmt.Errorf("requiredAffinityTerms: %w", err))
|
||||||
|
}
|
||||||
|
requiredAntiAffinityTerms, err := getAffinityTerms(pod,
|
||||||
|
getPodAntiAffinityTerms(pod.Spec.Affinity))
|
||||||
|
if err != nil {
|
||||||
|
parseErrs = append(parseErrs, fmt.Errorf("requiredAntiAffinityTerms: %w", err))
|
||||||
|
}
|
||||||
|
weightedAffinityTerms, err := getWeightedAffinityTerms(pod, preferredAffinityTerms)
|
||||||
|
if err != nil {
|
||||||
|
parseErrs = append(parseErrs, fmt.Errorf("preferredAffinityTerms: %w", err))
|
||||||
|
}
|
||||||
|
weightedAntiAffinityTerms, err := getWeightedAffinityTerms(pod, preferredAntiAffinityTerms)
|
||||||
|
if err != nil {
|
||||||
|
parseErrs = append(parseErrs, fmt.Errorf("preferredAntiAffinityTerms: %w", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
pi.Pod = pod
|
||||||
|
pi.RequiredAffinityTerms = requiredAffinityTerms
|
||||||
|
pi.RequiredAntiAffinityTerms = requiredAntiAffinityTerms
|
||||||
|
pi.PreferredAffinityTerms = weightedAffinityTerms
|
||||||
|
pi.PreferredAntiAffinityTerms = weightedAntiAffinityTerms
|
||||||
|
pi.ParseError = utilerrors.NewAggregate(parseErrs)
|
||||||
|
}
|
||||||
|
|
||||||
|
// AffinityTerm is a processed version of v1.PodAffinityTerm.
|
||||||
|
type AffinityTerm struct {
|
||||||
|
Namespaces sets.String
|
||||||
|
Selector labels.Selector
|
||||||
|
TopologyKey string
|
||||||
|
NamespaceSelector labels.Selector
|
||||||
|
}
|
||||||
|
|
||||||
|
// Matches returns true if the pod matches the label selector and namespaces or namespace selector.
|
||||||
|
func (at *AffinityTerm) Matches(pod *corev1.Pod, nsLabels labels.Set) bool {
|
||||||
|
if at.Namespaces.Has(pod.Namespace) || at.NamespaceSelector.Matches(nsLabels) {
|
||||||
|
return at.Selector.Matches(labels.Set(pod.Labels))
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// WeightedAffinityTerm is a "processed" representation of v1.WeightedAffinityTerm.
|
||||||
|
type WeightedAffinityTerm struct {
|
||||||
|
AffinityTerm
|
||||||
|
Weight int32
|
||||||
|
}
|
||||||
|
|
||||||
|
func newAffinityTerm(pod *corev1.Pod, term *corev1.PodAffinityTerm) (*AffinityTerm, error) {
|
||||||
|
selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
namespaces := getNamespacesFromPodAffinityTerm(pod, term)
|
||||||
|
nsSelector, err := metav1.LabelSelectorAsSelector(term.NamespaceSelector)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &AffinityTerm{Namespaces: namespaces, Selector: selector, TopologyKey: term.TopologyKey, NamespaceSelector: nsSelector}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// getAffinityTerms receives a Pod and affinity terms and returns the namespaces and
|
||||||
|
// selectors of the terms.
|
||||||
|
func getAffinityTerms(pod *corev1.Pod, v1Terms []corev1.PodAffinityTerm) ([]AffinityTerm, error) {
|
||||||
|
if v1Terms == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var terms []AffinityTerm
|
||||||
|
for i := range v1Terms {
|
||||||
|
t, err := newAffinityTerm(pod, &v1Terms[i])
|
||||||
|
if err != nil {
|
||||||
|
// We get here if the label selector failed to process
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
terms = append(terms, *t)
|
||||||
|
}
|
||||||
|
return terms, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// getWeightedAffinityTerms returns the list of processed affinity terms.
|
||||||
|
func getWeightedAffinityTerms(pod *corev1.Pod, v1Terms []corev1.WeightedPodAffinityTerm) ([]WeightedAffinityTerm, error) {
|
||||||
|
if v1Terms == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var terms []WeightedAffinityTerm
|
||||||
|
for i := range v1Terms {
|
||||||
|
t, err := newAffinityTerm(pod, &v1Terms[i].PodAffinityTerm)
|
||||||
|
if err != nil {
|
||||||
|
// We get here if the label selector failed to process
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
terms = append(terms, WeightedAffinityTerm{AffinityTerm: *t, Weight: v1Terms[i].Weight})
|
||||||
|
}
|
||||||
|
return terms, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewPodInfo returns a new PodInfo.
|
||||||
|
func NewPodInfo(pod *corev1.Pod) *PodInfo {
|
||||||
|
pInfo := &PodInfo{}
|
||||||
|
pInfo.Update(pod)
|
||||||
|
return pInfo
|
||||||
|
}
|
||||||
|
|
||||||
|
func getPodAffinityTerms(affinity *corev1.Affinity) (terms []corev1.PodAffinityTerm) {
|
||||||
|
if affinity != nil && affinity.PodAffinity != nil {
|
||||||
|
if len(affinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 {
|
||||||
|
terms = affinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution
|
||||||
|
}
|
||||||
|
// TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution.
|
||||||
|
//if len(affinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 {
|
||||||
|
// terms = append(terms, affinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution...)
|
||||||
|
//}
|
||||||
|
}
|
||||||
|
return terms
|
||||||
|
}
|
||||||
|
|
||||||
|
func getPodAntiAffinityTerms(affinity *corev1.Affinity) (terms []corev1.PodAffinityTerm) {
|
||||||
|
if affinity != nil && affinity.PodAntiAffinity != nil {
|
||||||
|
if len(affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 {
|
||||||
|
terms = affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution
|
||||||
|
}
|
||||||
|
// TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution.
|
||||||
|
//if len(affinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 {
|
||||||
|
// terms = append(terms, affinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution...)
|
||||||
|
//}
|
||||||
|
}
|
||||||
|
return terms
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns a set of names according to the namespaces indicated in podAffinityTerm.
|
||||||
|
// If namespaces is empty it considers the given pod's namespace.
|
||||||
|
func getNamespacesFromPodAffinityTerm(pod *corev1.Pod, podAffinityTerm *corev1.PodAffinityTerm) sets.String {
|
||||||
|
names := sets.String{}
|
||||||
|
if len(podAffinityTerm.Namespaces) == 0 && podAffinityTerm.NamespaceSelector == nil {
|
||||||
|
names.Insert(pod.Namespace)
|
||||||
|
} else {
|
||||||
|
names.Insert(podAffinityTerm.Namespaces...)
|
||||||
|
}
|
||||||
|
return names
|
||||||
|
}
|
||||||
|
|
||||||
|
// ImageStateSummary provides summarized information about the state of an image.
|
||||||
|
type ImageStateSummary struct {
|
||||||
|
// Size of the image
|
||||||
|
Size int64
|
||||||
|
// Used to track how many nodes have this image
|
||||||
|
NumNodes int
|
||||||
|
}
|
||||||
|
|
||||||
|
// NodeInfo is node level aggregated information.
|
||||||
|
type NodeInfo struct {
|
||||||
|
// Overall node information.
|
||||||
|
node *corev1.Node
|
||||||
|
|
||||||
|
// Pods running on the node.
|
||||||
|
Pods []*PodInfo
|
||||||
|
|
||||||
|
// The subset of pods with affinity.
|
||||||
|
PodsWithAffinity []*PodInfo
|
||||||
|
|
||||||
|
// The subset of pods with required anti-affinity.
|
||||||
|
PodsWithRequiredAntiAffinity []*PodInfo
|
||||||
|
|
||||||
|
// Ports allocated on the node.
|
||||||
|
UsedPorts HostPortInfo
|
||||||
|
|
||||||
|
// Total requested resources of all pods on this node. This includes assumed
|
||||||
|
// pods, which scheduler has sent for binding, but may not be scheduled yet.
|
||||||
|
Requested *util.Resource
|
||||||
|
// Total requested resources of all pods on this node with a minimum value
|
||||||
|
// applied to each container's CPU and memory requests. This does not reflect
|
||||||
|
// the actual resource requests for this node, but is used to avoid scheduling
|
||||||
|
// many zero-request pods onto one node.
|
||||||
|
NonZeroRequested *util.Resource
|
||||||
|
// We store allocatedResources (which is Node.Status.Allocatable.*) explicitly
|
||||||
|
// as int64, to avoid conversions and accessing map.
|
||||||
|
Allocatable *util.Resource
|
||||||
|
|
||||||
|
// ImageStates holds the entry of an image if and only if this image is on the node. The entry can be used for
|
||||||
|
// checking an image's existence and advanced usage (e.g., image locality scheduling policy) based on the image
|
||||||
|
// state information.
|
||||||
|
ImageStates map[string]*ImageStateSummary
|
||||||
|
|
||||||
|
// PVCRefCounts contains a mapping of PVC names to the number of pods on the node using it.
|
||||||
|
// Keys are in the format "namespace/name".
|
||||||
|
PVCRefCounts map[string]int
|
||||||
|
|
||||||
|
// Whenever NodeInfo changes, generation is bumped.
|
||||||
|
// This is used to avoid cloning it if the object didn't change.
|
||||||
|
Generation int64
|
||||||
|
}
|
||||||
|
|
||||||
|
// nextGeneration: Let's make sure history never forgets the name...
|
||||||
|
// Increments the generation number monotonically ensuring that generation numbers never collide.
|
||||||
|
// Collision of the generation numbers would be particularly problematic if a node was deleted and
|
||||||
|
// added back with the same name. See issue#63262.
|
||||||
|
func nextGeneration() int64 {
|
||||||
|
return atomic.AddInt64(&generation, 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewNodeInfo returns a ready to use empty NodeInfo object.
|
||||||
|
// If any pods are given in arguments, their information will be aggregated in
|
||||||
|
// the returned object.
|
||||||
|
func NewNodeInfo(pods ...*corev1.Pod) *NodeInfo {
|
||||||
|
ni := &NodeInfo{
|
||||||
|
Requested: &util.Resource{},
|
||||||
|
NonZeroRequested: &util.Resource{},
|
||||||
|
Allocatable: &util.Resource{},
|
||||||
|
Generation: nextGeneration(),
|
||||||
|
UsedPorts: make(HostPortInfo),
|
||||||
|
ImageStates: make(map[string]*ImageStateSummary),
|
||||||
|
PVCRefCounts: make(map[string]int),
|
||||||
|
}
|
||||||
|
for _, pod := range pods {
|
||||||
|
ni.AddPod(pod)
|
||||||
|
}
|
||||||
|
return ni
|
||||||
|
}
|
||||||
|
|
||||||
|
// Node returns overall information about this node.
|
||||||
|
func (n *NodeInfo) Node() *corev1.Node {
|
||||||
|
if n == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return n.node
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clone returns a copy of this node.
|
||||||
|
func (n *NodeInfo) Clone() *NodeInfo {
|
||||||
|
clone := &NodeInfo{
|
||||||
|
node: n.node,
|
||||||
|
Requested: n.Requested.Clone(),
|
||||||
|
NonZeroRequested: n.NonZeroRequested.Clone(),
|
||||||
|
Allocatable: n.Allocatable.Clone(),
|
||||||
|
UsedPorts: make(HostPortInfo),
|
||||||
|
ImageStates: n.ImageStates,
|
||||||
|
PVCRefCounts: make(map[string]int),
|
||||||
|
Generation: n.Generation,
|
||||||
|
}
|
||||||
|
if len(n.Pods) > 0 {
|
||||||
|
clone.Pods = append([]*PodInfo(nil), n.Pods...)
|
||||||
|
}
|
||||||
|
if len(n.UsedPorts) > 0 {
|
||||||
|
// HostPortInfo is a map-in-map struct
|
||||||
|
// make sure it's deep copied
|
||||||
|
for ip, portMap := range n.UsedPorts {
|
||||||
|
clone.UsedPorts[ip] = make(map[ProtocolPort]struct{})
|
||||||
|
for protocolPort, v := range portMap {
|
||||||
|
clone.UsedPorts[ip][protocolPort] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(n.PodsWithAffinity) > 0 {
|
||||||
|
clone.PodsWithAffinity = append([]*PodInfo(nil), n.PodsWithAffinity...)
|
||||||
|
}
|
||||||
|
if len(n.PodsWithRequiredAntiAffinity) > 0 {
|
||||||
|
clone.PodsWithRequiredAntiAffinity = append([]*PodInfo(nil), n.PodsWithRequiredAntiAffinity...)
|
||||||
|
}
|
||||||
|
for key, value := range n.PVCRefCounts {
|
||||||
|
clone.PVCRefCounts[key] = value
|
||||||
|
}
|
||||||
|
return clone
|
||||||
|
}
|
||||||
|
|
||||||
|
// String returns representation of human readable format of this NodeInfo.
|
||||||
|
func (n *NodeInfo) String() string {
|
||||||
|
podKeys := make([]string, len(n.Pods))
|
||||||
|
for i, p := range n.Pods {
|
||||||
|
podKeys[i] = p.Pod.Name
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("&NodeInfo{Pods:%v, RequestedResource:%#v, NonZeroRequest: %#v, UsedPort: %#v, AllocatableResource:%#v}",
|
||||||
|
podKeys, n.Requested, n.NonZeroRequested, n.UsedPorts, n.Allocatable)
|
||||||
|
}
|
||||||
|
|
||||||
|
// AddPodInfo adds pod information to this NodeInfo.
|
||||||
|
// Consider using this instead of AddPod if a PodInfo is already computed.
|
||||||
|
func (n *NodeInfo) AddPodInfo(podInfo *PodInfo) {
|
||||||
|
n.Pods = append(n.Pods, podInfo)
|
||||||
|
if podWithAffinity(podInfo.Pod) {
|
||||||
|
n.PodsWithAffinity = append(n.PodsWithAffinity, podInfo)
|
||||||
|
}
|
||||||
|
if podWithRequiredAntiAffinity(podInfo.Pod) {
|
||||||
|
n.PodsWithRequiredAntiAffinity = append(n.PodsWithRequiredAntiAffinity, podInfo)
|
||||||
|
}
|
||||||
|
n.update(podInfo.Pod, 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// AddPod is a wrapper around AddPodInfo.
|
||||||
|
func (n *NodeInfo) AddPod(pod *corev1.Pod) {
|
||||||
|
n.AddPodInfo(NewPodInfo(pod))
|
||||||
|
}
|
||||||
|
|
||||||
|
func podWithAffinity(p *corev1.Pod) bool {
|
||||||
|
affinity := p.Spec.Affinity
|
||||||
|
return affinity != nil && (affinity.PodAffinity != nil || affinity.PodAntiAffinity != nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
func podWithRequiredAntiAffinity(p *corev1.Pod) bool {
|
||||||
|
affinity := p.Spec.Affinity
|
||||||
|
return affinity != nil && affinity.PodAntiAffinity != nil &&
|
||||||
|
len(affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func removeFromSlice(s []*PodInfo, k string) []*PodInfo {
|
||||||
|
for i := range s {
|
||||||
|
k2, err := GetPodKey(s[i].Pod)
|
||||||
|
if err != nil {
|
||||||
|
klog.ErrorS(err, "Cannot get pod key", "pod", klog.KObj(s[i].Pod))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if k == k2 {
|
||||||
|
// delete the element
|
||||||
|
s[i] = s[len(s)-1]
|
||||||
|
s = s[:len(s)-1]
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
// RemovePod subtracts pod information from this NodeInfo.
|
||||||
|
func (n *NodeInfo) RemovePod(pod *corev1.Pod) error {
|
||||||
|
k, err := GetPodKey(pod)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if podWithAffinity(pod) {
|
||||||
|
n.PodsWithAffinity = removeFromSlice(n.PodsWithAffinity, k)
|
||||||
|
}
|
||||||
|
if podWithRequiredAntiAffinity(pod) {
|
||||||
|
n.PodsWithRequiredAntiAffinity = removeFromSlice(n.PodsWithRequiredAntiAffinity, k)
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range n.Pods {
|
||||||
|
k2, err := GetPodKey(n.Pods[i].Pod)
|
||||||
|
if err != nil {
|
||||||
|
klog.ErrorS(err, "Cannot get pod key", "pod", klog.KObj(n.Pods[i].Pod))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if k == k2 {
|
||||||
|
// delete the element
|
||||||
|
n.Pods[i] = n.Pods[len(n.Pods)-1]
|
||||||
|
n.Pods = n.Pods[:len(n.Pods)-1]
|
||||||
|
|
||||||
|
n.update(pod, -1)
|
||||||
|
n.resetSlicesIfEmpty()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fmt.Errorf("no corresponding pod %s in pods of node %s", pod.Name, n.node.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
// update node info based on the pod and sign.
|
||||||
|
// The sign will be set to `+1` when AddPod and to `-1` when RemovePod.
|
||||||
|
func (n *NodeInfo) update(pod *corev1.Pod, sign int64) {
|
||||||
|
res, non0CPU, non0Mem := calculateResource(pod)
|
||||||
|
n.Requested.MilliCPU += sign * res.MilliCPU
|
||||||
|
n.Requested.Memory += sign * res.Memory
|
||||||
|
n.Requested.EphemeralStorage += sign * res.EphemeralStorage
|
||||||
|
if n.Requested.ScalarResources == nil && len(res.ScalarResources) > 0 {
|
||||||
|
n.Requested.ScalarResources = map[corev1.ResourceName]int64{}
|
||||||
|
}
|
||||||
|
for rName, rQuant := range res.ScalarResources {
|
||||||
|
n.Requested.ScalarResources[rName] += sign * rQuant
|
||||||
|
}
|
||||||
|
n.NonZeroRequested.MilliCPU += sign * non0CPU
|
||||||
|
n.NonZeroRequested.Memory += sign * non0Mem
|
||||||
|
|
||||||
|
// Consume ports when pod added or release ports when pod removed.
|
||||||
|
n.updateUsedPorts(pod, sign > 0)
|
||||||
|
n.updatePVCRefCounts(pod, sign > 0)
|
||||||
|
|
||||||
|
n.Generation = nextGeneration()
|
||||||
|
}
|
||||||
|
|
||||||
|
// resets the slices to nil so that we can do DeepEqual in unit tests.
|
||||||
|
func (n *NodeInfo) resetSlicesIfEmpty() {
|
||||||
|
if len(n.PodsWithAffinity) == 0 {
|
||||||
|
n.PodsWithAffinity = nil
|
||||||
|
}
|
||||||
|
if len(n.PodsWithRequiredAntiAffinity) == 0 {
|
||||||
|
n.PodsWithRequiredAntiAffinity = nil
|
||||||
|
}
|
||||||
|
if len(n.Pods) == 0 {
|
||||||
|
n.Pods = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func max(a, b int64) int64 {
|
||||||
|
if a >= b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
// resourceRequest = max(sum(podSpec.Containers), podSpec.InitContainers) + overHead
|
||||||
|
func calculateResource(pod *corev1.Pod) (res util.Resource, non0CPU int64, non0Mem int64) {
|
||||||
|
resPtr := &res
|
||||||
|
for _, c := range pod.Spec.Containers {
|
||||||
|
resPtr.Add(c.Resources.Requests)
|
||||||
|
non0CPUReq, non0MemReq := schedutil.GetNonzeroRequests(&c.Resources.Requests)
|
||||||
|
non0CPU += non0CPUReq
|
||||||
|
non0Mem += non0MemReq
|
||||||
|
// No non-zero resources for GPUs or opaque resources.
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, ic := range pod.Spec.InitContainers {
|
||||||
|
resPtr.SetMaxResource(ic.Resources.Requests)
|
||||||
|
non0CPUReq, non0MemReq := schedutil.GetNonzeroRequests(&ic.Resources.Requests)
|
||||||
|
non0CPU = max(non0CPU, non0CPUReq)
|
||||||
|
non0Mem = max(non0Mem, non0MemReq)
|
||||||
|
}
|
||||||
|
|
||||||
|
// If Overhead is being utilized, add to the total requests for the pod
|
||||||
|
if pod.Spec.Overhead != nil {
|
||||||
|
resPtr.Add(pod.Spec.Overhead)
|
||||||
|
if _, found := pod.Spec.Overhead[corev1.ResourceCPU]; found {
|
||||||
|
non0CPU += pod.Spec.Overhead.Cpu().MilliValue()
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, found := pod.Spec.Overhead[corev1.ResourceMemory]; found {
|
||||||
|
non0Mem += pod.Spec.Overhead.Memory().Value()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// updateUsedPorts updates the UsedPorts of NodeInfo.
|
||||||
|
func (n *NodeInfo) updateUsedPorts(pod *corev1.Pod, add bool) {
|
||||||
|
for _, container := range pod.Spec.Containers {
|
||||||
|
for _, podPort := range container.Ports {
|
||||||
|
if add {
|
||||||
|
n.UsedPorts.Add(podPort.HostIP, string(podPort.Protocol), podPort.HostPort)
|
||||||
|
} else {
|
||||||
|
n.UsedPorts.Remove(podPort.HostIP, string(podPort.Protocol), podPort.HostPort)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// updatePVCRefCounts updates the PVCRefCounts of NodeInfo.
|
||||||
|
func (n *NodeInfo) updatePVCRefCounts(pod *corev1.Pod, add bool) {
|
||||||
|
for _, v := range pod.Spec.Volumes {
|
||||||
|
if v.PersistentVolumeClaim == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
key := GetNamespacedName(pod.Namespace, v.PersistentVolumeClaim.ClaimName)
|
||||||
|
if add {
|
||||||
|
n.PVCRefCounts[key]++
|
||||||
|
} else {
|
||||||
|
n.PVCRefCounts[key]--
|
||||||
|
if n.PVCRefCounts[key] <= 0 {
|
||||||
|
delete(n.PVCRefCounts, key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetNode sets the overall node information.
|
||||||
|
func (n *NodeInfo) SetNode(node *corev1.Node) {
|
||||||
|
n.node = node
|
||||||
|
n.Allocatable = util.NewResource(node.Status.Allocatable)
|
||||||
|
n.Generation = nextGeneration()
|
||||||
|
}
|
||||||
|
|
||||||
|
// RemoveNode removes the node object, leaving all other tracking information.
|
||||||
|
func (n *NodeInfo) RemoveNode() {
|
||||||
|
n.node = nil
|
||||||
|
n.Generation = nextGeneration()
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetPodKey returns the string key of a pod.
|
||||||
|
func GetPodKey(pod *corev1.Pod) (string, error) {
|
||||||
|
uid := string(pod.UID)
|
||||||
|
if len(uid) == 0 {
|
||||||
|
return "", errors.New("cannot get cache key for pod with empty UID")
|
||||||
|
}
|
||||||
|
return uid, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetNamespacedName returns the string format of a namespaced resource name.
|
||||||
|
func GetNamespacedName(namespace, name string) string {
|
||||||
|
return fmt.Sprintf("%s/%s", namespace, name)
|
||||||
|
}
|
||||||
|
|
||||||
|
// DefaultBindAllHostIP defines the default ip address used to bind to all host.
|
||||||
|
const DefaultBindAllHostIP = "0.0.0.0"
|
||||||
|
|
||||||
|
// ProtocolPort represents a protocol port pair, e.g. tcp:80.
|
||||||
|
type ProtocolPort struct {
|
||||||
|
Protocol string
|
||||||
|
Port int32
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewProtocolPort creates a ProtocolPort instance.
|
||||||
|
func NewProtocolPort(protocol string, port int32) *ProtocolPort {
|
||||||
|
pp := &ProtocolPort{
|
||||||
|
Protocol: protocol,
|
||||||
|
Port: port,
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(pp.Protocol) == 0 {
|
||||||
|
pp.Protocol = string(corev1.ProtocolTCP)
|
||||||
|
}
|
||||||
|
|
||||||
|
return pp
|
||||||
|
}
|
||||||
|
|
||||||
|
// HostPortInfo stores mapping from ip to a set of ProtocolPort
|
||||||
|
type HostPortInfo map[string]map[ProtocolPort]struct{}
|
||||||
|
|
||||||
|
// Add adds (ip, protocol, port) to HostPortInfo
|
||||||
|
func (h HostPortInfo) Add(ip, protocol string, port int32) {
|
||||||
|
if port <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
h.sanitize(&ip, &protocol)
|
||||||
|
|
||||||
|
pp := NewProtocolPort(protocol, port)
|
||||||
|
if _, ok := h[ip]; !ok {
|
||||||
|
h[ip] = map[ProtocolPort]struct{}{
|
||||||
|
*pp: {},
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
h[ip][*pp] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove removes (ip, protocol, port) from HostPortInfo
|
||||||
|
func (h HostPortInfo) Remove(ip, protocol string, port int32) {
|
||||||
|
if port <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
h.sanitize(&ip, &protocol)
|
||||||
|
|
||||||
|
pp := NewProtocolPort(protocol, port)
|
||||||
|
if m, ok := h[ip]; ok {
|
||||||
|
delete(m, *pp)
|
||||||
|
if len(h[ip]) == 0 {
|
||||||
|
delete(h, ip)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Len returns the total number of (ip, protocol, port) tuple in HostPortInfo
|
||||||
|
func (h HostPortInfo) Len() int {
|
||||||
|
length := 0
|
||||||
|
for _, m := range h {
|
||||||
|
length += len(m)
|
||||||
|
}
|
||||||
|
return length
|
||||||
|
}
|
||||||
|
|
||||||
|
// CheckConflict checks if the input (ip, protocol, port) conflicts with the existing
|
||||||
|
// ones in HostPortInfo.
|
||||||
|
func (h HostPortInfo) CheckConflict(ip, protocol string, port int32) bool {
|
||||||
|
if port <= 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
h.sanitize(&ip, &protocol)
|
||||||
|
|
||||||
|
pp := NewProtocolPort(protocol, port)
|
||||||
|
|
||||||
|
// If ip is 0.0.0.0 check all IP's (protocol, port) pair
|
||||||
|
if ip == DefaultBindAllHostIP {
|
||||||
|
for _, m := range h {
|
||||||
|
if _, ok := m[*pp]; ok {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// If ip isn't 0.0.0.0, only check IP and 0.0.0.0's (protocol, port) pair
|
||||||
|
for _, key := range []string{DefaultBindAllHostIP, ip} {
|
||||||
|
if m, ok := h[key]; ok {
|
||||||
|
if _, ok2 := m[*pp]; ok2 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// sanitize the parameters
|
||||||
|
func (h HostPortInfo) sanitize(ip, protocol *string) {
|
||||||
|
if len(*ip) == 0 {
|
||||||
|
*ip = DefaultBindAllHostIP
|
||||||
|
}
|
||||||
|
if len(*protocol) == 0 {
|
||||||
|
*protocol = string(corev1.ProtocolTCP)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,82 @@
|
||||||
|
/*
|
||||||
|
Copyright 2016 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// This code is directly lifted from the Kubernetes codebase in order to avoid relying on the k8s.io/kubernetes package.
|
||||||
|
// For reference:
|
||||||
|
// https://github.com/kubernetes/kubernetes/blob/release-1.25/pkg/scheduler/util/pod_resources.go
|
||||||
|
|
||||||
|
package util
|
||||||
|
|
||||||
|
import (
|
||||||
|
corev1 "k8s.io/api/core/v1"
|
||||||
|
)
|
||||||
|
|
||||||
|
// For each of these resources, a pod that doesn't request the resource explicitly
|
||||||
|
// will be treated as having requested the amount indicated below, for the purpose
|
||||||
|
// of computing priority only. This ensures that when scheduling zero-request pods, such
|
||||||
|
// pods will not all be scheduled to the node with the smallest in-use request,
|
||||||
|
// and that when scheduling regular pods, such pods will not see zero-request pods as
|
||||||
|
// consuming no resources whatsoever. We chose these values to be similar to the
|
||||||
|
// resources that we give to cluster addon pods (#10653). But they are pretty arbitrary.
|
||||||
|
// As described in #11713, we use request instead of limit to deal with resource requirements.
|
||||||
|
const (
|
||||||
|
// DefaultMilliCPURequest defines default milli cpu request number.
|
||||||
|
DefaultMilliCPURequest int64 = 100 // 0.1 core
|
||||||
|
// DefaultMemoryRequest defines default memory request size.
|
||||||
|
DefaultMemoryRequest int64 = 200 * 1024 * 1024 // 200 MB
|
||||||
|
)
|
||||||
|
|
||||||
|
// GetNonzeroRequests returns the default cpu and memory resource request if none is found or
|
||||||
|
// what is provided on the request.
|
||||||
|
func GetNonzeroRequests(requests *corev1.ResourceList) (int64, int64) {
|
||||||
|
return GetRequestForResource(corev1.ResourceCPU, requests, true),
|
||||||
|
GetRequestForResource(corev1.ResourceMemory, requests, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetRequestForResource returns the requested values unless nonZero is true and there is no defined request
|
||||||
|
// for CPU and memory.
|
||||||
|
// If nonZero is true and the resource has no defined request for CPU or memory, it returns a default value.
|
||||||
|
func GetRequestForResource(resource corev1.ResourceName, requests *corev1.ResourceList, nonZero bool) int64 {
|
||||||
|
if requests == nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
switch resource {
|
||||||
|
case corev1.ResourceCPU:
|
||||||
|
// Override if un-set, but not if explicitly set to zero
|
||||||
|
if _, found := (*requests)[corev1.ResourceCPU]; !found && nonZero {
|
||||||
|
return DefaultMilliCPURequest
|
||||||
|
}
|
||||||
|
return requests.Cpu().MilliValue()
|
||||||
|
case corev1.ResourceMemory:
|
||||||
|
// Override if un-set, but not if explicitly set to zero
|
||||||
|
if _, found := (*requests)[corev1.ResourceMemory]; !found && nonZero {
|
||||||
|
return DefaultMemoryRequest
|
||||||
|
}
|
||||||
|
return requests.Memory().Value()
|
||||||
|
case corev1.ResourceEphemeralStorage:
|
||||||
|
quantity, found := (*requests)[corev1.ResourceEphemeralStorage]
|
||||||
|
if !found {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return quantity.Value()
|
||||||
|
default:
|
||||||
|
quantity, found := (*requests)[resource]
|
||||||
|
if !found {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return quantity.Value()
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue