355 lines
14 KiB
Go
355 lines
14 KiB
Go
/*
|
|
Copyright 2016 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package simulator
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
|
|
apiv1 "k8s.io/api/core/v1"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/runtime"
|
|
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
|
|
informers "k8s.io/client-go/informers"
|
|
kube_client "k8s.io/client-go/kubernetes"
|
|
"k8s.io/kubernetes/pkg/scheduler"
|
|
"k8s.io/kubernetes/pkg/scheduler/algorithm/predicates"
|
|
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
|
|
"k8s.io/kubernetes/pkg/scheduler/factory"
|
|
schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
|
|
|
|
// We need to import provider to initialize default scheduler.
|
|
"k8s.io/kubernetes/pkg/scheduler/algorithmprovider"
|
|
|
|
"k8s.io/klog"
|
|
)
|
|
|
|
const (
|
|
// We want to disable affinity predicate for performance reasons if no pod
|
|
// requires it.
|
|
affinityPredicateName = "MatchInterPodAffinity"
|
|
)
|
|
|
|
// PredicateInfo assigns a name to a predicate
|
|
type PredicateInfo struct {
|
|
Name string
|
|
Predicate predicates.FitPredicate
|
|
}
|
|
|
|
// PredicateChecker checks whether all required predicates pass for given Pod and Node.
|
|
type PredicateChecker struct {
|
|
predicates []PredicateInfo
|
|
predicateMetadataProducer predicates.PredicateMetadataProducer
|
|
enableAffinityPredicate bool
|
|
}
|
|
|
|
// We run some predicates first as they are cheap to check and they should be enough
|
|
// to fail predicates in most of our simulations (especially binpacking).
|
|
// There are no const arrays in Go, this is meant to be used as a const.
|
|
var priorityPredicates = []string{"PodFitsResources", "PodToleratesNodeTaints", "GeneralPredicates", "ready"}
|
|
|
|
func init() {
|
|
// This results in filtering out some predicate functions registered by defaults.init() method.
|
|
// In scheduler this method is run from app.runCommand().
|
|
// We also need to call it in CA to have simulation behaviour consistent with scheduler.
|
|
// Note: the logic of method is conditional and depends on feature gates enabled. To have same
|
|
// behaviour in CA and scheduler both need to be run with same set of feature gates.
|
|
algorithmprovider.ApplyFeatureGates()
|
|
}
|
|
|
|
// NoOpEventRecorder is a noop implementation of EventRecorder
|
|
type NoOpEventRecorder struct{}
|
|
|
|
// Event is a noop method implementation
|
|
func (NoOpEventRecorder) Event(object runtime.Object, eventtype, reason, message string) {
|
|
}
|
|
|
|
// Eventf is a noop method implementation
|
|
func (NoOpEventRecorder) Eventf(object runtime.Object, eventtype, reason, messageFmt string, args ...interface{}) {
|
|
}
|
|
|
|
// PastEventf is a noop method implementation
|
|
func (NoOpEventRecorder) PastEventf(object runtime.Object, timestamp metav1.Time, eventtype, reason, messageFmt string, args ...interface{}) {
|
|
}
|
|
|
|
// AnnotatedEventf is a noop method implementation
|
|
func (NoOpEventRecorder) AnnotatedEventf(object runtime.Object, annotations map[string]string, eventtype, reason, messageFmt string, args ...interface{}) {
|
|
}
|
|
|
|
// NewPredicateChecker builds PredicateChecker.
|
|
func NewPredicateChecker(kubeClient kube_client.Interface, stop <-chan struct{}) (*PredicateChecker, error) {
|
|
informerFactory := informers.NewSharedInformerFactory(kubeClient, 0)
|
|
algorithmProvider := factory.DefaultProvider
|
|
|
|
// Set up the configurator which can create schedulers from configs.
|
|
nodeInformer := informerFactory.Core().V1().Nodes()
|
|
podInformer := informerFactory.Core().V1().Pods()
|
|
pvInformer := informerFactory.Core().V1().PersistentVolumes()
|
|
pvcInformer := informerFactory.Core().V1().PersistentVolumeClaims()
|
|
replicationControllerInformer := informerFactory.Core().V1().ReplicationControllers()
|
|
replicaSetInformer := informerFactory.Apps().V1().ReplicaSets()
|
|
statefulSetInformer := informerFactory.Apps().V1().StatefulSets()
|
|
serviceInformer := informerFactory.Core().V1().Services()
|
|
pdbInformer := informerFactory.Policy().V1beta1().PodDisruptionBudgets()
|
|
storageClassInformer := informerFactory.Storage().V1().StorageClasses()
|
|
configurator := factory.NewConfigFactory(&factory.ConfigFactoryArgs{
|
|
SchedulerName: apiv1.DefaultSchedulerName,
|
|
Client: kubeClient,
|
|
NodeInformer: nodeInformer,
|
|
PodInformer: podInformer,
|
|
PvInformer: pvInformer,
|
|
PvcInformer: pvcInformer,
|
|
ReplicationControllerInformer: replicationControllerInformer,
|
|
ReplicaSetInformer: replicaSetInformer,
|
|
StatefulSetInformer: statefulSetInformer,
|
|
ServiceInformer: serviceInformer,
|
|
PdbInformer: pdbInformer,
|
|
StorageClassInformer: storageClassInformer,
|
|
HardPodAffinitySymmetricWeight: apiv1.DefaultHardPodAffinitySymmetricWeight,
|
|
DisablePreemption: false,
|
|
PercentageOfNodesToScore: schedulerapi.DefaultPercentageOfNodesToScore,
|
|
BindTimeoutSeconds: scheduler.BindTimeoutSeconds,
|
|
})
|
|
// Create the config from a named algorithm provider.
|
|
config, err := configurator.CreateFromProvider(algorithmProvider)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("couldn't create scheduler using provider %q: %v", algorithmProvider, err)
|
|
}
|
|
// Additional tweaks to the config produced by the configurator.
|
|
config.Recorder = NoOpEventRecorder{}
|
|
config.DisablePreemption = false
|
|
config.StopEverything = stop
|
|
|
|
// Create the scheduler.
|
|
sched := scheduler.NewFromConfig(config)
|
|
|
|
scheduler.AddAllEventHandlers(sched, apiv1.DefaultSchedulerName,
|
|
nodeInformer, podInformer, pvInformer, pvcInformer, replicationControllerInformer, replicaSetInformer, statefulSetInformer, serviceInformer, pdbInformer, storageClassInformer)
|
|
|
|
predicateMap := map[string]predicates.FitPredicate{}
|
|
for predicateName, predicateFunc := range sched.Config().Algorithm.Predicates() {
|
|
predicateMap[predicateName] = predicateFunc
|
|
}
|
|
// We want to make sure that some predicates are present to run them first
|
|
// as they are cheap to check and they should be enough to fail predicates
|
|
// in most of our simulations (especially binpacking).
|
|
predicateMap["ready"] = IsNodeReadyAndSchedulablePredicate
|
|
if _, found := predicateMap["PodFitsResources"]; !found {
|
|
predicateMap["PodFitsResources"] = predicates.PodFitsResources
|
|
}
|
|
if _, found := predicateMap["PodToleratesNodeTaints"]; !found {
|
|
predicateMap["PodToleratesNodeTaints"] = predicates.PodToleratesNodeTaints
|
|
}
|
|
|
|
predicateList := make([]PredicateInfo, len(predicateMap))
|
|
for _, predicateName := range priorityPredicates {
|
|
if predicate, found := predicateMap[predicateName]; found {
|
|
predicateList = append(predicateList, PredicateInfo{Name: predicateName, Predicate: predicate})
|
|
delete(predicateMap, predicateName)
|
|
}
|
|
}
|
|
|
|
for predicateName, predicate := range predicateMap {
|
|
predicateList = append(predicateList, PredicateInfo{Name: predicateName, Predicate: predicate})
|
|
}
|
|
|
|
for _, predInfo := range predicateList {
|
|
klog.V(1).Infof("Using predicate %s", predInfo.Name)
|
|
}
|
|
|
|
informerFactory.Start(stop)
|
|
|
|
metadataProducer, err := configurator.GetPredicateMetadataProducer()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("could not obtain predicateMetadataProducer; %v", err.Error())
|
|
}
|
|
|
|
return &PredicateChecker{
|
|
predicates: predicateList,
|
|
predicateMetadataProducer: metadataProducer,
|
|
enableAffinityPredicate: true,
|
|
}, nil
|
|
}
|
|
|
|
// IsNodeReadyAndSchedulablePredicate checks if node is ready.
|
|
func IsNodeReadyAndSchedulablePredicate(pod *apiv1.Pod, meta predicates.PredicateMetadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool,
|
|
[]predicates.PredicateFailureReason, error) {
|
|
ready := kube_util.IsNodeReadyAndSchedulable(nodeInfo.Node())
|
|
if !ready {
|
|
return false, []predicates.PredicateFailureReason{predicates.NewFailureReason("node is unready")}, nil
|
|
}
|
|
return true, []predicates.PredicateFailureReason{}, nil
|
|
}
|
|
|
|
// NewTestPredicateChecker builds test version of PredicateChecker.
|
|
func NewTestPredicateChecker() *PredicateChecker {
|
|
return &PredicateChecker{
|
|
predicates: []PredicateInfo{
|
|
{Name: "default", Predicate: predicates.GeneralPredicates},
|
|
{Name: "ready", Predicate: IsNodeReadyAndSchedulablePredicate},
|
|
},
|
|
predicateMetadataProducer: func(_ *apiv1.Pod, _ map[string]*schedulernodeinfo.NodeInfo) predicates.PredicateMetadata {
|
|
return nil
|
|
},
|
|
}
|
|
}
|
|
|
|
// NewCustomTestPredicateChecker builds test version of PredicateChecker with additional predicates.
|
|
// Helps with benchmarking different ordering of predicates.
|
|
func NewCustomTestPredicateChecker(predicateInfos []PredicateInfo) *PredicateChecker {
|
|
return &PredicateChecker{
|
|
predicates: predicateInfos,
|
|
predicateMetadataProducer: func(_ *apiv1.Pod, _ map[string]*schedulernodeinfo.NodeInfo) predicates.PredicateMetadata {
|
|
return nil
|
|
},
|
|
}
|
|
}
|
|
|
|
// SetAffinityPredicateEnabled can be used to enable or disable checking MatchInterPodAffinity
|
|
// predicate. This will cause incorrect CA behavior if there is at least a single pod in
|
|
// cluster using affinity/antiaffinity. However, checking affinity predicate is extremely
|
|
// costly even if no pod is using it, so it may be worth disabling it in such situation.
|
|
func (p *PredicateChecker) SetAffinityPredicateEnabled(enable bool) {
|
|
p.enableAffinityPredicate = enable
|
|
}
|
|
|
|
// IsAffinityPredicateEnabled checks if affinity predicate is enabled.
|
|
func (p *PredicateChecker) IsAffinityPredicateEnabled() bool {
|
|
return p.enableAffinityPredicate
|
|
}
|
|
|
|
// GetPredicateMetadata precomputes some information useful for running predicates on a given pod in a given state
|
|
// of the cluster (represented by nodeInfos map). Passing the result of this function to CheckPredicates can significantly
|
|
// improve the performance of running predicates, especially MatchInterPodAffinity predicate. However, calculating
|
|
// predicateMetadata is also quite expensive, so it's not always the best option to run this method.
|
|
// Please refer to https://github.com/kubernetes/autoscaler/issues/257 for more details.
|
|
func (p *PredicateChecker) GetPredicateMetadata(pod *apiv1.Pod, nodeInfos map[string]*schedulernodeinfo.NodeInfo) predicates.PredicateMetadata {
|
|
// Skip precomputation if affinity predicate is disabled - it's not worth it performance-wise.
|
|
if !p.enableAffinityPredicate {
|
|
return nil
|
|
}
|
|
return p.predicateMetadataProducer(pod, nodeInfos)
|
|
}
|
|
|
|
// FitsAny checks if the given pod can be place on any of the given nodes.
|
|
func (p *PredicateChecker) FitsAny(pod *apiv1.Pod, nodeInfos map[string]*schedulernodeinfo.NodeInfo) (string, error) {
|
|
for name, nodeInfo := range nodeInfos {
|
|
// Be sure that the node is schedulable.
|
|
if nodeInfo.Node().Spec.Unschedulable {
|
|
continue
|
|
}
|
|
if err := p.CheckPredicates(pod, nil, nodeInfo); err == nil {
|
|
return name, nil
|
|
}
|
|
}
|
|
return "", fmt.Errorf("cannot put pod %s on any node", pod.Name)
|
|
}
|
|
|
|
// PredicateError implements error, preserving the original error information from scheduler predicate.
|
|
type PredicateError struct {
|
|
predicateName string
|
|
failureReasons []predicates.PredicateFailureReason
|
|
err error
|
|
|
|
reasons []string
|
|
message string
|
|
}
|
|
|
|
// Error returns a predefined error message.
|
|
func (pe *PredicateError) Error() string {
|
|
if pe.message != "" {
|
|
return pe.message
|
|
}
|
|
// Don't generate verbose message.
|
|
return "Predicates failed"
|
|
}
|
|
|
|
// VerboseError generates verbose error message if it isn't yet done.
|
|
// We're running a ton of predicates and more often than not we only care whether
|
|
// they pass or not and don't care for a reason. Turns out formatting nice error
|
|
// messages gets very expensive, so we only do it if explicitly requested.
|
|
func (pe *PredicateError) VerboseError() string {
|
|
if pe.message != "" {
|
|
return pe.message
|
|
}
|
|
// Generate verbose message.
|
|
if pe.err != nil {
|
|
pe.message = fmt.Sprintf("%s predicate error: %v", pe.predicateName, pe.err)
|
|
return pe.message
|
|
}
|
|
pe.message = fmt.Sprintf("%s predicate mismatch, reason: %s", pe.predicateName, strings.Join(pe.Reasons(), ", "))
|
|
return pe.message
|
|
}
|
|
|
|
// NewPredicateError creates a new predicate error from error and reasons.
|
|
func NewPredicateError(name string, err error, reasons []string, originalReasons []predicates.PredicateFailureReason) *PredicateError {
|
|
return &PredicateError{
|
|
predicateName: name,
|
|
err: err,
|
|
reasons: reasons,
|
|
failureReasons: originalReasons,
|
|
}
|
|
}
|
|
|
|
// Reasons returns original failure reasons from failed predicate as a slice of strings.
|
|
func (pe *PredicateError) Reasons() []string {
|
|
if pe.reasons != nil {
|
|
return pe.reasons
|
|
}
|
|
pe.reasons = make([]string, len(pe.failureReasons))
|
|
for i, reason := range pe.failureReasons {
|
|
pe.reasons[i] = reason.GetReason()
|
|
}
|
|
return pe.reasons
|
|
}
|
|
|
|
// OriginalReasons returns original failure reasons from failed predicate as a slice of PredicateFailureReason.
|
|
func (pe *PredicateError) OriginalReasons() []predicates.PredicateFailureReason {
|
|
return pe.failureReasons
|
|
}
|
|
|
|
// PredicateName returns the name of failed predicate.
|
|
func (pe *PredicateError) PredicateName() string {
|
|
return pe.predicateName
|
|
}
|
|
|
|
// CheckPredicates checks if the given pod can be placed on the given node.
|
|
// To improve performance predicateMetadata can be calculated using GetPredicateMetadata
|
|
// method and passed to CheckPredicates, however, this may lead to incorrect results if
|
|
// it was calculated using NodeInfo map representing different cluster state and the
|
|
// performance gains of CheckPredicates won't always offset the cost of GetPredicateMetadata.
|
|
// Alternatively you can pass nil as predicateMetadata.
|
|
func (p *PredicateChecker) CheckPredicates(pod *apiv1.Pod, predicateMetadata predicates.PredicateMetadata, nodeInfo *schedulernodeinfo.NodeInfo) *PredicateError {
|
|
for _, predInfo := range p.predicates {
|
|
// Skip affinity predicate if it has been disabled.
|
|
if !p.enableAffinityPredicate && predInfo.Name == affinityPredicateName {
|
|
continue
|
|
}
|
|
|
|
match, failureReasons, err := predInfo.Predicate(pod, predicateMetadata, nodeInfo)
|
|
|
|
if err != nil || !match {
|
|
return &PredicateError{
|
|
predicateName: predInfo.Name,
|
|
failureReasons: failureReasons,
|
|
err: err,
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|