Merge pull request #2387 from RainbowMango/pr_resource_modeling_control_flag
Introduce `--enable-cluster-resource-modeling` to customize resource modeling feature
This commit is contained in:
commit
ccc39b2cf5
|
@ -225,6 +225,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
|
|||
ClusterAPIBurst: opts.ClusterAPIBurst,
|
||||
ConcurrentWorkSyncs: opts.ConcurrentWorkSyncs,
|
||||
RateLimiterOptions: opts.RateLimiterOpts,
|
||||
EnableClusterResourceModeling: opts.EnableClusterResourceModeling,
|
||||
},
|
||||
StopChan: stopChan,
|
||||
ResourceInterpreter: resourceInterpreter,
|
||||
|
@ -262,6 +263,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (bool, error)
|
|||
ClusterFailureThreshold: ctx.Opts.ClusterFailureThreshold,
|
||||
ClusterCacheSyncTimeout: ctx.Opts.ClusterCacheSyncTimeout,
|
||||
RateLimiterOptions: ctx.Opts.RateLimiterOptions,
|
||||
EnableClusterResourceModeling: ctx.Opts.EnableClusterResourceModeling,
|
||||
}
|
||||
if err := clusterStatusController.SetupWithManager(ctx.Mgr); err != nil {
|
||||
return false, err
|
||||
|
|
|
@ -107,6 +107,12 @@ type Options struct {
|
|||
|
||||
// ClusterRegion represents the region of the cluster locate in.
|
||||
ClusterRegion string
|
||||
|
||||
// EnableClusterResourceModeling indicates if enable cluster resource modeling.
|
||||
// The resource modeling might be used by the scheduler to make scheduling decisions
|
||||
// in scenario of dynamic replica assignment based on cluster free resources.
|
||||
// Disable if it does not fit your cases for better performance.
|
||||
EnableClusterResourceModeling bool
|
||||
}
|
||||
|
||||
// NewOptions builds an default scheduler options.
|
||||
|
@ -174,6 +180,9 @@ func (o *Options) AddFlags(fs *pflag.FlagSet, allControllers []string) {
|
|||
fs.StringVar(&o.MetricsBindAddress, "metrics-bind-address", ":8080", "The TCP address that the controller should bind to for serving prometheus metrics(e.g. 127.0.0.1:8088, :8088)")
|
||||
fs.StringVar(&o.ClusterProvider, "cluster-provider", "", "Provider of the joining cluster. The Karmada scheduler can use this information to spread workloads across providers for higher availability.")
|
||||
fs.StringVar(&o.ClusterRegion, "cluster-region", "", "The region of the joining cluster. The Karmada scheduler can use this information to spread workloads across regions for higher availability.")
|
||||
fs.BoolVar(&o.EnableClusterResourceModeling, "enable-cluster-resource-modeling", true, "Enable means controller would build resource modeling for each cluster by syncing Nodes and Pods resources.\n"+
|
||||
"The resource modeling might be used by the scheduler to make scheduling decisions in scenario of dynamic replica assignment based on cluster free resources.\n"+
|
||||
"Disable if it does not fit your cases for better performance.")
|
||||
o.RateLimiterOpts.AddFlags(fs)
|
||||
o.ProfileOpts.AddFlags(fs)
|
||||
}
|
||||
|
|
|
@ -273,6 +273,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (enabled bool,
|
|||
ClusterFailureThreshold: opts.ClusterFailureThreshold,
|
||||
ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout,
|
||||
RateLimiterOptions: ctx.Opts.RateLimiterOptions,
|
||||
EnableClusterResourceModeling: ctx.Opts.EnableClusterResourceModeling,
|
||||
}
|
||||
if err := clusterStatusController.SetupWithManager(mgr); err != nil {
|
||||
return false, err
|
||||
|
@ -562,6 +563,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
|
|||
EnableTaintManager: opts.EnableTaintManager,
|
||||
RateLimiterOptions: opts.RateLimiterOpts,
|
||||
GracefulEvictionTimeout: opts.GracefulEvictionTimeout,
|
||||
EnableClusterResourceModeling: opts.EnableClusterResourceModeling,
|
||||
},
|
||||
StopChan: stopChan,
|
||||
DynamicClientSet: dynamicClientSet,
|
||||
|
|
|
@ -120,6 +120,11 @@ type Options struct {
|
|||
|
||||
RateLimiterOpts ratelimiterflag.Options
|
||||
ProfileOpts profileflag.Options
|
||||
// EnableClusterResourceModeling indicates if enable cluster resource modeling.
|
||||
// The resource modeling might be used by the scheduler to make scheduling decisions
|
||||
// in scenario of dynamic replica assignment based on cluster free resources.
|
||||
// Disable if it does not fit your cases for better performance.
|
||||
EnableClusterResourceModeling bool
|
||||
}
|
||||
|
||||
// NewOptions builds an empty options.
|
||||
|
@ -198,6 +203,9 @@ func (o *Options) AddFlags(flags *pflag.FlagSet, allControllers, disabledByDefau
|
|||
flags.IntVar(&o.ConcurrentResourceTemplateSyncs, "concurrent-resource-template-syncs", 5, "The number of resource templates that are allowed to sync concurrently.")
|
||||
flags.BoolVar(&o.EnableTaintManager, "enable-taint-manager", true, "If set to true enables NoExecute Taints and will evict all not-tolerating objects propagating on Clusters tainted with this kind of Taints.")
|
||||
flags.DurationVar(&o.GracefulEvictionTimeout.Duration, "graceful-eviction-timeout", 10*time.Minute, "Specifies the timeout period waiting for the graceful-eviction-controller performs the final removal since the workload(resource) has been moved to the graceful eviction tasks.")
|
||||
flags.BoolVar(&o.EnableClusterResourceModeling, "enable-cluster-resource-modeling", true, "Enable means controller would build resource modeling for each cluster by syncing Nodes and Pods resources.\n"+
|
||||
"The resource modeling might be used by the scheduler to make scheduling decisions in scenario of dynamic replica assignment based on cluster free resources.\n"+
|
||||
"Disable if it does not fit your cases for better performance.")
|
||||
|
||||
o.RateLimiterOpts.AddFlags(flags)
|
||||
o.ProfileOpts.AddFlags(flags)
|
||||
|
|
|
@ -64,6 +64,11 @@ type Options struct {
|
|||
// GracefulEvictionTimeout is the timeout period waiting for the grace-eviction-controller performs the final
|
||||
// removal since the workload(resource) has been moved to the graceful eviction tasks.
|
||||
GracefulEvictionTimeout metav1.Duration
|
||||
// EnableClusterResourceModeling indicates if enable cluster resource modeling.
|
||||
// The resource modeling might be used by the scheduler to make scheduling decisions
|
||||
// in scenario of dynamic replica assignment based on cluster free resources.
|
||||
// Disable if it does not fit your cases for better performance.
|
||||
EnableClusterResourceModeling bool
|
||||
}
|
||||
|
||||
// Context defines the context object for controller.
|
||||
|
|
|
@ -91,6 +91,12 @@ type ClusterStatusController struct {
|
|||
|
||||
ClusterCacheSyncTimeout metav1.Duration
|
||||
RateLimiterOptions ratelimiterflag.Options
|
||||
|
||||
// EnableClusterResourceModeling indicates if enable cluster resource modeling.
|
||||
// The resource modeling might be used by the scheduler to make scheduling decisions
|
||||
// in scenario of dynamic replica assignment based on cluster free resources.
|
||||
// Disable if it does not fit your cases for better performance.
|
||||
EnableClusterResourceModeling bool
|
||||
}
|
||||
|
||||
// Reconcile syncs status of the given member cluster.
|
||||
|
@ -171,15 +177,6 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
|
|||
|
||||
// skip collecting cluster status if not ready
|
||||
if online && healthy && readyCondition.Status == metav1.ConditionTrue {
|
||||
// get or create informer for pods and nodes in member cluster
|
||||
clusterInformerManager, err := c.buildInformerForCluster(clusterClient)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get or create informer for Cluster %s. Error: %v.", cluster.GetName(), err)
|
||||
// in large-scale clusters, the timeout may occur.
|
||||
// if clusterInformerManager fails to be built, should be returned, otherwise, it may cause a nil pointer
|
||||
return controllerruntime.Result{Requeue: true}, err
|
||||
}
|
||||
|
||||
if cluster.Spec.SyncMode == clusterv1alpha1.Pull {
|
||||
// init the lease controller for pull mode clusters
|
||||
c.initLeaseController(cluster)
|
||||
|
@ -189,6 +186,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
|
|||
if err != nil {
|
||||
klog.Errorf("Failed to get Kubernetes version for Cluster %s. Error: %v.", cluster.GetName(), err)
|
||||
}
|
||||
currentClusterStatus.KubernetesVersion = clusterVersion
|
||||
|
||||
// get the list of APIs installed in the member cluster
|
||||
apiEnables, err := getAPIEnablements(clusterClient)
|
||||
|
@ -197,21 +195,36 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
|
|||
} else if err != nil {
|
||||
klog.Warningf("Maybe get partial(%d) APIs installed in Cluster %s. Error: %v.", len(apiEnables), cluster.GetName(), err)
|
||||
}
|
||||
|
||||
nodes, err := listNodes(clusterInformerManager)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to list nodes for Cluster %s. Error: %v.", cluster.GetName(), err)
|
||||
}
|
||||
|
||||
pods, err := listPods(clusterInformerManager)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to list pods for Cluster %s. Error: %v.", cluster.GetName(), err)
|
||||
}
|
||||
|
||||
currentClusterStatus.KubernetesVersion = clusterVersion
|
||||
currentClusterStatus.APIEnablements = apiEnables
|
||||
currentClusterStatus.NodeSummary = getNodeSummary(nodes)
|
||||
currentClusterStatus.ResourceSummary = getResourceSummary(nodes, pods)
|
||||
|
||||
// The generic informer manager actually used by 'execution-controller' and 'work-status-controller'.
|
||||
// TODO(@RainbowMango): We should follow who-use who takes the responsibility to initialize it.
|
||||
// We should move this logic to both `execution-controller` and `work-status-controller`.
|
||||
// After that the 'initializeGenericInformerManagerForCluster' function as well as 'c.GenericInformerManager'
|
||||
// can be safely removed from current controller.
|
||||
c.initializeGenericInformerManagerForCluster(clusterClient)
|
||||
|
||||
if c.EnableClusterResourceModeling {
|
||||
// get or create informer for pods and nodes in member cluster
|
||||
clusterInformerManager, err := c.buildInformerForCluster(clusterClient)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get or create informer for Cluster %s. Error: %v.", cluster.GetName(), err)
|
||||
// in large-scale clusters, the timeout may occur.
|
||||
// if clusterInformerManager fails to be built, should be returned, otherwise, it may cause a nil pointer
|
||||
return controllerruntime.Result{Requeue: true}, err
|
||||
}
|
||||
nodes, err := listNodes(clusterInformerManager)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to list nodes for Cluster %s. Error: %v.", cluster.GetName(), err)
|
||||
}
|
||||
|
||||
pods, err := listPods(clusterInformerManager)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to list pods for Cluster %s. Error: %v.", cluster.GetName(), err)
|
||||
}
|
||||
currentClusterStatus.NodeSummary = getNodeSummary(nodes)
|
||||
currentClusterStatus.ResourceSummary = getResourceSummary(nodes, pods)
|
||||
}
|
||||
}
|
||||
|
||||
setTransitionTime(currentClusterStatus.Conditions, readyCondition)
|
||||
|
@ -256,22 +269,22 @@ func (c *ClusterStatusController) updateStatusIfNeeded(cluster *clusterv1alpha1.
|
|||
return controllerruntime.Result{RequeueAfter: c.ClusterStatusUpdateFrequency.Duration}, nil
|
||||
}
|
||||
|
||||
func (c *ClusterStatusController) initializeGenericInformerManagerForCluster(clusterClient *util.ClusterClient) {
|
||||
if c.GenericInformerManager.IsManagerExist(clusterClient.ClusterName) {
|
||||
return
|
||||
}
|
||||
|
||||
dynamicClient, err := c.ClusterDynamicClientSetFunc(clusterClient.ClusterName, c.Client)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to build dynamic cluster client for cluster %s.", clusterClient.ClusterName)
|
||||
return
|
||||
}
|
||||
c.GenericInformerManager.ForCluster(clusterClient.ClusterName, dynamicClient.DynamicClientSet, 0)
|
||||
}
|
||||
|
||||
// buildInformerForCluster builds informer manager for cluster if it doesn't exist, then constructs informers for node
|
||||
// and pod and start it. If the informer manager exist, return it.
|
||||
func (c *ClusterStatusController) buildInformerForCluster(clusterClient *util.ClusterClient) (typedmanager.SingleClusterInformerManager, error) {
|
||||
// cluster-status-controller will initialize the generic informer manager
|
||||
// mainly because when the member cluster is joined, the dynamic informer required by the member cluster
|
||||
// to distribute resources is prepared in advance
|
||||
// in that case execution-controller can distribute resources without waiting.
|
||||
if c.GenericInformerManager.GetSingleClusterManager(clusterClient.ClusterName) == nil {
|
||||
dynamicClient, err := c.ClusterDynamicClientSetFunc(clusterClient.ClusterName, c.Client)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to build dynamic cluster client for cluster %s.", clusterClient.ClusterName)
|
||||
return nil, err
|
||||
}
|
||||
c.GenericInformerManager.ForCluster(clusterClient.ClusterName, dynamicClient.DynamicClientSet, 0)
|
||||
}
|
||||
|
||||
singleClusterInformerManager := c.TypedInformerManager.GetSingleClusterManager(clusterClient.ClusterName)
|
||||
if singleClusterInformerManager == nil {
|
||||
singleClusterInformerManager = c.TypedInformerManager.ForCluster(clusterClient.ClusterName, clusterClient.KubeClient, 0)
|
||||
|
|
Loading…
Reference in New Issue