Merge pull request #2387 from RainbowMango/pr_resource_modeling_control_flag

Introduce `--enable-cluster-resource-modeling` to customize resource modeling feature
This commit is contained in:
karmada-bot 2022-08-17 17:34:43 +08:00 committed by GitHub
commit ccc39b2cf5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 75 additions and 36 deletions

View File

@ -225,6 +225,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
ClusterAPIBurst: opts.ClusterAPIBurst,
ConcurrentWorkSyncs: opts.ConcurrentWorkSyncs,
RateLimiterOptions: opts.RateLimiterOpts,
EnableClusterResourceModeling: opts.EnableClusterResourceModeling,
},
StopChan: stopChan,
ResourceInterpreter: resourceInterpreter,
@ -262,6 +263,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (bool, error)
ClusterFailureThreshold: ctx.Opts.ClusterFailureThreshold,
ClusterCacheSyncTimeout: ctx.Opts.ClusterCacheSyncTimeout,
RateLimiterOptions: ctx.Opts.RateLimiterOptions,
EnableClusterResourceModeling: ctx.Opts.EnableClusterResourceModeling,
}
if err := clusterStatusController.SetupWithManager(ctx.Mgr); err != nil {
return false, err

View File

@ -107,6 +107,12 @@ type Options struct {
// ClusterRegion represents the region of the cluster locate in.
ClusterRegion string
// EnableClusterResourceModeling indicates if enable cluster resource modeling.
// The resource modeling might be used by the scheduler to make scheduling decisions
// in scenario of dynamic replica assignment based on cluster free resources.
// Disable if it does not fit your cases for better performance.
EnableClusterResourceModeling bool
}
// NewOptions builds an default scheduler options.
@ -174,6 +180,9 @@ func (o *Options) AddFlags(fs *pflag.FlagSet, allControllers []string) {
fs.StringVar(&o.MetricsBindAddress, "metrics-bind-address", ":8080", "The TCP address that the controller should bind to for serving prometheus metrics(e.g. 127.0.0.1:8088, :8088)")
fs.StringVar(&o.ClusterProvider, "cluster-provider", "", "Provider of the joining cluster. The Karmada scheduler can use this information to spread workloads across providers for higher availability.")
fs.StringVar(&o.ClusterRegion, "cluster-region", "", "The region of the joining cluster. The Karmada scheduler can use this information to spread workloads across regions for higher availability.")
fs.BoolVar(&o.EnableClusterResourceModeling, "enable-cluster-resource-modeling", true, "Enable means controller would build resource modeling for each cluster by syncing Nodes and Pods resources.\n"+
"The resource modeling might be used by the scheduler to make scheduling decisions in scenario of dynamic replica assignment based on cluster free resources.\n"+
"Disable if it does not fit your cases for better performance.")
o.RateLimiterOpts.AddFlags(fs)
o.ProfileOpts.AddFlags(fs)
}

View File

@ -273,6 +273,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (enabled bool,
ClusterFailureThreshold: opts.ClusterFailureThreshold,
ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout,
RateLimiterOptions: ctx.Opts.RateLimiterOptions,
EnableClusterResourceModeling: ctx.Opts.EnableClusterResourceModeling,
}
if err := clusterStatusController.SetupWithManager(mgr); err != nil {
return false, err
@ -562,6 +563,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
EnableTaintManager: opts.EnableTaintManager,
RateLimiterOptions: opts.RateLimiterOpts,
GracefulEvictionTimeout: opts.GracefulEvictionTimeout,
EnableClusterResourceModeling: opts.EnableClusterResourceModeling,
},
StopChan: stopChan,
DynamicClientSet: dynamicClientSet,

View File

@ -120,6 +120,11 @@ type Options struct {
RateLimiterOpts ratelimiterflag.Options
ProfileOpts profileflag.Options
// EnableClusterResourceModeling indicates if enable cluster resource modeling.
// The resource modeling might be used by the scheduler to make scheduling decisions
// in scenario of dynamic replica assignment based on cluster free resources.
// Disable if it does not fit your cases for better performance.
EnableClusterResourceModeling bool
}
// NewOptions builds an empty options.
@ -198,6 +203,9 @@ func (o *Options) AddFlags(flags *pflag.FlagSet, allControllers, disabledByDefau
flags.IntVar(&o.ConcurrentResourceTemplateSyncs, "concurrent-resource-template-syncs", 5, "The number of resource templates that are allowed to sync concurrently.")
flags.BoolVar(&o.EnableTaintManager, "enable-taint-manager", true, "If set to true enables NoExecute Taints and will evict all not-tolerating objects propagating on Clusters tainted with this kind of Taints.")
flags.DurationVar(&o.GracefulEvictionTimeout.Duration, "graceful-eviction-timeout", 10*time.Minute, "Specifies the timeout period waiting for the graceful-eviction-controller performs the final removal since the workload(resource) has been moved to the graceful eviction tasks.")
flags.BoolVar(&o.EnableClusterResourceModeling, "enable-cluster-resource-modeling", true, "Enable means controller would build resource modeling for each cluster by syncing Nodes and Pods resources.\n"+
"The resource modeling might be used by the scheduler to make scheduling decisions in scenario of dynamic replica assignment based on cluster free resources.\n"+
"Disable if it does not fit your cases for better performance.")
o.RateLimiterOpts.AddFlags(flags)
o.ProfileOpts.AddFlags(flags)

View File

@ -64,6 +64,11 @@ type Options struct {
// GracefulEvictionTimeout is the timeout period waiting for the grace-eviction-controller performs the final
// removal since the workload(resource) has been moved to the graceful eviction tasks.
GracefulEvictionTimeout metav1.Duration
// EnableClusterResourceModeling indicates if enable cluster resource modeling.
// The resource modeling might be used by the scheduler to make scheduling decisions
// in scenario of dynamic replica assignment based on cluster free resources.
// Disable if it does not fit your cases for better performance.
EnableClusterResourceModeling bool
}
// Context defines the context object for controller.

View File

@ -91,6 +91,12 @@ type ClusterStatusController struct {
ClusterCacheSyncTimeout metav1.Duration
RateLimiterOptions ratelimiterflag.Options
// EnableClusterResourceModeling indicates if enable cluster resource modeling.
// The resource modeling might be used by the scheduler to make scheduling decisions
// in scenario of dynamic replica assignment based on cluster free resources.
// Disable if it does not fit your cases for better performance.
EnableClusterResourceModeling bool
}
// Reconcile syncs status of the given member cluster.
@ -171,15 +177,6 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
// skip collecting cluster status if not ready
if online && healthy && readyCondition.Status == metav1.ConditionTrue {
// get or create informer for pods and nodes in member cluster
clusterInformerManager, err := c.buildInformerForCluster(clusterClient)
if err != nil {
klog.Errorf("Failed to get or create informer for Cluster %s. Error: %v.", cluster.GetName(), err)
// in large-scale clusters, the timeout may occur.
// if clusterInformerManager fails to be built, should be returned, otherwise, it may cause a nil pointer
return controllerruntime.Result{Requeue: true}, err
}
if cluster.Spec.SyncMode == clusterv1alpha1.Pull {
// init the lease controller for pull mode clusters
c.initLeaseController(cluster)
@ -189,6 +186,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
if err != nil {
klog.Errorf("Failed to get Kubernetes version for Cluster %s. Error: %v.", cluster.GetName(), err)
}
currentClusterStatus.KubernetesVersion = clusterVersion
// get the list of APIs installed in the member cluster
apiEnables, err := getAPIEnablements(clusterClient)
@ -197,21 +195,36 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
} else if err != nil {
klog.Warningf("Maybe get partial(%d) APIs installed in Cluster %s. Error: %v.", len(apiEnables), cluster.GetName(), err)
}
nodes, err := listNodes(clusterInformerManager)
if err != nil {
klog.Errorf("Failed to list nodes for Cluster %s. Error: %v.", cluster.GetName(), err)
}
pods, err := listPods(clusterInformerManager)
if err != nil {
klog.Errorf("Failed to list pods for Cluster %s. Error: %v.", cluster.GetName(), err)
}
currentClusterStatus.KubernetesVersion = clusterVersion
currentClusterStatus.APIEnablements = apiEnables
currentClusterStatus.NodeSummary = getNodeSummary(nodes)
currentClusterStatus.ResourceSummary = getResourceSummary(nodes, pods)
// The generic informer manager actually used by 'execution-controller' and 'work-status-controller'.
// TODO(@RainbowMango): We should follow who-use who takes the responsibility to initialize it.
// We should move this logic to both `execution-controller` and `work-status-controller`.
// After that the 'initializeGenericInformerManagerForCluster' function as well as 'c.GenericInformerManager'
// can be safely removed from current controller.
c.initializeGenericInformerManagerForCluster(clusterClient)
if c.EnableClusterResourceModeling {
// get or create informer for pods and nodes in member cluster
clusterInformerManager, err := c.buildInformerForCluster(clusterClient)
if err != nil {
klog.Errorf("Failed to get or create informer for Cluster %s. Error: %v.", cluster.GetName(), err)
// in large-scale clusters, the timeout may occur.
// if clusterInformerManager fails to be built, should be returned, otherwise, it may cause a nil pointer
return controllerruntime.Result{Requeue: true}, err
}
nodes, err := listNodes(clusterInformerManager)
if err != nil {
klog.Errorf("Failed to list nodes for Cluster %s. Error: %v.", cluster.GetName(), err)
}
pods, err := listPods(clusterInformerManager)
if err != nil {
klog.Errorf("Failed to list pods for Cluster %s. Error: %v.", cluster.GetName(), err)
}
currentClusterStatus.NodeSummary = getNodeSummary(nodes)
currentClusterStatus.ResourceSummary = getResourceSummary(nodes, pods)
}
}
setTransitionTime(currentClusterStatus.Conditions, readyCondition)
@ -256,22 +269,22 @@ func (c *ClusterStatusController) updateStatusIfNeeded(cluster *clusterv1alpha1.
return controllerruntime.Result{RequeueAfter: c.ClusterStatusUpdateFrequency.Duration}, nil
}
func (c *ClusterStatusController) initializeGenericInformerManagerForCluster(clusterClient *util.ClusterClient) {
if c.GenericInformerManager.IsManagerExist(clusterClient.ClusterName) {
return
}
dynamicClient, err := c.ClusterDynamicClientSetFunc(clusterClient.ClusterName, c.Client)
if err != nil {
klog.Errorf("Failed to build dynamic cluster client for cluster %s.", clusterClient.ClusterName)
return
}
c.GenericInformerManager.ForCluster(clusterClient.ClusterName, dynamicClient.DynamicClientSet, 0)
}
// buildInformerForCluster builds informer manager for cluster if it doesn't exist, then constructs informers for node
// and pod and start it. If the informer manager exist, return it.
func (c *ClusterStatusController) buildInformerForCluster(clusterClient *util.ClusterClient) (typedmanager.SingleClusterInformerManager, error) {
// cluster-status-controller will initialize the generic informer manager
// mainly because when the member cluster is joined, the dynamic informer required by the member cluster
// to distribute resources is prepared in advance
// in that case execution-controller can distribute resources without waiting.
if c.GenericInformerManager.GetSingleClusterManager(clusterClient.ClusterName) == nil {
dynamicClient, err := c.ClusterDynamicClientSetFunc(clusterClient.ClusterName, c.Client)
if err != nil {
klog.Errorf("Failed to build dynamic cluster client for cluster %s.", clusterClient.ClusterName)
return nil, err
}
c.GenericInformerManager.ForCluster(clusterClient.ClusterName, dynamicClient.DynamicClientSet, 0)
}
singleClusterInformerManager := c.TypedInformerManager.GetSingleClusterManager(clusterClient.ClusterName)
if singleClusterInformerManager == nil {
singleClusterInformerManager = c.TypedInformerManager.ForCluster(clusterClient.ClusterName, clusterClient.KubeClient, 0)