Merge pull request #2387 from RainbowMango/pr_resource_modeling_control_flag

Introduce `--enable-cluster-resource-modeling` to customize resource modeling feature
2022-08-17 17:34:43 +08:00 · 2022-08-17 17:34:43 +08:00 · ccc39b2cf5
parent 6faf228584 95036ffcda
commit ccc39b2cf5
6 changed files with 75 additions and 36 deletions
--- a/cmd/agent/app/agent.go
+++ b/cmd/agent/app/agent.go
@ -225,6 +225,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
 			ClusterAPIBurst:                   opts.ClusterAPIBurst,
 			ConcurrentWorkSyncs:               opts.ConcurrentWorkSyncs,
 			RateLimiterOptions:                opts.RateLimiterOpts,
+			EnableClusterResourceModeling:     opts.EnableClusterResourceModeling,
 		},
 		StopChan:            stopChan,
 		ResourceInterpreter: resourceInterpreter,
@ -262,6 +263,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (bool, error)
 		ClusterFailureThreshold:           ctx.Opts.ClusterFailureThreshold,
 		ClusterCacheSyncTimeout:           ctx.Opts.ClusterCacheSyncTimeout,
 		RateLimiterOptions:                ctx.Opts.RateLimiterOptions,
+		EnableClusterResourceModeling:     ctx.Opts.EnableClusterResourceModeling,
 	}
 	if err := clusterStatusController.SetupWithManager(ctx.Mgr); err != nil {
 		return false, err
--- a/cmd/agent/app/options/options.go
+++ b/cmd/agent/app/options/options.go
@ -107,6 +107,12 @@ type Options struct {

 	// ClusterRegion represents the region of the cluster locate in.
 	ClusterRegion string
+
+	// EnableClusterResourceModeling indicates if enable cluster resource modeling.
+	// The resource modeling might be used by the scheduler to make scheduling decisions
+	// in scenario of dynamic replica assignment based on cluster free resources.
+	// Disable if it does not fit your cases for better performance.
+	EnableClusterResourceModeling bool
 }

 // NewOptions builds an default scheduler options.
@ -174,6 +180,9 @@ func (o *Options) AddFlags(fs *pflag.FlagSet, allControllers []string) {
 	fs.StringVar(&o.MetricsBindAddress, "metrics-bind-address", ":8080", "The TCP address that the controller should bind to for serving prometheus metrics(e.g. 127.0.0.1:8088, :8088)")
 	fs.StringVar(&o.ClusterProvider, "cluster-provider", "", "Provider of the joining cluster. The Karmada scheduler can use this information to spread workloads across providers for higher availability.")
 	fs.StringVar(&o.ClusterRegion, "cluster-region", "", "The region of the joining cluster. The Karmada scheduler can use this information to spread workloads across regions for higher availability.")
+	fs.BoolVar(&o.EnableClusterResourceModeling, "enable-cluster-resource-modeling", true, "Enable means controller would build resource modeling for each cluster by syncing Nodes and Pods resources.\n"+
+		"The resource modeling might be used by the scheduler to make scheduling decisions in scenario of dynamic replica assignment based on cluster free resources.\n"+
+		"Disable if it does not fit your cases for better performance.")
 	o.RateLimiterOpts.AddFlags(fs)
 	o.ProfileOpts.AddFlags(fs)
 }
--- a/cmd/controller-manager/app/controllermanager.go
+++ b/cmd/controller-manager/app/controllermanager.go
@ -273,6 +273,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (enabled bool,
 		ClusterFailureThreshold:           opts.ClusterFailureThreshold,
 		ClusterCacheSyncTimeout:           opts.ClusterCacheSyncTimeout,
 		RateLimiterOptions:                ctx.Opts.RateLimiterOptions,
+		EnableClusterResourceModeling:     ctx.Opts.EnableClusterResourceModeling,
 	}
 	if err := clusterStatusController.SetupWithManager(mgr); err != nil {
 		return false, err
@ -562,6 +563,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
 			EnableTaintManager:                opts.EnableTaintManager,
 			RateLimiterOptions:                opts.RateLimiterOpts,
 			GracefulEvictionTimeout:           opts.GracefulEvictionTimeout,
+			EnableClusterResourceModeling:     opts.EnableClusterResourceModeling,
 		},
 		StopChan:                    stopChan,
 		DynamicClientSet:            dynamicClientSet,
--- a/cmd/controller-manager/app/options/options.go
+++ b/cmd/controller-manager/app/options/options.go
@ -120,6 +120,11 @@ type Options struct {

 	RateLimiterOpts ratelimiterflag.Options
 	ProfileOpts     profileflag.Options
+	// EnableClusterResourceModeling indicates if enable cluster resource modeling.
+	// The resource modeling might be used by the scheduler to make scheduling decisions
+	// in scenario of dynamic replica assignment based on cluster free resources.
+	// Disable if it does not fit your cases for better performance.
+	EnableClusterResourceModeling bool
 }

 // NewOptions builds an empty options.
@ -198,6 +203,9 @@ func (o *Options) AddFlags(flags *pflag.FlagSet, allControllers, disabledByDefau
 	flags.IntVar(&o.ConcurrentResourceTemplateSyncs, "concurrent-resource-template-syncs", 5, "The number of resource templates that are allowed to sync concurrently.")
 	flags.BoolVar(&o.EnableTaintManager, "enable-taint-manager", true, "If set to true enables NoExecute Taints and will evict all not-tolerating objects propagating on Clusters tainted with this kind of Taints.")
 	flags.DurationVar(&o.GracefulEvictionTimeout.Duration, "graceful-eviction-timeout", 10*time.Minute, "Specifies the timeout period waiting for the graceful-eviction-controller performs the final removal since the workload(resource) has been moved to the graceful eviction tasks.")
+	flags.BoolVar(&o.EnableClusterResourceModeling, "enable-cluster-resource-modeling", true, "Enable means controller would build resource modeling for each cluster by syncing Nodes and Pods resources.\n"+
+		"The resource modeling might be used by the scheduler to make scheduling decisions in scenario of dynamic replica assignment based on cluster free resources.\n"+
+		"Disable if it does not fit your cases for better performance.")

 	o.RateLimiterOpts.AddFlags(flags)
 	o.ProfileOpts.AddFlags(flags)
--- a/pkg/controllers/context/context.go
+++ b/pkg/controllers/context/context.go
@ -64,6 +64,11 @@ type Options struct {
 	// GracefulEvictionTimeout is the timeout period waiting for the grace-eviction-controller performs the final
 	// removal since the workload(resource) has been moved to the graceful eviction tasks.
 	GracefulEvictionTimeout metav1.Duration
+	// EnableClusterResourceModeling indicates if enable cluster resource modeling.
+	// The resource modeling might be used by the scheduler to make scheduling decisions
+	// in scenario of dynamic replica assignment based on cluster free resources.
+	// Disable if it does not fit your cases for better performance.
+	EnableClusterResourceModeling bool
 }

 // Context defines the context object for controller.
--- a/pkg/controllers/status/cluster_status_controller.go
+++ b/pkg/controllers/status/cluster_status_controller.go
@ -91,6 +91,12 @@ type ClusterStatusController struct {

 	ClusterCacheSyncTimeout metav1.Duration
 	RateLimiterOptions      ratelimiterflag.Options
+
+	// EnableClusterResourceModeling indicates if enable cluster resource modeling.
+	// The resource modeling might be used by the scheduler to make scheduling decisions
+	// in scenario of dynamic replica assignment based on cluster free resources.
+	// Disable if it does not fit your cases for better performance.
+	EnableClusterResourceModeling bool
 }

 // Reconcile syncs status of the given member cluster.
@ -171,15 +177,6 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu

 	// skip collecting cluster status if not ready
 	if online && healthy && readyCondition.Status == metav1.ConditionTrue {
-		// get or create informer for pods and nodes in member cluster
-		clusterInformerManager, err := c.buildInformerForCluster(clusterClient)
-		if err != nil {
-			klog.Errorf("Failed to get or create informer for Cluster %s. Error: %v.", cluster.GetName(), err)
-			// in large-scale clusters, the timeout may occur.
-			// if clusterInformerManager fails to be built, should be returned, otherwise, it may cause a nil pointer
-			return controllerruntime.Result{Requeue: true}, err
-		}
-
 		if cluster.Spec.SyncMode == clusterv1alpha1.Pull {
 			// init the lease controller for pull mode clusters
 			c.initLeaseController(cluster)
@ -189,6 +186,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
 		if err != nil {
 			klog.Errorf("Failed to get Kubernetes version for Cluster %s. Error: %v.", cluster.GetName(), err)
 		}
+		currentClusterStatus.KubernetesVersion = clusterVersion

 		// get the list of APIs installed in the member cluster
 		apiEnables, err := getAPIEnablements(clusterClient)
@ -197,21 +195,36 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
 		} else if err != nil {
 			klog.Warningf("Maybe get partial(%d) APIs installed in Cluster %s. Error: %v.", len(apiEnables), cluster.GetName(), err)
 		}
-
-		nodes, err := listNodes(clusterInformerManager)
-		if err != nil {
-			klog.Errorf("Failed to list nodes for Cluster %s. Error: %v.", cluster.GetName(), err)
-		}
-
-		pods, err := listPods(clusterInformerManager)
-		if err != nil {
-			klog.Errorf("Failed to list pods for Cluster %s. Error: %v.", cluster.GetName(), err)
-		}
-
-		currentClusterStatus.KubernetesVersion = clusterVersion
 		currentClusterStatus.APIEnablements = apiEnables
-		currentClusterStatus.NodeSummary = getNodeSummary(nodes)
-		currentClusterStatus.ResourceSummary = getResourceSummary(nodes, pods)
+
+		// The generic informer manager actually used by 'execution-controller' and 'work-status-controller'.
+		// TODO(@RainbowMango): We should follow who-use who takes the responsibility to initialize it.
+		// 	We should move this logic to both `execution-controller` and `work-status-controller`.
+		//  After that the 'initializeGenericInformerManagerForCluster' function as well as 'c.GenericInformerManager'
+		//  can be safely removed from current controller.
+		c.initializeGenericInformerManagerForCluster(clusterClient)
+
+		if c.EnableClusterResourceModeling {
+			// get or create informer for pods and nodes in member cluster
+			clusterInformerManager, err := c.buildInformerForCluster(clusterClient)
+			if err != nil {
+				klog.Errorf("Failed to get or create informer for Cluster %s. Error: %v.", cluster.GetName(), err)
+				// in large-scale clusters, the timeout may occur.
+				// if clusterInformerManager fails to be built, should be returned, otherwise, it may cause a nil pointer
+				return controllerruntime.Result{Requeue: true}, err
+			}
+			nodes, err := listNodes(clusterInformerManager)
+			if err != nil {
+				klog.Errorf("Failed to list nodes for Cluster %s. Error: %v.", cluster.GetName(), err)
+			}
+
+			pods, err := listPods(clusterInformerManager)
+			if err != nil {
+				klog.Errorf("Failed to list pods for Cluster %s. Error: %v.", cluster.GetName(), err)
+			}
+			currentClusterStatus.NodeSummary = getNodeSummary(nodes)
+			currentClusterStatus.ResourceSummary = getResourceSummary(nodes, pods)
+		}
 	}

 	setTransitionTime(currentClusterStatus.Conditions, readyCondition)
@ -256,22 +269,22 @@ func (c *ClusterStatusController) updateStatusIfNeeded(cluster *clusterv1alpha1.
 	return controllerruntime.Result{RequeueAfter: c.ClusterStatusUpdateFrequency.Duration}, nil
 }

+func (c *ClusterStatusController) initializeGenericInformerManagerForCluster(clusterClient *util.ClusterClient) {
+	if c.GenericInformerManager.IsManagerExist(clusterClient.ClusterName) {
+		return
+	}
+
+	dynamicClient, err := c.ClusterDynamicClientSetFunc(clusterClient.ClusterName, c.Client)
+	if err != nil {
+		klog.Errorf("Failed to build dynamic cluster client for cluster %s.", clusterClient.ClusterName)
+		return
+	}
+	c.GenericInformerManager.ForCluster(clusterClient.ClusterName, dynamicClient.DynamicClientSet, 0)
+}
+
 // buildInformerForCluster builds informer manager for cluster if it doesn't exist, then constructs informers for node
 // and pod and start it. If the informer manager exist, return it.
 func (c *ClusterStatusController) buildInformerForCluster(clusterClient *util.ClusterClient) (typedmanager.SingleClusterInformerManager, error) {
-	// cluster-status-controller will initialize the generic informer manager
-	// mainly because when the member cluster is joined, the dynamic informer required by the member cluster
-	// to distribute resources is prepared in advance
-	// in that case execution-controller can distribute resources without waiting.
-	if c.GenericInformerManager.GetSingleClusterManager(clusterClient.ClusterName) == nil {
-		dynamicClient, err := c.ClusterDynamicClientSetFunc(clusterClient.ClusterName, c.Client)
-		if err != nil {
-			klog.Errorf("Failed to build dynamic cluster client for cluster %s.", clusterClient.ClusterName)
-			return nil, err
-		}
-		c.GenericInformerManager.ForCluster(clusterClient.ClusterName, dynamicClient.DynamicClientSet, 0)
-	}
-
 	singleClusterInformerManager := c.TypedInformerManager.GetSingleClusterManager(clusterClient.ClusterName)
 	if singleClusterInformerManager == nil {
 		singleClusterInformerManager = c.TypedInformerManager.ForCluster(clusterClient.ClusterName, clusterClient.KubeClient, 0)