From b3ca4c2ee8c8be53f94fad0dd28dafcfba2169c5 Mon Sep 17 00:00:00 2001
From: jingxueli <jingxueli@trip.com>
Date: Sun, 22 May 2022 02:47:26 +0800
Subject: [PATCH 1/2] make buildInformerForCluster configurable

Signed-off-by: jingxueli <jingxueli@trip.com>
---
 cmd/agent/app/agent.go                              | 1 +
 cmd/agent/app/options/options.go                    | 5 +++++
 cmd/controller-manager/app/controllermanager.go     | 2 ++
 cmd/controller-manager/app/options/options.go       | 5 +++++
 pkg/controllers/context/context.go                  | 2 ++
 pkg/controllers/status/cluster_status_controller.go | 4 +++-
 6 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/cmd/agent/app/agent.go b/cmd/agent/app/agent.go
index 8f32ed3ab..b875c6bd8 100644
--- a/cmd/agent/app/agent.go
+++ b/cmd/agent/app/agent.go
@@ -262,6 +262,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (bool, error)
 		ClusterFailureThreshold:           ctx.Opts.ClusterFailureThreshold,
 		ClusterCacheSyncTimeout:           ctx.Opts.ClusterCacheSyncTimeout,
 		RateLimiterOptions:                ctx.Opts.RateLimiterOptions,
+		EnableClusterResourceModeling:     ctx.Opts.EnableClusterResourceModeling,
 	}
 	if err := clusterStatusController.SetupWithManager(ctx.Mgr); err != nil {
 		return false, err
diff --git a/cmd/agent/app/options/options.go b/cmd/agent/app/options/options.go
index e8f6250e3..bcee5cefd 100644
--- a/cmd/agent/app/options/options.go
+++ b/cmd/agent/app/options/options.go
@@ -107,6 +107,8 @@ type Options struct {
 
 	// ClusterRegion represents the region of the cluster locate in.
 	ClusterRegion string
+
+	EnableClusterResourceModeling bool
 }
 
 // NewOptions builds an default scheduler options.
@@ -174,6 +176,9 @@ func (o *Options) AddFlags(fs *pflag.FlagSet, allControllers []string) {
 	fs.StringVar(&o.MetricsBindAddress, "metrics-bind-address", ":8080", "The TCP address that the controller should bind to for serving prometheus metrics(e.g. 127.0.0.1:8088, :8088)")
 	fs.StringVar(&o.ClusterProvider, "cluster-provider", "", "Provider of the joining cluster. The Karmada scheduler can use this information to spread workloads across providers for higher availability.")
 	fs.StringVar(&o.ClusterRegion, "cluster-region", "", "The region of the joining cluster. The Karmada scheduler can use this information to spread workloads across regions for higher availability.")
+	fs.BoolVar(&o.EnableClusterResourceModeling, "enable-cluster-resource-modeling", true, "Enable means controller would build resource modeling for each cluster by syncing Nodes and Pods resources.\n"+
+		"The resource modeling might be used by the scheduler to make scheduling decisions in scenario of dynamic replica assignment based on cluster free resources.\n"+
+		"Disable if it does not fit your cases for better performance.")
 	o.RateLimiterOpts.AddFlags(fs)
 	o.ProfileOpts.AddFlags(fs)
 }
diff --git a/cmd/controller-manager/app/controllermanager.go b/cmd/controller-manager/app/controllermanager.go
index 91f4b8437..a0eb4c3c4 100644
--- a/cmd/controller-manager/app/controllermanager.go
+++ b/cmd/controller-manager/app/controllermanager.go
@@ -273,6 +273,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (enabled bool,
 		ClusterFailureThreshold:           opts.ClusterFailureThreshold,
 		ClusterCacheSyncTimeout:           opts.ClusterCacheSyncTimeout,
 		RateLimiterOptions:                ctx.Opts.RateLimiterOptions,
+		EnableClusterResourceModeling:     ctx.Opts.EnableClusterResourceModeling,
 	}
 	if err := clusterStatusController.SetupWithManager(mgr); err != nil {
 		return false, err
@@ -562,6 +563,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
 			EnableTaintManager:                opts.EnableTaintManager,
 			RateLimiterOptions:                opts.RateLimiterOpts,
 			GracefulEvictionTimeout:           opts.GracefulEvictionTimeout,
+			EnableClusterResourceModeling:     opts.EnableClusterResourceModeling,
 		},
 		StopChan:                    stopChan,
 		DynamicClientSet:            dynamicClientSet,
diff --git a/cmd/controller-manager/app/options/options.go b/cmd/controller-manager/app/options/options.go
index aae903946..acd9aab51 100644
--- a/cmd/controller-manager/app/options/options.go
+++ b/cmd/controller-manager/app/options/options.go
@@ -120,6 +120,8 @@ type Options struct {
 
 	RateLimiterOpts ratelimiterflag.Options
 	ProfileOpts     profileflag.Options
+
+	EnableClusterResourceModeling bool
 }
 
 // NewOptions builds an empty options.
@@ -198,6 +200,9 @@ func (o *Options) AddFlags(flags *pflag.FlagSet, allControllers, disabledByDefau
 	flags.IntVar(&o.ConcurrentResourceTemplateSyncs, "concurrent-resource-template-syncs", 5, "The number of resource templates that are allowed to sync concurrently.")
 	flags.BoolVar(&o.EnableTaintManager, "enable-taint-manager", true, "If set to true enables NoExecute Taints and will evict all not-tolerating objects propagating on Clusters tainted with this kind of Taints.")
 	flags.DurationVar(&o.GracefulEvictionTimeout.Duration, "graceful-eviction-timeout", 10*time.Minute, "Specifies the timeout period waiting for the graceful-eviction-controller performs the final removal since the workload(resource) has been moved to the graceful eviction tasks.")
+	flags.BoolVar(&o.EnableClusterResourceModeling, "enable-cluster-resource-modeling", true, "Enable means controller would build resource modeling for each cluster by syncing Nodes and Pods resources.\n"+
+		"The resource modeling might be used by the scheduler to make scheduling decisions in scenario of dynamic replica assignment based on cluster free resources.\n"+
+		"Disable if it does not fit your cases for better performance.")
 
 	o.RateLimiterOpts.AddFlags(flags)
 	o.ProfileOpts.AddFlags(flags)
diff --git a/pkg/controllers/context/context.go b/pkg/controllers/context/context.go
index 8aad10ee2..90d2e83ff 100644
--- a/pkg/controllers/context/context.go
+++ b/pkg/controllers/context/context.go
@@ -64,6 +64,8 @@ type Options struct {
 	// GracefulEvictionTimeout is the timeout period waiting for the grace-eviction-controller performs the final
 	// removal since the workload(resource) has been moved to the graceful eviction tasks.
 	GracefulEvictionTimeout metav1.Duration
+
+	EnableClusterResourceModeling bool
 }
 
 // Context defines the context object for controller.
diff --git a/pkg/controllers/status/cluster_status_controller.go b/pkg/controllers/status/cluster_status_controller.go
index dc580f9ae..b6395f450 100644
--- a/pkg/controllers/status/cluster_status_controller.go
+++ b/pkg/controllers/status/cluster_status_controller.go
@@ -91,6 +91,8 @@ type ClusterStatusController struct {
 
 	ClusterCacheSyncTimeout metav1.Duration
 	RateLimiterOptions      ratelimiterflag.Options
+
+	EnableClusterResourceModeling bool
 }
 
 // Reconcile syncs status of the given member cluster.
@@ -170,7 +172,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
 	}
 
 	// skip collecting cluster status if not ready
-	if online && healthy && readyCondition.Status == metav1.ConditionTrue {
+	if online && healthy && readyCondition.Status == metav1.ConditionTrue && c.EnableClusterResourceModeling {
 		// get or create informer for pods and nodes in member cluster
 		clusterInformerManager, err := c.buildInformerForCluster(clusterClient)
 		if err != nil {

From 95036ffcdabbb7b164b25978de35ccc110cf17a7 Mon Sep 17 00:00:00 2001
From: RainbowMango <qdurenhongcai@gmail.com>
Date: Tue, 16 Aug 2022 21:13:48 +0800
Subject: [PATCH 2/2] enable node pod list only when needed for better
 performance

Signed-off-by: RainbowMango <qdurenhongcai@gmail.com>
---
 cmd/agent/app/agent.go                        |  1 +
 cmd/agent/app/options/options.go              |  4 +
 cmd/controller-manager/app/options/options.go |  5 +-
 pkg/controllers/context/context.go            |  5 +-
 .../status/cluster_status_controller.go       | 85 +++++++++++--------
 5 files changed, 61 insertions(+), 39 deletions(-)

diff --git a/cmd/agent/app/agent.go b/cmd/agent/app/agent.go
index b875c6bd8..40b54ed38 100644
--- a/cmd/agent/app/agent.go
+++ b/cmd/agent/app/agent.go
@@ -225,6 +225,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
 			ClusterAPIBurst:                   opts.ClusterAPIBurst,
 			ConcurrentWorkSyncs:               opts.ConcurrentWorkSyncs,
 			RateLimiterOptions:                opts.RateLimiterOpts,
+			EnableClusterResourceModeling:     opts.EnableClusterResourceModeling,
 		},
 		StopChan:            stopChan,
 		ResourceInterpreter: resourceInterpreter,
diff --git a/cmd/agent/app/options/options.go b/cmd/agent/app/options/options.go
index bcee5cefd..b9ffb891a 100644
--- a/cmd/agent/app/options/options.go
+++ b/cmd/agent/app/options/options.go
@@ -108,6 +108,10 @@ type Options struct {
 	// ClusterRegion represents the region of the cluster locate in.
 	ClusterRegion string
 
+	// EnableClusterResourceModeling indicates if enable cluster resource modeling.
+	// The resource modeling might be used by the scheduler to make scheduling decisions
+	// in scenario of dynamic replica assignment based on cluster free resources.
+	// Disable if it does not fit your cases for better performance.
 	EnableClusterResourceModeling bool
 }
 
diff --git a/cmd/controller-manager/app/options/options.go b/cmd/controller-manager/app/options/options.go
index acd9aab51..a29b218a4 100644
--- a/cmd/controller-manager/app/options/options.go
+++ b/cmd/controller-manager/app/options/options.go
@@ -120,7 +120,10 @@ type Options struct {
 
 	RateLimiterOpts ratelimiterflag.Options
 	ProfileOpts     profileflag.Options
-
+	// EnableClusterResourceModeling indicates if enable cluster resource modeling.
+	// The resource modeling might be used by the scheduler to make scheduling decisions
+	// in scenario of dynamic replica assignment based on cluster free resources.
+	// Disable if it does not fit your cases for better performance.
 	EnableClusterResourceModeling bool
 }
 
diff --git a/pkg/controllers/context/context.go b/pkg/controllers/context/context.go
index 90d2e83ff..52448e0c1 100644
--- a/pkg/controllers/context/context.go
+++ b/pkg/controllers/context/context.go
@@ -64,7 +64,10 @@ type Options struct {
 	// GracefulEvictionTimeout is the timeout period waiting for the grace-eviction-controller performs the final
 	// removal since the workload(resource) has been moved to the graceful eviction tasks.
 	GracefulEvictionTimeout metav1.Duration
-
+	// EnableClusterResourceModeling indicates if enable cluster resource modeling.
+	// The resource modeling might be used by the scheduler to make scheduling decisions
+	// in scenario of dynamic replica assignment based on cluster free resources.
+	// Disable if it does not fit your cases for better performance.
 	EnableClusterResourceModeling bool
 }
 
diff --git a/pkg/controllers/status/cluster_status_controller.go b/pkg/controllers/status/cluster_status_controller.go
index b6395f450..79a797c15 100644
--- a/pkg/controllers/status/cluster_status_controller.go
+++ b/pkg/controllers/status/cluster_status_controller.go
@@ -92,6 +92,10 @@ type ClusterStatusController struct {
 	ClusterCacheSyncTimeout metav1.Duration
 	RateLimiterOptions      ratelimiterflag.Options
 
+	// EnableClusterResourceModeling indicates if enable cluster resource modeling.
+	// The resource modeling might be used by the scheduler to make scheduling decisions
+	// in scenario of dynamic replica assignment based on cluster free resources.
+	// Disable if it does not fit your cases for better performance.
 	EnableClusterResourceModeling bool
 }
 
@@ -172,16 +176,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
 	}
 
 	// skip collecting cluster status if not ready
-	if online && healthy && readyCondition.Status == metav1.ConditionTrue && c.EnableClusterResourceModeling {
-		// get or create informer for pods and nodes in member cluster
-		clusterInformerManager, err := c.buildInformerForCluster(clusterClient)
-		if err != nil {
-			klog.Errorf("Failed to get or create informer for Cluster %s. Error: %v.", cluster.GetName(), err)
-			// in large-scale clusters, the timeout may occur.
-			// if clusterInformerManager fails to be built, should be returned, otherwise, it may cause a nil pointer
-			return controllerruntime.Result{Requeue: true}, err
-		}
-
+	if online && healthy && readyCondition.Status == metav1.ConditionTrue {
 		if cluster.Spec.SyncMode == clusterv1alpha1.Pull {
 			// init the lease controller for pull mode clusters
 			c.initLeaseController(cluster)
@@ -191,6 +186,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
 		if err != nil {
 			klog.Errorf("Failed to get Kubernetes version for Cluster %s. Error: %v.", cluster.GetName(), err)
 		}
+		currentClusterStatus.KubernetesVersion = clusterVersion
 
 		// get the list of APIs installed in the member cluster
 		apiEnables, err := getAPIEnablements(clusterClient)
@@ -199,21 +195,36 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
 		} else if err != nil {
 			klog.Warningf("Maybe get partial(%d) APIs installed in Cluster %s. Error: %v.", len(apiEnables), cluster.GetName(), err)
 		}
-
-		nodes, err := listNodes(clusterInformerManager)
-		if err != nil {
-			klog.Errorf("Failed to list nodes for Cluster %s. Error: %v.", cluster.GetName(), err)
-		}
-
-		pods, err := listPods(clusterInformerManager)
-		if err != nil {
-			klog.Errorf("Failed to list pods for Cluster %s. Error: %v.", cluster.GetName(), err)
-		}
-
-		currentClusterStatus.KubernetesVersion = clusterVersion
 		currentClusterStatus.APIEnablements = apiEnables
-		currentClusterStatus.NodeSummary = getNodeSummary(nodes)
-		currentClusterStatus.ResourceSummary = getResourceSummary(nodes, pods)
+
+		// The generic informer manager actually used by 'execution-controller' and 'work-status-controller'.
+		// TODO(@RainbowMango): We should follow who-use who takes the responsibility to initialize it.
+		// 	We should move this logic to both `execution-controller` and `work-status-controller`.
+		//  After that the 'initializeGenericInformerManagerForCluster' function as well as 'c.GenericInformerManager'
+		//  can be safely removed from current controller.
+		c.initializeGenericInformerManagerForCluster(clusterClient)
+
+		if c.EnableClusterResourceModeling {
+			// get or create informer for pods and nodes in member cluster
+			clusterInformerManager, err := c.buildInformerForCluster(clusterClient)
+			if err != nil {
+				klog.Errorf("Failed to get or create informer for Cluster %s. Error: %v.", cluster.GetName(), err)
+				// in large-scale clusters, the timeout may occur.
+				// if clusterInformerManager fails to be built, should be returned, otherwise, it may cause a nil pointer
+				return controllerruntime.Result{Requeue: true}, err
+			}
+			nodes, err := listNodes(clusterInformerManager)
+			if err != nil {
+				klog.Errorf("Failed to list nodes for Cluster %s. Error: %v.", cluster.GetName(), err)
+			}
+
+			pods, err := listPods(clusterInformerManager)
+			if err != nil {
+				klog.Errorf("Failed to list pods for Cluster %s. Error: %v.", cluster.GetName(), err)
+			}
+			currentClusterStatus.NodeSummary = getNodeSummary(nodes)
+			currentClusterStatus.ResourceSummary = getResourceSummary(nodes, pods)
+		}
 	}
 
 	setTransitionTime(currentClusterStatus.Conditions, readyCondition)
@@ -258,22 +269,22 @@ func (c *ClusterStatusController) updateStatusIfNeeded(cluster *clusterv1alpha1.
 	return controllerruntime.Result{RequeueAfter: c.ClusterStatusUpdateFrequency.Duration}, nil
 }
 
+func (c *ClusterStatusController) initializeGenericInformerManagerForCluster(clusterClient *util.ClusterClient) {
+	if c.GenericInformerManager.IsManagerExist(clusterClient.ClusterName) {
+		return
+	}
+
+	dynamicClient, err := c.ClusterDynamicClientSetFunc(clusterClient.ClusterName, c.Client)
+	if err != nil {
+		klog.Errorf("Failed to build dynamic cluster client for cluster %s.", clusterClient.ClusterName)
+		return
+	}
+	c.GenericInformerManager.ForCluster(clusterClient.ClusterName, dynamicClient.DynamicClientSet, 0)
+}
+
 // buildInformerForCluster builds informer manager for cluster if it doesn't exist, then constructs informers for node
 // and pod and start it. If the informer manager exist, return it.
 func (c *ClusterStatusController) buildInformerForCluster(clusterClient *util.ClusterClient) (typedmanager.SingleClusterInformerManager, error) {
-	// cluster-status-controller will initialize the generic informer manager
-	// mainly because when the member cluster is joined, the dynamic informer required by the member cluster
-	// to distribute resources is prepared in advance
-	// in that case execution-controller can distribute resources without waiting.
-	if c.GenericInformerManager.GetSingleClusterManager(clusterClient.ClusterName) == nil {
-		dynamicClient, err := c.ClusterDynamicClientSetFunc(clusterClient.ClusterName, c.Client)
-		if err != nil {
-			klog.Errorf("Failed to build dynamic cluster client for cluster %s.", clusterClient.ClusterName)
-			return nil, err
-		}
-		c.GenericInformerManager.ForCluster(clusterClient.ClusterName, dynamicClient.DynamicClientSet, 0)
-	}
-
 	singleClusterInformerManager := c.TypedInformerManager.GetSingleClusterManager(clusterClient.ClusterName)
 	if singleClusterInformerManager == nil {
 		singleClusterInformerManager = c.TypedInformerManager.ForCluster(clusterClient.ClusterName, clusterClient.KubeClient, 0)