add a flag to allow event duplication

this change brings in a new command line flag, `--record-duplicated-events`, which allows a user to enable the duplication of events bypassing the 5 minute de-duplication window.
2022-05-26 15:16:30 -04:00 · 2022-05-26 15:16:30 -04:00 · 8c27f76933
parent 4a97d16399
commit 8c27f76933
6 changed files with 27 additions and 7 deletions
--- a/cluster-autoscaler/FAQ.md
+++ b/cluster-autoscaler/FAQ.md
@ -28,6 +28,7 @@ this document:
 * [How to?](#how-to)
  * [I'm running cluster with nodes in multiple zones for HA purposes. Is that supported by Cluster Autoscaler?](#im-running-cluster-with-nodes-in-multiple-zones-for-ha-purposes-is-that-supported-by-cluster-autoscaler)
  * [How can I monitor Cluster Autoscaler?](#how-can-i-monitor-cluster-autoscaler)
+  * [How can I see all the events from Cluster Autoscaler?](#how-can-i-see-all-events-from-cluster-autoscaler)
  * [How can I scale my cluster to just 1 node?](#how-can-i-scale-my-cluster-to-just-1-node)
  * [How can I scale a node group to 0?](#how-can-i-scale-a-node-group-to-0)
  * [How can I prevent Cluster Autoscaler from scaling down a particular node?](#how-can-i-prevent-cluster-autoscaler-from-scaling-down-a-particular-node)
@ -267,6 +268,16 @@ respectively under `/metrics` and `/health-check`.
 Metrics are provided in Prometheus format and their detailed description is
 available [here](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/proposals/metrics.md).

+### How can I see all events from Cluster Autoscaler?
+
+By default, the Cluster Autoscaler will deduplicate similar events that occur within a 5 minute
+window. This is done to improve scalability performance where many similar events might be
+triggered in a short timespan, such as when there are too many unscheduled pods.
+
+In some cases, such as for debugging or when scalability of events is not an issue, you might
+want to see all the events coming from the Cluster Autoscaler. In these scenarios you should
+use the `--record-duplicated-events` command line flag.
+
 ### How can I scale my cluster to just 1 node?

 Prior to version 0.6, Cluster Autoscaler was not touching nodes that were running important
@ -760,6 +771,7 @@ The following startup parameters are supported for cluster autoscaler:
 | `daemonset-eviction-for-occupied-nodes` | Whether DaemonSet pods will be gracefully terminated from non-empty nodes | true
 | `feature-gates` | A set of key=value pairs that describe feature gates for alpha/experimental features. | ""
 | `cordon-node-before-terminating` | Should CA cordon nodes before terminating during downscale process | false
+| `record-duplicated-events` | Enable the autoscaler to print duplicated events within a 5 minute window. | false

 # Troubleshooting:

--- a/cluster-autoscaler/config/autoscaling_options.go
+++ b/cluster-autoscaler/config/autoscaling_options.go
@ -183,4 +183,6 @@ type AutoscalingOptions struct {
 	MaxDrainParallelism int
 	// GceExpanderEphemeralStorageSupport is whether scale-up takes ephemeral storage resources into account.
 	GceExpanderEphemeralStorageSupport bool
+	// RecordDuplicatedEvents controls whether events should be duplicated within a 5 minute window.
+	RecordDuplicatedEvents bool
 }
--- a/cluster-autoscaler/context/autoscaling_context.go
+++ b/cluster-autoscaler/context/autoscaling_context.go
@ -115,7 +115,7 @@ func NewAutoscalingContext(
 func NewAutoscalingKubeClients(opts config.AutoscalingOptions, kubeClient, eventsKubeClient kube_client.Interface) *AutoscalingKubeClients {
 	listerRegistryStopChannel := make(chan struct{})
 	listerRegistry := kube_util.NewListerRegistryWithDefaultListers(kubeClient, listerRegistryStopChannel)
-	kubeEventRecorder := kube_util.CreateEventRecorder(eventsKubeClient)
+	kubeEventRecorder := kube_util.CreateEventRecorder(eventsKubeClient, opts.RecordDuplicatedEvents)
 	logRecorder, err := utils.NewStatusMapRecorder(kubeClient, opts.ConfigNamespace, kubeEventRecorder, opts.WriteStatusConfigMap, opts.StatusConfigMapName)
 	if err != nil {
 		klog.Error("Failed to initialize status configmap, unable to write status events")
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@ -198,10 +198,10 @@ var (
 		"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
 	nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
 		"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
-
 	maxScaleDownParallelismFlag        = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
 	maxDrainParallelismFlag            = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
 	gceExpanderEphemeralStorageSupport = flag.Bool("gce-expander-ephemeral-storage-support", false, "Whether scale-up takes ephemeral storage resources into account for GCE cloud provider")
+	recordDuplicatedEvents             = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
 )

 func createAutoscalingOptions() config.AutoscalingOptions {
@ -288,6 +288,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		MaxScaleDownParallelism:            *maxScaleDownParallelismFlag,
 		MaxDrainParallelism:                *maxDrainParallelismFlag,
 		GceExpanderEphemeralStorageSupport: *gceExpanderEphemeralStorageSupport,
+		RecordDuplicatedEvents:             *recordDuplicatedEvents,
 	}
 }

@ -475,7 +476,7 @@ func main() {
 			kubeClient.CoordinationV1(),
 			resourcelock.ResourceLockConfig{
 				Identity:      id,
-				EventRecorder: kube_util.CreateEventRecorder(kubeClient),
+				EventRecorder: kube_util.CreateEventRecorder(kubeClient, *recordDuplicatedEvents),
 			},
 		)
 		if err != nil {
--- a/cluster-autoscaler/utils/deletetaint/delete_test.go
+++ b/cluster-autoscaler/utils/deletetaint/delete_test.go
@ -200,7 +200,7 @@ func TestCleanAllToBeDeleted(t *testing.T) {
 	n2.Spec.Taints = []apiv1.Taint{{Key: ToBeDeletedTaint, Value: strconv.FormatInt(time.Now().Unix()-301, 10)}}

 	fakeClient := buildFakeClient(t, n1, n2)
-	fakeRecorder := kube_util.CreateEventRecorder(fakeClient)
+	fakeRecorder := kube_util.CreateEventRecorder(fakeClient, false)

 	assert.Equal(t, 1, len(getNode(t, fakeClient, "n2").Spec.Taints))

@ -216,7 +216,7 @@ func TestCleanAllDeletionCandidates(t *testing.T) {
 	n2.Spec.Taints = []apiv1.Taint{{Key: DeletionCandidateTaint, Value: strconv.FormatInt(time.Now().Unix()-301, 10)}}

 	fakeClient := buildFakeClient(t, n1, n2)
-	fakeRecorder := kube_util.CreateEventRecorder(fakeClient)
+	fakeRecorder := kube_util.CreateEventRecorder(fakeClient, false)

 	assert.Equal(t, 1, len(getNode(t, fakeClient, "n2").Spec.Taints))

--- a/cluster-autoscaler/utils/kubernetes/factory.go
+++ b/cluster-autoscaler/utils/kubernetes/factory.go
@ -39,8 +39,13 @@ const (
 )

 // CreateEventRecorder creates an event recorder to send custom events to Kubernetes to be recorded for targeted Kubernetes objects
-func CreateEventRecorder(kubeClient clientset.Interface) kube_record.EventRecorder {
-	eventBroadcaster := kube_record.NewBroadcasterWithCorrelatorOptions(getCorrelationOptions())
+func CreateEventRecorder(kubeClient clientset.Interface, recordDuplicatedEvents bool) kube_record.EventRecorder {
+	var eventBroadcaster kube_record.EventBroadcaster
+	if recordDuplicatedEvents {
+		eventBroadcaster = kube_record.NewBroadcaster()
+	} else {
+		eventBroadcaster = kube_record.NewBroadcasterWithCorrelatorOptions(getCorrelationOptions())
+	}
 	if _, isfake := kubeClient.(*fake.Clientset); !isfake {
 		actualSink := &v1core.EventSinkImpl{Interface: v1core.New(kubeClient.CoreV1().RESTClient()).Events("")}
 		// EventBroadcaster has a StartLogging() method but the throttling options from getCorrelationOptions() get applied only to