add a flag to allow event duplication

this change brings in a new command line flag, `--record-duplicated-events`, which allows a user to enable the duplication of events bypassing the 5 minute de-duplication window.
2022-05-26 15:16:30 -04:00 · 2022-05-26 15:16:30 -04:00 · 8c27f76933
parent 4a97d16399
commit 8c27f76933
6 changed files with 27 additions and 7 deletions
--- a/cluster-autoscaler/FAQ.md
+++ b/cluster-autoscaler/FAQ.md
@ -28,6 +28,7 @@ this document:
 * [How to?](#how-to)
  * [I'm running cluster with nodes in multiple zones for HA purposes. Is that supported by Cluster Autoscaler?](#im-running-cluster-with-nodes-in-multiple-zones-for-ha-purposes-is-that-supported-by-cluster-autoscaler)
  * [How can I monitor Cluster Autoscaler?](#how-can-i-monitor-cluster-autoscaler)
  * [How can I see all the events from Cluster Autoscaler?](#how-can-i-see-all-events-from-cluster-autoscaler)
  * [How can I scale my cluster to just 1 node?](#how-can-i-scale-my-cluster-to-just-1-node)
  * [How can I scale a node group to 0?](#how-can-i-scale-a-node-group-to-0)
  * [How can I prevent Cluster Autoscaler from scaling down a particular node?](#how-can-i-prevent-cluster-autoscaler-from-scaling-down-a-particular-node)
@ -267,6 +268,16 @@ respectively under `/metrics` and `/health-check`.
 Metrics are provided in Prometheus format and their detailed description is
 available [here](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/proposals/metrics.md).
 ### How can I see all events from Cluster Autoscaler?
 By default, the Cluster Autoscaler will deduplicate similar events that occur within a 5 minute
 window. This is done to improve scalability performance where many similar events might be
 triggered in a short timespan, such as when there are too many unscheduled pods.
 In some cases, such as for debugging or when scalability of events is not an issue, you might
 want to see all the events coming from the Cluster Autoscaler. In these scenarios you should
 use the `--record-duplicated-events` command line flag.
 ### How can I scale my cluster to just 1 node?
 Prior to version 0.6, Cluster Autoscaler was not touching nodes that were running important
@ -760,6 +771,7 @@ The following startup parameters are supported for cluster autoscaler:
 | `daemonset-eviction-for-occupied-nodes` | Whether DaemonSet pods will be gracefully terminated from non-empty nodes | true
 | `feature-gates` | A set of key=value pairs that describe feature gates for alpha/experimental features. | ""
 | `cordon-node-before-terminating` | Should CA cordon nodes before terminating during downscale process | false
 | `record-duplicated-events` | Enable the autoscaler to print duplicated events within a 5 minute window. | false
 # Troubleshooting:
--- a/cluster-autoscaler/config/autoscaling_options.go
+++ b/cluster-autoscaler/config/autoscaling_options.go
@ -183,4 +183,6 @@ type AutoscalingOptions struct {
 	MaxDrainParallelism int
 	// GceExpanderEphemeralStorageSupport is whether scale-up takes ephemeral storage resources into account.
 	GceExpanderEphemeralStorageSupport bool
 	// RecordDuplicatedEvents controls whether events should be duplicated within a 5 minute window.
 	RecordDuplicatedEvents bool
 }
--- a/cluster-autoscaler/context/autoscaling_context.go
+++ b/cluster-autoscaler/context/autoscaling_context.go
@ -115,7 +115,7 @@ func NewAutoscalingContext(
 func NewAutoscalingKubeClients(opts config.AutoscalingOptions, kubeClient, eventsKubeClient kube_client.Interface) *AutoscalingKubeClients {
 	listerRegistryStopChannel := make(chan struct{})
 	listerRegistry := kube_util.NewListerRegistryWithDefaultListers(kubeClient, listerRegistryStopChannel)
-	kubeEventRecorder := kube_util.CreateEventRecorder(eventsKubeClient)
+	kubeEventRecorder := kube_util.CreateEventRecorder(eventsKubeClient, opts.RecordDuplicatedEvents)
 	logRecorder, err := utils.NewStatusMapRecorder(kubeClient, opts.ConfigNamespace, kubeEventRecorder, opts.WriteStatusConfigMap, opts.StatusConfigMapName)
 	if err != nil {
 		klog.Error("Failed to initialize status configmap, unable to write status events")
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@ -198,10 +198,10 @@ var (
 		"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
 	nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
 		"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
 	maxScaleDownParallelismFlag        = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
 	maxDrainParallelismFlag            = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
 	gceExpanderEphemeralStorageSupport = flag.Bool("gce-expander-ephemeral-storage-support", false, "Whether scale-up takes ephemeral storage resources into account for GCE cloud provider")
 	recordDuplicatedEvents             = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
 )
 func createAutoscalingOptions() config.AutoscalingOptions {
@ -288,6 +288,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		MaxScaleDownParallelism:            *maxScaleDownParallelismFlag,
 		MaxDrainParallelism:                *maxDrainParallelismFlag,
 		GceExpanderEphemeralStorageSupport: *gceExpanderEphemeralStorageSupport,
 		RecordDuplicatedEvents:             *recordDuplicatedEvents,
 	}
 }
@ -475,7 +476,7 @@ func main() {
 			kubeClient.CoordinationV1(),
 			resourcelock.ResourceLockConfig{
 				Identity:      id,
-				EventRecorder: kube_util.CreateEventRecorder(kubeClient),
+				EventRecorder: kube_util.CreateEventRecorder(kubeClient, *recordDuplicatedEvents),
 			},
 		)
 		if err != nil {
--- a/cluster-autoscaler/utils/deletetaint/delete_test.go
+++ b/cluster-autoscaler/utils/deletetaint/delete_test.go
@ -200,7 +200,7 @@ func TestCleanAllToBeDeleted(t *testing.T) {
 	n2.Spec.Taints = []apiv1.Taint{{Key: ToBeDeletedTaint, Value: strconv.FormatInt(time.Now().Unix()-301, 10)}}
 	fakeClient := buildFakeClient(t, n1, n2)
-	fakeRecorder := kube_util.CreateEventRecorder(fakeClient)
+	fakeRecorder := kube_util.CreateEventRecorder(fakeClient, false)
 	assert.Equal(t, 1, len(getNode(t, fakeClient, "n2").Spec.Taints))
@ -216,7 +216,7 @@ func TestCleanAllDeletionCandidates(t *testing.T) {
 	n2.Spec.Taints = []apiv1.Taint{{Key: DeletionCandidateTaint, Value: strconv.FormatInt(time.Now().Unix()-301, 10)}}
 	fakeClient := buildFakeClient(t, n1, n2)
-	fakeRecorder := kube_util.CreateEventRecorder(fakeClient)
+	fakeRecorder := kube_util.CreateEventRecorder(fakeClient, false)
 	assert.Equal(t, 1, len(getNode(t, fakeClient, "n2").Spec.Taints))
--- a/cluster-autoscaler/utils/kubernetes/factory.go
+++ b/cluster-autoscaler/utils/kubernetes/factory.go
@ -39,8 +39,13 @@ const (
 )
 // CreateEventRecorder creates an event recorder to send custom events to Kubernetes to be recorded for targeted Kubernetes objects
-func CreateEventRecorder(kubeClient clientset.Interface) kube_record.EventRecorder {
+func CreateEventRecorder(kubeClient clientset.Interface, recordDuplicatedEvents bool) kube_record.EventRecorder {
-	eventBroadcaster := kube_record.NewBroadcasterWithCorrelatorOptions(getCorrelationOptions())
+	var eventBroadcaster kube_record.EventBroadcaster
 	if recordDuplicatedEvents {
 		eventBroadcaster = kube_record.NewBroadcaster()
 	} else {
 		eventBroadcaster = kube_record.NewBroadcasterWithCorrelatorOptions(getCorrelationOptions())
 	}
 	if _, isfake := kubeClient.(*fake.Clientset); !isfake {
 		actualSink := &v1core.EventSinkImpl{Interface: v1core.New(kubeClient.CoreV1().RESTClient()).Events("")}
 		// EventBroadcaster has a StartLogging() method but the throttling options from getCorrelationOptions() get applied only to