add a flag to allow event duplication

this change brings in a new command line flag,
`--record-duplicated-events`, which allows a user to enable the
duplication of events bypassing the 5 minute de-duplication window.
This commit is contained in:
Michael McCune 2022-05-26 15:16:30 -04:00
parent 4a97d16399
commit 8c27f76933
6 changed files with 27 additions and 7 deletions

View File

@ -28,6 +28,7 @@ this document:
* [How to?](#how-to) * [How to?](#how-to)
* [I'm running cluster with nodes in multiple zones for HA purposes. Is that supported by Cluster Autoscaler?](#im-running-cluster-with-nodes-in-multiple-zones-for-ha-purposes-is-that-supported-by-cluster-autoscaler) * [I'm running cluster with nodes in multiple zones for HA purposes. Is that supported by Cluster Autoscaler?](#im-running-cluster-with-nodes-in-multiple-zones-for-ha-purposes-is-that-supported-by-cluster-autoscaler)
* [How can I monitor Cluster Autoscaler?](#how-can-i-monitor-cluster-autoscaler) * [How can I monitor Cluster Autoscaler?](#how-can-i-monitor-cluster-autoscaler)
* [How can I see all the events from Cluster Autoscaler?](#how-can-i-see-all-events-from-cluster-autoscaler)
* [How can I scale my cluster to just 1 node?](#how-can-i-scale-my-cluster-to-just-1-node) * [How can I scale my cluster to just 1 node?](#how-can-i-scale-my-cluster-to-just-1-node)
* [How can I scale a node group to 0?](#how-can-i-scale-a-node-group-to-0) * [How can I scale a node group to 0?](#how-can-i-scale-a-node-group-to-0)
* [How can I prevent Cluster Autoscaler from scaling down a particular node?](#how-can-i-prevent-cluster-autoscaler-from-scaling-down-a-particular-node) * [How can I prevent Cluster Autoscaler from scaling down a particular node?](#how-can-i-prevent-cluster-autoscaler-from-scaling-down-a-particular-node)
@ -267,6 +268,16 @@ respectively under `/metrics` and `/health-check`.
Metrics are provided in Prometheus format and their detailed description is Metrics are provided in Prometheus format and their detailed description is
available [here](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/proposals/metrics.md). available [here](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/proposals/metrics.md).
### How can I see all events from Cluster Autoscaler?
By default, the Cluster Autoscaler will deduplicate similar events that occur within a 5 minute
window. This is done to improve scalability performance where many similar events might be
triggered in a short timespan, such as when there are too many unscheduled pods.
In some cases, such as for debugging or when scalability of events is not an issue, you might
want to see all the events coming from the Cluster Autoscaler. In these scenarios you should
use the `--record-duplicated-events` command line flag.
### How can I scale my cluster to just 1 node? ### How can I scale my cluster to just 1 node?
Prior to version 0.6, Cluster Autoscaler was not touching nodes that were running important Prior to version 0.6, Cluster Autoscaler was not touching nodes that were running important
@ -760,6 +771,7 @@ The following startup parameters are supported for cluster autoscaler:
| `daemonset-eviction-for-occupied-nodes` | Whether DaemonSet pods will be gracefully terminated from non-empty nodes | true | `daemonset-eviction-for-occupied-nodes` | Whether DaemonSet pods will be gracefully terminated from non-empty nodes | true
| `feature-gates` | A set of key=value pairs that describe feature gates for alpha/experimental features. | "" | `feature-gates` | A set of key=value pairs that describe feature gates for alpha/experimental features. | ""
| `cordon-node-before-terminating` | Should CA cordon nodes before terminating during downscale process | false | `cordon-node-before-terminating` | Should CA cordon nodes before terminating during downscale process | false
| `record-duplicated-events` | Enable the autoscaler to print duplicated events within a 5 minute window. | false
# Troubleshooting: # Troubleshooting:

View File

@ -183,4 +183,6 @@ type AutoscalingOptions struct {
MaxDrainParallelism int MaxDrainParallelism int
// GceExpanderEphemeralStorageSupport is whether scale-up takes ephemeral storage resources into account. // GceExpanderEphemeralStorageSupport is whether scale-up takes ephemeral storage resources into account.
GceExpanderEphemeralStorageSupport bool GceExpanderEphemeralStorageSupport bool
// RecordDuplicatedEvents controls whether events should be duplicated within a 5 minute window.
RecordDuplicatedEvents bool
} }

View File

@ -115,7 +115,7 @@ func NewAutoscalingContext(
func NewAutoscalingKubeClients(opts config.AutoscalingOptions, kubeClient, eventsKubeClient kube_client.Interface) *AutoscalingKubeClients { func NewAutoscalingKubeClients(opts config.AutoscalingOptions, kubeClient, eventsKubeClient kube_client.Interface) *AutoscalingKubeClients {
listerRegistryStopChannel := make(chan struct{}) listerRegistryStopChannel := make(chan struct{})
listerRegistry := kube_util.NewListerRegistryWithDefaultListers(kubeClient, listerRegistryStopChannel) listerRegistry := kube_util.NewListerRegistryWithDefaultListers(kubeClient, listerRegistryStopChannel)
kubeEventRecorder := kube_util.CreateEventRecorder(eventsKubeClient) kubeEventRecorder := kube_util.CreateEventRecorder(eventsKubeClient, opts.RecordDuplicatedEvents)
logRecorder, err := utils.NewStatusMapRecorder(kubeClient, opts.ConfigNamespace, kubeEventRecorder, opts.WriteStatusConfigMap, opts.StatusConfigMapName) logRecorder, err := utils.NewStatusMapRecorder(kubeClient, opts.ConfigNamespace, kubeEventRecorder, opts.WriteStatusConfigMap, opts.StatusConfigMapName)
if err != nil { if err != nil {
klog.Error("Failed to initialize status configmap, unable to write status events") klog.Error("Failed to initialize status configmap, unable to write status events")

View File

@ -198,10 +198,10 @@ var (
"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.") "maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour, nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.") "nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.") maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.") maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
gceExpanderEphemeralStorageSupport = flag.Bool("gce-expander-ephemeral-storage-support", false, "Whether scale-up takes ephemeral storage resources into account for GCE cloud provider") gceExpanderEphemeralStorageSupport = flag.Bool("gce-expander-ephemeral-storage-support", false, "Whether scale-up takes ephemeral storage resources into account for GCE cloud provider")
recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
) )
func createAutoscalingOptions() config.AutoscalingOptions { func createAutoscalingOptions() config.AutoscalingOptions {
@ -288,6 +288,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
MaxScaleDownParallelism: *maxScaleDownParallelismFlag, MaxScaleDownParallelism: *maxScaleDownParallelismFlag,
MaxDrainParallelism: *maxDrainParallelismFlag, MaxDrainParallelism: *maxDrainParallelismFlag,
GceExpanderEphemeralStorageSupport: *gceExpanderEphemeralStorageSupport, GceExpanderEphemeralStorageSupport: *gceExpanderEphemeralStorageSupport,
RecordDuplicatedEvents: *recordDuplicatedEvents,
} }
} }
@ -475,7 +476,7 @@ func main() {
kubeClient.CoordinationV1(), kubeClient.CoordinationV1(),
resourcelock.ResourceLockConfig{ resourcelock.ResourceLockConfig{
Identity: id, Identity: id,
EventRecorder: kube_util.CreateEventRecorder(kubeClient), EventRecorder: kube_util.CreateEventRecorder(kubeClient, *recordDuplicatedEvents),
}, },
) )
if err != nil { if err != nil {

View File

@ -200,7 +200,7 @@ func TestCleanAllToBeDeleted(t *testing.T) {
n2.Spec.Taints = []apiv1.Taint{{Key: ToBeDeletedTaint, Value: strconv.FormatInt(time.Now().Unix()-301, 10)}} n2.Spec.Taints = []apiv1.Taint{{Key: ToBeDeletedTaint, Value: strconv.FormatInt(time.Now().Unix()-301, 10)}}
fakeClient := buildFakeClient(t, n1, n2) fakeClient := buildFakeClient(t, n1, n2)
fakeRecorder := kube_util.CreateEventRecorder(fakeClient) fakeRecorder := kube_util.CreateEventRecorder(fakeClient, false)
assert.Equal(t, 1, len(getNode(t, fakeClient, "n2").Spec.Taints)) assert.Equal(t, 1, len(getNode(t, fakeClient, "n2").Spec.Taints))
@ -216,7 +216,7 @@ func TestCleanAllDeletionCandidates(t *testing.T) {
n2.Spec.Taints = []apiv1.Taint{{Key: DeletionCandidateTaint, Value: strconv.FormatInt(time.Now().Unix()-301, 10)}} n2.Spec.Taints = []apiv1.Taint{{Key: DeletionCandidateTaint, Value: strconv.FormatInt(time.Now().Unix()-301, 10)}}
fakeClient := buildFakeClient(t, n1, n2) fakeClient := buildFakeClient(t, n1, n2)
fakeRecorder := kube_util.CreateEventRecorder(fakeClient) fakeRecorder := kube_util.CreateEventRecorder(fakeClient, false)
assert.Equal(t, 1, len(getNode(t, fakeClient, "n2").Spec.Taints)) assert.Equal(t, 1, len(getNode(t, fakeClient, "n2").Spec.Taints))

View File

@ -39,8 +39,13 @@ const (
) )
// CreateEventRecorder creates an event recorder to send custom events to Kubernetes to be recorded for targeted Kubernetes objects // CreateEventRecorder creates an event recorder to send custom events to Kubernetes to be recorded for targeted Kubernetes objects
func CreateEventRecorder(kubeClient clientset.Interface) kube_record.EventRecorder { func CreateEventRecorder(kubeClient clientset.Interface, recordDuplicatedEvents bool) kube_record.EventRecorder {
eventBroadcaster := kube_record.NewBroadcasterWithCorrelatorOptions(getCorrelationOptions()) var eventBroadcaster kube_record.EventBroadcaster
if recordDuplicatedEvents {
eventBroadcaster = kube_record.NewBroadcaster()
} else {
eventBroadcaster = kube_record.NewBroadcasterWithCorrelatorOptions(getCorrelationOptions())
}
if _, isfake := kubeClient.(*fake.Clientset); !isfake { if _, isfake := kubeClient.(*fake.Clientset); !isfake {
actualSink := &v1core.EventSinkImpl{Interface: v1core.New(kubeClient.CoreV1().RESTClient()).Events("")} actualSink := &v1core.EventSinkImpl{Interface: v1core.New(kubeClient.CoreV1().RESTClient()).Events("")}
// EventBroadcaster has a StartLogging() method but the throttling options from getCorrelationOptions() get applied only to // EventBroadcaster has a StartLogging() method but the throttling options from getCorrelationOptions() get applied only to