Don't pile up successive full refreshes during AWS scaledowns

Force refreshing everything at every DeleteNodes calls causes slow down
and throttling on large clusters with many ASGs (and lot of activity).

That function might be called several times in a row during scale-down
(once for each ASG having a node to be removed). Each time the forced
refresh will re-discover all ASGs, all LaunchConfigurations, then re-list all
instances from discovered ASGs.

That immediate refresh isn't required anyway, as the cache's DeleteInstances
concrete implementation will decrement the nodegroup size, and we can
schedule a grouped refresh for the next loop iteration.
This commit is contained in:
Benjamin Pineau 2021-01-06 12:02:19 +01:00
parent 7761d70770
commit 037dc7367a
2 changed files with 5 additions and 4 deletions

View File

@ -469,7 +469,7 @@ func TestDeleteNodes(t *testing.T) {
err = asgs[0].DeleteNodes([]*apiv1.Node{node}) err = asgs[0].DeleteNodes([]*apiv1.Node{node})
assert.NoError(t, err) assert.NoError(t, err)
service.AssertNumberOfCalls(t, "TerminateInstanceInAutoScalingGroup", 1) service.AssertNumberOfCalls(t, "TerminateInstanceInAutoScalingGroup", 1)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 2) service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 1)
newSize, err := asgs[0].TargetSize() newSize, err := asgs[0].TargetSize()
assert.NoError(t, err) assert.NoError(t, err)
@ -516,7 +516,7 @@ func TestDeleteNodesWithPlaceholder(t *testing.T) {
err = asgs[0].DeleteNodes([]*apiv1.Node{node}) err = asgs[0].DeleteNodes([]*apiv1.Node{node})
assert.NoError(t, err) assert.NoError(t, err)
service.AssertNumberOfCalls(t, "SetDesiredCapacity", 1) service.AssertNumberOfCalls(t, "SetDesiredCapacity", 1)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 2) service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 1)
newSize, err := asgs[0].TargetSize() newSize, err := asgs[0].TargetSize()
assert.NoError(t, err) assert.NoError(t, err)

View File

@ -294,8 +294,9 @@ func (m *AwsManager) DeleteInstances(instances []*AwsInstanceRef) error {
if err := m.asgCache.DeleteInstances(instances); err != nil { if err := m.asgCache.DeleteInstances(instances); err != nil {
return err return err
} }
klog.V(2).Infof("Some ASG instances might have been deleted, forcing ASG list refresh") klog.V(2).Infof("DeleteInstances was called: scheduling an ASG list refresh for next main loop evaluation")
return m.forceRefresh() m.lastRefresh = time.Now().Add(-refreshInterval)
return nil
} }
// GetAsgNodes returns Asg nodes. // GetAsgNodes returns Asg nodes.