Merge pull request #5411 from alam0rt/fix-issue-where-terminating-instances-scale-down-asg-to-min
Track lifecycle and skip terminating already terminating instances
This commit is contained in:
commit
47af9cc434
|
|
@ -40,6 +40,7 @@ type asgCache struct {
|
||||||
asgToInstances map[AwsRef][]AwsInstanceRef
|
asgToInstances map[AwsRef][]AwsInstanceRef
|
||||||
instanceToAsg map[AwsInstanceRef]*asg
|
instanceToAsg map[AwsInstanceRef]*asg
|
||||||
instanceStatus map[AwsInstanceRef]*string
|
instanceStatus map[AwsInstanceRef]*string
|
||||||
|
instanceLifecycle map[AwsInstanceRef]*string
|
||||||
asgInstanceTypeCache *instanceTypeExpirationStore
|
asgInstanceTypeCache *instanceTypeExpirationStore
|
||||||
mutex sync.Mutex
|
mutex sync.Mutex
|
||||||
awsService *awsWrapper
|
awsService *awsWrapper
|
||||||
|
|
@ -83,6 +84,7 @@ func newASGCache(awsService *awsWrapper, explicitSpecs []string, autoDiscoverySp
|
||||||
asgToInstances: make(map[AwsRef][]AwsInstanceRef),
|
asgToInstances: make(map[AwsRef][]AwsInstanceRef),
|
||||||
instanceToAsg: make(map[AwsInstanceRef]*asg),
|
instanceToAsg: make(map[AwsInstanceRef]*asg),
|
||||||
instanceStatus: make(map[AwsInstanceRef]*string),
|
instanceStatus: make(map[AwsInstanceRef]*string),
|
||||||
|
instanceLifecycle: make(map[AwsInstanceRef]*string),
|
||||||
asgInstanceTypeCache: newAsgInstanceTypeCache(awsService),
|
asgInstanceTypeCache: newAsgInstanceTypeCache(awsService),
|
||||||
interrupt: make(chan struct{}),
|
interrupt: make(chan struct{}),
|
||||||
asgAutoDiscoverySpecs: autoDiscoverySpecs,
|
asgAutoDiscoverySpecs: autoDiscoverySpecs,
|
||||||
|
|
@ -239,6 +241,14 @@ func (m *asgCache) InstanceStatus(ref AwsInstanceRef) (*string, error) {
|
||||||
return nil, fmt.Errorf("could not find instance %v", ref)
|
return nil, fmt.Errorf("could not find instance %v", ref)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *asgCache) findInstanceLifecycle(ref AwsInstanceRef) (*string, error) {
|
||||||
|
if lifecycle, found := m.instanceLifecycle[ref]; found {
|
||||||
|
return lifecycle, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, fmt.Errorf("could not find instance %v", ref)
|
||||||
|
}
|
||||||
|
|
||||||
func (m *asgCache) SetAsgSize(asg *asg, size int) error {
|
func (m *asgCache) SetAsgSize(asg *asg, size int) error {
|
||||||
m.mutex.Lock()
|
m.mutex.Lock()
|
||||||
defer m.mutex.Unlock()
|
defer m.mutex.Unlock()
|
||||||
|
|
@ -292,7 +302,6 @@ func (m *asgCache) DeleteInstances(instances []*AwsInstanceRef) error {
|
||||||
for i, instance := range instances {
|
for i, instance := range instances {
|
||||||
instanceIds[i] = instance.Name
|
instanceIds[i] = instance.Name
|
||||||
}
|
}
|
||||||
|
|
||||||
return fmt.Errorf("can't delete instances %s as they belong to at least two different ASGs (%s and %s)", strings.Join(instanceIds, ","), commonAsg.Name, asg.Name)
|
return fmt.Errorf("can't delete instances %s as they belong to at least two different ASGs (%s and %s)", strings.Join(instanceIds, ","), commonAsg.Name, asg.Name)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -305,6 +314,22 @@ func (m *asgCache) DeleteInstances(instances []*AwsInstanceRef) error {
|
||||||
"of deleting instance", instance.Name)
|
"of deleting instance", instance.Name)
|
||||||
m.decreaseAsgSizeByOneNoLock(commonAsg)
|
m.decreaseAsgSizeByOneNoLock(commonAsg)
|
||||||
} else {
|
} else {
|
||||||
|
// check if the instance is already terminating - if it is, don't bother terminating again
|
||||||
|
// as doing so causes unnecessary API calls and can cause the curSize cached value to decrement
|
||||||
|
// unnecessarily.
|
||||||
|
lifecycle, err := m.findInstanceLifecycle(*instance)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if lifecycle != nil &&
|
||||||
|
*lifecycle == autoscaling.LifecycleStateTerminating ||
|
||||||
|
*lifecycle == autoscaling.LifecycleStateTerminatingWait ||
|
||||||
|
*lifecycle == autoscaling.LifecycleStateTerminatingProceed {
|
||||||
|
klog.V(2).Infof("instance %s is already terminating, will skip instead", instance.Name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
params := &autoscaling.TerminateInstanceInAutoScalingGroupInput{
|
params := &autoscaling.TerminateInstanceInAutoScalingGroupInput{
|
||||||
InstanceId: aws.String(instance.Name),
|
InstanceId: aws.String(instance.Name),
|
||||||
ShouldDecrementDesiredCapacity: aws.Bool(true),
|
ShouldDecrementDesiredCapacity: aws.Bool(true),
|
||||||
|
|
@ -361,6 +386,7 @@ func (m *asgCache) regenerate() error {
|
||||||
newInstanceToAsgCache := make(map[AwsInstanceRef]*asg)
|
newInstanceToAsgCache := make(map[AwsInstanceRef]*asg)
|
||||||
newAsgToInstancesCache := make(map[AwsRef][]AwsInstanceRef)
|
newAsgToInstancesCache := make(map[AwsRef][]AwsInstanceRef)
|
||||||
newInstanceStatusMap := make(map[AwsInstanceRef]*string)
|
newInstanceStatusMap := make(map[AwsInstanceRef]*string)
|
||||||
|
newInstanceLifecycleMap := make(map[AwsInstanceRef]*string)
|
||||||
|
|
||||||
// Fetch details of all ASGs
|
// Fetch details of all ASGs
|
||||||
refreshNames := m.buildAsgNames()
|
refreshNames := m.buildAsgNames()
|
||||||
|
|
@ -402,6 +428,7 @@ func (m *asgCache) regenerate() error {
|
||||||
newInstanceToAsgCache[ref] = asg
|
newInstanceToAsgCache[ref] = asg
|
||||||
newAsgToInstancesCache[asg.AwsRef][i] = ref
|
newAsgToInstancesCache[asg.AwsRef][i] = ref
|
||||||
newInstanceStatusMap[ref] = instance.HealthStatus
|
newInstanceStatusMap[ref] = instance.HealthStatus
|
||||||
|
newInstanceLifecycleMap[ref] = instance.LifecycleState
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -431,6 +458,7 @@ func (m *asgCache) regenerate() error {
|
||||||
m.instanceToAsg = newInstanceToAsgCache
|
m.instanceToAsg = newInstanceToAsgCache
|
||||||
m.autoscalingOptions = newAutoscalingOptions
|
m.autoscalingOptions = newAutoscalingOptions
|
||||||
m.instanceStatus = newInstanceStatusMap
|
m.instanceStatus = newInstanceStatusMap
|
||||||
|
m.instanceLifecycle = newInstanceLifecycleMap
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -75,6 +75,7 @@ func testNamedDescribeAutoScalingGroupsOutput(groupName string, desiredCap int64
|
||||||
instances = append(instances, &autoscaling.Instance{
|
instances = append(instances, &autoscaling.Instance{
|
||||||
InstanceId: aws.String(id),
|
InstanceId: aws.String(id),
|
||||||
AvailabilityZone: aws.String("us-east-1a"),
|
AvailabilityZone: aws.String("us-east-1a"),
|
||||||
|
LifecycleState: aws.String(autoscaling.LifecycleStateInService),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
return &autoscaling.DescribeAutoScalingGroupsOutput{
|
return &autoscaling.DescribeAutoScalingGroupsOutput{
|
||||||
|
|
@ -91,6 +92,15 @@ func testNamedDescribeAutoScalingGroupsOutput(groupName string, desiredCap int64
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func testSetASGInstanceLifecycle(asg *autoscaling.DescribeAutoScalingGroupsOutput, lifecycleState string) *autoscaling.DescribeAutoScalingGroupsOutput {
|
||||||
|
for _, asg := range asg.AutoScalingGroups {
|
||||||
|
for _, instance := range asg.Instances {
|
||||||
|
instance.LifecycleState = aws.String(lifecycleState)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return asg
|
||||||
|
}
|
||||||
|
|
||||||
func testProvider(t *testing.T, m *AwsManager) *awsCloudProvider {
|
func testProvider(t *testing.T, m *AwsManager) *awsCloudProvider {
|
||||||
resourceLimiter := cloudprovider.NewResourceLimiter(
|
resourceLimiter := cloudprovider.NewResourceLimiter(
|
||||||
map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000},
|
map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000},
|
||||||
|
|
@ -440,6 +450,54 @@ func TestDeleteNodes(t *testing.T) {
|
||||||
assert.Equal(t, 1, newSize)
|
assert.Equal(t, 1, newSize)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestDeleteNodesTerminatingInstances(t *testing.T) {
|
||||||
|
a := &autoScalingMock{}
|
||||||
|
provider := testProvider(t, newTestAwsManagerWithAsgs(t, a, nil, []string{"1:5:test-asg"}))
|
||||||
|
asgs := provider.NodeGroups()
|
||||||
|
|
||||||
|
a.On("TerminateInstanceInAutoScalingGroup", &autoscaling.TerminateInstanceInAutoScalingGroupInput{
|
||||||
|
InstanceId: aws.String("test-instance-id"),
|
||||||
|
ShouldDecrementDesiredCapacity: aws.Bool(true),
|
||||||
|
}).Return(&autoscaling.TerminateInstanceInAutoScalingGroupOutput{
|
||||||
|
Activity: &autoscaling.Activity{Description: aws.String("Deleted instance")},
|
||||||
|
})
|
||||||
|
|
||||||
|
// Look up the current number of instances...
|
||||||
|
var expectedInstancesCount int64 = 2
|
||||||
|
a.On("DescribeAutoScalingGroupsPages",
|
||||||
|
&autoscaling.DescribeAutoScalingGroupsInput{
|
||||||
|
AutoScalingGroupNames: aws.StringSlice([]string{"test-asg"}),
|
||||||
|
MaxRecords: aws.Int64(maxRecordsReturnedByAPI),
|
||||||
|
},
|
||||||
|
mock.AnythingOfType("func(*autoscaling.DescribeAutoScalingGroupsOutput, bool) bool"),
|
||||||
|
).Run(func(args mock.Arguments) {
|
||||||
|
fn := args.Get(1).(func(*autoscaling.DescribeAutoScalingGroupsOutput, bool) bool)
|
||||||
|
fn(testSetASGInstanceLifecycle(testNamedDescribeAutoScalingGroupsOutput("test-asg", expectedInstancesCount, "test-instance-id", "second-test-instance-id"), autoscaling.LifecycleStateTerminatingWait), false)
|
||||||
|
// we expect the instance count to be 1 after the call to DeleteNodes
|
||||||
|
expectedInstancesCount = 1
|
||||||
|
}).Return(nil)
|
||||||
|
|
||||||
|
provider.Refresh()
|
||||||
|
|
||||||
|
initialSize, err := asgs[0].TargetSize()
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, 2, initialSize)
|
||||||
|
|
||||||
|
node := &apiv1.Node{
|
||||||
|
Spec: apiv1.NodeSpec{
|
||||||
|
ProviderID: "aws:///us-east-1a/test-instance-id",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
err = asgs[0].DeleteNodes([]*apiv1.Node{node})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
a.AssertNumberOfCalls(t, "TerminateInstanceInAutoScalingGroup", 0) // instances which are terminating don't need to be terminated again
|
||||||
|
a.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 1)
|
||||||
|
|
||||||
|
newSize, err := asgs[0].TargetSize()
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, 2, newSize)
|
||||||
|
}
|
||||||
|
|
||||||
func TestDeleteNodesWithPlaceholder(t *testing.T) {
|
func TestDeleteNodesWithPlaceholder(t *testing.T) {
|
||||||
a := &autoScalingMock{}
|
a := &autoScalingMock{}
|
||||||
provider := testProvider(t, newTestAwsManagerWithAsgs(t, a, nil, []string{"1:5:test-asg"}))
|
provider := testProvider(t, newTestAwsManagerWithAsgs(t, a, nil, []string{"1:5:test-asg"}))
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue