Merge pull request #8313 from johngmyers/surge

Option to surge during rolling update
2020-03-04 10:21:47 -08:00 · 2020-03-04 10:21:47 -08:00 · a5dabf58dc
parent 1b19763552 99100dc4a0
commit a5dabf58dc
33 changed files with 1067 additions and 104 deletions
--- a/cloudmock/aws/mockautoscaling/BUILD.bazel
+++ b/cloudmock/aws/mockautoscaling/BUILD.bazel
@ -5,6 +5,7 @@ go_library(
    srcs = [
        "api.go",
        "attach.go",
        "ec2shim.go",
        "group.go",
        "launchconfigurations.go",
        "tags.go",
@ -16,6 +17,8 @@ go_library(
        "//vendor/github.com/aws/aws-sdk-go/aws/request:go_default_library",
        "//vendor/github.com/aws/aws-sdk-go/service/autoscaling:go_default_library",
        "//vendor/github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface:go_default_library",
        "//vendor/github.com/aws/aws-sdk-go/service/ec2:go_default_library",
        "//vendor/github.com/aws/aws-sdk-go/service/ec2/ec2iface:go_default_library",
        "//vendor/k8s.io/klog:go_default_library",
    ],
 )
--- a/cloudmock/aws/mockautoscaling/ec2shim.go
+++ b/cloudmock/aws/mockautoscaling/ec2shim.go
@ -0,0 +1,52 @@
 /*
 Copyright 2020 The Kubernetes Authors.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 package mockautoscaling
 import (
 	"github.com/aws/aws-sdk-go/aws"
 	"github.com/aws/aws-sdk-go/service/autoscaling"
 	"github.com/aws/aws-sdk-go/service/ec2"
 	"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
 )
 type ec2Shim struct {
 	ec2iface.EC2API
 	mockAutoscaling *MockAutoscaling
 }
 func (m *MockAutoscaling) GetEC2Shim(e ec2iface.EC2API) ec2iface.EC2API {
 	return &ec2Shim{
 		EC2API:          e,
 		mockAutoscaling: m,
 	}
 }
 func (e *ec2Shim) TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error) {
 	if input.DryRun != nil && *input.DryRun {
 		return &ec2.TerminateInstancesOutput{}, nil
 	}
 	for _, id := range input.InstanceIds {
 		request := &autoscaling.TerminateInstanceInAutoScalingGroupInput{
 			InstanceId:                     id,
 			ShouldDecrementDesiredCapacity: aws.Bool(false),
 		}
 		if _, err := e.mockAutoscaling.TerminateInstanceInAutoScalingGroup(request); err != nil {
 			return nil, err
 		}
 	}
 	return &ec2.TerminateInstancesOutput{}, nil
 }
--- a/k8s/crds/kops.k8s.io_clusters.yaml
+++ b/k8s/crds/kops.k8s.io_clusters.yaml
@ -2937,6 +2937,20 @@ spec:
              description: RollingUpdate defines the default rolling-update settings
                for instance groups
              properties:
                maxSurge:
                  anyOf:
                  - type: string
                  - type: integer
                  description: 'MaxSurge is the maximum number of extra nodes that
                    can be created during the update. The value can be an absolute
                    number (for example 5) or a percentage of desired machines (for
                    example 10%). The absolute number is calculated from a percentage
                    by rounding up. A value of 0 for both this and MaxUnavailable
                    disables rolling updates. Has no effect on instance groups with
                    role "Master". Defaults to 0. Example: when this is set to 30%,
                    the InstanceGroup can be scaled up immediately when the rolling
                    update starts, such that the total number of old and new nodes
                    do not exceed 130% of desired nodes.'
                maxUnavailable:
                  anyOf:
                  - type: string
@ -2945,12 +2959,13 @@ spec:
                    can be unavailable during the update. The value can be an absolute
                    number (for example 5) or a percentage of desired nodes (for example
                    10%). The absolute number is calculated from a percentage by rounding
-                    down. A value of 0 disables rolling updates. Defaults to 1. Example:
+                    down. A value of 0 for both this and MaxSurge disables rolling
-                    when this is set to 30%, the InstanceGroup can be scaled down
+                    updates. Defaults to 1 if MaxSurge is 0, otherwise defaults to
-                    to 70% of desired nodes immediately when the rolling update starts.
+                    0. Example: when this is set to 30%, the InstanceGroup can be
-                    Once new nodes are ready, more old nodes can be drained, ensuring
+                    scaled down to 70% of desired nodes immediately when the rolling
-                    that the total number of nodes available at all times during the
+                    update starts. Once new nodes are ready, more old nodes can be
-                    update is at least 70% of desired nodes.'
+                    drained, ensuring that the total number of nodes available at
                    all times during the update is at least 70% of desired nodes.'
              type: object
            secretStore:
              description: SecretStore is the VFS path to where secrets are stored
--- a/k8s/crds/kops.k8s.io_instancegroups.yaml
+++ b/k8s/crds/kops.k8s.io_instancegroups.yaml
@ -630,6 +630,20 @@ spec:
            rollingUpdate:
              description: RollingUpdate defines the rolling-update behavior
              properties:
                maxSurge:
                  anyOf:
                  - type: string
                  - type: integer
                  description: 'MaxSurge is the maximum number of extra nodes that
                    can be created during the update. The value can be an absolute
                    number (for example 5) or a percentage of desired machines (for
                    example 10%). The absolute number is calculated from a percentage
                    by rounding up. A value of 0 for both this and MaxUnavailable
                    disables rolling updates. Has no effect on instance groups with
                    role "Master". Defaults to 0. Example: when this is set to 30%,
                    the InstanceGroup can be scaled up immediately when the rolling
                    update starts, such that the total number of old and new nodes
                    do not exceed 130% of desired nodes.'
                maxUnavailable:
                  anyOf:
                  - type: string
@ -638,12 +652,13 @@ spec:
                    can be unavailable during the update. The value can be an absolute
                    number (for example 5) or a percentage of desired nodes (for example
                    10%). The absolute number is calculated from a percentage by rounding
-                    down. A value of 0 disables rolling updates. Defaults to 1. Example:
+                    down. A value of 0 for both this and MaxSurge disables rolling
-                    when this is set to 30%, the InstanceGroup can be scaled down
+                    updates. Defaults to 1 if MaxSurge is 0, otherwise defaults to
-                    to 70% of desired nodes immediately when the rolling update starts.
+                    0. Example: when this is set to 30%, the InstanceGroup can be
-                    Once new nodes are ready, more old nodes can be drained, ensuring
+                    scaled down to 70% of desired nodes immediately when the rolling
-                    that the total number of nodes available at all times during the
+                    update starts. Once new nodes are ready, more old nodes can be
-                    update is at least 70% of desired nodes.'
+                    drained, ensuring that the total number of nodes available at
                    all times during the update is at least 70% of desired nodes.'
              type: object
            rootVolumeDeleteOnTermination:
              description: 'RootVolumeDeleteOnTermination configures root volume retention
--- a/pkg/apis/kops/cluster.go
+++ b/pkg/apis/kops/cluster.go
@ -684,8 +684,8 @@ type RollingUpdate struct {
 	// The value can be an absolute number (for example 5) or a percentage of desired
 	// nodes (for example 10%).
 	// The absolute number is calculated from a percentage by rounding down.
-	// A value of 0 disables rolling updates.
+	// A value of 0 for both this and MaxSurge disables rolling updates.
-	// Defaults to 1.
+	// Defaults to 1 if MaxSurge is 0, otherwise defaults to 0.
 	// Example: when this is set to 30%, the InstanceGroup can be scaled
 	// down to 70% of desired nodes immediately when the rolling update
 	// starts. Once new nodes are ready, more old nodes can be drained,
@ -693,4 +693,18 @@ type RollingUpdate struct {
 	// during the update is at least 70% of desired nodes.
 	// +optional
 	MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"`
 	// MaxSurge is the maximum number of extra nodes that can be created
 	// during the update.
 	// The value can be an absolute number (for example 5) or a percentage of
 	// desired machines (for example 10%).
 	// The absolute number is calculated from a percentage by rounding up.
 	// A value of 0 for both this and MaxUnavailable disables rolling updates.
 	// Has no effect on instance groups with role "Master".
 	// Defaults to 0.
 	// Example: when this is set to 30%, the InstanceGroup can be scaled
 	// up immediately when the rolling update starts, such that the total
 	// number of old and new nodes do not exceed 130% of desired
 	// nodes.
 	// +optional
 	MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
 }
--- a/pkg/apis/kops/v1alpha1/cluster.go
+++ b/pkg/apis/kops/v1alpha1/cluster.go
@ -566,8 +566,8 @@ type RollingUpdate struct {
 	// The value can be an absolute number (for example 5) or a percentage of desired
 	// nodes (for example 10%).
 	// The absolute number is calculated from a percentage by rounding down.
-	// A value of 0 disables rolling updates.
+	// A value of 0 for both this and MaxSurge disables rolling updates.
-	// Defaults to 1.
+	// Defaults to 1 if MaxSurge is 0, otherwise defaults to 0.
 	// Example: when this is set to 30%, the InstanceGroup can be scaled
 	// down to 70% of desired nodes immediately when the rolling update
 	// starts. Once new nodes are ready, more old nodes can be drained,
@ -575,4 +575,18 @@ type RollingUpdate struct {
 	// during the update is at least 70% of desired nodes.
 	// +optional
 	MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"`
 	// MaxSurge is the maximum number of extra nodes that can be created
 	// during the update.
 	// The value can be an absolute number (for example 5) or a percentage of
 	// desired machines (for example 10%).
 	// The absolute number is calculated from a percentage by rounding up.
 	// A value of 0 for both this and MaxUnavailable disables rolling updates.
 	// Has no effect on instance groups with role "Master".
 	// Defaults to 0.
 	// Example: when this is set to 30%, the InstanceGroup can be scaled
 	// up immediately when the rolling update starts, such that the total
 	// number of old and new nodes do not exceed 130% of desired
 	// nodes.
 	// +optional
 	MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
 }
--- a/pkg/apis/kops/v1alpha1/zz_generated.conversion.go
+++ b/pkg/apis/kops/v1alpha1/zz_generated.conversion.go
@ -4720,6 +4720,7 @@ func Convert_kops_RBACAuthorizationSpec_To_v1alpha1_RBACAuthorizationSpec(in *ko
 func autoConvert_v1alpha1_RollingUpdate_To_kops_RollingUpdate(in *RollingUpdate, out *kops.RollingUpdate, s conversion.Scope) error {
 	out.MaxUnavailable = in.MaxUnavailable
 	out.MaxSurge = in.MaxSurge
 	return nil
 }
@ -4730,6 +4731,7 @@ func Convert_v1alpha1_RollingUpdate_To_kops_RollingUpdate(in *RollingUpdate, out
 func autoConvert_kops_RollingUpdate_To_v1alpha1_RollingUpdate(in *kops.RollingUpdate, out *RollingUpdate, s conversion.Scope) error {
 	out.MaxUnavailable = in.MaxUnavailable
 	out.MaxSurge = in.MaxSurge
 	return nil
 }
--- a/pkg/apis/kops/v1alpha1/zz_generated.deepcopy.go
+++ b/pkg/apis/kops/v1alpha1/zz_generated.deepcopy.go
@ -3303,6 +3303,11 @@ func (in *RollingUpdate) DeepCopyInto(out *RollingUpdate) {
 		*out = new(intstr.IntOrString)
 		**out = **in
 	}
 	if in.MaxSurge != nil {
 		in, out := &in.MaxSurge, &out.MaxSurge
 		*out = new(intstr.IntOrString)
 		**out = **in
 	}
 	return
 }
--- a/pkg/apis/kops/v1alpha2/cluster.go
+++ b/pkg/apis/kops/v1alpha2/cluster.go
@ -579,8 +579,8 @@ type RollingUpdate struct {
 	// The value can be an absolute number (for example 5) or a percentage of desired
 	// nodes (for example 10%).
 	// The absolute number is calculated from a percentage by rounding down.
-	// A value of 0 disables rolling updates.
+	// A value of 0 for both this and MaxSurge disables rolling updates.
-	// Defaults to 1.
+	// Defaults to 1 if MaxSurge is 0, otherwise defaults to 0.
 	// Example: when this is set to 30%, the InstanceGroup can be scaled
 	// down to 70% of desired nodes immediately when the rolling update
 	// starts. Once new nodes are ready, more old nodes can be drained,
@ -588,4 +588,18 @@ type RollingUpdate struct {
 	// during the update is at least 70% of desired nodes.
 	// +optional
 	MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"`
 	// MaxSurge is the maximum number of extra nodes that can be created
 	// during the update.
 	// The value can be an absolute number (for example 5) or a percentage of
 	// desired machines (for example 10%).
 	// The absolute number is calculated from a percentage by rounding up.
 	// A value of 0 for both this and MaxUnavailable disables rolling updates.
 	// Has no effect on instance groups with role "Master".
 	// Defaults to 0.
 	// Example: when this is set to 30%, the InstanceGroup can be scaled
 	// up immediately when the rolling update starts, such that the total
 	// number of old and new nodes do not exceed 130% of desired
 	// nodes.
 	// +optional
 	MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
 }
--- a/pkg/apis/kops/v1alpha2/zz_generated.conversion.go
+++ b/pkg/apis/kops/v1alpha2/zz_generated.conversion.go
@ -4990,6 +4990,7 @@ func Convert_kops_RBACAuthorizationSpec_To_v1alpha2_RBACAuthorizationSpec(in *ko
 func autoConvert_v1alpha2_RollingUpdate_To_kops_RollingUpdate(in *RollingUpdate, out *kops.RollingUpdate, s conversion.Scope) error {
 	out.MaxUnavailable = in.MaxUnavailable
 	out.MaxSurge = in.MaxSurge
 	return nil
 }
@ -5000,6 +5001,7 @@ func Convert_v1alpha2_RollingUpdate_To_kops_RollingUpdate(in *RollingUpdate, out
 func autoConvert_kops_RollingUpdate_To_v1alpha2_RollingUpdate(in *kops.RollingUpdate, out *RollingUpdate, s conversion.Scope) error {
 	out.MaxUnavailable = in.MaxUnavailable
 	out.MaxSurge = in.MaxSurge
 	return nil
 }
--- a/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go
+++ b/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go
@ -3374,6 +3374,11 @@ func (in *RollingUpdate) DeepCopyInto(out *RollingUpdate) {
 		*out = new(intstr.IntOrString)
 		**out = **in
 	}
 	if in.MaxSurge != nil {
 		in, out := &in.MaxSurge, &out.MaxSurge
 		*out = new(intstr.IntOrString)
 		**out = **in
 	}
 	return
 }
--- a/pkg/apis/kops/validation/instancegroup.go
+++ b/pkg/apis/kops/validation/instancegroup.go
@ -118,7 +118,7 @@ func ValidateInstanceGroup(g *kops.InstanceGroup) field.ErrorList {
 	allErrs = append(allErrs, validateInstanceProfile(g.Spec.IAM, field.NewPath("spec", "iam"))...)
 	if g.Spec.RollingUpdate != nil {
-		allErrs = append(allErrs, validateRollingUpdate(g.Spec.RollingUpdate, field.NewPath("spec", "rollingUpdate"))...)
+		allErrs = append(allErrs, validateRollingUpdate(g.Spec.RollingUpdate, field.NewPath("spec", "rollingUpdate"), g.Spec.Role == kops.InstanceGroupRoleMaster)...)
 	}
 	return allErrs
--- a/pkg/apis/kops/validation/validation.go
+++ b/pkg/apis/kops/validation/validation.go
@ -123,7 +123,7 @@ func validateClusterSpec(spec *kops.ClusterSpec, fieldPath *field.Path) field.Er
 	}
 	if spec.RollingUpdate != nil {
-		allErrs = append(allErrs, validateRollingUpdate(spec.RollingUpdate, fieldPath.Child("rollingUpdate"))...)
+		allErrs = append(allErrs, validateRollingUpdate(spec.RollingUpdate, fieldPath.Child("rollingUpdate"), false)...)
 	}
 	return allErrs
@ -597,7 +597,7 @@ func validateContainerRuntime(runtime *string, fldPath *field.Path) field.ErrorL
 	return allErrs
 }
-func validateRollingUpdate(rollingUpdate *kops.RollingUpdate, fldpath *field.Path) field.ErrorList {
+func validateRollingUpdate(rollingUpdate *kops.RollingUpdate, fldpath *field.Path, onMasterInstanceGroup bool) field.ErrorList {
 	allErrs := field.ErrorList{}
 	if rollingUpdate.MaxUnavailable != nil {
 		unavailable, err := intstr.GetValueFromIntOrPercent(rollingUpdate.MaxUnavailable, 1, false)
@ -609,6 +609,18 @@ func validateRollingUpdate(rollingUpdate *kops.RollingUpdate, fldpath *field.Pat
 			allErrs = append(allErrs, field.Invalid(fldpath.Child("maxUnavailable"), rollingUpdate.MaxUnavailable, "Cannot be negative"))
 		}
 	}
 	if rollingUpdate.MaxSurge != nil {
 		surge, err := intstr.GetValueFromIntOrPercent(rollingUpdate.MaxSurge, 1000, true)
 		if err != nil {
 			allErrs = append(allErrs, field.Invalid(fldpath.Child("maxSurge"), rollingUpdate.MaxSurge,
 				fmt.Sprintf("Unable to parse: %v", err)))
 		}
 		if onMasterInstanceGroup && surge != 0 {
 			allErrs = append(allErrs, field.Forbidden(fldpath.Child("maxSurge"), "Cannot surge instance groups with role \"Master\""))
 		} else if surge < 0 {
 			allErrs = append(allErrs, field.Invalid(fldpath.Child("maxSurge"), rollingUpdate.MaxSurge, "Cannot be negative"))
 		}
 	}
 	return allErrs
 }
--- a/pkg/apis/kops/validation/validation_test.go
+++ b/pkg/apis/kops/validation/validation_test.go
@ -405,6 +405,7 @@ func Test_Validate_Calico(t *testing.T) {
 func Test_Validate_RollingUpdate(t *testing.T) {
 	grid := []struct {
 		Input          kops.RollingUpdate
 		OnMasterIG     bool
 		ExpectedErrors []string
 	}{
 		{
@ -438,9 +439,94 @@ func Test_Validate_RollingUpdate(t *testing.T) {
 			},
 			ExpectedErrors: []string{"Invalid value::testField.maxUnavailable"},
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromInt(0)),
 			},
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromString("0%")),
 			},
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromInt(1)),
 			},
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromString("1%")),
 			},
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromString("nope")),
 			},
 			ExpectedErrors: []string{"Invalid value::testField.maxSurge"},
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromInt(-1)),
 			},
 			ExpectedErrors: []string{"Invalid value::testField.maxSurge"},
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromString("-1%")),
 			},
 			ExpectedErrors: []string{"Invalid value::testField.maxSurge"},
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromInt(0)),
 			},
 			OnMasterIG: true,
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromString("0%")),
 			},
 			OnMasterIG: true,
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromInt(1)),
 			},
 			OnMasterIG:     true,
 			ExpectedErrors: []string{"Forbidden::testField.maxSurge"},
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromString("1%")),
 			},
 			OnMasterIG:     true,
 			ExpectedErrors: []string{"Forbidden::testField.maxSurge"},
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromString("nope")),
 			},
 			OnMasterIG:     true,
 			ExpectedErrors: []string{"Invalid value::testField.maxSurge"},
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromInt(-1)),
 			},
 			OnMasterIG:     true,
 			ExpectedErrors: []string{"Forbidden::testField.maxSurge"},
 		},
 		{
 			Input: kops.RollingUpdate{
 				MaxSurge: intStr(intstr.FromString("-1%")),
 			},
 			OnMasterIG:     true,
 			ExpectedErrors: []string{"Forbidden::testField.maxSurge"},
 		},
 	}
 	for _, g := range grid {
-		errs := validateRollingUpdate(&g.Input, field.NewPath("testField"))
+		errs := validateRollingUpdate(&g.Input, field.NewPath("testField"), g.OnMasterIG)
 		testErrors(t, g.Input, errs, g.ExpectedErrors)
 	}
 }
--- a/pkg/apis/kops/zz_generated.deepcopy.go
+++ b/pkg/apis/kops/zz_generated.deepcopy.go
@ -3588,6 +3588,11 @@ func (in *RollingUpdate) DeepCopyInto(out *RollingUpdate) {
 		*out = new(intstr.IntOrString)
 		**out = **in
 	}
 	if in.MaxSurge != nil {
 		in, out := &in.MaxSurge, &out.MaxSurge
 		*out = new(intstr.IntOrString)
 		**out = **in
 	}
 	return
 }
--- a/pkg/cloudinstances/cloud_instance_group.go
+++ b/pkg/cloudinstances/cloud_instance_group.go
@ -47,6 +47,8 @@ type CloudInstanceGroupMember struct {
 	Node *v1.Node
 	// CloudInstanceGroup is the managing CloudInstanceGroup
 	CloudInstanceGroup *CloudInstanceGroup
 	// Detached is whether fi.Cloud.DetachInstance has been successfully called on the instance.
 	Detached bool
 }
 // NewCloudInstanceGroupMember creates a new CloudInstanceGroupMember
@ -74,6 +76,28 @@ func (c *CloudInstanceGroup) NewCloudInstanceGroupMember(instanceId string, newG
 	return nil
 }
 // NewDetachedCloudInstanceGroupMember creates a new CloudInstanceGroupMember for a detached instance
 func (c *CloudInstanceGroup) NewDetachedCloudInstanceGroupMember(instanceId string, nodeMap map[string]*v1.Node) error {
 	if instanceId == "" {
 		return fmt.Errorf("instance id for cloud instance member cannot be empty")
 	}
 	cm := &CloudInstanceGroupMember{
 		ID:                 instanceId,
 		CloudInstanceGroup: c,
 		Detached:           true,
 	}
 	node := nodeMap[instanceId]
 	if node != nil {
 		cm.Node = node
 	} else {
 		klog.V(8).Infof("unable to find node for instance: %s", instanceId)
 	}
 	c.NeedUpdate = append(c.NeedUpdate, cm)
 	return nil
 }
 // Status returns a human-readable Status indicating whether an update is needed
 func (c *CloudInstanceGroup) Status() string {
 	if len(c.NeedUpdate) == 0 {
--- a/pkg/instancegroups/BUILD.bazel
+++ b/pkg/instancegroups/BUILD.bazel
@ -45,6 +45,8 @@ go_test(
        "//vendor/github.com/aws/aws-sdk-go/aws:go_default_library",
        "//vendor/github.com/aws/aws-sdk-go/service/autoscaling:go_default_library",
        "//vendor/github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface:go_default_library",
        "//vendor/github.com/aws/aws-sdk-go/service/ec2:go_default_library",
        "//vendor/github.com/aws/aws-sdk-go/service/ec2/ec2iface:go_default_library",
        "//vendor/github.com/stretchr/testify/assert:go_default_library",
        "//vendor/k8s.io/api/core/v1:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
--- a/pkg/instancegroups/instancegroups.go
+++ b/pkg/instancegroups/instancegroups.go
@ -101,10 +101,7 @@ func promptInteractive(upgradedHostId, upgradedHostName string) (stopPrompting b
 	return stopPrompting, err
 }
-// TODO: Temporarily increase size of ASG?
+// RollingUpdate performs a rolling update on a list of instances.
 // TODO: Remove from ASG first so status is immediately updated?
 // RollingUpdate performs a rolling update on a list of ec2 instances.
 func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateCluster, cluster *api.Cluster, isBastion bool, sleepAfterTerminate time.Duration, validationTimeout time.Duration) (err error) {
 	// we should not get here, but hey I am going to check.
@ -152,17 +149,61 @@ func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpd
 	settings := resolveSettings(cluster, r.CloudGroup.InstanceGroup, numInstances)
 	runningDrains := 0
-	maxConcurrency := settings.MaxUnavailable.IntValue()
+	maxSurge := settings.MaxSurge.IntValue()
 	if maxSurge > len(update) {
 		maxSurge = len(update)
 	}
 	maxConcurrency := maxSurge + settings.MaxUnavailable.IntValue()
 	if maxConcurrency == 0 {
 		klog.Infof("Rolling updates for InstanceGroup %s are disabled", r.CloudGroup.InstanceGroup.Name)
 		return nil
 	}
 	if r.CloudGroup.InstanceGroup.Spec.Role == api.InstanceGroupRoleMaster && maxSurge != 0 {
 		// Masters are incapable of surging because they rely on registering themselves through
 		// the local apiserver. That apiserver depends on the local etcd, which relies on being
 		// joined to the etcd cluster.
 		maxSurge = 0
 		maxConcurrency = settings.MaxUnavailable.IntValue()
 		if maxConcurrency == 0 {
 			maxConcurrency = 1
 		}
 	}
 	if rollingUpdateData.Interactive {
 		if maxSurge > 1 {
 			maxSurge = 1
 		}
 		maxConcurrency = 1
 	}
 	update = prioritizeUpdate(update)
 	if maxSurge > 0 && !rollingUpdateData.CloudOnly {
 		for numSurge := 1; numSurge <= maxSurge; numSurge++ {
 			u := update[len(update)-numSurge]
 			if !u.Detached {
 				if err := r.detachInstance(u); err != nil {
 					return err
 				}
 				// If noneReady, wait until after one node is detached and its replacement validates
 				// before detaching more in case the current spec does not result in usable nodes.
 				if numSurge == maxSurge || noneReady {
 					// Wait for the minimum interval
 					klog.Infof("waiting for %v after detaching instance", sleepAfterTerminate)
 					time.Sleep(sleepAfterTerminate)
 					if err := r.maybeValidate(rollingUpdateData, validationTimeout, "detaching"); err != nil {
 						return err
 					}
 					noneReady = false
 				}
 			}
 		}
 	}
 	terminateChan := make(chan error, maxConcurrency)
 	for uIdx, u := range update {
@ -183,7 +224,7 @@ func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpd
 			return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
 		}
-		err = r.maybeValidate(rollingUpdateData, validationTimeout)
+		err = r.maybeValidate(rollingUpdateData, validationTimeout, "removing")
 		if err != nil {
 			return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
 		}
@ -229,7 +270,7 @@ func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpd
 			}
 		}
-		err = r.maybeValidate(rollingUpdateData, validationTimeout)
+		err = r.maybeValidate(rollingUpdateData, validationTimeout, "removing")
 		if err != nil {
 			return err
 		}
@ -238,6 +279,25 @@ func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpd
 	return nil
 }
 func prioritizeUpdate(update []*cloudinstances.CloudInstanceGroupMember) []*cloudinstances.CloudInstanceGroupMember {
 	// The priorities are, in order:
 	//   attached before detached
 	//   TODO unhealthy before healthy
 	//   NeedUpdate before Ready (preserve original order)
 	result := make([]*cloudinstances.CloudInstanceGroupMember, 0, len(update))
 	var detached []*cloudinstances.CloudInstanceGroupMember
 	for _, u := range update {
 		if u.Detached {
 			detached = append(detached, u)
 		} else {
 			result = append(result, u)
 		}
 	}
 	result = append(result, detached...)
 	return result
 }
 func waitForPendingBeforeReturningError(runningDrains int, terminateChan chan error, err error) error {
 	for runningDrains > 0 {
 		<-terminateChan
@ -359,7 +419,7 @@ func (r *RollingUpdateInstanceGroup) drainTerminateAndWait(u *cloudinstances.Clo
 	return nil
 }
-func (r *RollingUpdateInstanceGroup) maybeValidate(rollingUpdateData *RollingUpdateCluster, validationTimeout time.Duration) error {
+func (r *RollingUpdateInstanceGroup) maybeValidate(rollingUpdateData *RollingUpdateCluster, validationTimeout time.Duration, operation string) error {
 	if rollingUpdateData.CloudOnly {
 		klog.Warningf("Not validating cluster as cloudonly flag is set.")
@ -370,10 +430,10 @@ func (r *RollingUpdateInstanceGroup) maybeValidate(rollingUpdateData *RollingUpd
 			if rollingUpdateData.FailOnValidate {
 				klog.Errorf("Cluster did not validate within %s", validationTimeout)
-				return fmt.Errorf("error validating cluster after removing a node: %v", err)
+				return fmt.Errorf("error validating cluster after %s a node: %v", operation, err)
 			}
-			klog.Warningf("Cluster validation failed after removing instance, proceeding since fail-on-validate is set to false: %v", err)
+			klog.Warningf("Cluster validation failed after %s instance, proceeding since fail-on-validate is set to false: %v", operation, err)
 		}
 	}
 	return nil
@ -450,6 +510,30 @@ func (r *RollingUpdateInstanceGroup) validateCluster(rollingUpdateData *RollingU
 }
 // detachInstance detaches a Cloud Instance
 func (r *RollingUpdateInstanceGroup) detachInstance(u *cloudinstances.CloudInstanceGroupMember) error {
 	id := u.ID
 	nodeName := ""
 	if u.Node != nil {
 		nodeName = u.Node.Name
 	}
 	if nodeName != "" {
 		klog.Infof("Detaching instance %q, node %q, in group %q.", id, nodeName, r.CloudGroup.HumanName)
 	} else {
 		klog.Infof("Detaching instance %q, in group %q.", id, r.CloudGroup.HumanName)
 	}
 	if err := r.Cloud.DetachInstance(u); err != nil {
 		if nodeName != "" {
 			return fmt.Errorf("error detaching instance %q, node %q: %v", id, nodeName, err)
 		} else {
 			return fmt.Errorf("error detaching instance %q: %v", id, err)
 		}
 	}
 	return nil
 }
 // DeleteInstance deletes an Cloud Instance.
 func (r *RollingUpdateInstanceGroup) DeleteInstance(u *cloudinstances.CloudInstanceGroupMember) error {
 	id := u.ID
--- a/pkg/instancegroups/rollingupdate_test.go
+++ b/pkg/instancegroups/rollingupdate_test.go
@ -26,6 +26,8 @@ import (
 	"github.com/aws/aws-sdk-go/aws"
 	"github.com/aws/aws-sdk-go/service/autoscaling"
 	"github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface"
 	"github.com/aws/aws-sdk-go/service/ec2"
 	"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
 	"github.com/stretchr/testify/assert"
 	v1 "k8s.io/api/core/v1"
 	v1meta "k8s.io/apimachinery/pkg/apis/meta/v1"
@ -49,7 +51,9 @@ func getTestSetup() (*RollingUpdateCluster, *awsup.MockAWSCloud, *kopsapi.Cluste
 	k8sClient := fake.NewSimpleClientset()
 	mockcloud := awsup.BuildMockAWSCloud("us-east-1", "abc")
-	mockcloud.MockAutoscaling = &mockautoscaling.MockAutoscaling{}
+	mockAutoscaling := &mockautoscaling.MockAutoscaling{}
 	mockcloud.MockAutoscaling = mockAutoscaling
 	mockcloud.MockEC2 = mockAutoscaling.GetEC2Shim(mockcloud.MockEC2)
 	cluster := &kopsapi.Cluster{}
 	cluster.Name = "test.k8s.local"
@ -109,6 +113,7 @@ func makeGroup(groups map[string]*cloudinstances.CloudInstanceGroup, k8sClient k
 	fakeClient := k8sClient.(*fake.Clientset)
 	groups[name] = &cloudinstances.CloudInstanceGroup{
 		HumanName: name,
 		InstanceGroup: &kopsapi.InstanceGroup{
 			ObjectMeta: v1meta.ObjectMeta{
 				Name: name,
@ -117,6 +122,7 @@ func makeGroup(groups map[string]*cloudinstances.CloudInstanceGroup, k8sClient k
 				Role: role,
 			},
 		},
 		Raw: &autoscaling.Group{AutoScalingGroupName: aws.String("asg-" + name)},
 	}
 	cloud.Autoscaling().CreateAutoScalingGroup(&autoscaling.CreateAutoScalingGroupInput{
 		AutoScalingGroupName: aws.String(name),
@ -136,8 +142,9 @@ func makeGroup(groups map[string]*cloudinstances.CloudInstanceGroup, k8sClient k
 			_ = fakeClient.Tracker().Add(node)
 		}
 		member := cloudinstances.CloudInstanceGroupMember{
-			ID:   id,
+			ID:                 id,
-			Node: node,
+			Node:               node,
 			CloudInstanceGroup: groups[name],
 		}
 		if i < needUpdate {
 			groups[name].NeedUpdate = append(groups[name].NeedUpdate, &member)
@ -605,6 +612,52 @@ func TestRollingUpdateTaintAllButOneNeedUpdate(t *testing.T) {
 	assertGroupInstanceCount(t, cloud, "node-1", 1)
 }
 func TestRollingUpdateMaxSurgeIgnoredForMaster(t *testing.T) {
 	c, cloud, cluster := getTestSetup()
 	two := intstr.FromInt(2)
 	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
 		MaxSurge: &two,
 	}
 	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
 	makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 3, 2)
 	err := c.RollingUpdate(groups, cluster, &kopsapi.InstanceGroupList{})
 	assert.NoError(t, err, "rolling update")
 	cordoned := ""
 	tainted := map[string]bool{}
 	deleted := map[string]bool{}
 	for _, action := range c.K8sClient.(*fake.Clientset).Actions() {
 		switch a := action.(type) {
 		case testingclient.PatchAction:
 			if string(a.GetPatch()) == cordonPatch {
 				assertCordon(t, a)
 				assert.Equal(t, "", cordoned, "at most one node cordoned at a time")
 				assert.True(t, tainted[a.GetName()], "node", a.GetName(), "tainted")
 				cordoned = a.GetName()
 			} else {
 				assertTaint(t, a)
 				assert.Equal(t, "", cordoned, "not tainting while node cordoned")
 				assert.False(t, tainted[a.GetName()], "node", a.GetName(), "already tainted")
 				tainted[a.GetName()] = true
 			}
 		case testingclient.DeleteAction:
 			assert.Equal(t, "nodes", a.GetResource().Resource)
 			assert.Equal(t, cordoned, a.GetName(), "node was cordoned before delete")
 			assert.False(t, deleted[a.GetName()], "node", a.GetName(), "already deleted")
 			deleted[a.GetName()] = true
 			cordoned = ""
 		case testingclient.ListAction:
 			// Don't care
 		default:
 			t.Errorf("unexpected action %v", a)
 		}
 	}
 	assertGroupInstanceCount(t, cloud, "master-1", 1)
 }
 func TestRollingUpdateDisabled(t *testing.T) {
 	c, cloud, cluster := getTestSetup()
@ -644,12 +697,26 @@ func TestRollingUpdateDisabledCloudonly(t *testing.T) {
 // The concurrent update tests attempt to induce the following expected update sequence:
 //
-// (Only for "all need update" tests, to verify the toe-dipping behavior)
+// (Only for surging "all need update" test, to verify the toe-dipping behavior)
 // Request validate (8)            -->
 //                                 <-- validated
 // Detach instance                 -->
 // Request validate (7)            -->
 //                                 <-- validated
 // Detach instance                 -->
 // (end only for surging "all need update" tests)
 // (Only for surging "all but one need update" test)
 // Request validate (7)            -->
 //                                 <-- validated
 // Detach instance                 -->
 // Detach instance                 -->
 // (end only for surging "all but one need update" test)
 // (Only for non-surging "all need update" tests, to verify the toe-dipping behavior)
 // Request validate (7)            -->
 //                                 <-- validated
 // Request terminate 1 node (7)    -->
 //                                 <-- 1 node terminated, 6 left
-// (end only for "all need update" tests)
+// (end only for non-surging "all need update" tests)
 // Request validate (6)            -->
 //                                 <-- validated
 // Request terminate 2 nodes (6,5) -->
@ -672,19 +739,27 @@ func TestRollingUpdateDisabledCloudonly(t *testing.T) {
 //                                 <-- validated
 type concurrentTest struct {
-	autoscalingiface.AutoScalingAPI
+	ec2iface.EC2API
 	t                       *testing.T
 	mutex                   sync.Mutex
 	surge                   int
 	terminationRequestsLeft int
 	previousValidation      int
 	validationChan          chan bool
 	terminationChan         chan bool
 	detached                map[string]bool
 }
 func (c *concurrentTest) Validate() (*validation.ValidationCluster, error) {
 	c.mutex.Lock()
 	defer c.mutex.Unlock()
 	if len(c.detached) < c.surge {
 		assert.Greater(c.t, c.previousValidation, 7, "previous validation")
 		c.previousValidation--
 		return &validation.ValidationCluster{}, nil
 	}
 	terminationRequestsLeft := c.terminationRequestsLeft
 	switch terminationRequestsLeft {
 	case 7, 6, 0:
@ -727,29 +802,40 @@ func (c *concurrentTest) Validate() (*validation.ValidationCluster, error) {
 	return &validation.ValidationCluster{}, nil
 }
-func (c *concurrentTest) TerminateInstanceInAutoScalingGroup(input *autoscaling.TerminateInstanceInAutoScalingGroupInput) (*autoscaling.TerminateInstanceInAutoScalingGroupOutput, error) {
+func (c *concurrentTest) TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error) {
 	if input.DryRun != nil && *input.DryRun {
 		return &ec2.TerminateInstancesOutput{}, nil
 	}
 	c.mutex.Lock()
 	defer c.mutex.Unlock()
-	terminationRequestsLeft := c.terminationRequestsLeft
+	for _, id := range input.InstanceIds {
-	c.terminationRequestsLeft--
+		assert.Equal(c.t, c.surge, len(c.detached), "Number of detached instances")
-	switch terminationRequestsLeft {
+		if c.detached[*id] {
-	case 7, 2, 1:
+			assert.LessOrEqual(c.t, c.terminationRequestsLeft, c.surge, "Deleting detached instances last")
-		assert.Equal(c.t, terminationRequestsLeft, c.previousValidation, "previous validation")
+		}
-	case 6, 4:
+
-		assert.Equal(c.t, terminationRequestsLeft, c.previousValidation, "previous validation")
+		terminationRequestsLeft := c.terminationRequestsLeft
-		c.mutex.Unlock()
+		c.terminationRequestsLeft--
-		select {
+		switch terminationRequestsLeft {
-		case <-c.terminationChan:
+		case 7, 2, 1:
-		case <-time.After(1 * time.Second):
+			assert.Equal(c.t, terminationRequestsLeft, c.previousValidation, "previous validation")
-			c.t.Error("timed out reading from terminationChan")
+		case 6, 4:
 			assert.Equal(c.t, terminationRequestsLeft, c.previousValidation, "previous validation")
 			c.mutex.Unlock()
 			select {
 			case <-c.terminationChan:
 			case <-time.After(1 * time.Second):
 				c.t.Error("timed out reading from terminationChan")
 			}
 			c.mutex.Lock()
 			go c.delayThenWakeValidation()
 		case 5, 3:
 			assert.Equal(c.t, terminationRequestsLeft+1, c.previousValidation, "previous validation")
 		}
 		c.mutex.Lock()
 		go c.delayThenWakeValidation()
 	case 5, 3:
 		assert.Equal(c.t, terminationRequestsLeft+1, c.previousValidation, "previous validation")
 	}
-	return c.AutoScalingAPI.TerminateInstanceInAutoScalingGroup(input)
+	return c.EC2API.TerminateInstances(input)
 }
 func (c *concurrentTest) delayThenWakeValidation() {
@ -767,28 +853,36 @@ func (c *concurrentTest) AssertComplete() {
 	assert.Equal(c.t, 0, c.previousValidation, "last validation")
 }
-func newConcurrentTest(t *testing.T, cloud *awsup.MockAWSCloud, allNeedUpdate bool) *concurrentTest {
+func newConcurrentTest(t *testing.T, cloud *awsup.MockAWSCloud, numSurge int, allNeedUpdate bool) *concurrentTest {
 	test := concurrentTest{
-		AutoScalingAPI:          cloud.MockAutoscaling,
+		EC2API:                  cloud.MockEC2,
 		t:                       t,
 		surge:                   numSurge,
 		terminationRequestsLeft: 6,
 		validationChan:          make(chan bool),
 		terminationChan:         make(chan bool),
 		detached:                map[string]bool{},
 	}
-	if allNeedUpdate {
+	if numSurge == 0 && allNeedUpdate {
 		test.terminationRequestsLeft = 7
 	}
-	test.previousValidation = test.terminationRequestsLeft + 1
+	if numSurge == 0 {
 		test.previousValidation = test.terminationRequestsLeft + 1
 	} else if allNeedUpdate {
 		test.previousValidation = 9
 	} else {
 		test.previousValidation = 8
 	}
 	return &test
 }
 func TestRollingUpdateMaxUnavailableAllNeedUpdate(t *testing.T) {
 	c, cloud, cluster := getTestSetup()
-	concurrentTest := newConcurrentTest(t, cloud, true)
+	concurrentTest := newConcurrentTest(t, cloud, 0, true)
 	c.ValidateSuccessDuration = 0
 	c.ClusterValidator = concurrentTest
-	cloud.MockAutoscaling = concurrentTest
+	cloud.MockEC2 = concurrentTest
 	two := intstr.FromInt(2)
 	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
@ -808,10 +902,10 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdate(t *testing.T) {
 func TestRollingUpdateMaxUnavailableAllButOneNeedUpdate(t *testing.T) {
 	c, cloud, cluster := getTestSetup()
-	concurrentTest := newConcurrentTest(t, cloud, false)
+	concurrentTest := newConcurrentTest(t, cloud, 0, false)
 	c.ValidateSuccessDuration = 0
 	c.ClusterValidator = concurrentTest
-	cloud.MockAutoscaling = concurrentTest
+	cloud.MockEC2 = concurrentTest
 	two := intstr.FromInt(2)
 	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
@ -830,10 +924,10 @@ func TestRollingUpdateMaxUnavailableAllButOneNeedUpdate(t *testing.T) {
 func TestRollingUpdateMaxUnavailableAllNeedUpdateMaster(t *testing.T) {
 	c, cloud, cluster := getTestSetup()
-	concurrentTest := newConcurrentTest(t, cloud, true)
+	concurrentTest := newConcurrentTest(t, cloud, 0, true)
 	c.ValidateSuccessDuration = 0
 	c.ClusterValidator = concurrentTest
-	cloud.MockAutoscaling = concurrentTest
+	cloud.MockEC2 = concurrentTest
 	two := intstr.FromInt(2)
 	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
@ -850,6 +944,264 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdateMaster(t *testing.T) {
 	concurrentTest.AssertComplete()
 }
 type concurrentTestAutoscaling struct {
 	autoscalingiface.AutoScalingAPI
 	ConcurrentTest *concurrentTest
 }
 func (m *concurrentTestAutoscaling) DetachInstances(input *autoscaling.DetachInstancesInput) (*autoscaling.DetachInstancesOutput, error) {
 	m.ConcurrentTest.mutex.Lock()
 	defer m.ConcurrentTest.mutex.Unlock()
 	assert.Equal(m.ConcurrentTest.t, "node-1", *input.AutoScalingGroupName)
 	assert.False(m.ConcurrentTest.t, *input.ShouldDecrementDesiredCapacity)
 	for _, id := range input.InstanceIds {
 		assert.Less(m.ConcurrentTest.t, len(m.ConcurrentTest.detached), m.ConcurrentTest.surge, "Number of detached instances")
 		assert.False(m.ConcurrentTest.t, m.ConcurrentTest.detached[*id], *id+" already detached")
 		m.ConcurrentTest.detached[*id] = true
 	}
 	return &autoscaling.DetachInstancesOutput{}, nil
 }
 type ec2IgnoreTags struct {
 	ec2iface.EC2API
 }
 // CreateTags ignores tagging of instances done by the AWS fi.Cloud implementation of DetachInstance()
 func (e *ec2IgnoreTags) CreateTags(*ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error) {
 	return &ec2.CreateTagsOutput{}, nil
 }
 func TestRollingUpdateMaxSurgeAllNeedUpdate(t *testing.T) {
 	c, cloud, cluster := getTestSetup()
 	concurrentTest := newConcurrentTest(t, cloud, 2, true)
 	c.ValidateSuccessDuration = 0
 	c.ClusterValidator = concurrentTest
 	cloud.MockAutoscaling = &concurrentTestAutoscaling{
 		AutoScalingAPI: cloud.MockAutoscaling,
 		ConcurrentTest: concurrentTest,
 	}
 	cloud.MockEC2 = &ec2IgnoreTags{EC2API: concurrentTest}
 	two := intstr.FromInt(2)
 	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
 		MaxSurge: &two,
 	}
 	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
 	makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 6, 6)
 	err := c.RollingUpdate(groups, cluster, &kopsapi.InstanceGroupList{})
 	assert.NoError(t, err, "rolling update")
 	assertGroupInstanceCount(t, cloud, "node-1", 0)
 	concurrentTest.AssertComplete()
 }
 func TestRollingUpdateMaxSurgeAllButOneNeedUpdate(t *testing.T) {
 	c, cloud, cluster := getTestSetup()
 	concurrentTest := newConcurrentTest(t, cloud, 2, false)
 	c.ValidateSuccessDuration = 0
 	c.ClusterValidator = concurrentTest
 	cloud.MockAutoscaling = &concurrentTestAutoscaling{
 		AutoScalingAPI: cloud.MockAutoscaling,
 		ConcurrentTest: concurrentTest,
 	}
 	cloud.MockEC2 = &ec2IgnoreTags{EC2API: concurrentTest}
 	two := intstr.FromInt(2)
 	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
 		MaxSurge: &two,
 	}
 	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
 	makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 7, 6)
 	err := c.RollingUpdate(groups, cluster, &kopsapi.InstanceGroupList{})
 	assert.NoError(t, err, "rolling update")
 	assertGroupInstanceCount(t, cloud, "node-1", 1)
 	concurrentTest.AssertComplete()
 }
 type countDetach struct {
 	autoscalingiface.AutoScalingAPI
 	Count int
 }
 func (c *countDetach) DetachInstances(input *autoscaling.DetachInstancesInput) (*autoscaling.DetachInstancesOutput, error) {
 	c.Count += len(input.InstanceIds)
 	return &autoscaling.DetachInstancesOutput{}, nil
 }
 func TestRollingUpdateMaxSurgeGreaterThanNeedUpdate(t *testing.T) {
 	c, cloud, cluster := getTestSetup()
 	countDetach := &countDetach{AutoScalingAPI: cloud.MockAutoscaling}
 	cloud.MockAutoscaling = countDetach
 	cloud.MockEC2 = &ec2IgnoreTags{EC2API: cloud.MockEC2}
 	ten := intstr.FromInt(10)
 	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
 		MaxSurge: &ten,
 	}
 	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
 	makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 2)
 	err := c.RollingUpdate(groups, cluster, &kopsapi.InstanceGroupList{})
 	assert.NoError(t, err, "rolling update")
 	assertGroupInstanceCount(t, cloud, "node-1", 1)
 	assert.Equal(t, 2, countDetach.Count)
 }
 // Request validate (1)            -->
 //                                 <-- validated
 // Detach instance                 -->
 // Request validate (2)            -->
 //                                 <-- validated
 // Detach instance                 -->
 // Request validate (3)            -->
 //                                 <-- validated
 // Request terminate 3 nodes       -->
 //                                 <-- 3 nodes terminated, 1 left
 // Request validate (4)            -->
 //                                 <-- validated
 // Request terminate 1 node        -->
 //                                 <-- 1 node terminated, 0 left
 // Request validate (5)            -->
 //                                 <-- validated
 type alreadyDetachedTest struct {
 	ec2iface.EC2API
 	t                       *testing.T
 	mutex                   sync.Mutex
 	terminationRequestsLeft int
 	numValidations          int
 	detached                map[string]bool
 }
 func (t *alreadyDetachedTest) Validate() (*validation.ValidationCluster, error) {
 	t.mutex.Lock()
 	defer t.mutex.Unlock()
 	t.numValidations++
 	switch t.numValidations {
 	case 1, 2, 3:
 		assert.Equal(t.t, t.numValidations, len(t.detached), "numnber of detached instances")
 	case 4:
 		t.mutex.Unlock()
 		time.Sleep(20 * time.Millisecond) // NodeInterval plus some
 		t.mutex.Lock()
 		assert.Equal(t.t, 1, t.terminationRequestsLeft, "terminations left")
 	case 5:
 		assert.Equal(t.t, 0, t.terminationRequestsLeft, "terminations left")
 	case 6:
 		t.t.Error("unexpected sixth call to Validate")
 	}
 	return &validation.ValidationCluster{}, nil
 }
 func (t *alreadyDetachedTest) TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error) {
 	if input.DryRun != nil && *input.DryRun {
 		return &ec2.TerminateInstancesOutput{}, nil
 	}
 	t.mutex.Lock()
 	defer t.mutex.Unlock()
 	for _, id := range input.InstanceIds {
 		assert.Equal(t.t, 3, len(t.detached), "Number of detached instances")
 		assert.GreaterOrEqual(t.t, t.numValidations, 3, "Number of previous validations")
 		if t.terminationRequestsLeft == 1 {
 			assert.True(t.t, t.detached[*id], "Last deleted instance %q was detached", *id)
 		}
 		t.terminationRequestsLeft--
 	}
 	return t.EC2API.TerminateInstances(input)
 }
 type alreadyDetachedTestAutoscaling struct {
 	autoscalingiface.AutoScalingAPI
 	AlreadyDetachedTest *alreadyDetachedTest
 }
 func (m *alreadyDetachedTestAutoscaling) DetachInstances(input *autoscaling.DetachInstancesInput) (*autoscaling.DetachInstancesOutput, error) {
 	m.AlreadyDetachedTest.mutex.Lock()
 	defer m.AlreadyDetachedTest.mutex.Unlock()
 	for _, id := range input.InstanceIds {
 		assert.Less(m.AlreadyDetachedTest.t, len(m.AlreadyDetachedTest.detached), 3, "Number of detached instances")
 		assert.False(m.AlreadyDetachedTest.t, m.AlreadyDetachedTest.detached[*id], *id+" already detached")
 		m.AlreadyDetachedTest.detached[*id] = true
 	}
 	return &autoscaling.DetachInstancesOutput{}, nil
 }
 func TestRollingUpdateMaxSurgeAllNeedUpdateOneAlreadyDetached(t *testing.T) {
 	c, cloud, cluster := getTestSetup()
 	alreadyDetachedTest := &alreadyDetachedTest{
 		EC2API:                  cloud.MockEC2,
 		t:                       t,
 		terminationRequestsLeft: 4,
 		detached:                map[string]bool{},
 	}
 	c.ValidateSuccessDuration = 0
 	c.ClusterValidator = alreadyDetachedTest
 	cloud.MockAutoscaling = &alreadyDetachedTestAutoscaling{
 		AutoScalingAPI:      cloud.MockAutoscaling,
 		AlreadyDetachedTest: alreadyDetachedTest,
 	}
 	cloud.MockEC2 = &ec2IgnoreTags{EC2API: alreadyDetachedTest}
 	three := intstr.FromInt(3)
 	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
 		MaxSurge: &three,
 	}
 	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
 	makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 4, 4)
 	alreadyDetachedTest.detached[groups["node-1"].NeedUpdate[3].ID] = true
 	groups["node-1"].NeedUpdate[3].Detached = true
 	err := c.RollingUpdate(groups, cluster, &kopsapi.InstanceGroupList{})
 	assert.NoError(t, err, "rolling update")
 	assertGroupInstanceCount(t, cloud, "node-1", 0)
 	assert.Equal(t, 5, alreadyDetachedTest.numValidations, "Number of validations")
 }
 func TestRollingUpdateMaxSurgeAllNeedUpdateMaxAlreadyDetached(t *testing.T) {
 	c, cloud, cluster := getTestSetup()
 	// Should behave the same as TestRollingUpdateMaxUnavailableAllNeedUpdate
 	concurrentTest := newConcurrentTest(t, cloud, 0, true)
 	c.ValidateSuccessDuration = 0
 	c.ClusterValidator = concurrentTest
 	cloud.MockEC2 = concurrentTest
 	two := intstr.FromInt(2)
 	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
 		MaxSurge: &two,
 	}
 	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
 	makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 7, 7)
 	groups["node-1"].NeedUpdate[1].Detached = true
 	groups["node-1"].NeedUpdate[3].Detached = true
 	// TODO verify those are the last two instances terminated
 	err := c.RollingUpdate(groups, cluster, &kopsapi.InstanceGroupList{})
 	assert.NoError(t, err, "rolling update")
 	assertGroupInstanceCount(t, cloud, "node-1", 0)
 	concurrentTest.AssertComplete()
 }
 func assertCordon(t *testing.T, action testingclient.PatchAction) {
 	assert.Equal(t, "nodes", action.GetResource().Resource)
 	assert.Equal(t, cordonPatch, string(action.GetPatch()))
--- a/pkg/instancegroups/settings.go
+++ b/pkg/instancegroups/settings.go
@ -31,19 +31,32 @@ func resolveSettings(cluster *kops.Cluster, group *kops.InstanceGroup, numInstan
 		if rollingUpdate.MaxUnavailable == nil {
 			rollingUpdate.MaxUnavailable = def.MaxUnavailable
 		}
 		if rollingUpdate.MaxSurge == nil {
 			rollingUpdate.MaxSurge = def.MaxSurge
 		}
 	}
-	if rollingUpdate.MaxUnavailable == nil || rollingUpdate.MaxUnavailable.IntVal < 0 {
+	if rollingUpdate.MaxSurge == nil {
-		one := intstr.FromInt(1)
+		zero := intstr.FromInt(0)
-		rollingUpdate.MaxUnavailable = &one
+		rollingUpdate.MaxSurge = &zero
 	}
 	if rollingUpdate.MaxSurge.Type == intstr.String {
 		surge, _ := intstr.GetValueFromIntOrPercent(rollingUpdate.MaxSurge, numInstances, true)
 		surgeInt := intstr.FromInt(surge)
 		rollingUpdate.MaxSurge = &surgeInt
 	}
 	maxUnavailableDefault := intstr.FromInt(0)
 	if rollingUpdate.MaxSurge.Type == intstr.Int && rollingUpdate.MaxSurge.IntVal == 0 {
 		maxUnavailableDefault = intstr.FromInt(1)
 	}
 	if rollingUpdate.MaxUnavailable == nil {
 		rollingUpdate.MaxUnavailable = &maxUnavailableDefault
 	}
 	if rollingUpdate.MaxUnavailable.Type == intstr.String {
-		unavailable, err := intstr.GetValueFromIntOrPercent(rollingUpdate.MaxUnavailable, numInstances, false)
+		unavailable, _ := intstr.GetValueFromIntOrPercent(rollingUpdate.MaxUnavailable, numInstances, false)
 		if err != nil {
 			// If unparseable use the default value
 			unavailable = 1
 		}
 		if unavailable <= 0 {
 			// While we round down, percentages should resolve to a minimum of 1
 			unavailable = 1
--- a/pkg/instancegroups/settings_test.go
+++ b/pkg/instancegroups/settings_test.go
@ -37,6 +37,11 @@ func TestSettings(t *testing.T) {
 			defaultValue:    intstr.FromInt(1),
 			nonDefaultValue: intstr.FromInt(2),
 		},
 		{
 			name:            "MaxSurge",
 			defaultValue:    intstr.FromInt(0),
 			nonDefaultValue: intstr.FromInt(2),
 		},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			defaultCluster := &kops.RollingUpdate{}
@ -133,21 +138,6 @@ func TestMaxUnavailable(t *testing.T) {
 			value:        "100%",
 			expected:     10,
 		},
 		{
 			numInstances: 5,
 			value:        "fnord",
 			expected:     1,
 		},
 		{
 			numInstances: 5,
 			value:        "-3",
 			expected:     1,
 		},
 		{
 			numInstances: 5,
 			value:        "-3%",
 			expected:     1,
 		},
 	} {
 		t.Run(fmt.Sprintf("%s %d", tc.value, tc.numInstances), func(t *testing.T) {
 			value := intstr.Parse(tc.value)
@ -165,3 +155,52 @@ func TestMaxUnavailable(t *testing.T) {
 		})
 	}
 }
 func TestMaxSurge(t *testing.T) {
 	for _, tc := range []struct {
 		numInstances int
 		value        string
 		expected     int32
 	}{
 		{
 			numInstances: 1,
 			value:        "0",
 			expected:     0,
 		},
 		{
 			numInstances: 1,
 			value:        "0%",
 			expected:     0,
 		},
 		{
 			numInstances: 10,
 			value:        "31%",
 			expected:     4,
 		},
 		{
 			numInstances: 10,
 			value:        "100%",
 			expected:     10,
 		},
 	} {
 		t.Run(fmt.Sprintf("%s %d", tc.value, tc.numInstances), func(t *testing.T) {
 			value := intstr.Parse(tc.value)
 			rollingUpdate := kops.RollingUpdate{
 				MaxSurge: &value,
 			}
 			instanceGroup := kops.InstanceGroup{
 				Spec: kops.InstanceGroupSpec{
 					RollingUpdate: &rollingUpdate,
 				},
 			}
 			resolved := resolveSettings(&kops.Cluster{}, &instanceGroup, tc.numInstances)
 			assert.Equal(t, intstr.Int, resolved.MaxSurge.Type)
 			assert.Equal(t, tc.expected, resolved.MaxSurge.IntVal)
 			if tc.expected == 0 {
 				assert.Equal(t, int32(1), resolved.MaxUnavailable.IntVal, "MaxUnavailable default")
 			} else {
 				assert.Equal(t, int32(0), resolved.MaxUnavailable.IntVal, "MaxUnavailable default")
 			}
 		})
 	}
 }
--- a/pkg/resources/digitalocean/cloud.go
+++ b/pkg/resources/digitalocean/cloud.go
@ -100,6 +100,12 @@ func (c *Cloud) DeleteInstance(i *cloudinstances.CloudInstanceGroupMember) error
 	return fmt.Errorf("digital ocean cloud provider does not support deleting cloud instances at this time")
 }
 // DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
 func (c *Cloud) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
 	klog.V(8).Info("digitalocean cloud provider DetachInstance not implemented yet")
 	return fmt.Errorf("digital ocean cloud provider does not support surging")
 }
 // ProviderID returns the kops api identifier for DigitalOcean cloud provider
 func (c *Cloud) ProviderID() kops.CloudProviderID {
 	return kops.CloudProviderDO
--- a/pkg/resources/spotinst/resources.go
+++ b/pkg/resources/spotinst/resources.go
@ -193,6 +193,11 @@ func DeleteInstance(cloud Cloud, instance *cloudinstances.CloudInstanceGroupMemb
 	return fmt.Errorf("spotinst: unexpected instance group type, got: %T", group.Raw)
 }
 // DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
 func DetachInstance(cloud Cloud, instance *cloudinstances.CloudInstanceGroupMember) error {
 	return fmt.Errorf("spotinst does not support surging")
 }
 // GetCloudGroups returns a list of InstanceGroups as CloudInstanceGroup objects.
 func GetCloudGroups(cloud Cloud, cluster *kops.Cluster, instanceGroups []*kops.InstanceGroup,
 	warnUnmatched bool, nodes []v1.Node) (map[string]*cloudinstances.CloudInstanceGroup, error) {
--- a/pkg/validation/validate_cluster.go
+++ b/pkg/validation/validate_cluster.go
@ -261,13 +261,20 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances
 		var allMembers []*cloudinstances.CloudInstanceGroupMember
 		allMembers = append(allMembers, cloudGroup.Ready...)
 		allMembers = append(allMembers, cloudGroup.NeedUpdate...)
-		if len(allMembers) < cloudGroup.MinSize {
+
 		numNodes := 0
 		for _, m := range allMembers {
 			if !m.Detached {
 				numNodes++
 			}
 		}
 		if numNodes < cloudGroup.MinSize {
 			v.addError(&ValidationError{
 				Kind: "InstanceGroup",
 				Name: cloudGroup.InstanceGroup.Name,
 				Message: fmt.Sprintf("InstanceGroup %q did not have enough nodes %d vs %d",
 					cloudGroup.InstanceGroup.Name,
-					len(allMembers),
+					numNodes,
 					cloudGroup.MinSize),
 			})
 		}
--- a/pkg/validation/validate_cluster_test.go
+++ b/pkg/validation/validate_cluster_test.go
@ -164,6 +164,59 @@ func Test_ValidateNodesNotEnough(t *testing.T) {
 	}
 }
 func Test_ValidateDetachedNodesDontCount(t *testing.T) {
 	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
 	groups["node-1"] = &cloudinstances.CloudInstanceGroup{
 		InstanceGroup: &kopsapi.InstanceGroup{
 			ObjectMeta: metav1.ObjectMeta{
 				Name: "node-1",
 			},
 			Spec: kopsapi.InstanceGroupSpec{
 				Role: kopsapi.InstanceGroupRoleNode,
 			},
 		},
 		MinSize: 2,
 		Ready: []*cloudinstances.CloudInstanceGroupMember{
 			{
 				ID: "i-00001",
 				Node: &v1.Node{
 					ObjectMeta: metav1.ObjectMeta{Name: "node-1a"},
 					Status: v1.NodeStatus{
 						Conditions: []v1.NodeCondition{
 							{Type: "Ready", Status: v1.ConditionTrue},
 						},
 					},
 				},
 			},
 		},
 		NeedUpdate: []*cloudinstances.CloudInstanceGroupMember{
 			{
 				ID: "i-00002",
 				Node: &v1.Node{
 					ObjectMeta: metav1.ObjectMeta{Name: "node-1b"},
 					Status: v1.NodeStatus{
 						Conditions: []v1.NodeCondition{
 							{Type: "Ready", Status: v1.ConditionTrue},
 						},
 					},
 				},
 				Detached: true,
 			},
 		},
 	}
 	v, err := testValidate(t, groups, nil)
 	require.NoError(t, err)
 	if !assert.Len(t, v.Failures, 1) ||
 		!assert.Equal(t, &ValidationError{
 			Kind:    "InstanceGroup",
 			Name:    "node-1",
 			Message: "InstanceGroup \"node-1\" did not have enough nodes 1 vs 2",
 		}, v.Failures[0]) {
 		printDebug(t, v)
 	}
 }
 func Test_ValidateNodeNotReady(t *testing.T) {
 	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
 	groups["node-1"] = &cloudinstances.CloudInstanceGroup{
--- a/upup/pkg/fi/cloud.go
+++ b/upup/pkg/fi/cloud.go
@ -28,20 +28,24 @@ type Cloud interface {
 	DNS() (dnsprovider.Interface, error)
-	// FindVPCInfo looks up the specified VPC by id, returning info if found, otherwise (nil, nil)
+	// FindVPCInfo looks up the specified VPC by id, returning info if found, otherwise (nil, nil).
 	FindVPCInfo(id string) (*VPCInfo, error)
-	// DeleteInstance deletes a cloud instance
+	// DeleteInstance deletes a cloud instance.
 	DeleteInstance(instance *cloudinstances.CloudInstanceGroupMember) error
-	// DeleteGroup deletes the cloud resources that make up a CloudInstanceGroup, including the instances
+	// DeleteGroup deletes the cloud resources that make up a CloudInstanceGroup, including the instances.
 	DeleteGroup(group *cloudinstances.CloudInstanceGroup) error
-	// GetCloudGroups returns a map of cloud instances that back a kops cluster
+	// DetachInstance causes a cloud instance to no longer be counted against the group's size limits.
 	DetachInstance(instance *cloudinstances.CloudInstanceGroupMember) error
 	// GetCloudGroups returns a map of cloud instances that back a kops cluster.
 	// Detached instances must be returned in the NeedUpdate slice.
 	GetCloudGroups(cluster *kops.Cluster, instancegroups []*kops.InstanceGroup, warnUnmatched bool, nodes []v1.Node) (map[string]*cloudinstances.CloudInstanceGroup, error)
-	// Region returns the cloud region bound to the cloud instance
+	// Region returns the cloud region bound to the cloud instance.
-	// If the region concept does not apply, returns ""
+	// If the region concept does not apply, returns "".
 	Region() string
 }
--- a/upup/pkg/fi/cloudup/aliup/ali_cloud.go
+++ b/upup/pkg/fi/cloudup/aliup/ali_cloud.go
@ -180,6 +180,10 @@ func (c *aliCloudImplementation) DeleteInstance(i *cloudinstances.CloudInstanceG
 	return nil
 }
 func (c *aliCloudImplementation) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
 	return errors.New("aliCloud cloud provider does not support surging")
 }
 func (c *aliCloudImplementation) FindVPCInfo(id string) (*fi.VPCInfo, error) {
 	request := &ecs.DescribeVpcsArgs{
 		RegionId: common.Region(c.Region()),
--- a/upup/pkg/fi/cloudup/awsup/aws_cloud.go
+++ b/upup/pkg/fi/cloudup/awsup/aws_cloud.go
@ -85,6 +85,8 @@ const TagNameKopsRole = "kubernetes.io/kops/role"
 // TagNameClusterOwnershipPrefix is the AWS tag used for ownership
 const TagNameClusterOwnershipPrefix = "kubernetes.io/cluster/"
 const tagNameDetachedInstance = "kops.k8s.io/detached-from-asg"
 const (
 	WellKnownAccountAmazonLinux2 = "137112412989"
 	WellKnownAccountCentOS       = "679593333241"
@ -358,6 +360,23 @@ func deleteGroup(c AWSCloud, g *cloudinstances.CloudInstanceGroup) error {
 		launchTemplate = aws.StringValue(asg.LaunchTemplate.LaunchTemplateName)
 	}
 	// Delete detached instances
 	{
 		detached, err := findDetachedInstances(c, asg)
 		if err != nil {
 			return fmt.Errorf("error searching for detached instances for autoscaling group %q: %v", name, err)
 		}
 		if len(detached) > 0 {
 			klog.V(2).Infof("Deleting detached instances for autoscaling group %q", name)
 			req := &ec2.TerminateInstancesInput{
 				InstanceIds: detached,
 			}
 			if _, err := c.EC2().TerminateInstances(req); err != nil {
 				return fmt.Errorf("error deleting detached instances for autoscaling group %q: %v", name, err)
 			}
 		}
 	}
 	// Delete ASG
 	{
 		klog.V(2).Infof("Deleting autoscaling group %q", name)
@ -418,12 +437,11 @@ func deleteInstance(c AWSCloud, i *cloudinstances.CloudInstanceGroupMember) erro
 		return fmt.Errorf("id was not set on CloudInstanceGroupMember: %v", i)
 	}
-	request := &autoscaling.TerminateInstanceInAutoScalingGroupInput{
+	request := &ec2.TerminateInstancesInput{
-		InstanceId:                     aws.String(id),
+		InstanceIds: []*string{aws.String(id)},
 		ShouldDecrementDesiredCapacity: aws.Bool(false),
 	}
-	if _, err := c.Autoscaling().TerminateInstanceInAutoScalingGroup(request); err != nil {
+	if _, err := c.EC2().TerminateInstances(request); err != nil {
 		return fmt.Errorf("error deleting instance %q: %v", id, err)
 	}
@ -432,7 +450,42 @@ func deleteInstance(c AWSCloud, i *cloudinstances.CloudInstanceGroupMember) erro
 	return nil
 }
-// TODO not used yet, as this requires a major refactor of rolling-update code, slowly but surely
+// DetachInstance causes an aws instance to no longer be counted against the ASG's size limits.
 func (c *awsCloudImplementation) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
 	if c.spotinst != nil {
 		return spotinst.DetachInstance(c.spotinst, i)
 	}
 	return detachInstance(c, i)
 }
 func detachInstance(c AWSCloud, i *cloudinstances.CloudInstanceGroupMember) error {
 	id := i.ID
 	if id == "" {
 		return fmt.Errorf("id was not set on CloudInstanceGroupMember: %v", i)
 	}
 	asg := i.CloudInstanceGroup.Raw.(*autoscaling.Group)
 	if err := c.CreateTags(id, map[string]string{tagNameDetachedInstance: *asg.AutoScalingGroupName}); err != nil {
 		return fmt.Errorf("error tagging instance %q: %v", id, err)
 	}
 	// TODO this also deregisters the instance from any ELB attached to the ASG. Do we care?
 	input := &autoscaling.DetachInstancesInput{
 		AutoScalingGroupName:           aws.String(i.CloudInstanceGroup.HumanName),
 		InstanceIds:                    []*string{aws.String(id)},
 		ShouldDecrementDesiredCapacity: aws.Bool(false),
 	}
 	if _, err := c.Autoscaling().DetachInstances(input); err != nil {
 		return fmt.Errorf("error detaching instance %q: %v", id, err)
 	}
 	klog.V(8).Infof("detached aws ec2 instance %q", id)
 	return nil
 }
 // GetCloudGroups returns a groups of instances that back a kops instance groups
 func (c *awsCloudImplementation) GetCloudGroups(cluster *kops.Cluster, instancegroups []*kops.InstanceGroup, warnUnmatched bool, nodes []v1.Node) (map[string]*cloudinstances.CloudInstanceGroup, error) {
@ -467,7 +520,7 @@ func getCloudGroups(c AWSCloud, cluster *kops.Cluster, instancegroups []*kops.In
 			continue
 		}
-		groups[instancegroup.ObjectMeta.Name], err = awsBuildCloudInstanceGroup(c, instancegroup, asg, nodeMap)
+		groups[instancegroup.ObjectMeta.Name], err = awsBuildCloudInstanceGroup(c, cluster, instancegroup, asg, nodeMap)
 		if err != nil {
 			return nil, fmt.Errorf("error getting cloud instance group %q: %v", instancegroup.ObjectMeta.Name, err)
 		}
@ -648,12 +701,14 @@ func findInstanceLaunchConfiguration(i *autoscaling.Instance) string {
 	return ""
 }
-func awsBuildCloudInstanceGroup(c AWSCloud, ig *kops.InstanceGroup, g *autoscaling.Group, nodeMap map[string]*v1.Node) (*cloudinstances.CloudInstanceGroup, error) {
+func awsBuildCloudInstanceGroup(c AWSCloud, cluster *kops.Cluster, ig *kops.InstanceGroup, g *autoscaling.Group, nodeMap map[string]*v1.Node) (*cloudinstances.CloudInstanceGroup, error) {
 	newConfigName, err := findAutoscalingGroupLaunchConfiguration(c, g)
 	if err != nil {
 		return nil, err
 	}
 	instanceSeen := map[string]bool{}
 	cg := &cloudinstances.CloudInstanceGroup{
 		HumanName:     aws.StringValue(g.AutoScalingGroupName),
 		InstanceGroup: ig,
@ -668,6 +723,7 @@ func awsBuildCloudInstanceGroup(c AWSCloud, ig *kops.InstanceGroup, g *autoscali
 			klog.Warningf("ignoring instance with no instance id: %s in autoscaling group: %s", id, cg.HumanName)
 			continue
 		}
 		instanceSeen[id] = true
 		// @step: check if the instance is terminating
 		if aws.StringValue(i.LifecycleState) == autoscaling.LifecycleStateTerminating {
 			klog.Warningf("ignoring instance as it is terminating: %s in autoscaling group: %s", id, cg.HumanName)
@ -680,9 +736,44 @@ func awsBuildCloudInstanceGroup(c AWSCloud, ig *kops.InstanceGroup, g *autoscali
 		}
 	}
 	detached, err := findDetachedInstances(c, g)
 	if err != nil {
 		return nil, fmt.Errorf("error searching for detached instances: %v", err)
 	}
 	for _, id := range detached {
 		if id != nil && *id != "" && !instanceSeen[*id] {
 			if err := cg.NewDetachedCloudInstanceGroupMember(*id, nodeMap); err != nil {
 				return nil, fmt.Errorf("error creating cloud instance group member: %v", err)
 			}
 			instanceSeen[*id] = true
 		}
 	}
 	return cg, nil
 }
 func findDetachedInstances(c AWSCloud, g *autoscaling.Group) ([]*string, error) {
 	req := &ec2.DescribeInstancesInput{
 		Filters: []*ec2.Filter{
 			NewEC2Filter("tag:"+tagNameDetachedInstance, aws.StringValue(g.AutoScalingGroupName)),
 			NewEC2Filter("instance-state-name", "pending", "running", "stopping", "stopped"),
 		},
 	}
 	result, err := c.EC2().DescribeInstances(req)
 	if err != nil {
 		return nil, err
 	}
 	var detached []*string
 	for _, r := range result.Reservations {
 		for _, i := range r.Instances {
 			detached = append(detached, i.InstanceId)
 		}
 	}
 	return detached, nil
 }
 func (c *awsCloudImplementation) Tags() map[string]string {
 	// Defensive copy
 	tags := make(map[string]string)
--- a/upup/pkg/fi/cloudup/awsup/mock_aws_cloud.go
+++ b/upup/pkg/fi/cloudup/awsup/mock_aws_cloud.go
@ -90,6 +90,10 @@ func (c *MockAWSCloud) DeleteInstance(i *cloudinstances.CloudInstanceGroupMember
 	return deleteInstance(c, i)
 }
 func (c *MockAWSCloud) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
 	return detachInstance(c, i)
 }
 func (c *MockAWSCloud) GetCloudGroups(cluster *kops.Cluster, instancegroups []*kops.InstanceGroup, warnUnmatched bool, nodes []v1.Node) (map[string]*cloudinstances.CloudInstanceGroup, error) {
 	return getCloudGroups(c, cluster, instancegroups, warnUnmatched, nodes)
 }
--- a/upup/pkg/fi/cloudup/baremetal/cloud.go
+++ b/upup/pkg/fi/cloudup/baremetal/cloud.go
@ -68,6 +68,13 @@ func (c *Cloud) DeleteGroup(g *cloudinstances.CloudInstanceGroup) error {
 	return fmt.Errorf("baremetal cloud provider does not support deleting cloud groups at this time")
 }
 // DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
 // Baremetal may not support this.
 func (c *Cloud) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
 	klog.V(8).Infof("baremetal cloud provider DetachInstance not implemented")
 	return fmt.Errorf("baremetal cloud provider does not support surging")
 }
 //DeleteInstance is not implemented yet, is func needs to delete a DO instance.
 //Baremetal may not support this.
 func (c *Cloud) DeleteInstance(instance *cloudinstances.CloudInstanceGroupMember) error {
--- a/upup/pkg/fi/cloudup/gce/instancegroups.go
+++ b/upup/pkg/fi/cloudup/gce/instancegroups.go
@ -61,6 +61,18 @@ func (c *mockGCECloud) DeleteInstance(i *cloudinstances.CloudInstanceGroupMember
 	return recreateCloudInstanceGroupMember(c, i)
 }
 // DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
 func (c *gceCloudImplementation) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
 	klog.V(8).Info("gce cloud provider DetachInstance not implemented yet")
 	return fmt.Errorf("gce cloud provider does not support surging")
 }
 // DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
 func (c *mockGCECloud) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
 	klog.V(8).Info("gce cloud provider DetachInstance not implemented yet")
 	return fmt.Errorf("gce cloud provider does not support surging")
 }
 // recreateCloudInstanceGroupMember recreates the specified instances, managed by an InstanceGroupManager
 func recreateCloudInstanceGroupMember(c GCECloud, i *cloudinstances.CloudInstanceGroupMember) error {
 	mig := i.CloudInstanceGroup.Raw.(*compute.InstanceGroupManager)
--- a/upup/pkg/fi/cloudup/openstack/instance.go
+++ b/upup/pkg/fi/cloudup/openstack/instance.go
@ -110,6 +110,12 @@ func (c *openstackCloud) DeleteInstanceWithID(instanceID string) error {
 	return servers.Delete(c.novaClient, instanceID).ExtractErr()
 }
 // DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
 func (c *openstackCloud) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
 	klog.V(8).Info("openstack cloud provider DetachInstance not implemented yet")
 	return fmt.Errorf("openstack cloud provider does not support surging")
 }
 func (c *openstackCloud) GetInstance(id string) (*servers.Server, error) {
 	var server *servers.Server
--- a/upup/pkg/fi/cloudup/vsphere/vsphere_cloud.go
+++ b/upup/pkg/fi/cloudup/vsphere/vsphere_cloud.go
@ -131,6 +131,12 @@ func (c *VSphereCloud) DeleteInstance(i *cloudinstances.CloudInstanceGroupMember
 	return fmt.Errorf("vSphere cloud provider does not support deleting cloud instances at this time.")
 }
 // DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
 func (c *VSphereCloud) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
 	klog.V(8).Info("vSphere cloud provider DetachInstance not implemented yet")
 	return fmt.Errorf("vSphere cloud provider does not support surging")
 }
 // DNS returns dnsprovider interface for this vSphere cloud.
 func (c *VSphereCloud) DNS() (dnsprovider.Interface, error) {
 	var provider dnsprovider.Interface