Merge pull request #8313 from johngmyers/surge

Option to surge during rolling update
2020-03-04 10:21:47 -08:00 · 2020-03-04 10:21:47 -08:00 · a5dabf58dc
parent 1b19763552 99100dc4a0
commit a5dabf58dc
33 changed files with 1067 additions and 104 deletions
--- a/cloudmock/aws/mockautoscaling/BUILD.bazel
+++ b/cloudmock/aws/mockautoscaling/BUILD.bazel
@ -5,6 +5,7 @@ go_library(
    srcs = [
        "api.go",
        "attach.go",
+        "ec2shim.go",
        "group.go",
        "launchconfigurations.go",
        "tags.go",
@ -16,6 +17,8 @@ go_library(
        "//vendor/github.com/aws/aws-sdk-go/aws/request:go_default_library",
        "//vendor/github.com/aws/aws-sdk-go/service/autoscaling:go_default_library",
        "//vendor/github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface:go_default_library",
+        "//vendor/github.com/aws/aws-sdk-go/service/ec2:go_default_library",
+        "//vendor/github.com/aws/aws-sdk-go/service/ec2/ec2iface:go_default_library",
        "//vendor/k8s.io/klog:go_default_library",
    ],
 )
--- a/cloudmock/aws/mockautoscaling/ec2shim.go
+++ b/cloudmock/aws/mockautoscaling/ec2shim.go
@ -0,0 +1,52 @@
+/*
+Copyright 2020 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package mockautoscaling
+
+import (
+	"github.com/aws/aws-sdk-go/aws"
+	"github.com/aws/aws-sdk-go/service/autoscaling"
+	"github.com/aws/aws-sdk-go/service/ec2"
+	"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
+)
+
+type ec2Shim struct {
+	ec2iface.EC2API
+	mockAutoscaling *MockAutoscaling
+}
+
+func (m *MockAutoscaling) GetEC2Shim(e ec2iface.EC2API) ec2iface.EC2API {
+	return &ec2Shim{
+		EC2API:          e,
+		mockAutoscaling: m,
+	}
+}
+
+func (e *ec2Shim) TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error) {
+	if input.DryRun != nil && *input.DryRun {
+		return &ec2.TerminateInstancesOutput{}, nil
+	}
+	for _, id := range input.InstanceIds {
+		request := &autoscaling.TerminateInstanceInAutoScalingGroupInput{
+			InstanceId:                     id,
+			ShouldDecrementDesiredCapacity: aws.Bool(false),
+		}
+		if _, err := e.mockAutoscaling.TerminateInstanceInAutoScalingGroup(request); err != nil {
+			return nil, err
+		}
+	}
+	return &ec2.TerminateInstancesOutput{}, nil
+}
--- a/k8s/crds/kops.k8s.io_clusters.yaml
+++ b/k8s/crds/kops.k8s.io_clusters.yaml
@ -2937,6 +2937,20 @@ spec:
              description: RollingUpdate defines the default rolling-update settings
                for instance groups
              properties:
+                maxSurge:
+                  anyOf:
+                  - type: string
+                  - type: integer
+                  description: 'MaxSurge is the maximum number of extra nodes that
+                    can be created during the update. The value can be an absolute
+                    number (for example 5) or a percentage of desired machines (for
+                    example 10%). The absolute number is calculated from a percentage
+                    by rounding up. A value of 0 for both this and MaxUnavailable
+                    disables rolling updates. Has no effect on instance groups with
+                    role "Master". Defaults to 0. Example: when this is set to 30%,
+                    the InstanceGroup can be scaled up immediately when the rolling
+                    update starts, such that the total number of old and new nodes
+                    do not exceed 130% of desired nodes.'
                maxUnavailable:
                  anyOf:
                  - type: string
@ -2945,12 +2959,13 @@ spec:
                    can be unavailable during the update. The value can be an absolute
                    number (for example 5) or a percentage of desired nodes (for example
                    10%). The absolute number is calculated from a percentage by rounding
-                    down. A value of 0 disables rolling updates. Defaults to 1. Example:
-                    when this is set to 30%, the InstanceGroup can be scaled down
-                    to 70% of desired nodes immediately when the rolling update starts.
-                    Once new nodes are ready, more old nodes can be drained, ensuring
-                    that the total number of nodes available at all times during the
-                    update is at least 70% of desired nodes.'
+                    down. A value of 0 for both this and MaxSurge disables rolling
+                    updates. Defaults to 1 if MaxSurge is 0, otherwise defaults to
+                    0. Example: when this is set to 30%, the InstanceGroup can be
+                    scaled down to 70% of desired nodes immediately when the rolling
+                    update starts. Once new nodes are ready, more old nodes can be
+                    drained, ensuring that the total number of nodes available at
+                    all times during the update is at least 70% of desired nodes.'
              type: object
            secretStore:
              description: SecretStore is the VFS path to where secrets are stored
--- a/k8s/crds/kops.k8s.io_instancegroups.yaml
+++ b/k8s/crds/kops.k8s.io_instancegroups.yaml
@ -630,6 +630,20 @@ spec:
            rollingUpdate:
              description: RollingUpdate defines the rolling-update behavior
              properties:
+                maxSurge:
+                  anyOf:
+                  - type: string
+                  - type: integer
+                  description: 'MaxSurge is the maximum number of extra nodes that
+                    can be created during the update. The value can be an absolute
+                    number (for example 5) or a percentage of desired machines (for
+                    example 10%). The absolute number is calculated from a percentage
+                    by rounding up. A value of 0 for both this and MaxUnavailable
+                    disables rolling updates. Has no effect on instance groups with
+                    role "Master". Defaults to 0. Example: when this is set to 30%,
+                    the InstanceGroup can be scaled up immediately when the rolling
+                    update starts, such that the total number of old and new nodes
+                    do not exceed 130% of desired nodes.'
                maxUnavailable:
                  anyOf:
                  - type: string
@ -638,12 +652,13 @@ spec:
                    can be unavailable during the update. The value can be an absolute
                    number (for example 5) or a percentage of desired nodes (for example
                    10%). The absolute number is calculated from a percentage by rounding
-                    down. A value of 0 disables rolling updates. Defaults to 1. Example:
-                    when this is set to 30%, the InstanceGroup can be scaled down
-                    to 70% of desired nodes immediately when the rolling update starts.
-                    Once new nodes are ready, more old nodes can be drained, ensuring
-                    that the total number of nodes available at all times during the
-                    update is at least 70% of desired nodes.'
+                    down. A value of 0 for both this and MaxSurge disables rolling
+                    updates. Defaults to 1 if MaxSurge is 0, otherwise defaults to
+                    0. Example: when this is set to 30%, the InstanceGroup can be
+                    scaled down to 70% of desired nodes immediately when the rolling
+                    update starts. Once new nodes are ready, more old nodes can be
+                    drained, ensuring that the total number of nodes available at
+                    all times during the update is at least 70% of desired nodes.'
              type: object
            rootVolumeDeleteOnTermination:
              description: 'RootVolumeDeleteOnTermination configures root volume retention
--- a/pkg/apis/kops/cluster.go
+++ b/pkg/apis/kops/cluster.go
@ -684,8 +684,8 @@ type RollingUpdate struct {
 	// The value can be an absolute number (for example 5) or a percentage of desired
 	// nodes (for example 10%).
 	// The absolute number is calculated from a percentage by rounding down.
-	// A value of 0 disables rolling updates.
-	// Defaults to 1.
+	// A value of 0 for both this and MaxSurge disables rolling updates.
+	// Defaults to 1 if MaxSurge is 0, otherwise defaults to 0.
 	// Example: when this is set to 30%, the InstanceGroup can be scaled
 	// down to 70% of desired nodes immediately when the rolling update
 	// starts. Once new nodes are ready, more old nodes can be drained,
@ -693,4 +693,18 @@ type RollingUpdate struct {
 	// during the update is at least 70% of desired nodes.
 	// +optional
 	MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"`
+	// MaxSurge is the maximum number of extra nodes that can be created
+	// during the update.
+	// The value can be an absolute number (for example 5) or a percentage of
+	// desired machines (for example 10%).
+	// The absolute number is calculated from a percentage by rounding up.
+	// A value of 0 for both this and MaxUnavailable disables rolling updates.
+	// Has no effect on instance groups with role "Master".
+	// Defaults to 0.
+	// Example: when this is set to 30%, the InstanceGroup can be scaled
+	// up immediately when the rolling update starts, such that the total
+	// number of old and new nodes do not exceed 130% of desired
+	// nodes.
+	// +optional
+	MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
 }
--- a/pkg/apis/kops/v1alpha1/cluster.go
+++ b/pkg/apis/kops/v1alpha1/cluster.go
@ -566,8 +566,8 @@ type RollingUpdate struct {
 	// The value can be an absolute number (for example 5) or a percentage of desired
 	// nodes (for example 10%).
 	// The absolute number is calculated from a percentage by rounding down.
-	// A value of 0 disables rolling updates.
-	// Defaults to 1.
+	// A value of 0 for both this and MaxSurge disables rolling updates.
+	// Defaults to 1 if MaxSurge is 0, otherwise defaults to 0.
 	// Example: when this is set to 30%, the InstanceGroup can be scaled
 	// down to 70% of desired nodes immediately when the rolling update
 	// starts. Once new nodes are ready, more old nodes can be drained,
@ -575,4 +575,18 @@ type RollingUpdate struct {
 	// during the update is at least 70% of desired nodes.
 	// +optional
 	MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"`
+	// MaxSurge is the maximum number of extra nodes that can be created
+	// during the update.
+	// The value can be an absolute number (for example 5) or a percentage of
+	// desired machines (for example 10%).
+	// The absolute number is calculated from a percentage by rounding up.
+	// A value of 0 for both this and MaxUnavailable disables rolling updates.
+	// Has no effect on instance groups with role "Master".
+	// Defaults to 0.
+	// Example: when this is set to 30%, the InstanceGroup can be scaled
+	// up immediately when the rolling update starts, such that the total
+	// number of old and new nodes do not exceed 130% of desired
+	// nodes.
+	// +optional
+	MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
 }
--- a/pkg/apis/kops/v1alpha1/zz_generated.conversion.go
+++ b/pkg/apis/kops/v1alpha1/zz_generated.conversion.go
@ -4720,6 +4720,7 @@ func Convert_kops_RBACAuthorizationSpec_To_v1alpha1_RBACAuthorizationSpec(in *ko

 func autoConvert_v1alpha1_RollingUpdate_To_kops_RollingUpdate(in *RollingUpdate, out *kops.RollingUpdate, s conversion.Scope) error {
 	out.MaxUnavailable = in.MaxUnavailable
+	out.MaxSurge = in.MaxSurge
 	return nil
 }

@ -4730,6 +4731,7 @@ func Convert_v1alpha1_RollingUpdate_To_kops_RollingUpdate(in *RollingUpdate, out

 func autoConvert_kops_RollingUpdate_To_v1alpha1_RollingUpdate(in *kops.RollingUpdate, out *RollingUpdate, s conversion.Scope) error {
 	out.MaxUnavailable = in.MaxUnavailable
+	out.MaxSurge = in.MaxSurge
 	return nil
 }

--- a/pkg/apis/kops/v1alpha1/zz_generated.deepcopy.go
+++ b/pkg/apis/kops/v1alpha1/zz_generated.deepcopy.go
@ -3303,6 +3303,11 @@ func (in *RollingUpdate) DeepCopyInto(out *RollingUpdate) {
 		*out = new(intstr.IntOrString)
 		**out = **in
 	}
+	if in.MaxSurge != nil {
+		in, out := &in.MaxSurge, &out.MaxSurge
+		*out = new(intstr.IntOrString)
+		**out = **in
+	}
 	return
 }

--- a/pkg/apis/kops/v1alpha2/cluster.go
+++ b/pkg/apis/kops/v1alpha2/cluster.go
@ -579,8 +579,8 @@ type RollingUpdate struct {
 	// The value can be an absolute number (for example 5) or a percentage of desired
 	// nodes (for example 10%).
 	// The absolute number is calculated from a percentage by rounding down.
-	// A value of 0 disables rolling updates.
-	// Defaults to 1.
+	// A value of 0 for both this and MaxSurge disables rolling updates.
+	// Defaults to 1 if MaxSurge is 0, otherwise defaults to 0.
 	// Example: when this is set to 30%, the InstanceGroup can be scaled
 	// down to 70% of desired nodes immediately when the rolling update
 	// starts. Once new nodes are ready, more old nodes can be drained,
@ -588,4 +588,18 @@ type RollingUpdate struct {
 	// during the update is at least 70% of desired nodes.
 	// +optional
 	MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"`
+	// MaxSurge is the maximum number of extra nodes that can be created
+	// during the update.
+	// The value can be an absolute number (for example 5) or a percentage of
+	// desired machines (for example 10%).
+	// The absolute number is calculated from a percentage by rounding up.
+	// A value of 0 for both this and MaxUnavailable disables rolling updates.
+	// Has no effect on instance groups with role "Master".
+	// Defaults to 0.
+	// Example: when this is set to 30%, the InstanceGroup can be scaled
+	// up immediately when the rolling update starts, such that the total
+	// number of old and new nodes do not exceed 130% of desired
+	// nodes.
+	// +optional
+	MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
 }
--- a/pkg/apis/kops/v1alpha2/zz_generated.conversion.go
+++ b/pkg/apis/kops/v1alpha2/zz_generated.conversion.go
@ -4990,6 +4990,7 @@ func Convert_kops_RBACAuthorizationSpec_To_v1alpha2_RBACAuthorizationSpec(in *ko

 func autoConvert_v1alpha2_RollingUpdate_To_kops_RollingUpdate(in *RollingUpdate, out *kops.RollingUpdate, s conversion.Scope) error {
 	out.MaxUnavailable = in.MaxUnavailable
+	out.MaxSurge = in.MaxSurge
 	return nil
 }

@ -5000,6 +5001,7 @@ func Convert_v1alpha2_RollingUpdate_To_kops_RollingUpdate(in *RollingUpdate, out

 func autoConvert_kops_RollingUpdate_To_v1alpha2_RollingUpdate(in *kops.RollingUpdate, out *RollingUpdate, s conversion.Scope) error {
 	out.MaxUnavailable = in.MaxUnavailable
+	out.MaxSurge = in.MaxSurge
 	return nil
 }

--- a/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go
+++ b/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go
@ -3374,6 +3374,11 @@ func (in *RollingUpdate) DeepCopyInto(out *RollingUpdate) {
 		*out = new(intstr.IntOrString)
 		**out = **in
 	}
+	if in.MaxSurge != nil {
+		in, out := &in.MaxSurge, &out.MaxSurge
+		*out = new(intstr.IntOrString)
+		**out = **in
+	}
 	return
 }

--- a/pkg/apis/kops/validation/instancegroup.go
+++ b/pkg/apis/kops/validation/instancegroup.go
@ -118,7 +118,7 @@ func ValidateInstanceGroup(g *kops.InstanceGroup) field.ErrorList {
 	allErrs = append(allErrs, validateInstanceProfile(g.Spec.IAM, field.NewPath("spec", "iam"))...)

 	if g.Spec.RollingUpdate != nil {
-		allErrs = append(allErrs, validateRollingUpdate(g.Spec.RollingUpdate, field.NewPath("spec", "rollingUpdate"))...)
+		allErrs = append(allErrs, validateRollingUpdate(g.Spec.RollingUpdate, field.NewPath("spec", "rollingUpdate"), g.Spec.Role == kops.InstanceGroupRoleMaster)...)
 	}

 	return allErrs
--- a/pkg/apis/kops/validation/validation.go
+++ b/pkg/apis/kops/validation/validation.go
@ -123,7 +123,7 @@ func validateClusterSpec(spec *kops.ClusterSpec, fieldPath *field.Path) field.Er
 	}

 	if spec.RollingUpdate != nil {
-		allErrs = append(allErrs, validateRollingUpdate(spec.RollingUpdate, fieldPath.Child("rollingUpdate"))...)
+		allErrs = append(allErrs, validateRollingUpdate(spec.RollingUpdate, fieldPath.Child("rollingUpdate"), false)...)
 	}

 	return allErrs
@ -597,7 +597,7 @@ func validateContainerRuntime(runtime *string, fldPath *field.Path) field.ErrorL
 	return allErrs
 }

-func validateRollingUpdate(rollingUpdate *kops.RollingUpdate, fldpath *field.Path) field.ErrorList {
+func validateRollingUpdate(rollingUpdate *kops.RollingUpdate, fldpath *field.Path, onMasterInstanceGroup bool) field.ErrorList {
 	allErrs := field.ErrorList{}
 	if rollingUpdate.MaxUnavailable != nil {
 		unavailable, err := intstr.GetValueFromIntOrPercent(rollingUpdate.MaxUnavailable, 1, false)
@ -609,6 +609,18 @@ func validateRollingUpdate(rollingUpdate *kops.RollingUpdate, fldpath *field.Pat
 			allErrs = append(allErrs, field.Invalid(fldpath.Child("maxUnavailable"), rollingUpdate.MaxUnavailable, "Cannot be negative"))
 		}
 	}
+	if rollingUpdate.MaxSurge != nil {
+		surge, err := intstr.GetValueFromIntOrPercent(rollingUpdate.MaxSurge, 1000, true)
+		if err != nil {
+			allErrs = append(allErrs, field.Invalid(fldpath.Child("maxSurge"), rollingUpdate.MaxSurge,
+				fmt.Sprintf("Unable to parse: %v", err)))
+		}
+		if onMasterInstanceGroup && surge != 0 {
+			allErrs = append(allErrs, field.Forbidden(fldpath.Child("maxSurge"), "Cannot surge instance groups with role \"Master\""))
+		} else if surge < 0 {
+			allErrs = append(allErrs, field.Invalid(fldpath.Child("maxSurge"), rollingUpdate.MaxSurge, "Cannot be negative"))
+		}
+	}

 	return allErrs
 }
--- a/pkg/apis/kops/validation/validation_test.go
+++ b/pkg/apis/kops/validation/validation_test.go
@ -405,6 +405,7 @@ func Test_Validate_Calico(t *testing.T) {
 func Test_Validate_RollingUpdate(t *testing.T) {
 	grid := []struct {
 		Input          kops.RollingUpdate
+		OnMasterIG     bool
 		ExpectedErrors []string
 	}{
 		{
@ -438,9 +439,94 @@ func Test_Validate_RollingUpdate(t *testing.T) {
 			},
 			ExpectedErrors: []string{"Invalid value::testField.maxUnavailable"},
 		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromInt(0)),
+			},
+		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromString("0%")),
+			},
+		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromInt(1)),
+			},
+		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromString("1%")),
+			},
+		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromString("nope")),
+			},
+			ExpectedErrors: []string{"Invalid value::testField.maxSurge"},
+		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromInt(-1)),
+			},
+			ExpectedErrors: []string{"Invalid value::testField.maxSurge"},
+		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromString("-1%")),
+			},
+			ExpectedErrors: []string{"Invalid value::testField.maxSurge"},
+		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromInt(0)),
+			},
+			OnMasterIG: true,
+		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromString("0%")),
+			},
+			OnMasterIG: true,
+		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromInt(1)),
+			},
+			OnMasterIG:     true,
+			ExpectedErrors: []string{"Forbidden::testField.maxSurge"},
+		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromString("1%")),
+			},
+			OnMasterIG:     true,
+			ExpectedErrors: []string{"Forbidden::testField.maxSurge"},
+		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromString("nope")),
+			},
+			OnMasterIG:     true,
+			ExpectedErrors: []string{"Invalid value::testField.maxSurge"},
+		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromInt(-1)),
+			},
+			OnMasterIG:     true,
+			ExpectedErrors: []string{"Forbidden::testField.maxSurge"},
+		},
+		{
+			Input: kops.RollingUpdate{
+				MaxSurge: intStr(intstr.FromString("-1%")),
+			},
+			OnMasterIG:     true,
+			ExpectedErrors: []string{"Forbidden::testField.maxSurge"},
+		},
 	}
 	for _, g := range grid {
-		errs := validateRollingUpdate(&g.Input, field.NewPath("testField"))
+		errs := validateRollingUpdate(&g.Input, field.NewPath("testField"), g.OnMasterIG)
 		testErrors(t, g.Input, errs, g.ExpectedErrors)
 	}
 }
--- a/pkg/apis/kops/zz_generated.deepcopy.go
+++ b/pkg/apis/kops/zz_generated.deepcopy.go
@ -3588,6 +3588,11 @@ func (in *RollingUpdate) DeepCopyInto(out *RollingUpdate) {
 		*out = new(intstr.IntOrString)
 		**out = **in
 	}
+	if in.MaxSurge != nil {
+		in, out := &in.MaxSurge, &out.MaxSurge
+		*out = new(intstr.IntOrString)
+		**out = **in
+	}
 	return
 }

--- a/pkg/cloudinstances/cloud_instance_group.go
+++ b/pkg/cloudinstances/cloud_instance_group.go
@ -47,6 +47,8 @@ type CloudInstanceGroupMember struct {
 	Node *v1.Node
 	// CloudInstanceGroup is the managing CloudInstanceGroup
 	CloudInstanceGroup *CloudInstanceGroup
+	// Detached is whether fi.Cloud.DetachInstance has been successfully called on the instance.
+	Detached bool
 }

 // NewCloudInstanceGroupMember creates a new CloudInstanceGroupMember
@ -74,6 +76,28 @@ func (c *CloudInstanceGroup) NewCloudInstanceGroupMember(instanceId string, newG
 	return nil
 }

+// NewDetachedCloudInstanceGroupMember creates a new CloudInstanceGroupMember for a detached instance
+func (c *CloudInstanceGroup) NewDetachedCloudInstanceGroupMember(instanceId string, nodeMap map[string]*v1.Node) error {
+	if instanceId == "" {
+		return fmt.Errorf("instance id for cloud instance member cannot be empty")
+	}
+	cm := &CloudInstanceGroupMember{
+		ID:                 instanceId,
+		CloudInstanceGroup: c,
+		Detached:           true,
+	}
+	node := nodeMap[instanceId]
+	if node != nil {
+		cm.Node = node
+	} else {
+		klog.V(8).Infof("unable to find node for instance: %s", instanceId)
+	}
+
+	c.NeedUpdate = append(c.NeedUpdate, cm)
+
+	return nil
+}
+
 // Status returns a human-readable Status indicating whether an update is needed
 func (c *CloudInstanceGroup) Status() string {
 	if len(c.NeedUpdate) == 0 {
--- a/pkg/instancegroups/BUILD.bazel
+++ b/pkg/instancegroups/BUILD.bazel
@ -45,6 +45,8 @@ go_test(
        "//vendor/github.com/aws/aws-sdk-go/aws:go_default_library",
        "//vendor/github.com/aws/aws-sdk-go/service/autoscaling:go_default_library",
        "//vendor/github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface:go_default_library",
+        "//vendor/github.com/aws/aws-sdk-go/service/ec2:go_default_library",
+        "//vendor/github.com/aws/aws-sdk-go/service/ec2/ec2iface:go_default_library",
        "//vendor/github.com/stretchr/testify/assert:go_default_library",
        "//vendor/k8s.io/api/core/v1:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
--- a/pkg/instancegroups/instancegroups.go
+++ b/pkg/instancegroups/instancegroups.go
@ -101,10 +101,7 @@ func promptInteractive(upgradedHostId, upgradedHostName string) (stopPrompting b
 	return stopPrompting, err
 }

-// TODO: Temporarily increase size of ASG?
-// TODO: Remove from ASG first so status is immediately updated?
-
-// RollingUpdate performs a rolling update on a list of ec2 instances.
+// RollingUpdate performs a rolling update on a list of instances.
 func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateCluster, cluster *api.Cluster, isBastion bool, sleepAfterTerminate time.Duration, validationTimeout time.Duration) (err error) {

 	// we should not get here, but hey I am going to check.
@ -152,17 +149,61 @@ func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpd
 	settings := resolveSettings(cluster, r.CloudGroup.InstanceGroup, numInstances)

 	runningDrains := 0
-	maxConcurrency := settings.MaxUnavailable.IntValue()
+	maxSurge := settings.MaxSurge.IntValue()
+	if maxSurge > len(update) {
+		maxSurge = len(update)
+	}
+	maxConcurrency := maxSurge + settings.MaxUnavailable.IntValue()

 	if maxConcurrency == 0 {
 		klog.Infof("Rolling updates for InstanceGroup %s are disabled", r.CloudGroup.InstanceGroup.Name)
 		return nil
 	}

+	if r.CloudGroup.InstanceGroup.Spec.Role == api.InstanceGroupRoleMaster && maxSurge != 0 {
+		// Masters are incapable of surging because they rely on registering themselves through
+		// the local apiserver. That apiserver depends on the local etcd, which relies on being
+		// joined to the etcd cluster.
+		maxSurge = 0
+		maxConcurrency = settings.MaxUnavailable.IntValue()
+		if maxConcurrency == 0 {
+			maxConcurrency = 1
+		}
+	}
+
 	if rollingUpdateData.Interactive {
+		if maxSurge > 1 {
+			maxSurge = 1
+		}
 		maxConcurrency = 1
 	}

+	update = prioritizeUpdate(update)
+
+	if maxSurge > 0 && !rollingUpdateData.CloudOnly {
+		for numSurge := 1; numSurge <= maxSurge; numSurge++ {
+			u := update[len(update)-numSurge]
+			if !u.Detached {
+				if err := r.detachInstance(u); err != nil {
+					return err
+				}
+
+				// If noneReady, wait until after one node is detached and its replacement validates
+				// before detaching more in case the current spec does not result in usable nodes.
+				if numSurge == maxSurge || noneReady {
+					// Wait for the minimum interval
+					klog.Infof("waiting for %v after detaching instance", sleepAfterTerminate)
+					time.Sleep(sleepAfterTerminate)
+
+					if err := r.maybeValidate(rollingUpdateData, validationTimeout, "detaching"); err != nil {
+						return err
+					}
+					noneReady = false
+				}
+			}
+		}
+	}
+
 	terminateChan := make(chan error, maxConcurrency)

 	for uIdx, u := range update {
@ -183,7 +224,7 @@ func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpd
 			return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
 		}

-		err = r.maybeValidate(rollingUpdateData, validationTimeout)
+		err = r.maybeValidate(rollingUpdateData, validationTimeout, "removing")
 		if err != nil {
 			return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
 		}
@ -229,7 +270,7 @@ func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpd
 			}
 		}

-		err = r.maybeValidate(rollingUpdateData, validationTimeout)
+		err = r.maybeValidate(rollingUpdateData, validationTimeout, "removing")
 		if err != nil {
 			return err
 		}
@ -238,6 +279,25 @@ func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpd
 	return nil
 }

+func prioritizeUpdate(update []*cloudinstances.CloudInstanceGroupMember) []*cloudinstances.CloudInstanceGroupMember {
+	// The priorities are, in order:
+	//   attached before detached
+	//   TODO unhealthy before healthy
+	//   NeedUpdate before Ready (preserve original order)
+	result := make([]*cloudinstances.CloudInstanceGroupMember, 0, len(update))
+	var detached []*cloudinstances.CloudInstanceGroupMember
+	for _, u := range update {
+		if u.Detached {
+			detached = append(detached, u)
+		} else {
+			result = append(result, u)
+		}
+	}
+
+	result = append(result, detached...)
+	return result
+}
+
 func waitForPendingBeforeReturningError(runningDrains int, terminateChan chan error, err error) error {
 	for runningDrains > 0 {
 		<-terminateChan
@ -359,7 +419,7 @@ func (r *RollingUpdateInstanceGroup) drainTerminateAndWait(u *cloudinstances.Clo
 	return nil
 }

-func (r *RollingUpdateInstanceGroup) maybeValidate(rollingUpdateData *RollingUpdateCluster, validationTimeout time.Duration) error {
+func (r *RollingUpdateInstanceGroup) maybeValidate(rollingUpdateData *RollingUpdateCluster, validationTimeout time.Duration, operation string) error {
 	if rollingUpdateData.CloudOnly {
 		klog.Warningf("Not validating cluster as cloudonly flag is set.")

@ -370,10 +430,10 @@ func (r *RollingUpdateInstanceGroup) maybeValidate(rollingUpdateData *RollingUpd

 			if rollingUpdateData.FailOnValidate {
 				klog.Errorf("Cluster did not validate within %s", validationTimeout)
-				return fmt.Errorf("error validating cluster after removing a node: %v", err)
+				return fmt.Errorf("error validating cluster after %s a node: %v", operation, err)
 			}

-			klog.Warningf("Cluster validation failed after removing instance, proceeding since fail-on-validate is set to false: %v", err)
+			klog.Warningf("Cluster validation failed after %s instance, proceeding since fail-on-validate is set to false: %v", operation, err)
 		}
 	}
 	return nil
@ -450,6 +510,30 @@ func (r *RollingUpdateInstanceGroup) validateCluster(rollingUpdateData *RollingU

 }

+// detachInstance detaches a Cloud Instance
+func (r *RollingUpdateInstanceGroup) detachInstance(u *cloudinstances.CloudInstanceGroupMember) error {
+	id := u.ID
+	nodeName := ""
+	if u.Node != nil {
+		nodeName = u.Node.Name
+	}
+	if nodeName != "" {
+		klog.Infof("Detaching instance %q, node %q, in group %q.", id, nodeName, r.CloudGroup.HumanName)
+	} else {
+		klog.Infof("Detaching instance %q, in group %q.", id, r.CloudGroup.HumanName)
+	}
+
+	if err := r.Cloud.DetachInstance(u); err != nil {
+		if nodeName != "" {
+			return fmt.Errorf("error detaching instance %q, node %q: %v", id, nodeName, err)
+		} else {
+			return fmt.Errorf("error detaching instance %q: %v", id, err)
+		}
+	}
+
+	return nil
+}
+
 // DeleteInstance deletes an Cloud Instance.
 func (r *RollingUpdateInstanceGroup) DeleteInstance(u *cloudinstances.CloudInstanceGroupMember) error {
 	id := u.ID
--- a/pkg/instancegroups/rollingupdate_test.go
+++ b/pkg/instancegroups/rollingupdate_test.go
@ -26,6 +26,8 @@ import (
 	"github.com/aws/aws-sdk-go/aws"
 	"github.com/aws/aws-sdk-go/service/autoscaling"
 	"github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface"
+	"github.com/aws/aws-sdk-go/service/ec2"
+	"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
 	"github.com/stretchr/testify/assert"
 	v1 "k8s.io/api/core/v1"
 	v1meta "k8s.io/apimachinery/pkg/apis/meta/v1"
@ -49,7 +51,9 @@ func getTestSetup() (*RollingUpdateCluster, *awsup.MockAWSCloud, *kopsapi.Cluste
 	k8sClient := fake.NewSimpleClientset()

 	mockcloud := awsup.BuildMockAWSCloud("us-east-1", "abc")
-	mockcloud.MockAutoscaling = &mockautoscaling.MockAutoscaling{}
+	mockAutoscaling := &mockautoscaling.MockAutoscaling{}
+	mockcloud.MockAutoscaling = mockAutoscaling
+	mockcloud.MockEC2 = mockAutoscaling.GetEC2Shim(mockcloud.MockEC2)

 	cluster := &kopsapi.Cluster{}
 	cluster.Name = "test.k8s.local"
@ -109,6 +113,7 @@ func makeGroup(groups map[string]*cloudinstances.CloudInstanceGroup, k8sClient k
 	fakeClient := k8sClient.(*fake.Clientset)

 	groups[name] = &cloudinstances.CloudInstanceGroup{
+		HumanName: name,
 		InstanceGroup: &kopsapi.InstanceGroup{
 			ObjectMeta: v1meta.ObjectMeta{
 				Name: name,
@ -117,6 +122,7 @@ func makeGroup(groups map[string]*cloudinstances.CloudInstanceGroup, k8sClient k
 				Role: role,
 			},
 		},
+		Raw: &autoscaling.Group{AutoScalingGroupName: aws.String("asg-" + name)},
 	}
 	cloud.Autoscaling().CreateAutoScalingGroup(&autoscaling.CreateAutoScalingGroupInput{
 		AutoScalingGroupName: aws.String(name),
@ -136,8 +142,9 @@ func makeGroup(groups map[string]*cloudinstances.CloudInstanceGroup, k8sClient k
 			_ = fakeClient.Tracker().Add(node)
 		}
 		member := cloudinstances.CloudInstanceGroupMember{
-			ID:   id,
-			Node: node,
+			ID:                 id,
+			Node:               node,
+			CloudInstanceGroup: groups[name],
 		}
 		if i < needUpdate {
 			groups[name].NeedUpdate = append(groups[name].NeedUpdate, &member)
@ -605,6 +612,52 @@ func TestRollingUpdateTaintAllButOneNeedUpdate(t *testing.T) {
 	assertGroupInstanceCount(t, cloud, "node-1", 1)
 }

+func TestRollingUpdateMaxSurgeIgnoredForMaster(t *testing.T) {
+	c, cloud, cluster := getTestSetup()
+
+	two := intstr.FromInt(2)
+	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
+		MaxSurge: &two,
+	}
+
+	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
+	makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 3, 2)
+	err := c.RollingUpdate(groups, cluster, &kopsapi.InstanceGroupList{})
+	assert.NoError(t, err, "rolling update")
+
+	cordoned := ""
+	tainted := map[string]bool{}
+	deleted := map[string]bool{}
+	for _, action := range c.K8sClient.(*fake.Clientset).Actions() {
+		switch a := action.(type) {
+		case testingclient.PatchAction:
+			if string(a.GetPatch()) == cordonPatch {
+				assertCordon(t, a)
+				assert.Equal(t, "", cordoned, "at most one node cordoned at a time")
+				assert.True(t, tainted[a.GetName()], "node", a.GetName(), "tainted")
+				cordoned = a.GetName()
+			} else {
+				assertTaint(t, a)
+				assert.Equal(t, "", cordoned, "not tainting while node cordoned")
+				assert.False(t, tainted[a.GetName()], "node", a.GetName(), "already tainted")
+				tainted[a.GetName()] = true
+			}
+		case testingclient.DeleteAction:
+			assert.Equal(t, "nodes", a.GetResource().Resource)
+			assert.Equal(t, cordoned, a.GetName(), "node was cordoned before delete")
+			assert.False(t, deleted[a.GetName()], "node", a.GetName(), "already deleted")
+			deleted[a.GetName()] = true
+			cordoned = ""
+		case testingclient.ListAction:
+			// Don't care
+		default:
+			t.Errorf("unexpected action %v", a)
+		}
+	}
+
+	assertGroupInstanceCount(t, cloud, "master-1", 1)
+}
+
 func TestRollingUpdateDisabled(t *testing.T) {
 	c, cloud, cluster := getTestSetup()

@ -644,12 +697,26 @@ func TestRollingUpdateDisabledCloudonly(t *testing.T) {

 // The concurrent update tests attempt to induce the following expected update sequence:
 //
-// (Only for "all need update" tests, to verify the toe-dipping behavior)
+// (Only for surging "all need update" test, to verify the toe-dipping behavior)
+// Request validate (8)            -->
+//                                 <-- validated
+// Detach instance                 -->
+// Request validate (7)            -->
+//                                 <-- validated
+// Detach instance                 -->
+// (end only for surging "all need update" tests)
+// (Only for surging "all but one need update" test)
+// Request validate (7)            -->
+//                                 <-- validated
+// Detach instance                 -->
+// Detach instance                 -->
+// (end only for surging "all but one need update" test)
+// (Only for non-surging "all need update" tests, to verify the toe-dipping behavior)
 // Request validate (7)            -->
 //                                 <-- validated
 // Request terminate 1 node (7)    -->
 //                                 <-- 1 node terminated, 6 left
-// (end only for "all need update" tests)
+// (end only for non-surging "all need update" tests)
 // Request validate (6)            -->
 //                                 <-- validated
 // Request terminate 2 nodes (6,5) -->
@ -672,19 +739,27 @@ func TestRollingUpdateDisabledCloudonly(t *testing.T) {
 //                                 <-- validated

 type concurrentTest struct {
-	autoscalingiface.AutoScalingAPI
+	ec2iface.EC2API
 	t                       *testing.T
 	mutex                   sync.Mutex
+	surge                   int
 	terminationRequestsLeft int
 	previousValidation      int
 	validationChan          chan bool
 	terminationChan         chan bool
+	detached                map[string]bool
 }

 func (c *concurrentTest) Validate() (*validation.ValidationCluster, error) {
 	c.mutex.Lock()
 	defer c.mutex.Unlock()

+	if len(c.detached) < c.surge {
+		assert.Greater(c.t, c.previousValidation, 7, "previous validation")
+		c.previousValidation--
+		return &validation.ValidationCluster{}, nil
+	}
+
 	terminationRequestsLeft := c.terminationRequestsLeft
 	switch terminationRequestsLeft {
 	case 7, 6, 0:
@ -727,29 +802,40 @@ func (c *concurrentTest) Validate() (*validation.ValidationCluster, error) {
 	return &validation.ValidationCluster{}, nil
 }

-func (c *concurrentTest) TerminateInstanceInAutoScalingGroup(input *autoscaling.TerminateInstanceInAutoScalingGroupInput) (*autoscaling.TerminateInstanceInAutoScalingGroupOutput, error) {
+func (c *concurrentTest) TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error) {
+	if input.DryRun != nil && *input.DryRun {
+		return &ec2.TerminateInstancesOutput{}, nil
+	}
+
 	c.mutex.Lock()
 	defer c.mutex.Unlock()

-	terminationRequestsLeft := c.terminationRequestsLeft
-	c.terminationRequestsLeft--
-	switch terminationRequestsLeft {
-	case 7, 2, 1:
-		assert.Equal(c.t, terminationRequestsLeft, c.previousValidation, "previous validation")
-	case 6, 4:
-		assert.Equal(c.t, terminationRequestsLeft, c.previousValidation, "previous validation")
-		c.mutex.Unlock()
-		select {
-		case <-c.terminationChan:
-		case <-time.After(1 * time.Second):
-			c.t.Error("timed out reading from terminationChan")
+	for _, id := range input.InstanceIds {
+		assert.Equal(c.t, c.surge, len(c.detached), "Number of detached instances")
+		if c.detached[*id] {
+			assert.LessOrEqual(c.t, c.terminationRequestsLeft, c.surge, "Deleting detached instances last")
+		}
+
+		terminationRequestsLeft := c.terminationRequestsLeft
+		c.terminationRequestsLeft--
+		switch terminationRequestsLeft {
+		case 7, 2, 1:
+			assert.Equal(c.t, terminationRequestsLeft, c.previousValidation, "previous validation")
+		case 6, 4:
+			assert.Equal(c.t, terminationRequestsLeft, c.previousValidation, "previous validation")
+			c.mutex.Unlock()
+			select {
+			case <-c.terminationChan:
+			case <-time.After(1 * time.Second):
+				c.t.Error("timed out reading from terminationChan")
+			}
+			c.mutex.Lock()
+			go c.delayThenWakeValidation()
+		case 5, 3:
+			assert.Equal(c.t, terminationRequestsLeft+1, c.previousValidation, "previous validation")
 		}
-		c.mutex.Lock()
-		go c.delayThenWakeValidation()
-	case 5, 3:
-		assert.Equal(c.t, terminationRequestsLeft+1, c.previousValidation, "previous validation")
 	}
-	return c.AutoScalingAPI.TerminateInstanceInAutoScalingGroup(input)
+	return c.EC2API.TerminateInstances(input)
 }

 func (c *concurrentTest) delayThenWakeValidation() {
@ -767,28 +853,36 @@ func (c *concurrentTest) AssertComplete() {
 	assert.Equal(c.t, 0, c.previousValidation, "last validation")
 }

-func newConcurrentTest(t *testing.T, cloud *awsup.MockAWSCloud, allNeedUpdate bool) *concurrentTest {
+func newConcurrentTest(t *testing.T, cloud *awsup.MockAWSCloud, numSurge int, allNeedUpdate bool) *concurrentTest {
 	test := concurrentTest{
-		AutoScalingAPI:          cloud.MockAutoscaling,
+		EC2API:                  cloud.MockEC2,
 		t:                       t,
+		surge:                   numSurge,
 		terminationRequestsLeft: 6,
 		validationChan:          make(chan bool),
 		terminationChan:         make(chan bool),
+		detached:                map[string]bool{},
 	}
-	if allNeedUpdate {
+	if numSurge == 0 && allNeedUpdate {
 		test.terminationRequestsLeft = 7
 	}
-	test.previousValidation = test.terminationRequestsLeft + 1
+	if numSurge == 0 {
+		test.previousValidation = test.terminationRequestsLeft + 1
+	} else if allNeedUpdate {
+		test.previousValidation = 9
+	} else {
+		test.previousValidation = 8
+	}
 	return &test
 }

 func TestRollingUpdateMaxUnavailableAllNeedUpdate(t *testing.T) {
 	c, cloud, cluster := getTestSetup()

-	concurrentTest := newConcurrentTest(t, cloud, true)
+	concurrentTest := newConcurrentTest(t, cloud, 0, true)
 	c.ValidateSuccessDuration = 0
 	c.ClusterValidator = concurrentTest
-	cloud.MockAutoscaling = concurrentTest
+	cloud.MockEC2 = concurrentTest

 	two := intstr.FromInt(2)
 	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
@ -808,10 +902,10 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdate(t *testing.T) {
 func TestRollingUpdateMaxUnavailableAllButOneNeedUpdate(t *testing.T) {
 	c, cloud, cluster := getTestSetup()

-	concurrentTest := newConcurrentTest(t, cloud, false)
+	concurrentTest := newConcurrentTest(t, cloud, 0, false)
 	c.ValidateSuccessDuration = 0
 	c.ClusterValidator = concurrentTest
-	cloud.MockAutoscaling = concurrentTest
+	cloud.MockEC2 = concurrentTest

 	two := intstr.FromInt(2)
 	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
@ -830,10 +924,10 @@ func TestRollingUpdateMaxUnavailableAllButOneNeedUpdate(t *testing.T) {
 func TestRollingUpdateMaxUnavailableAllNeedUpdateMaster(t *testing.T) {
 	c, cloud, cluster := getTestSetup()

-	concurrentTest := newConcurrentTest(t, cloud, true)
+	concurrentTest := newConcurrentTest(t, cloud, 0, true)
 	c.ValidateSuccessDuration = 0
 	c.ClusterValidator = concurrentTest
-	cloud.MockAutoscaling = concurrentTest
+	cloud.MockEC2 = concurrentTest

 	two := intstr.FromInt(2)
 	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
@ -850,6 +944,264 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdateMaster(t *testing.T) {
 	concurrentTest.AssertComplete()
 }

+type concurrentTestAutoscaling struct {
+	autoscalingiface.AutoScalingAPI
+	ConcurrentTest *concurrentTest
+}
+
+func (m *concurrentTestAutoscaling) DetachInstances(input *autoscaling.DetachInstancesInput) (*autoscaling.DetachInstancesOutput, error) {
+	m.ConcurrentTest.mutex.Lock()
+	defer m.ConcurrentTest.mutex.Unlock()
+
+	assert.Equal(m.ConcurrentTest.t, "node-1", *input.AutoScalingGroupName)
+	assert.False(m.ConcurrentTest.t, *input.ShouldDecrementDesiredCapacity)
+
+	for _, id := range input.InstanceIds {
+		assert.Less(m.ConcurrentTest.t, len(m.ConcurrentTest.detached), m.ConcurrentTest.surge, "Number of detached instances")
+		assert.False(m.ConcurrentTest.t, m.ConcurrentTest.detached[*id], *id+" already detached")
+		m.ConcurrentTest.detached[*id] = true
+	}
+	return &autoscaling.DetachInstancesOutput{}, nil
+}
+
+type ec2IgnoreTags struct {
+	ec2iface.EC2API
+}
+
+// CreateTags ignores tagging of instances done by the AWS fi.Cloud implementation of DetachInstance()
+func (e *ec2IgnoreTags) CreateTags(*ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error) {
+	return &ec2.CreateTagsOutput{}, nil
+}
+
+func TestRollingUpdateMaxSurgeAllNeedUpdate(t *testing.T) {
+	c, cloud, cluster := getTestSetup()
+
+	concurrentTest := newConcurrentTest(t, cloud, 2, true)
+	c.ValidateSuccessDuration = 0
+	c.ClusterValidator = concurrentTest
+	cloud.MockAutoscaling = &concurrentTestAutoscaling{
+		AutoScalingAPI: cloud.MockAutoscaling,
+		ConcurrentTest: concurrentTest,
+	}
+	cloud.MockEC2 = &ec2IgnoreTags{EC2API: concurrentTest}
+
+	two := intstr.FromInt(2)
+	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
+		MaxSurge: &two,
+	}
+
+	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
+	makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 6, 6)
+
+	err := c.RollingUpdate(groups, cluster, &kopsapi.InstanceGroupList{})
+	assert.NoError(t, err, "rolling update")
+
+	assertGroupInstanceCount(t, cloud, "node-1", 0)
+	concurrentTest.AssertComplete()
+}
+
+func TestRollingUpdateMaxSurgeAllButOneNeedUpdate(t *testing.T) {
+	c, cloud, cluster := getTestSetup()
+
+	concurrentTest := newConcurrentTest(t, cloud, 2, false)
+	c.ValidateSuccessDuration = 0
+	c.ClusterValidator = concurrentTest
+	cloud.MockAutoscaling = &concurrentTestAutoscaling{
+		AutoScalingAPI: cloud.MockAutoscaling,
+		ConcurrentTest: concurrentTest,
+	}
+	cloud.MockEC2 = &ec2IgnoreTags{EC2API: concurrentTest}
+
+	two := intstr.FromInt(2)
+	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
+		MaxSurge: &two,
+	}
+
+	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
+	makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 7, 6)
+	err := c.RollingUpdate(groups, cluster, &kopsapi.InstanceGroupList{})
+	assert.NoError(t, err, "rolling update")
+
+	assertGroupInstanceCount(t, cloud, "node-1", 1)
+	concurrentTest.AssertComplete()
+}
+
+type countDetach struct {
+	autoscalingiface.AutoScalingAPI
+	Count int
+}
+
+func (c *countDetach) DetachInstances(input *autoscaling.DetachInstancesInput) (*autoscaling.DetachInstancesOutput, error) {
+	c.Count += len(input.InstanceIds)
+	return &autoscaling.DetachInstancesOutput{}, nil
+}
+
+func TestRollingUpdateMaxSurgeGreaterThanNeedUpdate(t *testing.T) {
+	c, cloud, cluster := getTestSetup()
+
+	countDetach := &countDetach{AutoScalingAPI: cloud.MockAutoscaling}
+	cloud.MockAutoscaling = countDetach
+	cloud.MockEC2 = &ec2IgnoreTags{EC2API: cloud.MockEC2}
+
+	ten := intstr.FromInt(10)
+	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
+		MaxSurge: &ten,
+	}
+
+	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
+	makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 2)
+	err := c.RollingUpdate(groups, cluster, &kopsapi.InstanceGroupList{})
+	assert.NoError(t, err, "rolling update")
+
+	assertGroupInstanceCount(t, cloud, "node-1", 1)
+	assert.Equal(t, 2, countDetach.Count)
+}
+
+// Request validate (1)            -->
+//                                 <-- validated
+// Detach instance                 -->
+// Request validate (2)            -->
+//                                 <-- validated
+// Detach instance                 -->
+// Request validate (3)            -->
+//                                 <-- validated
+// Request terminate 3 nodes       -->
+//                                 <-- 3 nodes terminated, 1 left
+// Request validate (4)            -->
+//                                 <-- validated
+// Request terminate 1 node        -->
+//                                 <-- 1 node terminated, 0 left
+// Request validate (5)            -->
+//                                 <-- validated
+type alreadyDetachedTest struct {
+	ec2iface.EC2API
+	t                       *testing.T
+	mutex                   sync.Mutex
+	terminationRequestsLeft int
+	numValidations          int
+	detached                map[string]bool
+}
+
+func (t *alreadyDetachedTest) Validate() (*validation.ValidationCluster, error) {
+	t.mutex.Lock()
+	defer t.mutex.Unlock()
+
+	t.numValidations++
+	switch t.numValidations {
+	case 1, 2, 3:
+		assert.Equal(t.t, t.numValidations, len(t.detached), "numnber of detached instances")
+	case 4:
+		t.mutex.Unlock()
+		time.Sleep(20 * time.Millisecond) // NodeInterval plus some
+		t.mutex.Lock()
+		assert.Equal(t.t, 1, t.terminationRequestsLeft, "terminations left")
+	case 5:
+		assert.Equal(t.t, 0, t.terminationRequestsLeft, "terminations left")
+	case 6:
+		t.t.Error("unexpected sixth call to Validate")
+	}
+
+	return &validation.ValidationCluster{}, nil
+}
+
+func (t *alreadyDetachedTest) TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error) {
+	if input.DryRun != nil && *input.DryRun {
+		return &ec2.TerminateInstancesOutput{}, nil
+	}
+
+	t.mutex.Lock()
+	defer t.mutex.Unlock()
+
+	for _, id := range input.InstanceIds {
+		assert.Equal(t.t, 3, len(t.detached), "Number of detached instances")
+		assert.GreaterOrEqual(t.t, t.numValidations, 3, "Number of previous validations")
+		if t.terminationRequestsLeft == 1 {
+			assert.True(t.t, t.detached[*id], "Last deleted instance %q was detached", *id)
+		}
+
+		t.terminationRequestsLeft--
+	}
+	return t.EC2API.TerminateInstances(input)
+}
+
+type alreadyDetachedTestAutoscaling struct {
+	autoscalingiface.AutoScalingAPI
+	AlreadyDetachedTest *alreadyDetachedTest
+}
+
+func (m *alreadyDetachedTestAutoscaling) DetachInstances(input *autoscaling.DetachInstancesInput) (*autoscaling.DetachInstancesOutput, error) {
+	m.AlreadyDetachedTest.mutex.Lock()
+	defer m.AlreadyDetachedTest.mutex.Unlock()
+
+	for _, id := range input.InstanceIds {
+		assert.Less(m.AlreadyDetachedTest.t, len(m.AlreadyDetachedTest.detached), 3, "Number of detached instances")
+		assert.False(m.AlreadyDetachedTest.t, m.AlreadyDetachedTest.detached[*id], *id+" already detached")
+		m.AlreadyDetachedTest.detached[*id] = true
+	}
+	return &autoscaling.DetachInstancesOutput{}, nil
+}
+
+func TestRollingUpdateMaxSurgeAllNeedUpdateOneAlreadyDetached(t *testing.T) {
+	c, cloud, cluster := getTestSetup()
+
+	alreadyDetachedTest := &alreadyDetachedTest{
+		EC2API:                  cloud.MockEC2,
+		t:                       t,
+		terminationRequestsLeft: 4,
+		detached:                map[string]bool{},
+	}
+
+	c.ValidateSuccessDuration = 0
+	c.ClusterValidator = alreadyDetachedTest
+	cloud.MockAutoscaling = &alreadyDetachedTestAutoscaling{
+		AutoScalingAPI:      cloud.MockAutoscaling,
+		AlreadyDetachedTest: alreadyDetachedTest,
+	}
+	cloud.MockEC2 = &ec2IgnoreTags{EC2API: alreadyDetachedTest}
+
+	three := intstr.FromInt(3)
+	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
+		MaxSurge: &three,
+	}
+
+	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
+	makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 4, 4)
+	alreadyDetachedTest.detached[groups["node-1"].NeedUpdate[3].ID] = true
+	groups["node-1"].NeedUpdate[3].Detached = true
+	err := c.RollingUpdate(groups, cluster, &kopsapi.InstanceGroupList{})
+	assert.NoError(t, err, "rolling update")
+
+	assertGroupInstanceCount(t, cloud, "node-1", 0)
+	assert.Equal(t, 5, alreadyDetachedTest.numValidations, "Number of validations")
+}
+
+func TestRollingUpdateMaxSurgeAllNeedUpdateMaxAlreadyDetached(t *testing.T) {
+	c, cloud, cluster := getTestSetup()
+
+	// Should behave the same as TestRollingUpdateMaxUnavailableAllNeedUpdate
+	concurrentTest := newConcurrentTest(t, cloud, 0, true)
+	c.ValidateSuccessDuration = 0
+	c.ClusterValidator = concurrentTest
+	cloud.MockEC2 = concurrentTest
+
+	two := intstr.FromInt(2)
+	cluster.Spec.RollingUpdate = &kopsapi.RollingUpdate{
+		MaxSurge: &two,
+	}
+
+	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
+	makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 7, 7)
+	groups["node-1"].NeedUpdate[1].Detached = true
+	groups["node-1"].NeedUpdate[3].Detached = true
+	// TODO verify those are the last two instances terminated
+
+	err := c.RollingUpdate(groups, cluster, &kopsapi.InstanceGroupList{})
+	assert.NoError(t, err, "rolling update")
+
+	assertGroupInstanceCount(t, cloud, "node-1", 0)
+	concurrentTest.AssertComplete()
+}
+
 func assertCordon(t *testing.T, action testingclient.PatchAction) {
 	assert.Equal(t, "nodes", action.GetResource().Resource)
 	assert.Equal(t, cordonPatch, string(action.GetPatch()))
--- a/pkg/instancegroups/settings.go
+++ b/pkg/instancegroups/settings.go
@ -31,19 +31,32 @@ func resolveSettings(cluster *kops.Cluster, group *kops.InstanceGroup, numInstan
 		if rollingUpdate.MaxUnavailable == nil {
 			rollingUpdate.MaxUnavailable = def.MaxUnavailable
 		}
+		if rollingUpdate.MaxSurge == nil {
+			rollingUpdate.MaxSurge = def.MaxSurge
+		}
 	}

-	if rollingUpdate.MaxUnavailable == nil || rollingUpdate.MaxUnavailable.IntVal < 0 {
-		one := intstr.FromInt(1)
-		rollingUpdate.MaxUnavailable = &one
+	if rollingUpdate.MaxSurge == nil {
+		zero := intstr.FromInt(0)
+		rollingUpdate.MaxSurge = &zero
+	}
+
+	if rollingUpdate.MaxSurge.Type == intstr.String {
+		surge, _ := intstr.GetValueFromIntOrPercent(rollingUpdate.MaxSurge, numInstances, true)
+		surgeInt := intstr.FromInt(surge)
+		rollingUpdate.MaxSurge = &surgeInt
+	}
+
+	maxUnavailableDefault := intstr.FromInt(0)
+	if rollingUpdate.MaxSurge.Type == intstr.Int && rollingUpdate.MaxSurge.IntVal == 0 {
+		maxUnavailableDefault = intstr.FromInt(1)
+	}
+	if rollingUpdate.MaxUnavailable == nil {
+		rollingUpdate.MaxUnavailable = &maxUnavailableDefault
 	}

 	if rollingUpdate.MaxUnavailable.Type == intstr.String {
-		unavailable, err := intstr.GetValueFromIntOrPercent(rollingUpdate.MaxUnavailable, numInstances, false)
-		if err != nil {
-			// If unparseable use the default value
-			unavailable = 1
-		}
+		unavailable, _ := intstr.GetValueFromIntOrPercent(rollingUpdate.MaxUnavailable, numInstances, false)
 		if unavailable <= 0 {
 			// While we round down, percentages should resolve to a minimum of 1
 			unavailable = 1
--- a/pkg/instancegroups/settings_test.go
+++ b/pkg/instancegroups/settings_test.go
@ -37,6 +37,11 @@ func TestSettings(t *testing.T) {
 			defaultValue:    intstr.FromInt(1),
 			nonDefaultValue: intstr.FromInt(2),
 		},
+		{
+			name:            "MaxSurge",
+			defaultValue:    intstr.FromInt(0),
+			nonDefaultValue: intstr.FromInt(2),
+		},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			defaultCluster := &kops.RollingUpdate{}
@ -133,21 +138,6 @@ func TestMaxUnavailable(t *testing.T) {
 			value:        "100%",
 			expected:     10,
 		},
-		{
-			numInstances: 5,
-			value:        "fnord",
-			expected:     1,
-		},
-		{
-			numInstances: 5,
-			value:        "-3",
-			expected:     1,
-		},
-		{
-			numInstances: 5,
-			value:        "-3%",
-			expected:     1,
-		},
 	} {
 		t.Run(fmt.Sprintf("%s %d", tc.value, tc.numInstances), func(t *testing.T) {
 			value := intstr.Parse(tc.value)
@ -165,3 +155,52 @@ func TestMaxUnavailable(t *testing.T) {
 		})
 	}
 }
+
+func TestMaxSurge(t *testing.T) {
+	for _, tc := range []struct {
+		numInstances int
+		value        string
+		expected     int32
+	}{
+		{
+			numInstances: 1,
+			value:        "0",
+			expected:     0,
+		},
+		{
+			numInstances: 1,
+			value:        "0%",
+			expected:     0,
+		},
+		{
+			numInstances: 10,
+			value:        "31%",
+			expected:     4,
+		},
+		{
+			numInstances: 10,
+			value:        "100%",
+			expected:     10,
+		},
+	} {
+		t.Run(fmt.Sprintf("%s %d", tc.value, tc.numInstances), func(t *testing.T) {
+			value := intstr.Parse(tc.value)
+			rollingUpdate := kops.RollingUpdate{
+				MaxSurge: &value,
+			}
+			instanceGroup := kops.InstanceGroup{
+				Spec: kops.InstanceGroupSpec{
+					RollingUpdate: &rollingUpdate,
+				},
+			}
+			resolved := resolveSettings(&kops.Cluster{}, &instanceGroup, tc.numInstances)
+			assert.Equal(t, intstr.Int, resolved.MaxSurge.Type)
+			assert.Equal(t, tc.expected, resolved.MaxSurge.IntVal)
+			if tc.expected == 0 {
+				assert.Equal(t, int32(1), resolved.MaxUnavailable.IntVal, "MaxUnavailable default")
+			} else {
+				assert.Equal(t, int32(0), resolved.MaxUnavailable.IntVal, "MaxUnavailable default")
+			}
+		})
+	}
+}
--- a/pkg/resources/digitalocean/cloud.go
+++ b/pkg/resources/digitalocean/cloud.go
@ -100,6 +100,12 @@ func (c *Cloud) DeleteInstance(i *cloudinstances.CloudInstanceGroupMember) error
 	return fmt.Errorf("digital ocean cloud provider does not support deleting cloud instances at this time")
 }

+// DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
+func (c *Cloud) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
+	klog.V(8).Info("digitalocean cloud provider DetachInstance not implemented yet")
+	return fmt.Errorf("digital ocean cloud provider does not support surging")
+}
+
 // ProviderID returns the kops api identifier for DigitalOcean cloud provider
 func (c *Cloud) ProviderID() kops.CloudProviderID {
 	return kops.CloudProviderDO
--- a/pkg/resources/spotinst/resources.go
+++ b/pkg/resources/spotinst/resources.go
@ -193,6 +193,11 @@ func DeleteInstance(cloud Cloud, instance *cloudinstances.CloudInstanceGroupMemb
 	return fmt.Errorf("spotinst: unexpected instance group type, got: %T", group.Raw)
 }

+// DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
+func DetachInstance(cloud Cloud, instance *cloudinstances.CloudInstanceGroupMember) error {
+	return fmt.Errorf("spotinst does not support surging")
+}
+
 // GetCloudGroups returns a list of InstanceGroups as CloudInstanceGroup objects.
 func GetCloudGroups(cloud Cloud, cluster *kops.Cluster, instanceGroups []*kops.InstanceGroup,
 	warnUnmatched bool, nodes []v1.Node) (map[string]*cloudinstances.CloudInstanceGroup, error) {
--- a/pkg/validation/validate_cluster.go
+++ b/pkg/validation/validate_cluster.go
@ -261,13 +261,20 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances
 		var allMembers []*cloudinstances.CloudInstanceGroupMember
 		allMembers = append(allMembers, cloudGroup.Ready...)
 		allMembers = append(allMembers, cloudGroup.NeedUpdate...)
-		if len(allMembers) < cloudGroup.MinSize {
+
+		numNodes := 0
+		for _, m := range allMembers {
+			if !m.Detached {
+				numNodes++
+			}
+		}
+		if numNodes < cloudGroup.MinSize {
 			v.addError(&ValidationError{
 				Kind: "InstanceGroup",
 				Name: cloudGroup.InstanceGroup.Name,
 				Message: fmt.Sprintf("InstanceGroup %q did not have enough nodes %d vs %d",
 					cloudGroup.InstanceGroup.Name,
-					len(allMembers),
+					numNodes,
 					cloudGroup.MinSize),
 			})
 		}
--- a/pkg/validation/validate_cluster_test.go
+++ b/pkg/validation/validate_cluster_test.go
@ -164,6 +164,59 @@ func Test_ValidateNodesNotEnough(t *testing.T) {
 	}
 }

+func Test_ValidateDetachedNodesDontCount(t *testing.T) {
+	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
+	groups["node-1"] = &cloudinstances.CloudInstanceGroup{
+		InstanceGroup: &kopsapi.InstanceGroup{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: "node-1",
+			},
+			Spec: kopsapi.InstanceGroupSpec{
+				Role: kopsapi.InstanceGroupRoleNode,
+			},
+		},
+		MinSize: 2,
+		Ready: []*cloudinstances.CloudInstanceGroupMember{
+			{
+				ID: "i-00001",
+				Node: &v1.Node{
+					ObjectMeta: metav1.ObjectMeta{Name: "node-1a"},
+					Status: v1.NodeStatus{
+						Conditions: []v1.NodeCondition{
+							{Type: "Ready", Status: v1.ConditionTrue},
+						},
+					},
+				},
+			},
+		},
+		NeedUpdate: []*cloudinstances.CloudInstanceGroupMember{
+			{
+				ID: "i-00002",
+				Node: &v1.Node{
+					ObjectMeta: metav1.ObjectMeta{Name: "node-1b"},
+					Status: v1.NodeStatus{
+						Conditions: []v1.NodeCondition{
+							{Type: "Ready", Status: v1.ConditionTrue},
+						},
+					},
+				},
+				Detached: true,
+			},
+		},
+	}
+
+	v, err := testValidate(t, groups, nil)
+	require.NoError(t, err)
+	if !assert.Len(t, v.Failures, 1) ||
+		!assert.Equal(t, &ValidationError{
+			Kind:    "InstanceGroup",
+			Name:    "node-1",
+			Message: "InstanceGroup \"node-1\" did not have enough nodes 1 vs 2",
+		}, v.Failures[0]) {
+		printDebug(t, v)
+	}
+}
+
 func Test_ValidateNodeNotReady(t *testing.T) {
 	groups := make(map[string]*cloudinstances.CloudInstanceGroup)
 	groups["node-1"] = &cloudinstances.CloudInstanceGroup{
--- a/upup/pkg/fi/cloud.go
+++ b/upup/pkg/fi/cloud.go
@ -28,20 +28,24 @@ type Cloud interface {

 	DNS() (dnsprovider.Interface, error)

-	// FindVPCInfo looks up the specified VPC by id, returning info if found, otherwise (nil, nil)
+	// FindVPCInfo looks up the specified VPC by id, returning info if found, otherwise (nil, nil).
 	FindVPCInfo(id string) (*VPCInfo, error)

-	// DeleteInstance deletes a cloud instance
+	// DeleteInstance deletes a cloud instance.
 	DeleteInstance(instance *cloudinstances.CloudInstanceGroupMember) error

-	// DeleteGroup deletes the cloud resources that make up a CloudInstanceGroup, including the instances
+	// DeleteGroup deletes the cloud resources that make up a CloudInstanceGroup, including the instances.
 	DeleteGroup(group *cloudinstances.CloudInstanceGroup) error

-	// GetCloudGroups returns a map of cloud instances that back a kops cluster
+	// DetachInstance causes a cloud instance to no longer be counted against the group's size limits.
+	DetachInstance(instance *cloudinstances.CloudInstanceGroupMember) error
+
+	// GetCloudGroups returns a map of cloud instances that back a kops cluster.
+	// Detached instances must be returned in the NeedUpdate slice.
 	GetCloudGroups(cluster *kops.Cluster, instancegroups []*kops.InstanceGroup, warnUnmatched bool, nodes []v1.Node) (map[string]*cloudinstances.CloudInstanceGroup, error)

-	// Region returns the cloud region bound to the cloud instance
-	// If the region concept does not apply, returns ""
+	// Region returns the cloud region bound to the cloud instance.
+	// If the region concept does not apply, returns "".
 	Region() string
 }

--- a/upup/pkg/fi/cloudup/aliup/ali_cloud.go
+++ b/upup/pkg/fi/cloudup/aliup/ali_cloud.go
@ -180,6 +180,10 @@ func (c *aliCloudImplementation) DeleteInstance(i *cloudinstances.CloudInstanceG
 	return nil
 }

+func (c *aliCloudImplementation) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
+	return errors.New("aliCloud cloud provider does not support surging")
+}
+
 func (c *aliCloudImplementation) FindVPCInfo(id string) (*fi.VPCInfo, error) {
 	request := &ecs.DescribeVpcsArgs{
 		RegionId: common.Region(c.Region()),
--- a/upup/pkg/fi/cloudup/awsup/aws_cloud.go
+++ b/upup/pkg/fi/cloudup/awsup/aws_cloud.go
@ -85,6 +85,8 @@ const TagNameKopsRole = "kubernetes.io/kops/role"
 // TagNameClusterOwnershipPrefix is the AWS tag used for ownership
 const TagNameClusterOwnershipPrefix = "kubernetes.io/cluster/"

+const tagNameDetachedInstance = "kops.k8s.io/detached-from-asg"
+
 const (
 	WellKnownAccountAmazonLinux2 = "137112412989"
 	WellKnownAccountCentOS       = "679593333241"
@ -358,6 +360,23 @@ func deleteGroup(c AWSCloud, g *cloudinstances.CloudInstanceGroup) error {
 		launchTemplate = aws.StringValue(asg.LaunchTemplate.LaunchTemplateName)
 	}

+	// Delete detached instances
+	{
+		detached, err := findDetachedInstances(c, asg)
+		if err != nil {
+			return fmt.Errorf("error searching for detached instances for autoscaling group %q: %v", name, err)
+		}
+		if len(detached) > 0 {
+			klog.V(2).Infof("Deleting detached instances for autoscaling group %q", name)
+			req := &ec2.TerminateInstancesInput{
+				InstanceIds: detached,
+			}
+			if _, err := c.EC2().TerminateInstances(req); err != nil {
+				return fmt.Errorf("error deleting detached instances for autoscaling group %q: %v", name, err)
+			}
+		}
+	}
+
 	// Delete ASG
 	{
 		klog.V(2).Infof("Deleting autoscaling group %q", name)
@ -418,12 +437,11 @@ func deleteInstance(c AWSCloud, i *cloudinstances.CloudInstanceGroupMember) erro
 		return fmt.Errorf("id was not set on CloudInstanceGroupMember: %v", i)
 	}

-	request := &autoscaling.TerminateInstanceInAutoScalingGroupInput{
-		InstanceId:                     aws.String(id),
-		ShouldDecrementDesiredCapacity: aws.Bool(false),
+	request := &ec2.TerminateInstancesInput{
+		InstanceIds: []*string{aws.String(id)},
 	}

-	if _, err := c.Autoscaling().TerminateInstanceInAutoScalingGroup(request); err != nil {
+	if _, err := c.EC2().TerminateInstances(request); err != nil {
 		return fmt.Errorf("error deleting instance %q: %v", id, err)
 	}

@ -432,7 +450,42 @@ func deleteInstance(c AWSCloud, i *cloudinstances.CloudInstanceGroupMember) erro
 	return nil
 }

-// TODO not used yet, as this requires a major refactor of rolling-update code, slowly but surely
+// DetachInstance causes an aws instance to no longer be counted against the ASG's size limits.
+func (c *awsCloudImplementation) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
+	if c.spotinst != nil {
+		return spotinst.DetachInstance(c.spotinst, i)
+	}
+
+	return detachInstance(c, i)
+}
+
+func detachInstance(c AWSCloud, i *cloudinstances.CloudInstanceGroupMember) error {
+	id := i.ID
+	if id == "" {
+		return fmt.Errorf("id was not set on CloudInstanceGroupMember: %v", i)
+	}
+
+	asg := i.CloudInstanceGroup.Raw.(*autoscaling.Group)
+	if err := c.CreateTags(id, map[string]string{tagNameDetachedInstance: *asg.AutoScalingGroupName}); err != nil {
+		return fmt.Errorf("error tagging instance %q: %v", id, err)
+	}
+
+	// TODO this also deregisters the instance from any ELB attached to the ASG. Do we care?
+
+	input := &autoscaling.DetachInstancesInput{
+		AutoScalingGroupName:           aws.String(i.CloudInstanceGroup.HumanName),
+		InstanceIds:                    []*string{aws.String(id)},
+		ShouldDecrementDesiredCapacity: aws.Bool(false),
+	}
+
+	if _, err := c.Autoscaling().DetachInstances(input); err != nil {
+		return fmt.Errorf("error detaching instance %q: %v", id, err)
+	}
+
+	klog.V(8).Infof("detached aws ec2 instance %q", id)
+
+	return nil
+}

 // GetCloudGroups returns a groups of instances that back a kops instance groups
 func (c *awsCloudImplementation) GetCloudGroups(cluster *kops.Cluster, instancegroups []*kops.InstanceGroup, warnUnmatched bool, nodes []v1.Node) (map[string]*cloudinstances.CloudInstanceGroup, error) {
@ -467,7 +520,7 @@ func getCloudGroups(c AWSCloud, cluster *kops.Cluster, instancegroups []*kops.In
 			continue
 		}

-		groups[instancegroup.ObjectMeta.Name], err = awsBuildCloudInstanceGroup(c, instancegroup, asg, nodeMap)
+		groups[instancegroup.ObjectMeta.Name], err = awsBuildCloudInstanceGroup(c, cluster, instancegroup, asg, nodeMap)
 		if err != nil {
 			return nil, fmt.Errorf("error getting cloud instance group %q: %v", instancegroup.ObjectMeta.Name, err)
 		}
@ -648,12 +701,14 @@ func findInstanceLaunchConfiguration(i *autoscaling.Instance) string {
 	return ""
 }

-func awsBuildCloudInstanceGroup(c AWSCloud, ig *kops.InstanceGroup, g *autoscaling.Group, nodeMap map[string]*v1.Node) (*cloudinstances.CloudInstanceGroup, error) {
+func awsBuildCloudInstanceGroup(c AWSCloud, cluster *kops.Cluster, ig *kops.InstanceGroup, g *autoscaling.Group, nodeMap map[string]*v1.Node) (*cloudinstances.CloudInstanceGroup, error) {
 	newConfigName, err := findAutoscalingGroupLaunchConfiguration(c, g)
 	if err != nil {
 		return nil, err
 	}

+	instanceSeen := map[string]bool{}
+
 	cg := &cloudinstances.CloudInstanceGroup{
 		HumanName:     aws.StringValue(g.AutoScalingGroupName),
 		InstanceGroup: ig,
@ -668,6 +723,7 @@ func awsBuildCloudInstanceGroup(c AWSCloud, ig *kops.InstanceGroup, g *autoscali
 			klog.Warningf("ignoring instance with no instance id: %s in autoscaling group: %s", id, cg.HumanName)
 			continue
 		}
+		instanceSeen[id] = true
 		// @step: check if the instance is terminating
 		if aws.StringValue(i.LifecycleState) == autoscaling.LifecycleStateTerminating {
 			klog.Warningf("ignoring instance as it is terminating: %s in autoscaling group: %s", id, cg.HumanName)
@ -680,9 +736,44 @@ func awsBuildCloudInstanceGroup(c AWSCloud, ig *kops.InstanceGroup, g *autoscali
 		}
 	}

+	detached, err := findDetachedInstances(c, g)
+	if err != nil {
+		return nil, fmt.Errorf("error searching for detached instances: %v", err)
+	}
+	for _, id := range detached {
+		if id != nil && *id != "" && !instanceSeen[*id] {
+			if err := cg.NewDetachedCloudInstanceGroupMember(*id, nodeMap); err != nil {
+				return nil, fmt.Errorf("error creating cloud instance group member: %v", err)
+			}
+			instanceSeen[*id] = true
+		}
+	}
+
 	return cg, nil
 }

+func findDetachedInstances(c AWSCloud, g *autoscaling.Group) ([]*string, error) {
+	req := &ec2.DescribeInstancesInput{
+		Filters: []*ec2.Filter{
+			NewEC2Filter("tag:"+tagNameDetachedInstance, aws.StringValue(g.AutoScalingGroupName)),
+			NewEC2Filter("instance-state-name", "pending", "running", "stopping", "stopped"),
+		},
+	}
+
+	result, err := c.EC2().DescribeInstances(req)
+	if err != nil {
+		return nil, err
+	}
+
+	var detached []*string
+	for _, r := range result.Reservations {
+		for _, i := range r.Instances {
+			detached = append(detached, i.InstanceId)
+		}
+	}
+	return detached, nil
+}
+
 func (c *awsCloudImplementation) Tags() map[string]string {
 	// Defensive copy
 	tags := make(map[string]string)
--- a/upup/pkg/fi/cloudup/awsup/mock_aws_cloud.go
+++ b/upup/pkg/fi/cloudup/awsup/mock_aws_cloud.go
@ -90,6 +90,10 @@ func (c *MockAWSCloud) DeleteInstance(i *cloudinstances.CloudInstanceGroupMember
 	return deleteInstance(c, i)
 }

+func (c *MockAWSCloud) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
+	return detachInstance(c, i)
+}
+
 func (c *MockAWSCloud) GetCloudGroups(cluster *kops.Cluster, instancegroups []*kops.InstanceGroup, warnUnmatched bool, nodes []v1.Node) (map[string]*cloudinstances.CloudInstanceGroup, error) {
 	return getCloudGroups(c, cluster, instancegroups, warnUnmatched, nodes)
 }
--- a/upup/pkg/fi/cloudup/baremetal/cloud.go
+++ b/upup/pkg/fi/cloudup/baremetal/cloud.go
@ -68,6 +68,13 @@ func (c *Cloud) DeleteGroup(g *cloudinstances.CloudInstanceGroup) error {
 	return fmt.Errorf("baremetal cloud provider does not support deleting cloud groups at this time")
 }

+// DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
+// Baremetal may not support this.
+func (c *Cloud) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
+	klog.V(8).Infof("baremetal cloud provider DetachInstance not implemented")
+	return fmt.Errorf("baremetal cloud provider does not support surging")
+}
+
 //DeleteInstance is not implemented yet, is func needs to delete a DO instance.
 //Baremetal may not support this.
 func (c *Cloud) DeleteInstance(instance *cloudinstances.CloudInstanceGroupMember) error {
--- a/upup/pkg/fi/cloudup/gce/instancegroups.go
+++ b/upup/pkg/fi/cloudup/gce/instancegroups.go
@ -61,6 +61,18 @@ func (c *mockGCECloud) DeleteInstance(i *cloudinstances.CloudInstanceGroupMember
 	return recreateCloudInstanceGroupMember(c, i)
 }

+// DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
+func (c *gceCloudImplementation) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
+	klog.V(8).Info("gce cloud provider DetachInstance not implemented yet")
+	return fmt.Errorf("gce cloud provider does not support surging")
+}
+
+// DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
+func (c *mockGCECloud) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
+	klog.V(8).Info("gce cloud provider DetachInstance not implemented yet")
+	return fmt.Errorf("gce cloud provider does not support surging")
+}
+
 // recreateCloudInstanceGroupMember recreates the specified instances, managed by an InstanceGroupManager
 func recreateCloudInstanceGroupMember(c GCECloud, i *cloudinstances.CloudInstanceGroupMember) error {
 	mig := i.CloudInstanceGroup.Raw.(*compute.InstanceGroupManager)
--- a/upup/pkg/fi/cloudup/openstack/instance.go
+++ b/upup/pkg/fi/cloudup/openstack/instance.go
@ -110,6 +110,12 @@ func (c *openstackCloud) DeleteInstanceWithID(instanceID string) error {
 	return servers.Delete(c.novaClient, instanceID).ExtractErr()
 }

+// DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
+func (c *openstackCloud) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
+	klog.V(8).Info("openstack cloud provider DetachInstance not implemented yet")
+	return fmt.Errorf("openstack cloud provider does not support surging")
+}
+
 func (c *openstackCloud) GetInstance(id string) (*servers.Server, error) {
 	var server *servers.Server

--- a/upup/pkg/fi/cloudup/vsphere/vsphere_cloud.go
+++ b/upup/pkg/fi/cloudup/vsphere/vsphere_cloud.go
@ -131,6 +131,12 @@ func (c *VSphereCloud) DeleteInstance(i *cloudinstances.CloudInstanceGroupMember
 	return fmt.Errorf("vSphere cloud provider does not support deleting cloud instances at this time.")
 }

+// DetachInstance is not implemented yet. It needs to cause a cloud instance to no longer be counted against the group's size limits.
+func (c *VSphereCloud) DetachInstance(i *cloudinstances.CloudInstanceGroupMember) error {
+	klog.V(8).Info("vSphere cloud provider DetachInstance not implemented yet")
+	return fmt.Errorf("vSphere cloud provider does not support surging")
+}
+
 // DNS returns dnsprovider interface for this vSphere cloud.
 func (c *VSphereCloud) DNS() (dnsprovider.Interface, error) {
 	var provider dnsprovider.Interface