mirror of https://github.com/kubernetes/kops.git
aws: Graceful handling of EC2 detach errors
Sometimes, we observe the following error during a rolling update: error detaching instance "i-XXXX", node "ip-10-X-X-X.ec2.internal": error detaching instance "i-XXXX": ValidationError: The instance i-XXXX is not part of Auto Scaling group XXXXX The sequence of events that lead to this problem is the following: - A new ASG object is being built from the launch template - Existing instances are being added to it - An existing instance is being ignored because it's already terminating W0205 08:01:32.593377 191 aws_cloud.go:791] ignoring instance as it is terminating: i-XXXX in autoscaling group: XXXX - Due to maxSurge, the terminating instance is trying to be detached from the autoscaling group and fails. As such, in case of EC@ ASG deatch failures we can simply try to detach the next node instead of aborting the whole update operation.
This commit is contained in:
parent
89fdd16c4d
commit
0a49650c70
|
|
@ -134,11 +134,15 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances.
|
|||
update = prioritizeUpdate(update)
|
||||
|
||||
if maxSurge > 0 && !c.CloudOnly {
|
||||
skippedNodes := 0
|
||||
for numSurge := 1; numSurge <= maxSurge; numSurge++ {
|
||||
u := update[len(update)-numSurge]
|
||||
u := update[len(update)-numSurge+skippedNodes]
|
||||
if u.Status != cloudinstances.CloudInstanceStatusDetached {
|
||||
if err := c.detachInstance(u); err != nil {
|
||||
return err
|
||||
// If detaching a node fails, we simply proceed to the next one instead of
|
||||
// bubbling up the error.
|
||||
skippedNodes++
|
||||
numSurge--
|
||||
}
|
||||
|
||||
// If noneReady, wait until after one node is detached and its replacement validates
|
||||
|
|
|
|||
Loading…
Reference in New Issue