rolling-update: don't deregister our only apiserver

If we do, we can't drain the node afterwards. We also are going to have dropped connections in this case anyway.
2022-01-30 13:18:09 -05:00 · 2022-01-30 13:18:09 -05:00 · 4b2f773748
parent 280a4a94ad
commit 4b2f773748
3 changed files with 50 additions and 3 deletions
--- a/cmd/kops/rolling-update_cluster.go
+++ b/cmd/kops/rolling-update_cluster.go
@ -140,6 +140,9 @@ type RollingUpdateOptions struct {
 	// InstanceGroupRoles is the list of roles we should rolling-update
 	// if not specified, all instance groups will be updated
 	InstanceGroupRoles []string
+
+	// TODO: Move more/all above options to RollingUpdateOptions
+	instancegroups.RollingUpdateOptions
 }

 func (o *RollingUpdateOptions) InitDefaults() {
@ -159,6 +162,8 @@ func (o *RollingUpdateOptions) InitDefaults() {
 	o.ValidateCount = 2

 	o.DrainTimeout = 15 * time.Minute
+
+	o.RollingUpdateOptions.InitDefaults()
 }

 func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
@ -262,9 +267,21 @@ func RunRollingUpdateCluster(ctx context.Context, f *util.Factory, out io.Writer
 		return err
 	}

+	countByRole := make(map[kopsapi.InstanceGroupRole]int32)
 	var instanceGroups []*kopsapi.InstanceGroup
 	for i := range list.Items {
-		instanceGroups = append(instanceGroups, &list.Items[i])
+		instanceGroup := &list.Items[i]
+		instanceGroups = append(instanceGroups, instanceGroup)
+
+		minSize := int32(1)
+		if instanceGroup.Spec.MinSize != nil {
+			minSize = *instanceGroup.Spec.MinSize
+		}
+		countByRole[instanceGroup.Spec.Role] = countByRole[instanceGroup.Spec.Role] + minSize
+	}
+	if countByRole[kopsapi.InstanceGroupRoleAPIServer]+countByRole[kopsapi.InstanceGroupRoleMaster] <= 1 {
+		fmt.Fprintf(out, "Detected single-control-plane cluster; won't detach before draining\n")
+		options.DeregisterControlPlaneNodes = false
 	}

 	warnUnmatched := true
@ -346,6 +363,9 @@ func RunRollingUpdateCluster(ctx context.Context, f *util.Factory, out io.Writer
 		// TODO should we expose this to the UI?
 		ValidateTickDuration:    30 * time.Second,
 		ValidateSuccessDuration: 10 * time.Second,
+
+		// TODO: Move more of the passthrough options here, instead of duplicating them.
+		Options: options.RollingUpdateOptions,
 	}

 	err = d.AdjustNeedUpdate(groups)
--- a/pkg/instancegroups/instancegroups.go
+++ b/pkg/instancegroups/instancegroups.go
@ -647,8 +647,22 @@ func (c *RollingUpdateCluster) drainNode(u *cloudinstances.CloudInstance) error
 		return fmt.Errorf("error excluding node from load balancer: %v", err)
 	}

-	if err := c.Cloud.DeregisterInstance(u); err != nil {
-		return fmt.Errorf("error deregistering instance %q, node %q: %v", u.ID, u.Node.Name, err)
+	shouldDeregister := true
+	if !c.Options.DeregisterControlPlaneNodes {
+		if u.CloudInstanceGroup != nil && u.CloudInstanceGroup.InstanceGroup != nil {
+			role := u.CloudInstanceGroup.InstanceGroup.Spec.Role
+			switch role {
+			case api.InstanceGroupRoleAPIServer, api.InstanceGroupRoleMaster:
+				klog.Infof("skipping deregistration of instance %q, as part of instancegroup with role %q", u.ID, role)
+				shouldDeregister = false
+			}
+		}
+	}
+
+	if shouldDeregister {
+		if err := c.Cloud.DeregisterInstance(u); err != nil {
+			return fmt.Errorf("error deregistering instance %q, node %q: %w", u.ID, u.Node.Name, err)
+		}
 	}

 	if err := drain.RunNodeDrain(helper, u.Node.Name); err != nil {
--- a/pkg/instancegroups/rollingupdate.go
+++ b/pkg/instancegroups/rollingupdate.go
@ -80,6 +80,19 @@ type RollingUpdateCluster struct {

 	// DrainTimeout is the maximum amount of time to wait while draining a node.
 	DrainTimeout time.Duration
+
+	// Options holds user-specified options
+	Options RollingUpdateOptions
+}
+
+type RollingUpdateOptions struct {
+	// DeregisterControlPlaneNodes controls if we deregister control plane instances from load balacners etc before draining/terminating.
+	// When a cluster only has a single apiserver, we don't want to do this, as we can't drain after deregistering it.
+	DeregisterControlPlaneNodes bool
+}
+
+func (o *RollingUpdateOptions) InitDefaults() {
+	o.DeregisterControlPlaneNodes = true
 }

 // AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation