diff --git a/cmd/kops/rolling-update_cluster.go b/cmd/kops/rolling-update_cluster.go index d53b0476da..ee1e0832f2 100644 --- a/cmd/kops/rolling-update_cluster.go +++ b/cmd/kops/rolling-update_cluster.go @@ -140,6 +140,9 @@ type RollingUpdateOptions struct { // InstanceGroupRoles is the list of roles we should rolling-update // if not specified, all instance groups will be updated InstanceGroupRoles []string + + // TODO: Move more/all above options to RollingUpdateOptions + instancegroups.RollingUpdateOptions } func (o *RollingUpdateOptions) InitDefaults() { @@ -159,6 +162,8 @@ func (o *RollingUpdateOptions) InitDefaults() { o.ValidateCount = 2 o.DrainTimeout = 15 * time.Minute + + o.RollingUpdateOptions.InitDefaults() } func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command { @@ -262,9 +267,21 @@ func RunRollingUpdateCluster(ctx context.Context, f *util.Factory, out io.Writer return err } + countByRole := make(map[kopsapi.InstanceGroupRole]int32) var instanceGroups []*kopsapi.InstanceGroup for i := range list.Items { - instanceGroups = append(instanceGroups, &list.Items[i]) + instanceGroup := &list.Items[i] + instanceGroups = append(instanceGroups, instanceGroup) + + minSize := int32(1) + if instanceGroup.Spec.MinSize != nil { + minSize = *instanceGroup.Spec.MinSize + } + countByRole[instanceGroup.Spec.Role] = countByRole[instanceGroup.Spec.Role] + minSize + } + if countByRole[kopsapi.InstanceGroupRoleAPIServer]+countByRole[kopsapi.InstanceGroupRoleMaster] <= 1 { + fmt.Fprintf(out, "Detected single-control-plane cluster; won't detach before draining\n") + options.DeregisterControlPlaneNodes = false } warnUnmatched := true @@ -346,6 +363,9 @@ func RunRollingUpdateCluster(ctx context.Context, f *util.Factory, out io.Writer // TODO should we expose this to the UI? ValidateTickDuration: 30 * time.Second, ValidateSuccessDuration: 10 * time.Second, + + // TODO: Move more of the passthrough options here, instead of duplicating them. + Options: options.RollingUpdateOptions, } err = d.AdjustNeedUpdate(groups) diff --git a/pkg/instancegroups/instancegroups.go b/pkg/instancegroups/instancegroups.go index 5ec3c95666..9de86c20f7 100644 --- a/pkg/instancegroups/instancegroups.go +++ b/pkg/instancegroups/instancegroups.go @@ -647,8 +647,22 @@ func (c *RollingUpdateCluster) drainNode(u *cloudinstances.CloudInstance) error return fmt.Errorf("error excluding node from load balancer: %v", err) } - if err := c.Cloud.DeregisterInstance(u); err != nil { - return fmt.Errorf("error deregistering instance %q, node %q: %v", u.ID, u.Node.Name, err) + shouldDeregister := true + if !c.Options.DeregisterControlPlaneNodes { + if u.CloudInstanceGroup != nil && u.CloudInstanceGroup.InstanceGroup != nil { + role := u.CloudInstanceGroup.InstanceGroup.Spec.Role + switch role { + case api.InstanceGroupRoleAPIServer, api.InstanceGroupRoleMaster: + klog.Infof("skipping deregistration of instance %q, as part of instancegroup with role %q", u.ID, role) + shouldDeregister = false + } + } + } + + if shouldDeregister { + if err := c.Cloud.DeregisterInstance(u); err != nil { + return fmt.Errorf("error deregistering instance %q, node %q: %w", u.ID, u.Node.Name, err) + } } if err := drain.RunNodeDrain(helper, u.Node.Name); err != nil { diff --git a/pkg/instancegroups/rollingupdate.go b/pkg/instancegroups/rollingupdate.go index 079bbf4a10..ed522d32b2 100644 --- a/pkg/instancegroups/rollingupdate.go +++ b/pkg/instancegroups/rollingupdate.go @@ -80,6 +80,19 @@ type RollingUpdateCluster struct { // DrainTimeout is the maximum amount of time to wait while draining a node. DrainTimeout time.Duration + + // Options holds user-specified options + Options RollingUpdateOptions +} + +type RollingUpdateOptions struct { + // DeregisterControlPlaneNodes controls if we deregister control plane instances from load balacners etc before draining/terminating. + // When a cluster only has a single apiserver, we don't want to do this, as we can't drain after deregistering it. + DeregisterControlPlaneNodes bool +} + +func (o *RollingUpdateOptions) InitDefaults() { + o.DeregisterControlPlaneNodes = true } // AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation