mirror of https://github.com/kubernetes/kops.git
More PR review changes, improving drain, and fixing a couple bugs
This commit is contained in:
parent
a848f577a2
commit
da763ea1e5
|
|
@ -107,8 +107,8 @@ and validates the cluser.`,
|
|||
cmd.Flags().DurationVar(&options.BastionInterval, "bastion-interval", options.BastionInterval, "Time to wait between restarting bastions")
|
||||
cmd.Flags().StringSliceVar(&options.InstanceGroups, "instance-group", options.InstanceGroups, "List of instance groups to update (defaults to all if not specified)")
|
||||
|
||||
cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", false, "The rolling-update will fail if draining a node fails. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
|
||||
cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate", true, "The rolling-update will fail if the cluster fails to validate. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
|
||||
cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", true, "The rolling-update will fail if draining a node fails. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
|
||||
cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate-error", true, "The rolling-update will fail if the cluster fails to validate. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
|
||||
cmd.Flags().IntVar(&options.ValidateRetries, "validate-retries", 8, "The number of times that a node will be validated. Between validation kops sleeps the master-interval/2 or node-interval/2 duration. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
|
||||
|
||||
cmd.Run = func(cmd *cobra.Command, args []string) {
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ func Bool(b bool) *bool {
|
|||
var DNSPreCreate = New("DNSPreCreate", Bool(true))
|
||||
|
||||
// DrainAndValidateRollingUpdate if set will use new rolling update code that will drain and validate.
|
||||
var DrainAndValidateRollingUpdate = New("DrainAndValidateRollingUpdate", Bool(true))
|
||||
var DrainAndValidateRollingUpdate = New("DrainAndValidateRollingUpdate", Bool(false))
|
||||
|
||||
// VPCSkipEnableDNSSupport if set will make that a VPC does not need DNSSupport enabled.
|
||||
var VPCSkipEnableDNSSupport = New("VPCSkipEnableDNSSupport", Bool(false))
|
||||
|
|
|
|||
|
|
@ -19,6 +19,10 @@ package kutil
|
|||
// Based off of drain in kubectl
|
||||
// https://github.com/kubernetes/kubernetes/blob/master/pkg/kubectl/cmd/drain.go
|
||||
|
||||
// TODO: Figure out how and if we are going to handle petsets.
|
||||
// TODO: Test more and determine if we mark to ignore DS, but they still are removed if they have local storage.
|
||||
// TODO: Probably write a unit test for upstream for the above statement?
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
|
|
@ -151,7 +155,29 @@ func (o *DrainOptions) DrainTheNode(nodeName string) (err error) {
|
|||
return fmt.Errorf("drain failed %v, %s", err, nodeName)
|
||||
}
|
||||
|
||||
return nil
|
||||
defaultPodDrainDuration := 90 * time.Second // Default pod grace period + 30s
|
||||
|
||||
// Wait for the pods to get off of the node
|
||||
time.Sleep(defaultPodDrainDuration)
|
||||
|
||||
// Wait two more times for the pods to leave and then bail
|
||||
for i := 0; i <= 2; i++ {
|
||||
|
||||
pods, err := o.getPodsForDeletion()
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to retrieve the pod for a drain, so the drain failed %v, %s", err, nodeName)
|
||||
}
|
||||
if len(pods) != 0 {
|
||||
glog.Infof("Node not drained, and waiting longer, so the drain failed: %v.", err)
|
||||
time.Sleep(defaultPodDrainDuration)
|
||||
} else {
|
||||
glog.Infof("Node drained.")
|
||||
return nil
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return fmt.Errorf("the pods did not fully leave the node, so the drain failed %v, %s", err, nodeName)
|
||||
}
|
||||
|
||||
// SetupDrain populates some fields from the factory, grabs command line
|
||||
|
|
@ -259,6 +285,8 @@ func (o *DrainOptions) getController(sr *api.SerializedReference) (interface{},
|
|||
return o.client.Batch().Jobs(sr.Reference.Namespace).Get(sr.Reference.Name, metav1.GetOptions{})
|
||||
case "ReplicaSet":
|
||||
return o.client.Extensions().ReplicaSets(sr.Reference.Namespace).Get(sr.Reference.Name, metav1.GetOptions{})
|
||||
// TODO: do we want to support PetSet upgrades
|
||||
// FIXME: test
|
||||
case "PetSet":
|
||||
return "PetSet", nil
|
||||
case "StatefulSet":
|
||||
|
|
|
|||
|
|
@ -316,10 +316,17 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
|
|||
glog.V(3).Info("Not validating the cluster as instance is a bastion.")
|
||||
} else if rollingUpdateData.CloudOnly {
|
||||
glog.V(3).Info("Not validating cluster as validation is turned off via the cloud-only flag.")
|
||||
} else if rollingUpdateData.FailOnValidate && featureflag.DrainAndValidateRollingUpdate.Enabled() {
|
||||
} else if featureflag.DrainAndValidateRollingUpdate.Enabled() {
|
||||
if err = n.ValidateCluster(rollingUpdateData, instanceGroupList); err != nil {
|
||||
glog.Errorf("Error validating cluster: %s.", err)
|
||||
return err
|
||||
|
||||
glog.Warningf("Error validating cluster %q: %v.", rollingUpdateData.ClusterName, err)
|
||||
|
||||
if rollingUpdateData.FailOnValidate {
|
||||
glog.Errorf("Error validating cluster: %v.", err)
|
||||
return err
|
||||
}
|
||||
|
||||
glog.Warningf("Cluster validation, proceeding since fail-on-validate is set to false")
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -344,9 +351,11 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
|
|||
|
||||
} else if featureflag.DrainAndValidateRollingUpdate.Enabled() {
|
||||
|
||||
glog.Infof("Draining the node: %s.", u.Node.Name)
|
||||
glog.Infof("Draining the node: %q.", u.Node.Name)
|
||||
|
||||
if err = n.DrainNode(u, rollingUpdateData); err == nil {
|
||||
// FIXME: This seems to be happening a bit quickly.
|
||||
// FIXME: We may need to wait till all of the pods are drained
|
||||
if err = n.DrainNode(u, rollingUpdateData); err != nil {
|
||||
glog.Errorf("Error draining node %q, instance id %q: %v", u.Node.Name, instanceId, err)
|
||||
return err
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue