More PR review changes, improving drain, and fixing a couple bugs

2017-02-17 08:08:01 -07:00 · 2017-02-17 08:08:01 -07:00 · da763ea1e5
parent a848f577a2
commit da763ea1e5
4 changed files with 46 additions and 9 deletions
--- a/cmd/kops/rollingupdatecluster.go
+++ b/cmd/kops/rollingupdatecluster.go
@ -107,8 +107,8 @@ and validates the cluser.`,
 	cmd.Flags().DurationVar(&options.BastionInterval, "bastion-interval", options.BastionInterval, "Time to wait between restarting bastions")
 	cmd.Flags().StringSliceVar(&options.InstanceGroups, "instance-group", options.InstanceGroups, "List of instance groups to update (defaults to all if not specified)")

-	cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", false, "The rolling-update will fail if draining a node fails. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
-	cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate", true, "The rolling-update will fail if the cluster fails to validate. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
+	cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", true, "The rolling-update will fail if draining a node fails. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
+	cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate-error", true, "The rolling-update will fail if the cluster fails to validate. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
 	cmd.Flags().IntVar(&options.ValidateRetries, "validate-retries", 8, "The number of times that a node will be validated.  Between validation kops sleeps the master-interval/2 or node-interval/2 duration. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")

 	cmd.Run = func(cmd *cobra.Command, args []string) {
--- a/pkg/featureflag/featureflag.go
+++ b/pkg/featureflag/featureflag.go
@ -39,7 +39,7 @@ func Bool(b bool) *bool {
 var DNSPreCreate = New("DNSPreCreate", Bool(true))

 // DrainAndValidateRollingUpdate if set will use new rolling update code that will drain and validate.
-var DrainAndValidateRollingUpdate = New("DrainAndValidateRollingUpdate", Bool(true))
+var DrainAndValidateRollingUpdate = New("DrainAndValidateRollingUpdate", Bool(false))

 // VPCSkipEnableDNSSupport if set will make that a VPC does not need DNSSupport enabled.
 var VPCSkipEnableDNSSupport = New("VPCSkipEnableDNSSupport", Bool(false))
--- a/upup/pkg/kutil/drain.go
+++ b/upup/pkg/kutil/drain.go
@ -19,6 +19,10 @@ package kutil
 // Based off of drain in kubectl
 // https://github.com/kubernetes/kubernetes/blob/master/pkg/kubectl/cmd/drain.go

+// TODO: Figure out how and if we are going to handle petsets.
+// TODO: Test more and determine if we mark to ignore DS, but they still are removed if they have local storage.
+// TODO: Probably write a unit test for upstream for the above statement?
+
 import (
 	"errors"
 	"fmt"
@ -151,7 +155,29 @@ func (o *DrainOptions) DrainTheNode(nodeName string) (err error) {
 		return fmt.Errorf("drain failed %v, %s", err, nodeName)
 	}

-	return nil
+	defaultPodDrainDuration := 90 * time.Second // Default pod grace period + 30s
+
+	// Wait for the pods to get off of the node
+	time.Sleep(defaultPodDrainDuration)
+
+	// Wait two more times for the pods to leave and then bail
+	for i := 0; i <= 2; i++ {
+
+		pods, err := o.getPodsForDeletion()
+		if err != nil {
+			return fmt.Errorf("unable to retrieve the pod for a drain, so the drain failed %v, %s", err, nodeName)
+		}
+		if len(pods) != 0 {
+			glog.Infof("Node not drained, and waiting longer, so the drain failed: %v.", err)
+			time.Sleep(defaultPodDrainDuration)
+		} else {
+			glog.Infof("Node drained.")
+			return nil
+		}
+
+	}
+
+	return fmt.Errorf("the pods did not fully leave the node, so the drain failed %v, %s", err, nodeName)
 }

 // SetupDrain populates some fields from the factory, grabs command line
@ -259,6 +285,8 @@ func (o *DrainOptions) getController(sr *api.SerializedReference) (interface{},
 		return o.client.Batch().Jobs(sr.Reference.Namespace).Get(sr.Reference.Name, metav1.GetOptions{})
 	case "ReplicaSet":
 		return o.client.Extensions().ReplicaSets(sr.Reference.Namespace).Get(sr.Reference.Name, metav1.GetOptions{})
+	// TODO: do we want to support PetSet upgrades
+	// FIXME: test
 	case "PetSet":
 		return "PetSet", nil
 	case "StatefulSet":
--- a/upup/pkg/kutil/rollingupdate_cluster.go
+++ b/upup/pkg/kutil/rollingupdate_cluster.go
@ -316,10 +316,17 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
 		glog.V(3).Info("Not validating the cluster as instance is a bastion.")
 	} else if rollingUpdateData.CloudOnly {
 		glog.V(3).Info("Not validating cluster as validation is turned off via the cloud-only flag.")
-	} else if rollingUpdateData.FailOnValidate && featureflag.DrainAndValidateRollingUpdate.Enabled() {
+	} else if featureflag.DrainAndValidateRollingUpdate.Enabled() {
 		if err = n.ValidateCluster(rollingUpdateData, instanceGroupList); err != nil {
-			glog.Errorf("Error validating cluster: %s.", err)
-			return err
+
+			glog.Warningf("Error validating cluster %q: %v.", rollingUpdateData.ClusterName, err)
+
+			if rollingUpdateData.FailOnValidate {
+				glog.Errorf("Error validating cluster: %v.", err)
+				return err
+			}
+
+			glog.Warningf("Cluster validation, proceeding since fail-on-validate is set to false")
 		}
 	}

@ -344,9 +351,11 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust

 		} else if featureflag.DrainAndValidateRollingUpdate.Enabled() {

-			glog.Infof("Draining the node: %s.", u.Node.Name)
+			glog.Infof("Draining the node: %q.", u.Node.Name)

-			if err = n.DrainNode(u, rollingUpdateData); err == nil {
+			// FIXME: This seems to be happening a bit quickly.
+			// FIXME: We may need to wait till all of the pods are drained
+			if err = n.DrainNode(u, rollingUpdateData); err != nil {
 				glog.Errorf("Error draining node %q, instance id %q: %v", u.Node.Name, instanceId, err)
 				return err
 			}