More PR review changes, improving drain, and fixing a couple bugs

This commit is contained in:
chrislovecnm 2017-02-17 08:08:01 -07:00
parent a848f577a2
commit da763ea1e5
4 changed files with 46 additions and 9 deletions

View File

@ -107,8 +107,8 @@ and validates the cluser.`,
cmd.Flags().DurationVar(&options.BastionInterval, "bastion-interval", options.BastionInterval, "Time to wait between restarting bastions")
cmd.Flags().StringSliceVar(&options.InstanceGroups, "instance-group", options.InstanceGroups, "List of instance groups to update (defaults to all if not specified)")
cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", false, "The rolling-update will fail if draining a node fails. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate", true, "The rolling-update will fail if the cluster fails to validate. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", true, "The rolling-update will fail if draining a node fails. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate-error", true, "The rolling-update will fail if the cluster fails to validate. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
cmd.Flags().IntVar(&options.ValidateRetries, "validate-retries", 8, "The number of times that a node will be validated. Between validation kops sleeps the master-interval/2 or node-interval/2 duration. Enable with KOPS_FEATURE_FLAGS='+DrainAndValidateRollingUpdate'")
cmd.Run = func(cmd *cobra.Command, args []string) {

View File

@ -39,7 +39,7 @@ func Bool(b bool) *bool {
var DNSPreCreate = New("DNSPreCreate", Bool(true))
// DrainAndValidateRollingUpdate if set will use new rolling update code that will drain and validate.
var DrainAndValidateRollingUpdate = New("DrainAndValidateRollingUpdate", Bool(true))
var DrainAndValidateRollingUpdate = New("DrainAndValidateRollingUpdate", Bool(false))
// VPCSkipEnableDNSSupport if set will make that a VPC does not need DNSSupport enabled.
var VPCSkipEnableDNSSupport = New("VPCSkipEnableDNSSupport", Bool(false))

View File

@ -19,6 +19,10 @@ package kutil
// Based off of drain in kubectl
// https://github.com/kubernetes/kubernetes/blob/master/pkg/kubectl/cmd/drain.go
// TODO: Figure out how and if we are going to handle petsets.
// TODO: Test more and determine if we mark to ignore DS, but they still are removed if they have local storage.
// TODO: Probably write a unit test for upstream for the above statement?
import (
"errors"
"fmt"
@ -151,7 +155,29 @@ func (o *DrainOptions) DrainTheNode(nodeName string) (err error) {
return fmt.Errorf("drain failed %v, %s", err, nodeName)
}
return nil
defaultPodDrainDuration := 90 * time.Second // Default pod grace period + 30s
// Wait for the pods to get off of the node
time.Sleep(defaultPodDrainDuration)
// Wait two more times for the pods to leave and then bail
for i := 0; i <= 2; i++ {
pods, err := o.getPodsForDeletion()
if err != nil {
return fmt.Errorf("unable to retrieve the pod for a drain, so the drain failed %v, %s", err, nodeName)
}
if len(pods) != 0 {
glog.Infof("Node not drained, and waiting longer, so the drain failed: %v.", err)
time.Sleep(defaultPodDrainDuration)
} else {
glog.Infof("Node drained.")
return nil
}
}
return fmt.Errorf("the pods did not fully leave the node, so the drain failed %v, %s", err, nodeName)
}
// SetupDrain populates some fields from the factory, grabs command line
@ -259,6 +285,8 @@ func (o *DrainOptions) getController(sr *api.SerializedReference) (interface{},
return o.client.Batch().Jobs(sr.Reference.Namespace).Get(sr.Reference.Name, metav1.GetOptions{})
case "ReplicaSet":
return o.client.Extensions().ReplicaSets(sr.Reference.Namespace).Get(sr.Reference.Name, metav1.GetOptions{})
// TODO: do we want to support PetSet upgrades
// FIXME: test
case "PetSet":
return "PetSet", nil
case "StatefulSet":

View File

@ -316,10 +316,17 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
glog.V(3).Info("Not validating the cluster as instance is a bastion.")
} else if rollingUpdateData.CloudOnly {
glog.V(3).Info("Not validating cluster as validation is turned off via the cloud-only flag.")
} else if rollingUpdateData.FailOnValidate && featureflag.DrainAndValidateRollingUpdate.Enabled() {
} else if featureflag.DrainAndValidateRollingUpdate.Enabled() {
if err = n.ValidateCluster(rollingUpdateData, instanceGroupList); err != nil {
glog.Errorf("Error validating cluster: %s.", err)
return err
glog.Warningf("Error validating cluster %q: %v.", rollingUpdateData.ClusterName, err)
if rollingUpdateData.FailOnValidate {
glog.Errorf("Error validating cluster: %v.", err)
return err
}
glog.Warningf("Cluster validation, proceeding since fail-on-validate is set to false")
}
}
@ -344,9 +351,11 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
} else if featureflag.DrainAndValidateRollingUpdate.Enabled() {
glog.Infof("Draining the node: %s.", u.Node.Name)
glog.Infof("Draining the node: %q.", u.Node.Name)
if err = n.DrainNode(u, rollingUpdateData); err == nil {
// FIXME: This seems to be happening a bit quickly.
// FIXME: We may need to wait till all of the pods are drained
if err = n.DrainNode(u, rollingUpdateData); err != nil {
glog.Errorf("Error draining node %q, instance id %q: %v", u.Node.Name, instanceId, err)
return err
}