reusing the node and master duration for validation periods

This commit is contained in:
chrislovecnm 2017-09-23 18:07:43 -06:00
parent acb5e8b5a6
commit ec2f0dfdf3
3 changed files with 67 additions and 42 deletions

View File

@ -45,32 +45,50 @@ var (
rollingupdate_long = pretty.LongDesc(i18n.T(`
This command updates a kubernetes cluster to match the cloud, and kops specifications.
To perform rolling update, you need to update the cloud resources first with "kops update cluster"
To perform rolling update, you need to update the cloud resources first with the command
` + pretty.Bash("kops update cluster") + `.
Note: terraform users will need run the following commands all from the same directory "kops update cluster --target=terraform" then "terraform plan" then "terraform apply"
prior to running "kops rolling-update cluster"
If rolling-update does not report that the cluster needs to be rolled you can force the cluster to be
rolled with the force flag. Rolling update drains and validates the cluster by default. A cluster is
deemed validated when all required nodes are running, and all pods in the kube-system namespace are operational.
When a node is deleted rolling-update sleeps the interval for the node type, and the tries for the same period
of time for the cluster to be validated. For instance setting --master-interval=3m causes rolling-update
to wait for 3m after a master is rolled, and another 3m for the cluster to stabilize and pass
validation.
Use ` + pretty.Bash("export KOPS_FEATURE_FLAGS=\"+DrainAndValidateRollingUpdate\"") + ` to use beta code that drains the nodes
and validates the cluster. New flags for Drain and Validation operations will be shown when
the environment variable is set.`))
Note: terraform users will need run the following commands all from the same directory
` + pretty.Bash("kops update cluster --target=terraform") + `then
` + pretty.Bash("terraform plan") + ` then ` + pretty.Bash("terraform apply") +
`prior to running` + pretty.Bash("kops rolling-update cluster") + `.`))
rollingupdate_example = templates.Examples(i18n.T(`
# Roll the currently selected kops cluster
# Preview a rolling-update
kops rolling-update cluster
# Roll the currently selected kops cluster with defaults.
# Nodes will be drained and the cluster will be validated between node replacement
kops rolling-update cluster --yes
# Roll the k8s-cluster.example.com kops cluster
# use the new drain an validate functionality
export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
# do not fail if the cluster does not validate
# wait 8 min to create new node, and at least 8 min
# to validate the cluster.
kops rolling-update cluster k8s-cluster.example.com --yes \
--fail-on-validate-error="false" \
--master-interval=8m \
--node-interval=8m
# Roll the k8s-cluster.example.com kops cluster
# do not validate the cluster because of the cloudonly flag.
# Force the entire cluster to roll, even if rolling update
# reports that the cluster does not need to be rolled.
kops rolling-update cluster k8s-cluster.example.com --yes \
--cloudonly \
--force
# Roll the k8s-cluster.example.com kops cluster
# only roll the node instancegroup
# use the new drain an validate functionality
export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
kops rolling-update cluster k8s-cluster.example.com --yes \
--fail-on-validate-error="false" \
--node-interval 8m \
@ -98,8 +116,6 @@ type RollingUpdateOptions struct {
DrainInterval time.Duration
ValidateRetries int
MasterInterval time.Duration
NodeInterval time.Duration
BastionInterval time.Duration
@ -119,11 +135,9 @@ func (o *RollingUpdateOptions) InitDefaults() {
o.FailOnValidate = true
o.MasterInterval = 5 * time.Minute
o.NodeInterval = 2 * time.Minute
o.NodeInterval = 4 * time.Minute
o.BastionInterval = 5 * time.Minute
o.ValidateRetries = 8
o.DrainInterval = 90 * time.Second
}
@ -152,8 +166,6 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
if featureflag.DrainAndValidateRollingUpdate.Enabled() {
cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", true, "The rolling-update will fail if draining a node fails.")
cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate-error", true, "The rolling-update will fail if the cluster fails to validate.")
cmd.Flags().IntVar(&options.ValidateRetries, "validate-retries", options.ValidateRetries, "The number of times that a node will be validated. Between validation kops sleeps the master-interval/2 or node-interval/2 duration.")
cmd.Flags().DurationVar(&options.DrainInterval, "drain-interval", options.DrainInterval, "The duration that a rolling-update will wait after the node is drained.")
}
cmd.Run = func(cmd *cobra.Command, args []string) {
@ -202,10 +214,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
return fmt.Errorf("cannot load kubecfg settings for %q: %v", contextName, err)
}
if options.ValidateRetries <= 0 {
return fmt.Errorf("validate-retries flag cannot be 0 or smaller")
}
var nodes []v1.Node
var k8sClient kubernetes.Interface
if !options.CloudOnly {
@ -339,7 +347,7 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
}
if featureflag.DrainAndValidateRollingUpdate.Enabled() {
glog.V(2).Infof("New rolling update with drain and validate enabled.")
glog.V(2).Infof("Rolling update with drain and validate enabled.")
}
d := &instancegroups.RollingUpdateCluster{
MasterInterval: options.MasterInterval,
@ -352,7 +360,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
FailOnValidate: options.FailOnValidate,
CloudOnly: options.CloudOnly,
ClusterName: options.ClusterName,
ValidateRetries: options.ValidateRetries,
DrainInterval: options.DrainInterval,
}
return d.RollingUpdate(groups, list)

View File

@ -297,9 +297,10 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
glog.Infof("Validating the cluster.")
if err = n.ValidateClusterWithRetries(rollingUpdateData, instanceGroupList, t); err != nil {
if err = n.ValidateClusterWithDuration(rollingUpdateData, instanceGroupList, t); err != nil {
if rollingUpdateData.FailOnValidate {
glog.Errorf("Cluster did not validate within the set duration of %q, you can retry, and maybe extend the duration", t)
return fmt.Errorf("error validating cluster after removing a node: %v", err)
}
@ -311,25 +312,43 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
return nil
}
// ValidateClusterWithRetries runs our validation methods on the K8s Cluster x times and then fails.
func (n *CloudInstanceGroup) ValidateClusterWithRetries(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, t time.Duration) (err error) {
// TODO - We are going to need to improve Validate to allow for more than one node, not master
// TODO - going down at a time.
for i := 0; i <= rollingUpdateData.ValidateRetries; i++ {
if _, err = validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
glog.Infof("Cluster did not validate, and waiting longer: %v.", err)
time.Sleep(t / 2)
} else {
glog.Infof("Cluster validated.")
return nil
}
// ValidateClusterWithDuration runs validation.ValidateCluster until either we get positive result or the timeout expires
func (n *CloudInstanceGroup) ValidateClusterWithDuration(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration) error {
// TODO should we expose this to the UI?
tickDuration := 30 * time.Second
// Try to validate cluster at least once, this will handle durations that are lower
// than our tick time
if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
return nil
}
// for loop is done, and did not end when the cluster validated
return fmt.Errorf("cluster validation failed: %v", err)
timeout := time.After(duration)
tick := time.Tick(tickDuration)
// Keep trying until we're timed out or got a result or got an error
for {
select {
case <-timeout:
// Got a timeout fail with a timeout error
return fmt.Errorf("cluster did not validate within a duation of %q", duration)
case <-tick:
// Got a tick, validate cluster
if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
return nil
}
// ValidateCluster didn't work yet, so let's try again
// this will exit up to the for loop
}
}
}
func (n *CloudInstanceGroup) tryValidateCluster(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration, tickDuration time.Duration) bool {
if _, err := validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
glog.Infof("Cluster did not validate, will try again in %q util duration %q expires: %v.", tickDuration, duration, err)
return false
} else {
glog.Infof("Cluster validated.")
return true
}
}
// ValidateCluster runs our validation methods on the K8s Cluster.

View File

@ -44,7 +44,6 @@ type RollingUpdateCluster struct {
FailOnValidate bool
CloudOnly bool
ClusterName string
ValidateRetries int
DrainInterval time.Duration
}