mirror of https://github.com/kubernetes/kops.git
reusing the node and master duration for validation periods
This commit is contained in:
parent
acb5e8b5a6
commit
ec2f0dfdf3
|
@ -45,32 +45,50 @@ var (
|
|||
rollingupdate_long = pretty.LongDesc(i18n.T(`
|
||||
This command updates a kubernetes cluster to match the cloud, and kops specifications.
|
||||
|
||||
To perform rolling update, you need to update the cloud resources first with "kops update cluster"
|
||||
To perform rolling update, you need to update the cloud resources first with the command
|
||||
` + pretty.Bash("kops update cluster") + `.
|
||||
|
||||
Note: terraform users will need run the following commands all from the same directory "kops update cluster --target=terraform" then "terraform plan" then "terraform apply"
|
||||
prior to running "kops rolling-update cluster"
|
||||
If rolling-update does not report that the cluster needs to be rolled you can force the cluster to be
|
||||
rolled with the force flag. Rolling update drains and validates the cluster by default. A cluster is
|
||||
deemed validated when all required nodes are running, and all pods in the kube-system namespace are operational.
|
||||
When a node is deleted rolling-update sleeps the interval for the node type, and the tries for the same period
|
||||
of time for the cluster to be validated. For instance setting --master-interval=3m causes rolling-update
|
||||
to wait for 3m after a master is rolled, and another 3m for the cluster to stabilize and pass
|
||||
validation.
|
||||
|
||||
Use ` + pretty.Bash("export KOPS_FEATURE_FLAGS=\"+DrainAndValidateRollingUpdate\"") + ` to use beta code that drains the nodes
|
||||
and validates the cluster. New flags for Drain and Validation operations will be shown when
|
||||
the environment variable is set.`))
|
||||
Note: terraform users will need run the following commands all from the same directory
|
||||
` + pretty.Bash("kops update cluster --target=terraform") + `then
|
||||
` + pretty.Bash("terraform plan") + ` then ` + pretty.Bash("terraform apply") +
|
||||
`prior to running` + pretty.Bash("kops rolling-update cluster") + `.`))
|
||||
|
||||
rollingupdate_example = templates.Examples(i18n.T(`
|
||||
# Roll the currently selected kops cluster
|
||||
# Preview a rolling-update
|
||||
kops rolling-update cluster
|
||||
|
||||
# Roll the currently selected kops cluster with defaults.
|
||||
# Nodes will be drained and the cluster will be validated between node replacement
|
||||
kops rolling-update cluster --yes
|
||||
|
||||
# Roll the k8s-cluster.example.com kops cluster
|
||||
# use the new drain an validate functionality
|
||||
export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
|
||||
# do not fail if the cluster does not validate
|
||||
# wait 8 min to create new node, and at least 8 min
|
||||
# to validate the cluster.
|
||||
kops rolling-update cluster k8s-cluster.example.com --yes \
|
||||
--fail-on-validate-error="false" \
|
||||
--master-interval=8m \
|
||||
--node-interval=8m
|
||||
|
||||
# Roll the k8s-cluster.example.com kops cluster
|
||||
# do not validate the cluster because of the cloudonly flag.
|
||||
# Force the entire cluster to roll, even if rolling update
|
||||
# reports that the cluster does not need to be rolled.
|
||||
kops rolling-update cluster k8s-cluster.example.com --yes \
|
||||
--cloudonly \
|
||||
--force
|
||||
|
||||
# Roll the k8s-cluster.example.com kops cluster
|
||||
# only roll the node instancegroup
|
||||
# use the new drain an validate functionality
|
||||
export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
|
||||
kops rolling-update cluster k8s-cluster.example.com --yes \
|
||||
--fail-on-validate-error="false" \
|
||||
--node-interval 8m \
|
||||
|
@ -98,8 +116,6 @@ type RollingUpdateOptions struct {
|
|||
|
||||
DrainInterval time.Duration
|
||||
|
||||
ValidateRetries int
|
||||
|
||||
MasterInterval time.Duration
|
||||
NodeInterval time.Duration
|
||||
BastionInterval time.Duration
|
||||
|
@ -119,11 +135,9 @@ func (o *RollingUpdateOptions) InitDefaults() {
|
|||
o.FailOnValidate = true
|
||||
|
||||
o.MasterInterval = 5 * time.Minute
|
||||
o.NodeInterval = 2 * time.Minute
|
||||
o.NodeInterval = 4 * time.Minute
|
||||
o.BastionInterval = 5 * time.Minute
|
||||
|
||||
o.ValidateRetries = 8
|
||||
|
||||
o.DrainInterval = 90 * time.Second
|
||||
|
||||
}
|
||||
|
@ -152,8 +166,6 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
|
|||
if featureflag.DrainAndValidateRollingUpdate.Enabled() {
|
||||
cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", true, "The rolling-update will fail if draining a node fails.")
|
||||
cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate-error", true, "The rolling-update will fail if the cluster fails to validate.")
|
||||
cmd.Flags().IntVar(&options.ValidateRetries, "validate-retries", options.ValidateRetries, "The number of times that a node will be validated. Between validation kops sleeps the master-interval/2 or node-interval/2 duration.")
|
||||
cmd.Flags().DurationVar(&options.DrainInterval, "drain-interval", options.DrainInterval, "The duration that a rolling-update will wait after the node is drained.")
|
||||
}
|
||||
|
||||
cmd.Run = func(cmd *cobra.Command, args []string) {
|
||||
|
@ -202,10 +214,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
|
|||
return fmt.Errorf("cannot load kubecfg settings for %q: %v", contextName, err)
|
||||
}
|
||||
|
||||
if options.ValidateRetries <= 0 {
|
||||
return fmt.Errorf("validate-retries flag cannot be 0 or smaller")
|
||||
}
|
||||
|
||||
var nodes []v1.Node
|
||||
var k8sClient kubernetes.Interface
|
||||
if !options.CloudOnly {
|
||||
|
@ -339,7 +347,7 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
|
|||
}
|
||||
|
||||
if featureflag.DrainAndValidateRollingUpdate.Enabled() {
|
||||
glog.V(2).Infof("New rolling update with drain and validate enabled.")
|
||||
glog.V(2).Infof("Rolling update with drain and validate enabled.")
|
||||
}
|
||||
d := &instancegroups.RollingUpdateCluster{
|
||||
MasterInterval: options.MasterInterval,
|
||||
|
@ -352,7 +360,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
|
|||
FailOnValidate: options.FailOnValidate,
|
||||
CloudOnly: options.CloudOnly,
|
||||
ClusterName: options.ClusterName,
|
||||
ValidateRetries: options.ValidateRetries,
|
||||
DrainInterval: options.DrainInterval,
|
||||
}
|
||||
return d.RollingUpdate(groups, list)
|
||||
|
|
|
@ -297,9 +297,10 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
|
|||
|
||||
glog.Infof("Validating the cluster.")
|
||||
|
||||
if err = n.ValidateClusterWithRetries(rollingUpdateData, instanceGroupList, t); err != nil {
|
||||
if err = n.ValidateClusterWithDuration(rollingUpdateData, instanceGroupList, t); err != nil {
|
||||
|
||||
if rollingUpdateData.FailOnValidate {
|
||||
glog.Errorf("Cluster did not validate within the set duration of %q, you can retry, and maybe extend the duration", t)
|
||||
return fmt.Errorf("error validating cluster after removing a node: %v", err)
|
||||
}
|
||||
|
||||
|
@ -311,25 +312,43 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
|
|||
return nil
|
||||
}
|
||||
|
||||
// ValidateClusterWithRetries runs our validation methods on the K8s Cluster x times and then fails.
|
||||
func (n *CloudInstanceGroup) ValidateClusterWithRetries(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, t time.Duration) (err error) {
|
||||
|
||||
// TODO - We are going to need to improve Validate to allow for more than one node, not master
|
||||
// TODO - going down at a time.
|
||||
for i := 0; i <= rollingUpdateData.ValidateRetries; i++ {
|
||||
|
||||
if _, err = validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
|
||||
glog.Infof("Cluster did not validate, and waiting longer: %v.", err)
|
||||
time.Sleep(t / 2)
|
||||
} else {
|
||||
glog.Infof("Cluster validated.")
|
||||
return nil
|
||||
}
|
||||
|
||||
// ValidateClusterWithDuration runs validation.ValidateCluster until either we get positive result or the timeout expires
|
||||
func (n *CloudInstanceGroup) ValidateClusterWithDuration(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration) error {
|
||||
// TODO should we expose this to the UI?
|
||||
tickDuration := 30 * time.Second
|
||||
// Try to validate cluster at least once, this will handle durations that are lower
|
||||
// than our tick time
|
||||
if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
|
||||
return nil
|
||||
}
|
||||
|
||||
// for loop is done, and did not end when the cluster validated
|
||||
return fmt.Errorf("cluster validation failed: %v", err)
|
||||
timeout := time.After(duration)
|
||||
tick := time.Tick(tickDuration)
|
||||
// Keep trying until we're timed out or got a result or got an error
|
||||
for {
|
||||
select {
|
||||
case <-timeout:
|
||||
// Got a timeout fail with a timeout error
|
||||
return fmt.Errorf("cluster did not validate within a duation of %q", duration)
|
||||
case <-tick:
|
||||
// Got a tick, validate cluster
|
||||
if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
|
||||
return nil
|
||||
}
|
||||
// ValidateCluster didn't work yet, so let's try again
|
||||
// this will exit up to the for loop
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (n *CloudInstanceGroup) tryValidateCluster(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration, tickDuration time.Duration) bool {
|
||||
if _, err := validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
|
||||
glog.Infof("Cluster did not validate, will try again in %q util duration %q expires: %v.", tickDuration, duration, err)
|
||||
return false
|
||||
} else {
|
||||
glog.Infof("Cluster validated.")
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// ValidateCluster runs our validation methods on the K8s Cluster.
|
||||
|
|
|
@ -44,7 +44,6 @@ type RollingUpdateCluster struct {
|
|||
FailOnValidate bool
|
||||
CloudOnly bool
|
||||
ClusterName string
|
||||
ValidateRetries int
|
||||
DrainInterval time.Duration
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue