validate cluster n times in rolling update

This commit is contained in:
Jesse Haka 2020-04-08 00:34:15 +03:00
parent fde8ceaaa8
commit e1e79790ef
5 changed files with 32 additions and 26 deletions

View File

@ -121,6 +121,9 @@ type RollingUpdateOptions struct {
// ValidationTimeout is the timeout for validation to succeed after the drain and pause // ValidationTimeout is the timeout for validation to succeed after the drain and pause
ValidationTimeout time.Duration ValidationTimeout time.Duration
// ValidateTimes is the amount of time that a cluster needs to be validated after single node update
ValidateTimes int32
// MasterInterval is the minimum time to wait after stopping a master node. This does not include drain and validate time. // MasterInterval is the minimum time to wait after stopping a master node. This does not include drain and validate time.
MasterInterval time.Duration MasterInterval time.Duration
@ -158,6 +161,7 @@ func (o *RollingUpdateOptions) InitDefaults() {
o.PostDrainDelay = 5 * time.Second o.PostDrainDelay = 5 * time.Second
o.ValidationTimeout = 15 * time.Minute o.ValidationTimeout = 15 * time.Minute
o.ValidateTimes = 2
} }
func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command { func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
@ -177,6 +181,7 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
cmd.Flags().BoolVar(&options.CloudOnly, "cloudonly", options.CloudOnly, "Perform rolling update without confirming progress with k8s") cmd.Flags().BoolVar(&options.CloudOnly, "cloudonly", options.CloudOnly, "Perform rolling update without confirming progress with k8s")
cmd.Flags().DurationVar(&options.ValidationTimeout, "validation-timeout", options.ValidationTimeout, "Maximum time to wait for a cluster to validate") cmd.Flags().DurationVar(&options.ValidationTimeout, "validation-timeout", options.ValidationTimeout, "Maximum time to wait for a cluster to validate")
cmd.Flags().Int32Var(&options.ValidateTimes, "validate-times", options.ValidateTimes, "Amount of times that a cluster needs to be validated after single node update")
cmd.Flags().DurationVar(&options.MasterInterval, "master-interval", options.MasterInterval, "Time to wait between restarting masters") cmd.Flags().DurationVar(&options.MasterInterval, "master-interval", options.MasterInterval, "Time to wait between restarting masters")
cmd.Flags().DurationVar(&options.NodeInterval, "node-interval", options.NodeInterval, "Time to wait between restarting nodes") cmd.Flags().DurationVar(&options.NodeInterval, "node-interval", options.NodeInterval, "Time to wait between restarting nodes")
cmd.Flags().DurationVar(&options.BastionInterval, "bastion-interval", options.BastionInterval, "Time to wait between restarting bastions") cmd.Flags().DurationVar(&options.BastionInterval, "bastion-interval", options.BastionInterval, "Time to wait between restarting bastions")
@ -333,9 +338,10 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
ClusterName: options.ClusterName, ClusterName: options.ClusterName,
PostDrainDelay: options.PostDrainDelay, PostDrainDelay: options.PostDrainDelay,
ValidationTimeout: options.ValidationTimeout, ValidationTimeout: options.ValidationTimeout,
ValidateTimes: options.ValidateTimes,
ValidateSucceeded: 0,
// TODO should we expose this to the UI? // TODO should we expose this to the UI?
ValidateTickDuration: 30 * time.Second, ValidateTickDuration: 30 * time.Second,
ValidateSuccessDuration: 10 * time.Second,
} }
err = d.AdjustNeedUpdate(groups, cluster, list) err = d.AdjustNeedUpdate(groups, cluster, list)

View File

@ -80,6 +80,7 @@ kops rolling-update cluster [flags]
--master-interval duration Time to wait between restarting masters (default 15s) --master-interval duration Time to wait between restarting masters (default 15s)
--node-interval duration Time to wait between restarting nodes (default 15s) --node-interval duration Time to wait between restarting nodes (default 15s)
--post-drain-delay duration Time to wait after draining each node (default 5s) --post-drain-delay duration Time to wait after draining each node (default 5s)
--validate-times int32 Amount of times that a cluster needs to be validated after single node update (default 2)
--validation-timeout duration Maximum time to wait for a cluster to validate (default 15m0s) --validation-timeout duration Maximum time to wait for a cluster to validate (default 15m0s)
-y, --yes Perform rolling update immediately, without --yes rolling-update executes a dry-run -y, --yes Perform rolling update immediately, without --yes rolling-update executes a dry-run
``` ```

View File

@ -430,10 +430,12 @@ func (c *RollingUpdateCluster) validateClusterWithDuration(duration time.Duratio
func (c *RollingUpdateCluster) tryValidateCluster(duration time.Duration) bool { func (c *RollingUpdateCluster) tryValidateCluster(duration time.Duration) bool {
result, err := c.ClusterValidator.Validate() result, err := c.ClusterValidator.Validate()
if err == nil && len(result.Failures) == 0 && c.ValidateSuccessDuration > 0 { if err == nil && len(result.Failures) == 0 && c.ValidateTimes > 0 {
klog.Infof("Cluster validated; revalidating in %s to make sure it does not flap.", c.ValidateSuccessDuration) c.ValidateSucceeded++
time.Sleep(c.ValidateSuccessDuration) if c.ValidateSucceeded < c.ValidateTimes {
result, err = c.ClusterValidator.Validate() klog.Infof("Cluster validated; revalidating %d time(s) to make sure it does not flap.", c.ValidateTimes-c.ValidateSucceeded)
return false
}
} }
if err != nil { if err != nil {

View File

@ -64,9 +64,11 @@ type RollingUpdateCluster struct {
// ValidateTickDuration is the amount of time to wait between cluster validation attempts // ValidateTickDuration is the amount of time to wait between cluster validation attempts
ValidateTickDuration time.Duration ValidateTickDuration time.Duration
// ValidateSuccessDuration is the amount of time a cluster must continue to validate successfully // ValidateTimes is the amount of time that a cluster needs to be validated after single node update
// before updating the next node ValidateTimes int32
ValidateSuccessDuration time.Duration
// ValidateSucceeded is the amount of times that a cluster validate is succeeded already
ValidateSucceeded int32
} }
// AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation // AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation

View File

@ -59,16 +59,17 @@ func getTestSetup() (*RollingUpdateCluster, *awsup.MockAWSCloud, *kopsapi.Cluste
cluster.Name = "test.k8s.local" cluster.Name = "test.k8s.local"
c := &RollingUpdateCluster{ c := &RollingUpdateCluster{
Cloud: mockcloud, Cloud: mockcloud,
MasterInterval: 1 * time.Millisecond, MasterInterval: 1 * time.Millisecond,
NodeInterval: 1 * time.Millisecond, NodeInterval: 1 * time.Millisecond,
BastionInterval: 1 * time.Millisecond, BastionInterval: 1 * time.Millisecond,
Force: false, Force: false,
K8sClient: k8sClient, K8sClient: k8sClient,
ClusterValidator: &successfulClusterValidator{}, ClusterValidator: &successfulClusterValidator{},
FailOnValidate: true, FailOnValidate: true,
ValidateTickDuration: 1 * time.Millisecond, ValidateTickDuration: 1 * time.Millisecond,
ValidateSuccessDuration: 5 * time.Millisecond, ValidateTimes: 1,
ValidateSucceeded: 0,
} }
return c, mockcloud, cluster return c, mockcloud, cluster
@ -511,6 +512,7 @@ func (v *flappingClusterValidator) Validate() (*validation.ValidationCluster, er
func TestRollingUpdateFlappingValidation(t *testing.T) { func TestRollingUpdateFlappingValidation(t *testing.T) {
c, cloud, cluster := getTestSetup() c, cloud, cluster := getTestSetup()
c.ValidateTimes = 3
// This should only take a few milliseconds, // This should only take a few milliseconds,
// but we have to pad to allow for random delays (e.g. GC) // but we have to pad to allow for random delays (e.g. GC)
@ -977,7 +979,6 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdate(t *testing.T) {
c, cloud, cluster := getTestSetup() c, cloud, cluster := getTestSetup()
concurrentTest := newConcurrentTest(t, cloud, 0, true) concurrentTest := newConcurrentTest(t, cloud, 0, true)
c.ValidateSuccessDuration = 0
c.ClusterValidator = concurrentTest c.ClusterValidator = concurrentTest
cloud.MockEC2 = concurrentTest cloud.MockEC2 = concurrentTest
@ -1000,7 +1001,6 @@ func TestRollingUpdateMaxUnavailableAllButOneNeedUpdate(t *testing.T) {
c, cloud, cluster := getTestSetup() c, cloud, cluster := getTestSetup()
concurrentTest := newConcurrentTest(t, cloud, 0, false) concurrentTest := newConcurrentTest(t, cloud, 0, false)
c.ValidateSuccessDuration = 0
c.ClusterValidator = concurrentTest c.ClusterValidator = concurrentTest
cloud.MockEC2 = concurrentTest cloud.MockEC2 = concurrentTest
@ -1022,7 +1022,6 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdateMaster(t *testing.T) {
c, cloud, cluster := getTestSetup() c, cloud, cluster := getTestSetup()
concurrentTest := newConcurrentTest(t, cloud, 0, true) concurrentTest := newConcurrentTest(t, cloud, 0, true)
c.ValidateSuccessDuration = 0
c.ClusterValidator = concurrentTest c.ClusterValidator = concurrentTest
cloud.MockEC2 = concurrentTest cloud.MockEC2 = concurrentTest
@ -1074,7 +1073,6 @@ func TestRollingUpdateMaxSurgeAllNeedUpdate(t *testing.T) {
c, cloud, cluster := getTestSetup() c, cloud, cluster := getTestSetup()
concurrentTest := newConcurrentTest(t, cloud, 2, true) concurrentTest := newConcurrentTest(t, cloud, 2, true)
c.ValidateSuccessDuration = 0
c.ClusterValidator = concurrentTest c.ClusterValidator = concurrentTest
cloud.MockAutoscaling = &concurrentTestAutoscaling{ cloud.MockAutoscaling = &concurrentTestAutoscaling{
AutoScalingAPI: cloud.MockAutoscaling, AutoScalingAPI: cloud.MockAutoscaling,
@ -1101,7 +1099,6 @@ func TestRollingUpdateMaxSurgeAllButOneNeedUpdate(t *testing.T) {
c, cloud, cluster := getTestSetup() c, cloud, cluster := getTestSetup()
concurrentTest := newConcurrentTest(t, cloud, 2, false) concurrentTest := newConcurrentTest(t, cloud, 2, false)
c.ValidateSuccessDuration = 0
c.ClusterValidator = concurrentTest c.ClusterValidator = concurrentTest
cloud.MockAutoscaling = &concurrentTestAutoscaling{ cloud.MockAutoscaling = &concurrentTestAutoscaling{
AutoScalingAPI: cloud.MockAutoscaling, AutoScalingAPI: cloud.MockAutoscaling,
@ -1248,7 +1245,6 @@ func TestRollingUpdateMaxSurgeAllNeedUpdateOneAlreadyDetached(t *testing.T) {
detached: map[string]bool{}, detached: map[string]bool{},
} }
c.ValidateSuccessDuration = 0
c.ClusterValidator = alreadyDetachedTest c.ClusterValidator = alreadyDetachedTest
cloud.MockAutoscaling = &alreadyDetachedTestAutoscaling{ cloud.MockAutoscaling = &alreadyDetachedTestAutoscaling{
AutoScalingAPI: cloud.MockAutoscaling, AutoScalingAPI: cloud.MockAutoscaling,
@ -1277,7 +1273,6 @@ func TestRollingUpdateMaxSurgeAllNeedUpdateMaxAlreadyDetached(t *testing.T) {
// Should behave the same as TestRollingUpdateMaxUnavailableAllNeedUpdate // Should behave the same as TestRollingUpdateMaxUnavailableAllNeedUpdate
concurrentTest := newConcurrentTest(t, cloud, 0, true) concurrentTest := newConcurrentTest(t, cloud, 0, true)
c.ValidateSuccessDuration = 0
c.ClusterValidator = concurrentTest c.ClusterValidator = concurrentTest
cloud.MockEC2 = concurrentTest cloud.MockEC2 = concurrentTest