From 49f2a0e10a992c1522ae8e7e19c2642cb0d67069 Mon Sep 17 00:00:00 2001 From: Bharath Vedartham Date: Sat, 31 Oct 2020 19:16:42 +0530 Subject: [PATCH 1/4] validate_cluster: Add InstanceGroup field to ValidationError struct The InstanceGroup field in ValidationError struct is an optional field meant to indicate the InstanceGroup which has reported that failure. This field either holds a pointer to the instance group which caused the validation error or can be nil which indicates that we were unable to determine the instance group to which this failure should be attributed to. This field is mainly used to identify whether a failure is worth waiting for when validating a particular instance group. --- pkg/validation/validate_cluster.go | 42 +++++++++++++++++++----------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/pkg/validation/validate_cluster.go b/pkg/validation/validate_cluster.go index ba4131eda2..9ceccbbcaf 100644 --- a/pkg/validation/validate_cluster.go +++ b/pkg/validation/validate_cluster.go @@ -25,6 +25,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/pager" + "k8s.io/kops/pkg/apis/kops" "k8s.io/kops/upup/pkg/fi" v1 "k8s.io/api/core/v1" @@ -32,7 +33,6 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" "k8s.io/klog/v2" - "k8s.io/kops/pkg/apis/kops" "k8s.io/kops/pkg/cloudinstances" "k8s.io/kops/pkg/dns" ) @@ -49,6 +49,8 @@ type ValidationError struct { Kind string `json:"type,omitempty"` Name string `json:"name,omitempty"` Message string `json:"message,omitempty"` + // The InstanceGroup field is used to indicate which instance group this validation error is coming from + InstanceGroup *kops.InstanceGroup `json:"instanceGroup,omitempty"` } type ClusterValidator interface { @@ -172,7 +174,7 @@ func (v *clusterValidatorImpl) Validate() (*ValidationCluster, error) { return nil, fmt.Errorf("cannot get component status for %q: %v", clusterName, err) } - if err := validation.collectPodFailures(ctx, v.k8sClient, readyNodes); err != nil { + if err := validation.collectPodFailures(ctx, v.k8sClient, readyNodes, v.instanceGroups); err != nil { return nil, fmt.Errorf("cannot get pod health for %q: %v", clusterName, err) } @@ -185,6 +187,7 @@ func (v *ValidationCluster) collectComponentFailures(ctx context.Context, client return fmt.Errorf("error listing ComponentStatuses: %v", err) } + // TODO: Add logic to figure out the InstanceGroup given a component for _, component := range componentList.Items { for _, condition := range component.Conditions { if condition.Status != v1.ConditionTrue { @@ -205,9 +208,11 @@ var masterStaticPods = []string{ "kube-scheduler", } -func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kubernetes.Interface, nodes []v1.Node) error { +func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kubernetes.Interface, nodes []v1.Node, + groups []*kops.InstanceGroup) error { masterWithoutPod := map[string]map[string]bool{} nodeByAddress := map[string]string{} + for _, node := range nodes { labels := node.GetLabels() if labels != nil && labels["kubernetes.io/role"] == "master" { @@ -238,6 +243,8 @@ func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kuber if pod.Status.Phase == v1.PodSucceeded { return nil } + + // TODO: Add logic to figure out the InstanceGroup given a pod if pod.Status.Phase == v1.PodPending { v.addError(&ValidationError{ Kind: "Pod", @@ -311,6 +318,7 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances cloudGroup.InstanceGroup.Name, numNodes, cloudGroup.TargetSize), + InstanceGroup: cloudGroup.InstanceGroup, }) } @@ -326,9 +334,10 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances if nodeExpectedToJoin { v.addError(&ValidationError{ - Kind: "Machine", - Name: member.ID, - Message: fmt.Sprintf("machine %q has not yet joined cluster", member.ID), + Kind: "Machine", + Name: member.ID, + Message: fmt.Sprintf("machine %q has not yet joined cluster", member.ID), + InstanceGroup: cloudGroup.InstanceGroup, }) } continue @@ -358,9 +367,10 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances if n.Role == "master" { if !ready { v.addError(&ValidationError{ - Kind: "Node", - Name: node.Name, - Message: fmt.Sprintf("master %q is not ready", node.Name), + Kind: "Node", + Name: node.Name, + Message: fmt.Sprintf("master %q is not ready", node.Name), + InstanceGroup: cloudGroup.InstanceGroup, }) } @@ -368,9 +378,10 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances } else if n.Role == "node" { if !ready { v.addError(&ValidationError{ - Kind: "Node", - Name: node.Name, - Message: fmt.Sprintf("node %q is not ready", node.Name), + Kind: "Node", + Name: node.Name, + Message: fmt.Sprintf("node %q is not ready", node.Name), + InstanceGroup: cloudGroup.InstanceGroup, }) } @@ -384,9 +395,10 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances for _, ig := range groups { if !groupsSeen[ig.Name] { v.addError(&ValidationError{ - Kind: "InstanceGroup", - Name: ig.Name, - Message: fmt.Sprintf("InstanceGroup %q is missing from the cloud provider", ig.Name), + Kind: "InstanceGroup", + Name: ig.Name, + Message: fmt.Sprintf("InstanceGroup %q is missing from the cloud provider", ig.Name), + InstanceGroup: ig, }) } } From f99c04fafadc72e422d51fdeae114882d9589f38 Mon Sep 17 00:00:00 2001 From: Bharath Vedartham Date: Sat, 31 Oct 2020 19:16:54 +0530 Subject: [PATCH 2/4] validate_cluster_test: Update validate_cluster_tests This commit fixes the unit tests for validate_cluster to reflect the addition of the new InstanceGroup field in struct ValidationError --- pkg/validation/validate_cluster_test.go | 50 ++++++++++++++----------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/pkg/validation/validate_cluster_test.go b/pkg/validation/validate_cluster_test.go index 223060526c..6b64a1b5d6 100644 --- a/pkg/validation/validate_cluster_test.go +++ b/pkg/validation/validate_cluster_test.go @@ -132,6 +132,7 @@ func Test_ValidateCloudGroupMissing(t *testing.T) { cluster := &kopsapi.Cluster{ ObjectMeta: metav1.ObjectMeta{Name: "testcluster.k8s.local"}, } + instanceGroups := []kopsapi.InstanceGroup{ { ObjectMeta: metav1.ObjectMeta{ @@ -151,9 +152,10 @@ func Test_ValidateCloudGroupMissing(t *testing.T) { require.NoError(t, err) if !assert.Len(t, v.Failures, 1) || !assert.Equal(t, &ValidationError{ - Kind: "InstanceGroup", - Name: "node-1", - Message: "InstanceGroup \"node-1\" is missing from the cloud provider", + Kind: "InstanceGroup", + Name: "node-1", + Message: "InstanceGroup \"node-1\" is missing from the cloud provider", + InstanceGroup: &instanceGroups[0], }, v.Failures[0]) { printDebug(t, v) } @@ -204,9 +206,10 @@ func Test_ValidateNodesNotEnough(t *testing.T) { require.NoError(t, err) if !assert.Len(t, v.Failures, 1) || !assert.Equal(t, &ValidationError{ - Kind: "InstanceGroup", - Name: "node-1", - Message: "InstanceGroup \"node-1\" did not have enough nodes 2 vs 3", + Kind: "InstanceGroup", + Name: "node-1", + Message: "InstanceGroup \"node-1\" did not have enough nodes 2 vs 3", + InstanceGroup: groups["node-1"].InstanceGroup, }, v.Failures[0]) { printDebug(t, v) } @@ -258,9 +261,10 @@ func Test_ValidateDetachedNodesDontCount(t *testing.T) { require.NoError(t, err) if !assert.Len(t, v.Failures, 1) || !assert.Equal(t, &ValidationError{ - Kind: "InstanceGroup", - Name: "node-1", - Message: "InstanceGroup \"node-1\" did not have enough nodes 1 vs 2", + Kind: "InstanceGroup", + Name: "node-1", + Message: "InstanceGroup \"node-1\" did not have enough nodes 1 vs 2", + InstanceGroup: groups["node-1"].InstanceGroup, }, v.Failures[0]) { printDebug(t, v) } @@ -311,9 +315,10 @@ func Test_ValidateNodeNotReady(t *testing.T) { require.NoError(t, err) if !assert.Len(t, v.Failures, 1) || !assert.Equal(t, &ValidationError{ - Kind: "Node", - Name: "node-1b", - Message: "node \"node-1b\" is not ready", + Kind: "Node", + Name: "node-1b", + Message: "node \"node-1b\" is not ready", + InstanceGroup: groups["node-1"].InstanceGroup, }, v.Failures[0]) { printDebug(t, v) } @@ -364,9 +369,10 @@ func Test_ValidateMastersNotEnough(t *testing.T) { require.NoError(t, err) if !assert.Len(t, v.Failures, 1) || !assert.Equal(t, &ValidationError{ - Kind: "InstanceGroup", - Name: "master-1", - Message: "InstanceGroup \"master-1\" did not have enough nodes 2 vs 3", + Kind: "InstanceGroup", + Name: "master-1", + Message: "InstanceGroup \"master-1\" did not have enough nodes 2 vs 3", + InstanceGroup: groups["node-1"].InstanceGroup, }, v.Failures[0]) { printDebug(t, v) } @@ -417,9 +423,10 @@ func Test_ValidateMasterNotReady(t *testing.T) { require.NoError(t, err) if !assert.Len(t, v.Failures, 1) || !assert.Equal(t, &ValidationError{ - Kind: "Node", - Name: "master-1b", - Message: "master \"master-1b\" is not ready", + Kind: "Node", + Name: "master-1b", + Message: "master \"master-1b\" is not ready", + InstanceGroup: groups["node-1"].InstanceGroup, }, v.Failures[0]) { printDebug(t, v) } @@ -504,9 +511,10 @@ func Test_ValidateMasterStaticPods(t *testing.T) { var podList []map[string]string expectedFailures := []*ValidationError{ { - Kind: "Node", - Name: "master-1c", - Message: "master \"master-1c\" is not ready", + Kind: "Node", + Name: "master-1c", + Message: "master \"master-1c\" is not ready", + InstanceGroup: groups["node-1"].InstanceGroup, }, } From 7067f5f47af41e176a3710f11a6abf31194e0d2c Mon Sep 17 00:00:00 2001 From: Bharath Vedartham Date: Sat, 31 Oct 2020 19:17:24 +0530 Subject: [PATCH 3/4] instancegroups: Ignore validation errors in unrelated instance groups When unrelated instance groups produce validation errors, the instance group being updated produces a failure and is forced to wait for rolling update to continue. This can be avoided as failures in different node instance groups usually don't affect the instance group being affected in any way. --- pkg/instancegroups/instancegroups.go | 42 +++++++++++++++++++++------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/pkg/instancegroups/instancegroups.go b/pkg/instancegroups/instancegroups.go index dcf065ba89..976075ed0e 100644 --- a/pkg/instancegroups/instancegroups.go +++ b/pkg/instancegroups/instancegroups.go @@ -36,6 +36,7 @@ import ( "k8s.io/klog/v2" api "k8s.io/kops/pkg/apis/kops" "k8s.io/kops/pkg/cloudinstances" + "k8s.io/kops/pkg/validation" "k8s.io/kubectl/pkg/drain" ) @@ -92,7 +93,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances. if isBastion { klog.V(3).Info("Not validating the cluster as instance is a bastion.") - } else if err = c.maybeValidate("", 1); err != nil { + } else if err = c.maybeValidate("", 1, group); err != nil { return err } @@ -147,7 +148,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances. klog.Infof("waiting for %v after detaching instance", sleepAfterTerminate) time.Sleep(sleepAfterTerminate) - if err := c.maybeValidate(" after detaching instance", c.ValidateCount); err != nil { + if err := c.maybeValidate(" after detaching instance", c.ValidateCount, group); err != nil { return err } noneReady = false @@ -181,7 +182,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances. return waitForPendingBeforeReturningError(runningDrains, terminateChan, err) } - err = c.maybeValidate(" after terminating instance", c.ValidateCount) + err = c.maybeValidate(" after terminating instance", c.ValidateCount, group) if err != nil { return waitForPendingBeforeReturningError(runningDrains, terminateChan, err) } @@ -227,7 +228,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances. } } - err = c.maybeValidate(" after terminating instance", c.ValidateCount) + err = c.maybeValidate(" after terminating instance", c.ValidateCount, group) if err != nil { return err } @@ -410,14 +411,14 @@ func (c *RollingUpdateCluster) reconcileInstanceGroup() error { } -func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int) error { +func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int, group *cloudinstances.CloudInstanceGroup) error { if c.CloudOnly { klog.Warningf("Not validating cluster as cloudonly flag is set.") } else { klog.Info("Validating the cluster.") - if err := c.validateClusterWithTimeout(validateCount); err != nil { + if err := c.validateClusterWithTimeout(validateCount, group); err != nil { if c.FailOnValidate { klog.Errorf("Cluster did not validate within %s", c.ValidationTimeout) @@ -431,7 +432,7 @@ func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int } // validateClusterWithTimeout runs validation.ValidateCluster until either we get positive result or the timeout expires -func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) error { +func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int, group *cloudinstances.CloudInstanceGroup) error { ctx, cancel := context.WithTimeout(context.Background(), c.ValidationTimeout) defer cancel() @@ -445,7 +446,7 @@ func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) err for { // Note that we validate at least once before checking the timeout, in case the cluster is healthy with a short timeout result, err := c.ClusterValidator.Validate() - if err == nil && len(result.Failures) == 0 { + if err == nil && !hasFailureRelevantToGroup(result.Failures, group) { successCount++ if successCount >= validateCount { klog.Info("Cluster validated.") @@ -477,7 +478,7 @@ func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) err // Reset the success count; we want N consecutive successful validations successCount = 0 - // Wait before retrying + // Wait before retrying in some cases // TODO: Should we check if we have enough time left before the deadline? time.Sleep(c.ValidateTickDuration) } @@ -485,6 +486,27 @@ func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) err return fmt.Errorf("cluster did not validate within a duration of %q", c.ValidationTimeout) } +// checks if the validation failures returned after cluster validation are relevant to the current +// instance group whose rolling update is occurring +func hasFailureRelevantToGroup(failures []*validation.ValidationError, group *cloudinstances.CloudInstanceGroup) bool { + // Ignore non critical validation errors in other instance groups like below target size errors + for _, failure := range failures { + // Determining InstanceGroups for certain resources like Pods, ComponentStatus is not straightforward. + // Till we are able to determine the InstanceGroups for these resources without ambiguity, the + // InstanceGroup field of the ValidationErrors for these resources will be nil + if failure.InstanceGroup == nil { + return true + } + + // if there is a failure in the same instance group or a failure which has cluster wide impact + if (failure.InstanceGroup.IsMaster()) || (failure.InstanceGroup == group.InstanceGroup) { + return true + } + } + + return false +} + // detachInstance detaches a Cloud Instance func (c *RollingUpdateCluster) detachInstance(u *cloudinstances.CloudInstance) error { id := u.ID @@ -608,7 +630,7 @@ func (c *RollingUpdateCluster) UpdateSingleInstance(cloudMember *cloudinstances. if err != nil { return fmt.Errorf("failed to detach instance: %v", err) } - if err := c.maybeValidate(" after detaching instance", c.ValidateCount); err != nil { + if err := c.maybeValidate(" after detaching instance", c.ValidateCount, cloudMember.CloudInstanceGroup); err != nil { return err } } From 1e18a5d34481fab619a09d16c85c2247c2b06f70 Mon Sep 17 00:00:00 2001 From: Bharath Vedartham Date: Sat, 31 Oct 2020 19:17:45 +0530 Subject: [PATCH 4/4] rollingupdate_test: add tests for rolling update The tests create a cluster with 2 node instance groups and 1 master and bastion instance groups. Only one node instance group requires rolling update. instanceGroupNodeSpecificErrorClusterValidator mocks a validation failure for a given node group. rolling update should not fail if the cluster validator reports an error in an unrelated instance group. --- pkg/instancegroups/rollingupdate_test.go | 108 +++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/pkg/instancegroups/rollingupdate_test.go b/pkg/instancegroups/rollingupdate_test.go index cafc42f30b..441ec51a53 100644 --- a/pkg/instancegroups/rollingupdate_test.go +++ b/pkg/instancegroups/rollingupdate_test.go @@ -105,6 +105,24 @@ func (*erroringClusterValidator) Validate() (*validation.ValidationCluster, erro return nil, errors.New("testing validation error") } +// instanceGroupNodeSpecificErrorClusterValidator simulates failures in a specific node group in the map of instance groups +type instanceGroupNodeSpecificErrorClusterValidator struct { + InstanceGroup *kopsapi.InstanceGroup +} + +func (igErrorValidator *instanceGroupNodeSpecificErrorClusterValidator) Validate() (*validation.ValidationCluster, error) { + return &validation.ValidationCluster{ + Failures: []*validation.ValidationError{ + { + Kind: "testing", + Name: "testing failure", + Message: "testing failure", + InstanceGroup: igErrorValidator.InstanceGroup, + }, + }, + }, nil +} + type assertNotCalledClusterValidator struct { T *testing.T } @@ -458,6 +476,96 @@ func TestRollingUpdateClusterErrorsValidationAfterOneMaster(t *testing.T) { assertGroupInstanceCount(t, cloud, "bastion-1", 0) } +func TestRollingUpdateNonRelatedInstanceGroupFailure(t *testing.T) { + + c, cloud := getTestSetup() + + groups := make(map[string]*cloudinstances.CloudInstanceGroup) + makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3) + makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 0) + makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0) + makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0) + + c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{ + InstanceGroup: groups["node-2"].InstanceGroup, + } + + err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{}) + assert.NoError(t, err, "rolling update") + + assertGroupInstanceCount(t, cloud, "node-1", 0) + assertGroupInstanceCount(t, cloud, "node-2", 3) + assertGroupInstanceCount(t, cloud, "master-1", 2) + assertGroupInstanceCount(t, cloud, "bastion-1", 1) +} + +func TestRollingUpdateRelatedInstanceGroupFailure(t *testing.T) { + + c, cloud := getTestSetup() + + groups := make(map[string]*cloudinstances.CloudInstanceGroup) + makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3) + makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 0) + makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0) + makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0) + + c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{ + InstanceGroup: groups["node-1"].InstanceGroup, + } + + err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{}) + assert.Error(t, err, "rolling update") + + assertGroupInstanceCount(t, cloud, "node-1", 3) + assertGroupInstanceCount(t, cloud, "node-2", 3) + assertGroupInstanceCount(t, cloud, "master-1", 2) + assertGroupInstanceCount(t, cloud, "bastion-1", 1) +} + +func TestRollingUpdateMasterGroupFailure(t *testing.T) { + c, cloud := getTestSetup() + + groups := make(map[string]*cloudinstances.CloudInstanceGroup) + makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3) + makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 0) + makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0) + makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0) + + c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{ + InstanceGroup: groups["master-1"].InstanceGroup, + } + + err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{}) + assert.Error(t, err, "rolling update") + + assertGroupInstanceCount(t, cloud, "node-1", 3) + assertGroupInstanceCount(t, cloud, "node-2", 3) + assertGroupInstanceCount(t, cloud, "master-1", 2) + assertGroupInstanceCount(t, cloud, "bastion-1", 1) +} + +func TestRollingUpdateValidationErrorInstanceGroupNil(t *testing.T) { + c, cloud := getTestSetup() + + groups := make(map[string]*cloudinstances.CloudInstanceGroup) + makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3) + makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 0) + makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0) + makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0) + + c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{ + InstanceGroup: nil, + } + + err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{}) + assert.Error(t, err, "rolling update") + + assertGroupInstanceCount(t, cloud, "node-1", 3) + assertGroupInstanceCount(t, cloud, "node-2", 3) + assertGroupInstanceCount(t, cloud, "master-1", 2) + assertGroupInstanceCount(t, cloud, "bastion-1", 1) +} + func TestRollingUpdateClusterFailsValidationAfterOneNode(t *testing.T) { c, cloud := getTestSetup()