diff --git a/pkg/instancegroups/instancegroups.go b/pkg/instancegroups/instancegroups.go index 7faae56546..1a3a8518a8 100644 --- a/pkg/instancegroups/instancegroups.go +++ b/pkg/instancegroups/instancegroups.go @@ -36,6 +36,7 @@ import ( "k8s.io/klog/v2" api "k8s.io/kops/pkg/apis/kops" "k8s.io/kops/pkg/cloudinstances" + "k8s.io/kops/pkg/validation" "k8s.io/kubectl/pkg/drain" ) @@ -92,7 +93,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances. if isBastion { klog.V(3).Info("Not validating the cluster as instance is a bastion.") - } else if err = c.maybeValidate("", 1); err != nil { + } else if err = c.maybeValidate("", 1, group); err != nil { return err } @@ -147,7 +148,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances. klog.Infof("waiting for %v after detaching instance", sleepAfterTerminate) time.Sleep(sleepAfterTerminate) - if err := c.maybeValidate(" after detaching instance", c.ValidateCount); err != nil { + if err := c.maybeValidate(" after detaching instance", c.ValidateCount, group); err != nil { return err } noneReady = false @@ -181,7 +182,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances. return waitForPendingBeforeReturningError(runningDrains, terminateChan, err) } - err = c.maybeValidate(" after terminating instance", c.ValidateCount) + err = c.maybeValidate(" after terminating instance", c.ValidateCount, group) if err != nil { return waitForPendingBeforeReturningError(runningDrains, terminateChan, err) } @@ -227,7 +228,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances. } } - err = c.maybeValidate(" after terminating instance", c.ValidateCount) + err = c.maybeValidate(" after terminating instance", c.ValidateCount, group) if err != nil { return err } @@ -410,14 +411,14 @@ func (c *RollingUpdateCluster) reconcileInstanceGroup() error { } -func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int) error { +func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int, group *cloudinstances.CloudInstanceGroup) error { if c.CloudOnly { klog.Warningf("Not validating cluster as cloudonly flag is set.") } else { klog.Info("Validating the cluster.") - if err := c.validateClusterWithTimeout(validateCount); err != nil { + if err := c.validateClusterWithTimeout(validateCount, group); err != nil { if c.FailOnValidate { klog.Errorf("Cluster did not validate within %s", c.ValidationTimeout) @@ -431,7 +432,7 @@ func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int } // validateClusterWithTimeout runs validation.ValidateCluster until either we get positive result or the timeout expires -func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) error { +func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int, group *cloudinstances.CloudInstanceGroup) error { ctx, cancel := context.WithTimeout(context.Background(), c.ValidationTimeout) defer cancel() @@ -445,7 +446,7 @@ func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) err for { // Note that we validate at least once before checking the timeout, in case the cluster is healthy with a short timeout result, err := c.ClusterValidator.Validate() - if err == nil && len(result.Failures) == 0 { + if err == nil && !hasFailureRelevantToGroup(result.Failures, group) { successCount++ if successCount >= validateCount { klog.Info("Cluster validated.") @@ -477,7 +478,7 @@ func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) err // Reset the success count; we want N consecutive successful validations successCount = 0 - // Wait before retrying + // Wait before retrying in some cases // TODO: Should we check if we have enough time left before the deadline? time.Sleep(c.ValidateTickDuration) } @@ -485,6 +486,27 @@ func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) err return fmt.Errorf("cluster did not validate within a duration of %q", c.ValidationTimeout) } +// checks if the validation failures returned after cluster validation are relevant to the current +// instance group whose rolling update is occurring +func hasFailureRelevantToGroup(failures []*validation.ValidationError, group *cloudinstances.CloudInstanceGroup) bool { + // Ignore non critical validation errors in other instance groups like below target size errors + for _, failure := range failures { + // Determining InstanceGroups for certain resources like Pods, ComponentStatus is not straightforward. + // Till we are able to determine the InstanceGroups for these resources without ambiguity, the + // InstanceGroup field of the ValidationErrors for these resources will be nil + if failure.InstanceGroup == nil { + return true + } + + // if there is a failure in the same instance group or a failure which has cluster wide impact + if (failure.InstanceGroup.IsMaster()) || (failure.InstanceGroup == group.InstanceGroup) { + return true + } + } + + return false +} + // detachInstance detaches a Cloud Instance func (c *RollingUpdateCluster) detachInstance(u *cloudinstances.CloudInstance) error { id := u.ID @@ -608,7 +630,7 @@ func (c *RollingUpdateCluster) UpdateSingleInstance(cloudMember *cloudinstances. if err != nil { return fmt.Errorf("failed to detach instance: %v", err) } - if err := c.maybeValidate(" after detaching instance", c.ValidateCount); err != nil { + if err := c.maybeValidate(" after detaching instance", c.ValidateCount, cloudMember.CloudInstanceGroup); err != nil { return err } } diff --git a/pkg/instancegroups/rollingupdate_test.go b/pkg/instancegroups/rollingupdate_test.go index cafc42f30b..441ec51a53 100644 --- a/pkg/instancegroups/rollingupdate_test.go +++ b/pkg/instancegroups/rollingupdate_test.go @@ -105,6 +105,24 @@ func (*erroringClusterValidator) Validate() (*validation.ValidationCluster, erro return nil, errors.New("testing validation error") } +// instanceGroupNodeSpecificErrorClusterValidator simulates failures in a specific node group in the map of instance groups +type instanceGroupNodeSpecificErrorClusterValidator struct { + InstanceGroup *kopsapi.InstanceGroup +} + +func (igErrorValidator *instanceGroupNodeSpecificErrorClusterValidator) Validate() (*validation.ValidationCluster, error) { + return &validation.ValidationCluster{ + Failures: []*validation.ValidationError{ + { + Kind: "testing", + Name: "testing failure", + Message: "testing failure", + InstanceGroup: igErrorValidator.InstanceGroup, + }, + }, + }, nil +} + type assertNotCalledClusterValidator struct { T *testing.T } @@ -458,6 +476,96 @@ func TestRollingUpdateClusterErrorsValidationAfterOneMaster(t *testing.T) { assertGroupInstanceCount(t, cloud, "bastion-1", 0) } +func TestRollingUpdateNonRelatedInstanceGroupFailure(t *testing.T) { + + c, cloud := getTestSetup() + + groups := make(map[string]*cloudinstances.CloudInstanceGroup) + makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3) + makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 0) + makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0) + makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0) + + c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{ + InstanceGroup: groups["node-2"].InstanceGroup, + } + + err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{}) + assert.NoError(t, err, "rolling update") + + assertGroupInstanceCount(t, cloud, "node-1", 0) + assertGroupInstanceCount(t, cloud, "node-2", 3) + assertGroupInstanceCount(t, cloud, "master-1", 2) + assertGroupInstanceCount(t, cloud, "bastion-1", 1) +} + +func TestRollingUpdateRelatedInstanceGroupFailure(t *testing.T) { + + c, cloud := getTestSetup() + + groups := make(map[string]*cloudinstances.CloudInstanceGroup) + makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3) + makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 0) + makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0) + makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0) + + c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{ + InstanceGroup: groups["node-1"].InstanceGroup, + } + + err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{}) + assert.Error(t, err, "rolling update") + + assertGroupInstanceCount(t, cloud, "node-1", 3) + assertGroupInstanceCount(t, cloud, "node-2", 3) + assertGroupInstanceCount(t, cloud, "master-1", 2) + assertGroupInstanceCount(t, cloud, "bastion-1", 1) +} + +func TestRollingUpdateMasterGroupFailure(t *testing.T) { + c, cloud := getTestSetup() + + groups := make(map[string]*cloudinstances.CloudInstanceGroup) + makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3) + makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 0) + makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0) + makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0) + + c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{ + InstanceGroup: groups["master-1"].InstanceGroup, + } + + err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{}) + assert.Error(t, err, "rolling update") + + assertGroupInstanceCount(t, cloud, "node-1", 3) + assertGroupInstanceCount(t, cloud, "node-2", 3) + assertGroupInstanceCount(t, cloud, "master-1", 2) + assertGroupInstanceCount(t, cloud, "bastion-1", 1) +} + +func TestRollingUpdateValidationErrorInstanceGroupNil(t *testing.T) { + c, cloud := getTestSetup() + + groups := make(map[string]*cloudinstances.CloudInstanceGroup) + makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3) + makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 0) + makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0) + makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0) + + c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{ + InstanceGroup: nil, + } + + err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{}) + assert.Error(t, err, "rolling update") + + assertGroupInstanceCount(t, cloud, "node-1", 3) + assertGroupInstanceCount(t, cloud, "node-2", 3) + assertGroupInstanceCount(t, cloud, "master-1", 2) + assertGroupInstanceCount(t, cloud, "bastion-1", 1) +} + func TestRollingUpdateClusterFailsValidationAfterOneNode(t *testing.T) { c, cloud := getTestSetup() diff --git a/pkg/validation/validate_cluster.go b/pkg/validation/validate_cluster.go index ba4131eda2..9ceccbbcaf 100644 --- a/pkg/validation/validate_cluster.go +++ b/pkg/validation/validate_cluster.go @@ -25,6 +25,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/pager" + "k8s.io/kops/pkg/apis/kops" "k8s.io/kops/upup/pkg/fi" v1 "k8s.io/api/core/v1" @@ -32,7 +33,6 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" "k8s.io/klog/v2" - "k8s.io/kops/pkg/apis/kops" "k8s.io/kops/pkg/cloudinstances" "k8s.io/kops/pkg/dns" ) @@ -49,6 +49,8 @@ type ValidationError struct { Kind string `json:"type,omitempty"` Name string `json:"name,omitempty"` Message string `json:"message,omitempty"` + // The InstanceGroup field is used to indicate which instance group this validation error is coming from + InstanceGroup *kops.InstanceGroup `json:"instanceGroup,omitempty"` } type ClusterValidator interface { @@ -172,7 +174,7 @@ func (v *clusterValidatorImpl) Validate() (*ValidationCluster, error) { return nil, fmt.Errorf("cannot get component status for %q: %v", clusterName, err) } - if err := validation.collectPodFailures(ctx, v.k8sClient, readyNodes); err != nil { + if err := validation.collectPodFailures(ctx, v.k8sClient, readyNodes, v.instanceGroups); err != nil { return nil, fmt.Errorf("cannot get pod health for %q: %v", clusterName, err) } @@ -185,6 +187,7 @@ func (v *ValidationCluster) collectComponentFailures(ctx context.Context, client return fmt.Errorf("error listing ComponentStatuses: %v", err) } + // TODO: Add logic to figure out the InstanceGroup given a component for _, component := range componentList.Items { for _, condition := range component.Conditions { if condition.Status != v1.ConditionTrue { @@ -205,9 +208,11 @@ var masterStaticPods = []string{ "kube-scheduler", } -func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kubernetes.Interface, nodes []v1.Node) error { +func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kubernetes.Interface, nodes []v1.Node, + groups []*kops.InstanceGroup) error { masterWithoutPod := map[string]map[string]bool{} nodeByAddress := map[string]string{} + for _, node := range nodes { labels := node.GetLabels() if labels != nil && labels["kubernetes.io/role"] == "master" { @@ -238,6 +243,8 @@ func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kuber if pod.Status.Phase == v1.PodSucceeded { return nil } + + // TODO: Add logic to figure out the InstanceGroup given a pod if pod.Status.Phase == v1.PodPending { v.addError(&ValidationError{ Kind: "Pod", @@ -311,6 +318,7 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances cloudGroup.InstanceGroup.Name, numNodes, cloudGroup.TargetSize), + InstanceGroup: cloudGroup.InstanceGroup, }) } @@ -326,9 +334,10 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances if nodeExpectedToJoin { v.addError(&ValidationError{ - Kind: "Machine", - Name: member.ID, - Message: fmt.Sprintf("machine %q has not yet joined cluster", member.ID), + Kind: "Machine", + Name: member.ID, + Message: fmt.Sprintf("machine %q has not yet joined cluster", member.ID), + InstanceGroup: cloudGroup.InstanceGroup, }) } continue @@ -358,9 +367,10 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances if n.Role == "master" { if !ready { v.addError(&ValidationError{ - Kind: "Node", - Name: node.Name, - Message: fmt.Sprintf("master %q is not ready", node.Name), + Kind: "Node", + Name: node.Name, + Message: fmt.Sprintf("master %q is not ready", node.Name), + InstanceGroup: cloudGroup.InstanceGroup, }) } @@ -368,9 +378,10 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances } else if n.Role == "node" { if !ready { v.addError(&ValidationError{ - Kind: "Node", - Name: node.Name, - Message: fmt.Sprintf("node %q is not ready", node.Name), + Kind: "Node", + Name: node.Name, + Message: fmt.Sprintf("node %q is not ready", node.Name), + InstanceGroup: cloudGroup.InstanceGroup, }) } @@ -384,9 +395,10 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances for _, ig := range groups { if !groupsSeen[ig.Name] { v.addError(&ValidationError{ - Kind: "InstanceGroup", - Name: ig.Name, - Message: fmt.Sprintf("InstanceGroup %q is missing from the cloud provider", ig.Name), + Kind: "InstanceGroup", + Name: ig.Name, + Message: fmt.Sprintf("InstanceGroup %q is missing from the cloud provider", ig.Name), + InstanceGroup: ig, }) } } diff --git a/pkg/validation/validate_cluster_test.go b/pkg/validation/validate_cluster_test.go index 223060526c..6b64a1b5d6 100644 --- a/pkg/validation/validate_cluster_test.go +++ b/pkg/validation/validate_cluster_test.go @@ -132,6 +132,7 @@ func Test_ValidateCloudGroupMissing(t *testing.T) { cluster := &kopsapi.Cluster{ ObjectMeta: metav1.ObjectMeta{Name: "testcluster.k8s.local"}, } + instanceGroups := []kopsapi.InstanceGroup{ { ObjectMeta: metav1.ObjectMeta{ @@ -151,9 +152,10 @@ func Test_ValidateCloudGroupMissing(t *testing.T) { require.NoError(t, err) if !assert.Len(t, v.Failures, 1) || !assert.Equal(t, &ValidationError{ - Kind: "InstanceGroup", - Name: "node-1", - Message: "InstanceGroup \"node-1\" is missing from the cloud provider", + Kind: "InstanceGroup", + Name: "node-1", + Message: "InstanceGroup \"node-1\" is missing from the cloud provider", + InstanceGroup: &instanceGroups[0], }, v.Failures[0]) { printDebug(t, v) } @@ -204,9 +206,10 @@ func Test_ValidateNodesNotEnough(t *testing.T) { require.NoError(t, err) if !assert.Len(t, v.Failures, 1) || !assert.Equal(t, &ValidationError{ - Kind: "InstanceGroup", - Name: "node-1", - Message: "InstanceGroup \"node-1\" did not have enough nodes 2 vs 3", + Kind: "InstanceGroup", + Name: "node-1", + Message: "InstanceGroup \"node-1\" did not have enough nodes 2 vs 3", + InstanceGroup: groups["node-1"].InstanceGroup, }, v.Failures[0]) { printDebug(t, v) } @@ -258,9 +261,10 @@ func Test_ValidateDetachedNodesDontCount(t *testing.T) { require.NoError(t, err) if !assert.Len(t, v.Failures, 1) || !assert.Equal(t, &ValidationError{ - Kind: "InstanceGroup", - Name: "node-1", - Message: "InstanceGroup \"node-1\" did not have enough nodes 1 vs 2", + Kind: "InstanceGroup", + Name: "node-1", + Message: "InstanceGroup \"node-1\" did not have enough nodes 1 vs 2", + InstanceGroup: groups["node-1"].InstanceGroup, }, v.Failures[0]) { printDebug(t, v) } @@ -311,9 +315,10 @@ func Test_ValidateNodeNotReady(t *testing.T) { require.NoError(t, err) if !assert.Len(t, v.Failures, 1) || !assert.Equal(t, &ValidationError{ - Kind: "Node", - Name: "node-1b", - Message: "node \"node-1b\" is not ready", + Kind: "Node", + Name: "node-1b", + Message: "node \"node-1b\" is not ready", + InstanceGroup: groups["node-1"].InstanceGroup, }, v.Failures[0]) { printDebug(t, v) } @@ -364,9 +369,10 @@ func Test_ValidateMastersNotEnough(t *testing.T) { require.NoError(t, err) if !assert.Len(t, v.Failures, 1) || !assert.Equal(t, &ValidationError{ - Kind: "InstanceGroup", - Name: "master-1", - Message: "InstanceGroup \"master-1\" did not have enough nodes 2 vs 3", + Kind: "InstanceGroup", + Name: "master-1", + Message: "InstanceGroup \"master-1\" did not have enough nodes 2 vs 3", + InstanceGroup: groups["node-1"].InstanceGroup, }, v.Failures[0]) { printDebug(t, v) } @@ -417,9 +423,10 @@ func Test_ValidateMasterNotReady(t *testing.T) { require.NoError(t, err) if !assert.Len(t, v.Failures, 1) || !assert.Equal(t, &ValidationError{ - Kind: "Node", - Name: "master-1b", - Message: "master \"master-1b\" is not ready", + Kind: "Node", + Name: "master-1b", + Message: "master \"master-1b\" is not ready", + InstanceGroup: groups["node-1"].InstanceGroup, }, v.Failures[0]) { printDebug(t, v) } @@ -504,9 +511,10 @@ func Test_ValidateMasterStaticPods(t *testing.T) { var podList []map[string]string expectedFailures := []*ValidationError{ { - Kind: "Node", - Name: "master-1c", - Message: "master \"master-1c\" is not ready", + Kind: "Node", + Name: "master-1c", + Message: "master \"master-1c\" is not ready", + InstanceGroup: groups["node-1"].InstanceGroup, }, }