Merge pull request #10065 from bharath-123/feature/instancegroup-specific-validation

Avoid waiting on validation during rolling update for inapplicable instance groups
This commit is contained in:
Kubernetes Prow Robot 2020-11-05 22:38:50 -08:00 committed by GitHub
commit 7b26ec4b6d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 196 additions and 46 deletions

View File

@ -36,6 +36,7 @@ import (
"k8s.io/klog/v2"
api "k8s.io/kops/pkg/apis/kops"
"k8s.io/kops/pkg/cloudinstances"
"k8s.io/kops/pkg/validation"
"k8s.io/kubectl/pkg/drain"
)
@ -92,7 +93,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances.
if isBastion {
klog.V(3).Info("Not validating the cluster as instance is a bastion.")
} else if err = c.maybeValidate("", 1); err != nil {
} else if err = c.maybeValidate("", 1, group); err != nil {
return err
}
@ -147,7 +148,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances.
klog.Infof("waiting for %v after detaching instance", sleepAfterTerminate)
time.Sleep(sleepAfterTerminate)
if err := c.maybeValidate(" after detaching instance", c.ValidateCount); err != nil {
if err := c.maybeValidate(" after detaching instance", c.ValidateCount, group); err != nil {
return err
}
noneReady = false
@ -181,7 +182,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances.
return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
}
err = c.maybeValidate(" after terminating instance", c.ValidateCount)
err = c.maybeValidate(" after terminating instance", c.ValidateCount, group)
if err != nil {
return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
}
@ -227,7 +228,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(group *cloudinstances.
}
}
err = c.maybeValidate(" after terminating instance", c.ValidateCount)
err = c.maybeValidate(" after terminating instance", c.ValidateCount, group)
if err != nil {
return err
}
@ -410,14 +411,14 @@ func (c *RollingUpdateCluster) reconcileInstanceGroup() error {
}
func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int) error {
func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int, group *cloudinstances.CloudInstanceGroup) error {
if c.CloudOnly {
klog.Warningf("Not validating cluster as cloudonly flag is set.")
} else {
klog.Info("Validating the cluster.")
if err := c.validateClusterWithTimeout(validateCount); err != nil {
if err := c.validateClusterWithTimeout(validateCount, group); err != nil {
if c.FailOnValidate {
klog.Errorf("Cluster did not validate within %s", c.ValidationTimeout)
@ -431,7 +432,7 @@ func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int
}
// validateClusterWithTimeout runs validation.ValidateCluster until either we get positive result or the timeout expires
func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) error {
func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int, group *cloudinstances.CloudInstanceGroup) error {
ctx, cancel := context.WithTimeout(context.Background(), c.ValidationTimeout)
defer cancel()
@ -445,7 +446,7 @@ func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) err
for {
// Note that we validate at least once before checking the timeout, in case the cluster is healthy with a short timeout
result, err := c.ClusterValidator.Validate()
if err == nil && len(result.Failures) == 0 {
if err == nil && !hasFailureRelevantToGroup(result.Failures, group) {
successCount++
if successCount >= validateCount {
klog.Info("Cluster validated.")
@ -477,7 +478,7 @@ func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) err
// Reset the success count; we want N consecutive successful validations
successCount = 0
// Wait before retrying
// Wait before retrying in some cases
// TODO: Should we check if we have enough time left before the deadline?
time.Sleep(c.ValidateTickDuration)
}
@ -485,6 +486,27 @@ func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) err
return fmt.Errorf("cluster did not validate within a duration of %q", c.ValidationTimeout)
}
// checks if the validation failures returned after cluster validation are relevant to the current
// instance group whose rolling update is occurring
func hasFailureRelevantToGroup(failures []*validation.ValidationError, group *cloudinstances.CloudInstanceGroup) bool {
// Ignore non critical validation errors in other instance groups like below target size errors
for _, failure := range failures {
// Determining InstanceGroups for certain resources like Pods, ComponentStatus is not straightforward.
// Till we are able to determine the InstanceGroups for these resources without ambiguity, the
// InstanceGroup field of the ValidationErrors for these resources will be nil
if failure.InstanceGroup == nil {
return true
}
// if there is a failure in the same instance group or a failure which has cluster wide impact
if (failure.InstanceGroup.IsMaster()) || (failure.InstanceGroup == group.InstanceGroup) {
return true
}
}
return false
}
// detachInstance detaches a Cloud Instance
func (c *RollingUpdateCluster) detachInstance(u *cloudinstances.CloudInstance) error {
id := u.ID
@ -608,7 +630,7 @@ func (c *RollingUpdateCluster) UpdateSingleInstance(cloudMember *cloudinstances.
if err != nil {
return fmt.Errorf("failed to detach instance: %v", err)
}
if err := c.maybeValidate(" after detaching instance", c.ValidateCount); err != nil {
if err := c.maybeValidate(" after detaching instance", c.ValidateCount, cloudMember.CloudInstanceGroup); err != nil {
return err
}
}

View File

@ -105,6 +105,24 @@ func (*erroringClusterValidator) Validate() (*validation.ValidationCluster, erro
return nil, errors.New("testing validation error")
}
// instanceGroupNodeSpecificErrorClusterValidator simulates failures in a specific node group in the map of instance groups
type instanceGroupNodeSpecificErrorClusterValidator struct {
InstanceGroup *kopsapi.InstanceGroup
}
func (igErrorValidator *instanceGroupNodeSpecificErrorClusterValidator) Validate() (*validation.ValidationCluster, error) {
return &validation.ValidationCluster{
Failures: []*validation.ValidationError{
{
Kind: "testing",
Name: "testing failure",
Message: "testing failure",
InstanceGroup: igErrorValidator.InstanceGroup,
},
},
}, nil
}
type assertNotCalledClusterValidator struct {
T *testing.T
}
@ -458,6 +476,96 @@ func TestRollingUpdateClusterErrorsValidationAfterOneMaster(t *testing.T) {
assertGroupInstanceCount(t, cloud, "bastion-1", 0)
}
func TestRollingUpdateNonRelatedInstanceGroupFailure(t *testing.T) {
c, cloud := getTestSetup()
groups := make(map[string]*cloudinstances.CloudInstanceGroup)
makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3)
makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 0)
makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0)
makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0)
c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{
InstanceGroup: groups["node-2"].InstanceGroup,
}
err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{})
assert.NoError(t, err, "rolling update")
assertGroupInstanceCount(t, cloud, "node-1", 0)
assertGroupInstanceCount(t, cloud, "node-2", 3)
assertGroupInstanceCount(t, cloud, "master-1", 2)
assertGroupInstanceCount(t, cloud, "bastion-1", 1)
}
func TestRollingUpdateRelatedInstanceGroupFailure(t *testing.T) {
c, cloud := getTestSetup()
groups := make(map[string]*cloudinstances.CloudInstanceGroup)
makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3)
makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 0)
makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0)
makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0)
c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{
InstanceGroup: groups["node-1"].InstanceGroup,
}
err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{})
assert.Error(t, err, "rolling update")
assertGroupInstanceCount(t, cloud, "node-1", 3)
assertGroupInstanceCount(t, cloud, "node-2", 3)
assertGroupInstanceCount(t, cloud, "master-1", 2)
assertGroupInstanceCount(t, cloud, "bastion-1", 1)
}
func TestRollingUpdateMasterGroupFailure(t *testing.T) {
c, cloud := getTestSetup()
groups := make(map[string]*cloudinstances.CloudInstanceGroup)
makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3)
makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 0)
makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0)
makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0)
c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{
InstanceGroup: groups["master-1"].InstanceGroup,
}
err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{})
assert.Error(t, err, "rolling update")
assertGroupInstanceCount(t, cloud, "node-1", 3)
assertGroupInstanceCount(t, cloud, "node-2", 3)
assertGroupInstanceCount(t, cloud, "master-1", 2)
assertGroupInstanceCount(t, cloud, "bastion-1", 1)
}
func TestRollingUpdateValidationErrorInstanceGroupNil(t *testing.T) {
c, cloud := getTestSetup()
groups := make(map[string]*cloudinstances.CloudInstanceGroup)
makeGroup(groups, c.K8sClient, cloud, "node-1", kopsapi.InstanceGroupRoleNode, 3, 3)
makeGroup(groups, c.K8sClient, cloud, "node-2", kopsapi.InstanceGroupRoleNode, 3, 0)
makeGroup(groups, c.K8sClient, cloud, "master-1", kopsapi.InstanceGroupRoleMaster, 2, 0)
makeGroup(groups, c.K8sClient, cloud, "bastion-1", kopsapi.InstanceGroupRoleBastion, 1, 0)
c.ClusterValidator = &instanceGroupNodeSpecificErrorClusterValidator{
InstanceGroup: nil,
}
err := c.RollingUpdate(groups, &kopsapi.InstanceGroupList{})
assert.Error(t, err, "rolling update")
assertGroupInstanceCount(t, cloud, "node-1", 3)
assertGroupInstanceCount(t, cloud, "node-2", 3)
assertGroupInstanceCount(t, cloud, "master-1", 2)
assertGroupInstanceCount(t, cloud, "bastion-1", 1)
}
func TestRollingUpdateClusterFailsValidationAfterOneNode(t *testing.T) {
c, cloud := getTestSetup()

View File

@ -25,6 +25,7 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/pager"
"k8s.io/kops/pkg/apis/kops"
"k8s.io/kops/upup/pkg/fi"
v1 "k8s.io/api/core/v1"
@ -32,7 +33,6 @@ import (
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"
"k8s.io/kops/pkg/apis/kops"
"k8s.io/kops/pkg/cloudinstances"
"k8s.io/kops/pkg/dns"
)
@ -49,6 +49,8 @@ type ValidationError struct {
Kind string `json:"type,omitempty"`
Name string `json:"name,omitempty"`
Message string `json:"message,omitempty"`
// The InstanceGroup field is used to indicate which instance group this validation error is coming from
InstanceGroup *kops.InstanceGroup `json:"instanceGroup,omitempty"`
}
type ClusterValidator interface {
@ -172,7 +174,7 @@ func (v *clusterValidatorImpl) Validate() (*ValidationCluster, error) {
return nil, fmt.Errorf("cannot get component status for %q: %v", clusterName, err)
}
if err := validation.collectPodFailures(ctx, v.k8sClient, readyNodes); err != nil {
if err := validation.collectPodFailures(ctx, v.k8sClient, readyNodes, v.instanceGroups); err != nil {
return nil, fmt.Errorf("cannot get pod health for %q: %v", clusterName, err)
}
@ -185,6 +187,7 @@ func (v *ValidationCluster) collectComponentFailures(ctx context.Context, client
return fmt.Errorf("error listing ComponentStatuses: %v", err)
}
// TODO: Add logic to figure out the InstanceGroup given a component
for _, component := range componentList.Items {
for _, condition := range component.Conditions {
if condition.Status != v1.ConditionTrue {
@ -205,9 +208,11 @@ var masterStaticPods = []string{
"kube-scheduler",
}
func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kubernetes.Interface, nodes []v1.Node) error {
func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kubernetes.Interface, nodes []v1.Node,
groups []*kops.InstanceGroup) error {
masterWithoutPod := map[string]map[string]bool{}
nodeByAddress := map[string]string{}
for _, node := range nodes {
labels := node.GetLabels()
if labels != nil && labels["kubernetes.io/role"] == "master" {
@ -238,6 +243,8 @@ func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kuber
if pod.Status.Phase == v1.PodSucceeded {
return nil
}
// TODO: Add logic to figure out the InstanceGroup given a pod
if pod.Status.Phase == v1.PodPending {
v.addError(&ValidationError{
Kind: "Pod",
@ -311,6 +318,7 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances
cloudGroup.InstanceGroup.Name,
numNodes,
cloudGroup.TargetSize),
InstanceGroup: cloudGroup.InstanceGroup,
})
}
@ -326,9 +334,10 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances
if nodeExpectedToJoin {
v.addError(&ValidationError{
Kind: "Machine",
Name: member.ID,
Message: fmt.Sprintf("machine %q has not yet joined cluster", member.ID),
Kind: "Machine",
Name: member.ID,
Message: fmt.Sprintf("machine %q has not yet joined cluster", member.ID),
InstanceGroup: cloudGroup.InstanceGroup,
})
}
continue
@ -358,9 +367,10 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances
if n.Role == "master" {
if !ready {
v.addError(&ValidationError{
Kind: "Node",
Name: node.Name,
Message: fmt.Sprintf("master %q is not ready", node.Name),
Kind: "Node",
Name: node.Name,
Message: fmt.Sprintf("master %q is not ready", node.Name),
InstanceGroup: cloudGroup.InstanceGroup,
})
}
@ -368,9 +378,10 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances
} else if n.Role == "node" {
if !ready {
v.addError(&ValidationError{
Kind: "Node",
Name: node.Name,
Message: fmt.Sprintf("node %q is not ready", node.Name),
Kind: "Node",
Name: node.Name,
Message: fmt.Sprintf("node %q is not ready", node.Name),
InstanceGroup: cloudGroup.InstanceGroup,
})
}
@ -384,9 +395,10 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances
for _, ig := range groups {
if !groupsSeen[ig.Name] {
v.addError(&ValidationError{
Kind: "InstanceGroup",
Name: ig.Name,
Message: fmt.Sprintf("InstanceGroup %q is missing from the cloud provider", ig.Name),
Kind: "InstanceGroup",
Name: ig.Name,
Message: fmt.Sprintf("InstanceGroup %q is missing from the cloud provider", ig.Name),
InstanceGroup: ig,
})
}
}

View File

@ -132,6 +132,7 @@ func Test_ValidateCloudGroupMissing(t *testing.T) {
cluster := &kopsapi.Cluster{
ObjectMeta: metav1.ObjectMeta{Name: "testcluster.k8s.local"},
}
instanceGroups := []kopsapi.InstanceGroup{
{
ObjectMeta: metav1.ObjectMeta{
@ -151,9 +152,10 @@ func Test_ValidateCloudGroupMissing(t *testing.T) {
require.NoError(t, err)
if !assert.Len(t, v.Failures, 1) ||
!assert.Equal(t, &ValidationError{
Kind: "InstanceGroup",
Name: "node-1",
Message: "InstanceGroup \"node-1\" is missing from the cloud provider",
Kind: "InstanceGroup",
Name: "node-1",
Message: "InstanceGroup \"node-1\" is missing from the cloud provider",
InstanceGroup: &instanceGroups[0],
}, v.Failures[0]) {
printDebug(t, v)
}
@ -204,9 +206,10 @@ func Test_ValidateNodesNotEnough(t *testing.T) {
require.NoError(t, err)
if !assert.Len(t, v.Failures, 1) ||
!assert.Equal(t, &ValidationError{
Kind: "InstanceGroup",
Name: "node-1",
Message: "InstanceGroup \"node-1\" did not have enough nodes 2 vs 3",
Kind: "InstanceGroup",
Name: "node-1",
Message: "InstanceGroup \"node-1\" did not have enough nodes 2 vs 3",
InstanceGroup: groups["node-1"].InstanceGroup,
}, v.Failures[0]) {
printDebug(t, v)
}
@ -258,9 +261,10 @@ func Test_ValidateDetachedNodesDontCount(t *testing.T) {
require.NoError(t, err)
if !assert.Len(t, v.Failures, 1) ||
!assert.Equal(t, &ValidationError{
Kind: "InstanceGroup",
Name: "node-1",
Message: "InstanceGroup \"node-1\" did not have enough nodes 1 vs 2",
Kind: "InstanceGroup",
Name: "node-1",
Message: "InstanceGroup \"node-1\" did not have enough nodes 1 vs 2",
InstanceGroup: groups["node-1"].InstanceGroup,
}, v.Failures[0]) {
printDebug(t, v)
}
@ -311,9 +315,10 @@ func Test_ValidateNodeNotReady(t *testing.T) {
require.NoError(t, err)
if !assert.Len(t, v.Failures, 1) ||
!assert.Equal(t, &ValidationError{
Kind: "Node",
Name: "node-1b",
Message: "node \"node-1b\" is not ready",
Kind: "Node",
Name: "node-1b",
Message: "node \"node-1b\" is not ready",
InstanceGroup: groups["node-1"].InstanceGroup,
}, v.Failures[0]) {
printDebug(t, v)
}
@ -364,9 +369,10 @@ func Test_ValidateMastersNotEnough(t *testing.T) {
require.NoError(t, err)
if !assert.Len(t, v.Failures, 1) ||
!assert.Equal(t, &ValidationError{
Kind: "InstanceGroup",
Name: "master-1",
Message: "InstanceGroup \"master-1\" did not have enough nodes 2 vs 3",
Kind: "InstanceGroup",
Name: "master-1",
Message: "InstanceGroup \"master-1\" did not have enough nodes 2 vs 3",
InstanceGroup: groups["node-1"].InstanceGroup,
}, v.Failures[0]) {
printDebug(t, v)
}
@ -417,9 +423,10 @@ func Test_ValidateMasterNotReady(t *testing.T) {
require.NoError(t, err)
if !assert.Len(t, v.Failures, 1) ||
!assert.Equal(t, &ValidationError{
Kind: "Node",
Name: "master-1b",
Message: "master \"master-1b\" is not ready",
Kind: "Node",
Name: "master-1b",
Message: "master \"master-1b\" is not ready",
InstanceGroup: groups["node-1"].InstanceGroup,
}, v.Failures[0]) {
printDebug(t, v)
}
@ -504,9 +511,10 @@ func Test_ValidateMasterStaticPods(t *testing.T) {
var podList []map[string]string
expectedFailures := []*ValidationError{
{
Kind: "Node",
Name: "master-1c",
Message: "master \"master-1c\" is not ready",
Kind: "Node",
Name: "master-1c",
Message: "master \"master-1c\" is not ready",
InstanceGroup: groups["node-1"].InstanceGroup,
},
}