reconcile: wait for apiserver to response before trying rolling-update

The rolling-update requires the apiserver (when called without --cloudonly),
so reconcile should wait for apiserver to start responding.

Implement this by reusing "validate cluster", but filtering to only the instance groups
and pods that we expect to be online.
This commit is contained in:
justinsb 2025-01-13 17:47:48 -05:00
parent 2b133b2503
commit f2d4eeb104
9 changed files with 125 additions and 51 deletions

View File

@ -258,7 +258,7 @@ func RunDeleteInstance(ctx context.Context, f *util.Factory, out io.Writer, opti
var clusterValidator validation.ClusterValidator var clusterValidator validation.ClusterValidator
if !options.CloudOnly { if !options.CloudOnly {
clusterValidator, err = validation.NewClusterValidator(cluster, cloud, list, restConfig, k8sClient) clusterValidator, err = validation.NewClusterValidator(cluster, cloud, list, nil, nil, restConfig, k8sClient)
if err != nil { if err != nil {
return fmt.Errorf("cannot create cluster validator: %v", err) return fmt.Errorf("cannot create cluster validator: %v", err)
} }

View File

@ -20,8 +20,10 @@ import (
"context" "context"
"fmt" "fmt"
"io" "io"
"time"
"github.com/spf13/cobra" "github.com/spf13/cobra"
v1 "k8s.io/api/core/v1"
"k8s.io/kops/cmd/kops/util" "k8s.io/kops/cmd/kops/util"
"k8s.io/kops/pkg/apis/kops" "k8s.io/kops/pkg/apis/kops"
"k8s.io/kops/pkg/commands/commandutils" "k8s.io/kops/pkg/commands/commandutils"
@ -134,6 +136,30 @@ func RunReconcileCluster(ctx context.Context, f *util.Factory, out io.Writer, c
} }
} }
// Particularly for a new cluster, we need to wait for the control plane to be answering requests
// before we can do a rolling update.
fmt.Fprintf(out, "Waiting for the kubernetes API to be served\n")
{
opt := &ValidateClusterOptions{}
opt.InitDefaults()
opt.ClusterName = c.ClusterName
opt.wait = 10 * time.Minute
// filter the instance group to only include the control plane
opt.filterInstanceGroups = func(ig *kops.InstanceGroup) bool {
return ig.Spec.Role == kops.InstanceGroupRoleAPIServer || ig.Spec.Role == kops.InstanceGroupRoleControlPlane
}
// Ignore all pods, we just want to check the control plane is responding
opt.filterPodsForValidation = func(pod *v1.Pod) bool {
return false
}
if _, err := RunValidateCluster(ctx, f, out, opt); err != nil {
return fmt.Errorf("waiting for kubernetes API to be served: %w", err)
}
}
fmt.Fprintf(out, "Doing rolling-update for control plane\n") fmt.Fprintf(out, "Doing rolling-update for control plane\n")
{ {
opt := &RollingUpdateOptions{} opt := &RollingUpdateOptions{}

View File

@ -453,7 +453,7 @@ func RunRollingUpdateCluster(ctx context.Context, f *util.Factory, out io.Writer
return fmt.Errorf("getting rest config: %w", err) return fmt.Errorf("getting rest config: %w", err)
} }
clusterValidator, err = validation.NewClusterValidator(cluster, cloud, list, restConfig, k8sClient) clusterValidator, err = validation.NewClusterValidator(cluster, cloud, list, nil, nil, restConfig, k8sClient)
if err != nil { if err != nil {
return fmt.Errorf("cannot create cluster validator: %v", err) return fmt.Errorf("cannot create cluster validator: %v", err)
} }

View File

@ -36,6 +36,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2" "k8s.io/klog/v2"
"k8s.io/kops/cmd/kops/util" "k8s.io/kops/cmd/kops/util"
"k8s.io/kops/pkg/apis/kops"
kopsapi "k8s.io/kops/pkg/apis/kops" kopsapi "k8s.io/kops/pkg/apis/kops"
"k8s.io/kops/pkg/validation" "k8s.io/kops/pkg/validation"
"k8s.io/kops/util/pkg/tables" "k8s.io/kops/util/pkg/tables"
@ -61,12 +62,19 @@ var (
) )
type ValidateClusterOptions struct { type ValidateClusterOptions struct {
ClusterName string ClusterName string
output string InstanceGroupRoles []kops.InstanceGroupRole
wait time.Duration output string
count int wait time.Duration
interval time.Duration count int
kubeconfig string interval time.Duration
kubeconfig string
// filterInstanceGroups is a function that returns true if the instance group should be validated
filterInstanceGroups func(ig *kops.InstanceGroup) bool
// filterPodsForValidation is a function that returns true if the pod should be validated
filterPodsForValidation func(pod *v1.Pod) bool
} }
func (o *ValidateClusterOptions) InitDefaults() { func (o *ValidateClusterOptions) InitDefaults() {
@ -164,7 +172,7 @@ func RunValidateCluster(ctx context.Context, f *util.Factory, out io.Writer, opt
timeout := time.Now().Add(options.wait) timeout := time.Now().Add(options.wait)
validator, err := validation.NewClusterValidator(cluster, cloud, list, restConfig, k8sClient) validator, err := validation.NewClusterValidator(cluster, cloud, list, options.filterInstanceGroups, options.filterPodsForValidation, restConfig, k8sClient)
if err != nil { if err != nil {
return nil, fmt.Errorf("unexpected error creating validatior: %v", err) return nil, fmt.Errorf("unexpected error creating validatior: %v", err)
} }
@ -175,7 +183,7 @@ func RunValidateCluster(ctx context.Context, f *util.Factory, out io.Writer, opt
return nil, fmt.Errorf("wait time exceeded during validation") return nil, fmt.Errorf("wait time exceeded during validation")
} }
result, err := validator.Validate() result, err := validator.Validate(ctx)
if err != nil { if err != nil {
consecutive = 0 consecutive = 0
if options.wait > 0 { if options.wait > 0 {

View File

@ -537,7 +537,7 @@ func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int, gro
for { for {
// Note that we validate at least once before checking the timeout, in case the cluster is healthy with a short timeout // Note that we validate at least once before checking the timeout, in case the cluster is healthy with a short timeout
result, err := c.ClusterValidator.Validate() result, err := c.ClusterValidator.Validate(ctx)
if err == nil && !hasFailureRelevantToGroup(result.Failures, group) { if err == nil && !hasFailureRelevantToGroup(result.Failures, group) {
successCount++ successCount++
if successCount >= validateCount { if successCount >= validateCount {

View File

@ -84,13 +84,13 @@ func getTestSetup() (*RollingUpdateCluster, *awsup.MockAWSCloud) {
type successfulClusterValidator struct{} type successfulClusterValidator struct{}
func (*successfulClusterValidator) Validate() (*validation.ValidationCluster, error) { func (*successfulClusterValidator) Validate(ctx context.Context) (*validation.ValidationCluster, error) {
return &validation.ValidationCluster{}, nil return &validation.ValidationCluster{}, nil
} }
type failingClusterValidator struct{} type failingClusterValidator struct{}
func (*failingClusterValidator) Validate() (*validation.ValidationCluster, error) { func (*failingClusterValidator) Validate(ctx context.Context) (*validation.ValidationCluster, error) {
return &validation.ValidationCluster{ return &validation.ValidationCluster{
Failures: []*validation.ValidationError{ Failures: []*validation.ValidationError{
{ {
@ -104,7 +104,7 @@ func (*failingClusterValidator) Validate() (*validation.ValidationCluster, error
type erroringClusterValidator struct{} type erroringClusterValidator struct{}
func (*erroringClusterValidator) Validate() (*validation.ValidationCluster, error) { func (*erroringClusterValidator) Validate(ctx context.Context) (*validation.ValidationCluster, error) {
return nil, errors.New("testing validation error") return nil, errors.New("testing validation error")
} }
@ -113,7 +113,7 @@ type instanceGroupNodeSpecificErrorClusterValidator struct {
InstanceGroup *kopsapi.InstanceGroup InstanceGroup *kopsapi.InstanceGroup
} }
func (igErrorValidator *instanceGroupNodeSpecificErrorClusterValidator) Validate() (*validation.ValidationCluster, error) { func (igErrorValidator *instanceGroupNodeSpecificErrorClusterValidator) Validate(ctx context.Context) (*validation.ValidationCluster, error) {
return &validation.ValidationCluster{ return &validation.ValidationCluster{
Failures: []*validation.ValidationError{ Failures: []*validation.ValidationError{
{ {
@ -130,7 +130,7 @@ type assertNotCalledClusterValidator struct {
T *testing.T T *testing.T
} }
func (v *assertNotCalledClusterValidator) Validate() (*validation.ValidationCluster, error) { func (v *assertNotCalledClusterValidator) Validate(ctx context.Context) (*validation.ValidationCluster, error) {
v.T.Fatal("validator called unexpectedly") v.T.Fatal("validator called unexpectedly")
return nil, errors.New("validator called unexpectedly") return nil, errors.New("validator called unexpectedly")
} }
@ -425,8 +425,7 @@ type failAfterOneNodeClusterValidator struct {
ReturnError bool ReturnError bool
} }
func (v *failAfterOneNodeClusterValidator) Validate() (*validation.ValidationCluster, error) { func (v *failAfterOneNodeClusterValidator) Validate(ctx context.Context) (*validation.ValidationCluster, error) {
ctx := context.TODO()
asgGroups, _ := v.Cloud.Autoscaling().DescribeAutoScalingGroups(ctx, &autoscaling.DescribeAutoScalingGroupsInput{ asgGroups, _ := v.Cloud.Autoscaling().DescribeAutoScalingGroups(ctx, &autoscaling.DescribeAutoScalingGroupsInput{
AutoScalingGroupNames: []string{v.Group}, AutoScalingGroupNames: []string{v.Group},
}) })
@ -648,8 +647,7 @@ type flappingClusterValidator struct {
invocationCount int invocationCount int
} }
func (v *flappingClusterValidator) Validate() (*validation.ValidationCluster, error) { func (v *flappingClusterValidator) Validate(ctx context.Context) (*validation.ValidationCluster, error) {
ctx := context.TODO()
asgGroups, _ := v.Cloud.Autoscaling().DescribeAutoScalingGroups(ctx, &autoscaling.DescribeAutoScalingGroupsInput{ asgGroups, _ := v.Cloud.Autoscaling().DescribeAutoScalingGroups(ctx, &autoscaling.DescribeAutoScalingGroupsInput{
AutoScalingGroupNames: []string{"master-1"}, AutoScalingGroupNames: []string{"master-1"},
}) })
@ -706,7 +704,7 @@ type failThreeTimesClusterValidator struct {
invocationCount int invocationCount int
} }
func (v *failThreeTimesClusterValidator) Validate() (*validation.ValidationCluster, error) { func (v *failThreeTimesClusterValidator) Validate(ctx context.Context) (*validation.ValidationCluster, error) {
v.invocationCount++ v.invocationCount++
if v.invocationCount <= 3 { if v.invocationCount <= 3 {
return &validation.ValidationCluster{ return &validation.ValidationCluster{
@ -1060,7 +1058,7 @@ type concurrentTest struct {
detached map[string]bool detached map[string]bool
} }
func (c *concurrentTest) Validate() (*validation.ValidationCluster, error) { func (c *concurrentTest) Validate(ctx context.Context) (*validation.ValidationCluster, error) {
c.mutex.Lock() c.mutex.Lock()
defer c.mutex.Unlock() defer c.mutex.Unlock()
@ -1441,7 +1439,7 @@ type alreadyDetachedTest struct {
detached map[string]bool detached map[string]bool
} }
func (t *alreadyDetachedTest) Validate() (*validation.ValidationCluster, error) { func (t *alreadyDetachedTest) Validate(ctx context.Context) (*validation.ValidationCluster, error) {
t.mutex.Lock() t.mutex.Lock()
defer t.mutex.Unlock() defer t.mutex.Unlock()

View File

@ -75,7 +75,7 @@ type countingValidator struct {
numValidations int numValidations int
} }
func (c *countingValidator) Validate() (*validation.ValidationCluster, error) { func (c *countingValidator) Validate(ctx context.Context) (*validation.ValidationCluster, error) {
c.numValidations++ c.numValidations++
return &validation.ValidationCluster{}, nil return &validation.ValidationCluster{}, nil
} }

View File

@ -56,15 +56,23 @@ type ValidationError struct {
type ClusterValidator interface { type ClusterValidator interface {
// Validate validates a k8s cluster // Validate validates a k8s cluster
Validate() (*ValidationCluster, error) Validate(ctx context.Context) (*ValidationCluster, error)
} }
type clusterValidatorImpl struct { type clusterValidatorImpl struct {
cluster *kops.Cluster cluster *kops.Cluster
cloud fi.Cloud cloud fi.Cloud
instanceGroups []*kops.InstanceGroup restConfig *rest.Config
restConfig *rest.Config k8sClient kubernetes.Interface
k8sClient kubernetes.Interface
// allInstanceGroups is the list of all instance groups in the cluster
allInstanceGroups []*kops.InstanceGroup
// filterInstanceGroups is a function that returns true if the instance group should be validated
filterInstanceGroups func(ig *kops.InstanceGroup) bool
// filterPodsForValidation is a function that returns true if the pod should be validated
filterPodsForValidation func(pod *v1.Pod) bool
} }
func (v *ValidationCluster) addError(failure *ValidationError) { func (v *ValidationCluster) addError(failure *ValidationError) {
@ -101,30 +109,44 @@ func hasPlaceHolderIP(host string) (string, error) {
return "", nil return "", nil
} }
func NewClusterValidator(cluster *kops.Cluster, cloud fi.Cloud, instanceGroupList *kops.InstanceGroupList, restConfig *rest.Config, k8sClient kubernetes.Interface) (ClusterValidator, error) { func NewClusterValidator(cluster *kops.Cluster, cloud fi.Cloud, instanceGroupList *kops.InstanceGroupList, filterInstanceGroups func(ig *kops.InstanceGroup) bool, filterPodsForValidation func(pod *v1.Pod) bool, restConfig *rest.Config, k8sClient kubernetes.Interface) (ClusterValidator, error) {
var instanceGroups []*kops.InstanceGroup var allInstanceGroups []*kops.InstanceGroup
for i := range instanceGroupList.Items { for i := range instanceGroupList.Items {
ig := &instanceGroupList.Items[i] ig := &instanceGroupList.Items[i]
instanceGroups = append(instanceGroups, ig) allInstanceGroups = append(allInstanceGroups, ig)
} }
if len(instanceGroups) == 0 { if len(allInstanceGroups) == 0 {
return nil, fmt.Errorf("no InstanceGroup objects found") return nil, fmt.Errorf("no InstanceGroup objects found")
} }
// If no filter is provided, validate all instance groups
if filterInstanceGroups == nil {
filterInstanceGroups = func(ig *kops.InstanceGroup) bool {
return true
}
}
// If no filter is provided, validate all pods
if filterPodsForValidation == nil {
filterPodsForValidation = func(pod *v1.Pod) bool {
return true
}
}
return &clusterValidatorImpl{ return &clusterValidatorImpl{
cluster: cluster, cluster: cluster,
cloud: cloud, cloud: cloud,
instanceGroups: instanceGroups, allInstanceGroups: allInstanceGroups,
restConfig: restConfig, restConfig: restConfig,
k8sClient: k8sClient, k8sClient: k8sClient,
filterInstanceGroups: filterInstanceGroups,
filterPodsForValidation: filterPodsForValidation,
}, nil }, nil
} }
func (v *clusterValidatorImpl) Validate() (*ValidationCluster, error) { func (v *clusterValidatorImpl) Validate(ctx context.Context) (*ValidationCluster, error) {
ctx := context.TODO()
validation := &ValidationCluster{} validation := &ValidationCluster{}
// Do not use if we are running gossip or without dns // Do not use if we are running gossip or without dns
@ -161,13 +183,14 @@ func (v *clusterValidatorImpl) Validate() (*ValidationCluster, error) {
} }
warnUnmatched := false warnUnmatched := false
cloudGroups, err := v.cloud.GetCloudGroups(v.cluster, v.instanceGroups, warnUnmatched, nodeList.Items) cloudGroups, err := v.cloud.GetCloudGroups(v.cluster, v.allInstanceGroups, warnUnmatched, nodeList.Items)
if err != nil { if err != nil {
return nil, err return nil, err
} }
readyNodes, nodeInstanceGroupMapping := validation.validateNodes(cloudGroups, v.instanceGroups)
if err := validation.collectPodFailures(ctx, v.k8sClient, readyNodes, nodeInstanceGroupMapping); err != nil { readyNodes, nodeInstanceGroupMapping := validation.validateNodes(cloudGroups, v.allInstanceGroups, v.filterInstanceGroups)
if err := validation.collectPodFailures(ctx, v.k8sClient, readyNodes, nodeInstanceGroupMapping, v.filterPodsForValidation); err != nil {
return nil, fmt.Errorf("cannot get pod health for %q: %v", v.cluster.Name, err) return nil, fmt.Errorf("cannot get pod health for %q: %v", v.cluster.Name, err)
} }
@ -181,7 +204,7 @@ var masterStaticPods = []string{
} }
func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kubernetes.Interface, nodes []v1.Node, func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kubernetes.Interface, nodes []v1.Node,
nodeInstanceGroupMapping map[string]*kops.InstanceGroup, nodeInstanceGroupMapping map[string]*kops.InstanceGroup, podValidationFilter func(pod *v1.Pod) bool,
) error { ) error {
masterWithoutPod := map[string]map[string]bool{} masterWithoutPod := map[string]map[string]bool{}
nodeByAddress := map[string]string{} nodeByAddress := map[string]string{}
@ -210,10 +233,16 @@ func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kuber
delete(masterWithoutPod[nodeByAddress[pod.Status.HostIP]], app) delete(masterWithoutPod[nodeByAddress[pod.Status.HostIP]], app)
} }
// Ignore pods that we don't want to validate
if !podValidationFilter(pod) {
return nil
}
priority := pod.Spec.PriorityClassName priority := pod.Spec.PriorityClassName
if priority != "system-cluster-critical" && priority != "system-node-critical" { if priority != "system-cluster-critical" && priority != "system-node-critical" {
return nil return nil
} }
if pod.Status.Phase == v1.PodSucceeded { if pod.Status.Phase == v1.PodSucceeded {
return nil return nil
} }
@ -275,12 +304,16 @@ func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kuber
return nil return nil
} }
func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances.CloudInstanceGroup, groups []*kops.InstanceGroup) ([]v1.Node, map[string]*kops.InstanceGroup) { func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances.CloudInstanceGroup, groups []*kops.InstanceGroup, shouldValidateInstanceGroup func(ig *kops.InstanceGroup) bool) ([]v1.Node, map[string]*kops.InstanceGroup) {
var readyNodes []v1.Node var readyNodes []v1.Node
groupsSeen := map[string]bool{} groupsSeen := map[string]bool{}
nodeInstanceGroupMapping := map[string]*kops.InstanceGroup{} nodeInstanceGroupMapping := map[string]*kops.InstanceGroup{}
for _, cloudGroup := range cloudGroups { for _, cloudGroup := range cloudGroups {
if cloudGroup.InstanceGroup != nil && !shouldValidateInstanceGroup(cloudGroup.InstanceGroup) {
continue
}
var allMembers []*cloudinstances.CloudInstance var allMembers []*cloudinstances.CloudInstance
allMembers = append(allMembers, cloudGroup.Ready...) allMembers = append(allMembers, cloudGroup.Ready...)
allMembers = append(allMembers, cloudGroup.NeedUpdate...) allMembers = append(allMembers, cloudGroup.NeedUpdate...)
@ -372,6 +405,10 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances
} }
for _, ig := range groups { for _, ig := range groups {
if !shouldValidateInstanceGroup(ig) {
continue
}
if !groupsSeen[ig.Name] { if !groupsSeen[ig.Name] {
v.addError(&ValidationError{ v.addError(&ValidationError{
Kind: "InstanceGroup", Kind: "InstanceGroup",

View File

@ -17,6 +17,7 @@ limitations under the License.
package validation package validation
import ( import (
"context"
"fmt" "fmt"
"testing" "testing"
@ -70,6 +71,8 @@ func (c *MockCloud) GetCloudGroups(cluster *kopsapi.Cluster, instancegroups []*k
} }
func testValidate(t *testing.T, groups map[string]*cloudinstances.CloudInstanceGroup, objects []runtime.Object) (*ValidationCluster, error) { func testValidate(t *testing.T, groups map[string]*cloudinstances.CloudInstanceGroup, objects []runtime.Object) (*ValidationCluster, error) {
ctx := context.TODO()
cluster := &kopsapi.Cluster{ cluster := &kopsapi.Cluster{
ObjectMeta: metav1.ObjectMeta{Name: "testcluster.k8s.local"}, ObjectMeta: metav1.ObjectMeta{Name: "testcluster.k8s.local"},
Spec: kopsapi.ClusterSpec{ Spec: kopsapi.ClusterSpec{
@ -130,14 +133,16 @@ func testValidate(t *testing.T, groups map[string]*cloudinstances.CloudInstanceG
restConfig := &rest.Config{ restConfig := &rest.Config{
Host: "https://api.testcluster.k8s.local", Host: "https://api.testcluster.k8s.local",
} }
validator, err := NewClusterValidator(cluster, mockcloud, &kopsapi.InstanceGroupList{Items: instanceGroups}, restConfig, fake.NewSimpleClientset(objects...)) validator, err := NewClusterValidator(cluster, mockcloud, &kopsapi.InstanceGroupList{Items: instanceGroups}, nil, nil, restConfig, fake.NewSimpleClientset(objects...))
if err != nil { if err != nil {
return nil, err return nil, err
} }
return validator.Validate() return validator.Validate(ctx)
} }
func Test_ValidateCloudGroupMissing(t *testing.T) { func Test_ValidateCloudGroupMissing(t *testing.T) {
ctx := context.TODO()
cluster := &kopsapi.Cluster{ cluster := &kopsapi.Cluster{
ObjectMeta: metav1.ObjectMeta{Name: "testcluster.k8s.local"}, ObjectMeta: metav1.ObjectMeta{Name: "testcluster.k8s.local"},
Spec: kopsapi.ClusterSpec{ Spec: kopsapi.ClusterSpec{
@ -163,9 +168,9 @@ func Test_ValidateCloudGroupMissing(t *testing.T) {
restConfig := &rest.Config{ restConfig := &rest.Config{
Host: "https://api.testcluster.k8s.local", Host: "https://api.testcluster.k8s.local",
} }
validator, err := NewClusterValidator(cluster, mockcloud, &kopsapi.InstanceGroupList{Items: instanceGroups}, restConfig, fake.NewSimpleClientset()) validator, err := NewClusterValidator(cluster, mockcloud, &kopsapi.InstanceGroupList{Items: instanceGroups}, nil, nil, restConfig, fake.NewSimpleClientset())
require.NoError(t, err) require.NoError(t, err)
v, err := validator.Validate() v, err := validator.Validate(ctx)
require.NoError(t, err) require.NoError(t, err)
if !assert.Len(t, v.Failures, 1) || if !assert.Len(t, v.Failures, 1) ||
!assert.Equal(t, &ValidationError{ !assert.Equal(t, &ValidationError{