Fix: E2E failures in CI

Removed:
- Metrics and pod logs collection. Crust gather collects logs for all
  resources.

Fixed:
- MachineDeployment checks for running machines. MachineSets are picked
  at random, as they are indistinguishable based on labels, and belong
  to the same MachineDeployment. This causes flakes as old MachineSet is
  expected to scale accordingly, while the new one performed it instead.
- Increased ClusterClass apply timeouts. CAPD webhooks may take longer
  to stand up.

Signed-off-by: Danil-Grigorev <danil.grigorev@suse.com>
This commit is contained in:
Danil-Grigorev 2024-09-11 18:24:33 +02:00
parent 967b2da50d
commit 4578c2377c
No known key found for this signature in database
GPG Key ID: 7C96CE1776C81090
6 changed files with 157 additions and 54 deletions

View File

@ -32,9 +32,11 @@ import (
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/cmd/clusterctl/client/config"
"sigs.k8s.io/cluster-api/test/framework"
"sigs.k8s.io/cluster-api/test/framework/clusterctl"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/yaml"
)
@ -55,16 +57,11 @@ func Byf(format string, a ...interface{}) {
By(fmt.Sprintf(format, a...))
}
func setupSpecNamespace(ctx context.Context, specName string, clusterProxy framework.ClusterProxy, artifactFolder string) (*corev1.Namespace, context.CancelFunc) {
func setupSpecNamespace(ctx context.Context, specName string, clusterProxy framework.ClusterProxy, _ string) (*corev1.Namespace, context.CancelFunc) {
Byf("Creating a namespace for hosting the %q test spec", specName)
namespace, cancelWatches := framework.CreateNamespaceAndWatchEvents(ctx, framework.CreateNamespaceAndWatchEventsInput{
Creator: clusterProxy.GetClient(),
ClientSet: clusterProxy.GetClientSet(),
Name: fmt.Sprintf("%s-%s", specName, util.RandomString(6)),
LogFolder: filepath.Join(artifactFolder, "clusters", clusterProxy.GetName()),
})
return namespace, cancelWatches
_, cancelWatches := context.WithCancel(ctx)
return framework.CreateNamespace(ctx, framework.CreateNamespaceInput{Creator: clusterProxy.GetClient(), Name: fmt.Sprintf("%s-%s", specName, util.RandomString(6))}, "40s", "10s"), cancelWatches
}
func cleanupInstallation(ctx context.Context, clusterctlLogFolder, clusterctlConfigPath string, proxy framework.ClusterProxy) func() {
@ -191,3 +188,103 @@ func localLoadE2EConfig(configPath string) *clusterctl.E2EConfig {
return config
}
// UpgradeManagementCluster upgrades provider a management cluster using clusterctl, and waits for the cluster to be ready.
func UpgradeManagementCluster(ctx context.Context, input clusterctl.UpgradeManagementClusterAndWaitInput) {
Expect(ctx).NotTo(BeNil(), "ctx is required for UpgradeManagementCluster")
Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling UpgradeManagementCluster")
Expect(input.ClusterctlConfigPath).To(BeAnExistingFile(), "Invalid argument. input.ClusterctlConfigPath must be an existing file when calling UpgradeManagementCluster")
// Check if the user want a custom upgrade
isCustomUpgrade := input.CoreProvider != "" ||
len(input.BootstrapProviders) > 0 ||
len(input.ControlPlaneProviders) > 0 ||
len(input.InfrastructureProviders) > 0 ||
len(input.IPAMProviders) > 0 ||
len(input.RuntimeExtensionProviders) > 0 ||
len(input.AddonProviders) > 0
Expect((input.Contract != "" && !isCustomUpgrade) || (input.Contract == "" && isCustomUpgrade)).To(BeTrue(), `Invalid argument. Either the input.Contract parameter or at least one of the following providers has to be set:
input.CoreProvider, input.BootstrapProviders, input.ControlPlaneProviders, input.InfrastructureProviders, input.IPAMProviders, input.RuntimeExtensionProviders, input.AddonProviders`)
Expect(os.MkdirAll(input.LogFolder, 0750)).To(Succeed(), "Invalid argument. input.LogFolder can't be created for UpgradeManagementClusterAndWait")
upgradeInput := clusterctl.UpgradeInput{
ClusterctlConfigPath: input.ClusterctlConfigPath,
ClusterctlVariables: input.ClusterctlVariables,
ClusterName: input.ClusterProxy.GetName(),
KubeconfigPath: input.ClusterProxy.GetKubeconfigPath(),
Contract: input.Contract,
CoreProvider: input.CoreProvider,
BootstrapProviders: input.BootstrapProviders,
ControlPlaneProviders: input.ControlPlaneProviders,
InfrastructureProviders: input.InfrastructureProviders,
IPAMProviders: input.IPAMProviders,
RuntimeExtensionProviders: input.RuntimeExtensionProviders,
AddonProviders: input.AddonProviders,
LogFolder: input.LogFolder,
}
clusterctl.Upgrade(ctx, upgradeInput)
// We have to skip collecting metrics, as it causes failures in CI
}
// InitManagementCluster initializes a management using clusterctl.
func InitManagementCluster(ctx context.Context, input clusterctl.InitManagementClusterAndWatchControllerLogsInput, intervals ...interface{}) {
Expect(ctx).NotTo(BeNil(), "ctx is required for InitManagementCluster")
Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling InitManagementCluster")
Expect(input.ClusterctlConfigPath).To(BeAnExistingFile(), "Invalid argument. input.ClusterctlConfigPath must be an existing file when calling InitManagementCluster")
Expect(input.InfrastructureProviders).ToNot(BeEmpty(), "Invalid argument. input.InfrastructureProviders can't be empty when calling InitManagementCluster")
Expect(os.MkdirAll(input.LogFolder, 0750)).To(Succeed(), "Invalid argument. input.LogFolder can't be created for InitManagementCluster")
logger := log.FromContext(ctx)
if input.CoreProvider == "" {
input.CoreProvider = config.ClusterAPIProviderName
}
if len(input.BootstrapProviders) == 0 {
input.BootstrapProviders = []string{config.KubeadmBootstrapProviderName}
}
if len(input.ControlPlaneProviders) == 0 {
input.ControlPlaneProviders = []string{config.KubeadmControlPlaneProviderName}
}
client := input.ClusterProxy.GetClient()
controllersDeployments := framework.GetControllerDeployments(ctx, framework.GetControllerDeploymentsInput{
Lister: client,
})
if len(controllersDeployments) == 0 {
initInput := clusterctl.InitInput{
// pass reference to the management cluster hosting this test
KubeconfigPath: input.ClusterProxy.GetKubeconfigPath(),
// pass the clusterctl config file that points to the local provider repository created for this test
ClusterctlConfigPath: input.ClusterctlConfigPath,
// setup the desired list of providers for a single-tenant management cluster
CoreProvider: input.CoreProvider,
BootstrapProviders: input.BootstrapProviders,
ControlPlaneProviders: input.ControlPlaneProviders,
InfrastructureProviders: input.InfrastructureProviders,
IPAMProviders: input.IPAMProviders,
RuntimeExtensionProviders: input.RuntimeExtensionProviders,
AddonProviders: input.AddonProviders,
// setup clusterctl logs folder
LogFolder: input.LogFolder,
}
clusterctl.Init(ctx, initInput)
}
logger.Info("Waiting for provider controllers to be running")
controllersDeployments = framework.GetControllerDeployments(ctx, framework.GetControllerDeploymentsInput{
Lister: client,
})
Expect(controllersDeployments).ToNot(BeEmpty(), "The list of controller deployments should not be empty")
for _, deployment := range controllersDeployments {
framework.WaitForDeploymentsAvailable(ctx, framework.WaitForDeploymentsAvailableInput{
Getter: client,
Deployment: deployment,
}, intervals...)
}
}

View File

@ -102,7 +102,9 @@ var _ = Describe("Workload cluster creation", func() {
}
})
Expect(err).ToNot(HaveOccurred())
Expect(bootstrapClusterProxy.Apply(ctx, []byte(clusterClassConfig))).To(Succeed(), "Failed to apply ClusterClass definition")
Eventually(func() error {
return bootstrapClusterProxy.Apply(ctx, []byte(clusterClassConfig))
}, e2eConfig.GetIntervals(specName, "wait-cluster")...).Should(Succeed(), "Failed to apply ClusterClass definition")
By("Create a Docker Cluster from topology")

View File

@ -236,7 +236,7 @@ func setupBootstrapCluster(config *clusterctl.E2EConfig, scheme *runtime.Scheme,
// initBootstrapCluster initializes a bootstrap cluster with the latest minor version.
func initBootstrapCluster(bootstrapClusterProxy framework.ClusterProxy, config *clusterctl.E2EConfig, clusterctlConfig, artifactFolder string) {
clusterctl.InitManagementClusterAndWatchControllerLogs(context.TODO(), clusterctl.InitManagementClusterAndWatchControllerLogsInput{
InitManagementCluster(context.TODO(), clusterctl.InitManagementClusterAndWatchControllerLogsInput{
ClusterProxy: bootstrapClusterProxy,
ClusterctlConfigPath: clusterctlConfig,
InfrastructureProviders: config.InfrastructureProviders(),
@ -245,13 +245,14 @@ func initBootstrapCluster(bootstrapClusterProxy framework.ClusterProxy, config *
BootstrapProviders: []string{"rke2-bootstrap"},
ControlPlaneProviders: []string{"rke2-control-plane"},
LogFolder: filepath.Join(artifactFolder, "clusters", bootstrapClusterProxy.GetName()),
DisableMetricsCollection: true,
}, config.GetIntervals(bootstrapClusterProxy.GetName(), "wait-controllers")...)
}
// initUpgradableBootstrapCluster initializes a bootstrap cluster with the latest minor version N-1 and used to perform an upgrade to the latest version.
// Make sure to update the version in the providers list to the latest minor version N-1.
func initUpgradableBootstrapCluster(bootstrapClusterProxy framework.ClusterProxy, config *clusterctl.E2EConfig, clusterctlConfig, artifactFolder string) {
clusterctl.InitManagementClusterAndWatchControllerLogs(context.TODO(), clusterctl.InitManagementClusterAndWatchControllerLogsInput{
InitManagementCluster(context.TODO(), clusterctl.InitManagementClusterAndWatchControllerLogsInput{
ClusterProxy: bootstrapClusterProxy,
ClusterctlConfigPath: clusterctlConfig,
InfrastructureProviders: config.InfrastructureProviders(),
@ -260,6 +261,7 @@ func initUpgradableBootstrapCluster(bootstrapClusterProxy framework.ClusterProxy
BootstrapProviders: []string{"rke2-bootstrap:v0.6.0"},
ControlPlaneProviders: []string{"rke2-control-plane:v0.6.0"},
LogFolder: filepath.Join(artifactFolder, "clusters", bootstrapClusterProxy.GetName()),
DisableMetricsCollection: true,
}, config.GetIntervals(bootstrapClusterProxy.GetName(), "wait-controllers")...)
}

View File

@ -151,7 +151,7 @@ var _ = Describe("Workload cluster creation", func() {
}, result)
WaitForClusterToUpgrade(ctx, WaitForClusterToUpgradeInput{
Lister: bootstrapClusterProxy.GetClient(),
Reader: bootstrapClusterProxy.GetClient(),
ControlPlane: result.ControlPlane,
MachineDeployments: result.MachineDeployments,
VersionAfterUpgrade: e2eConfig.GetVariable(KubernetesVersionUpgradeTo),

View File

@ -115,13 +115,13 @@ var _ = Describe("Workload cluster creation", func() {
}, e2eConfig.GetIntervals(specName, "wait-control-plane")...)
By("Upgrading to latest boostrap/controlplane provider version")
clusterctl.UpgradeManagementClusterAndWait(ctx, clusterctl.UpgradeManagementClusterAndWaitInput{
UpgradeManagementCluster(ctx, clusterctl.UpgradeManagementClusterAndWaitInput{
ClusterProxy: bootstrapClusterProxy,
ClusterctlConfigPath: clusterctlConfigPath,
BootstrapProviders: []string{"rke2-bootstrap:v0.7.99"},
ControlPlaneProviders: []string{"rke2-control-plane:v0.7.99"},
LogFolder: clusterctlLogFolder,
}, e2eConfig.GetIntervals(specName, "wait-controllers")...)
})
WaitForControlPlaneToBeReady(ctx, WaitForControlPlaneToBeReadyInput{
Getter: bootstrapClusterProxy.GetClient(),
@ -174,7 +174,7 @@ var _ = Describe("Workload cluster creation", func() {
}, result)
WaitForClusterToUpgrade(ctx, WaitForClusterToUpgradeInput{
Lister: bootstrapClusterProxy.GetClient(),
Reader: bootstrapClusterProxy.GetClient(),
ControlPlane: result.ControlPlane,
MachineDeployments: result.MachineDeployments,
VersionAfterUpgrade: e2eConfig.GetVariable(KubernetesVersionUpgradeTo),

View File

@ -29,7 +29,6 @@ import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"
@ -138,14 +137,6 @@ func ApplyClusterTemplateAndWait(ctx context.Context, input ApplyClusterTemplate
})
Expect(workloadClusterTemplate).ToNot(BeNil(), "Failed to get the cluster template")
// Ensure we have a Cluster for dump and cleanup steps in AfterEach even if ApplyClusterTemplateAndWait fails.
result.Cluster = &clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: input.ConfigCluster.ClusterName,
Namespace: input.ConfigCluster.Namespace,
},
}
ApplyCustomClusterTemplateAndWait(ctx, ApplyCustomClusterTemplateAndWaitInput{
ClusterProxy: input.ClusterProxy,
CustomTemplateYAML: workloadClusterTemplate,
@ -174,19 +165,10 @@ func ApplyCustomClusterTemplateAndWait(ctx context.Context, input ApplyCustomClu
Byf("Creating the workload cluster with name %q from the provided yaml", input.ClusterName)
// Ensure we have a Cluster for dump and cleanup steps in AfterEach even if ApplyClusterTemplateAndWait fails.
result.Cluster = &clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: input.ClusterName,
Namespace: input.Namespace,
},
}
Byf("Applying the cluster template yaml of cluster %s", klog.KRef(input.Namespace, input.ClusterName))
Eventually(func() error {
return input.ClusterProxy.Apply(ctx, input.CustomTemplateYAML, input.Args...)
// return input.ClusterProxy.CreateOrUpdate(ctx, input.CustomTemplateYAML, input.CreateOrUpdateOpts...)
}, 1*time.Minute).Should(Succeed(), "Failed to apply the cluster template")
}, input.WaitForClusterIntervals...).Should(Succeed(), "Failed to apply the cluster template")
// Once we applied the cluster template we can run PreWaitForCluster.
// Note: This can e.g. be used to verify the BeforeClusterCreate lifecycle hook is executed
@ -218,7 +200,7 @@ func ApplyCustomClusterTemplateAndWait(ctx context.Context, input ApplyCustomClu
input.WaitForControlPlaneMachinesReady(ctx, input, result)
Byf("Waiting for the machine deployments of cluster %s to be provisioned", klog.KRef(input.Namespace, input.ClusterName))
result.MachineDeployments = framework.DiscoveryAndWaitForMachineDeployments(ctx, framework.DiscoveryAndWaitForMachineDeploymentsInput{
result.MachineDeployments = DiscoveryAndWaitForMachineDeployments(ctx, framework.DiscoveryAndWaitForMachineDeploymentsInput{
Lister: input.ClusterProxy.GetClient(),
Cluster: result.Cluster,
}, input.WaitForMachineDeployments...)
@ -285,7 +267,7 @@ func DiscoveryAndWaitForRKE2ControlPlaneInitialized(ctx context.Context, input D
Namespace: input.Cluster.Namespace,
})
g.Expect(controlPlane).ToNot(BeNil())
}, "10s", "1s").Should(Succeed(), "Couldn't get the control plane for the cluster %s", klog.KObj(input.Cluster))
}, "2m", "1s").Should(Succeed(), "Couldn't get the control plane for the cluster %s", klog.KObj(input.Cluster))
return controlPlane
}
@ -445,7 +427,7 @@ func WaitForMachineConditions(ctx context.Context, input WaitForMachineCondition
// WaitForClusterToUpgradeInput is the input for WaitForClusterToUpgrade.
type WaitForClusterToUpgradeInput struct {
Lister framework.Lister
Reader framework.GetLister
ControlPlane *controlplanev1.RKE2ControlPlane
MachineDeployments []*clusterv1.MachineDeployment
VersionAfterUpgrade string
@ -455,32 +437,52 @@ type WaitForClusterToUpgradeInput struct {
func WaitForClusterToUpgrade(ctx context.Context, input WaitForClusterToUpgradeInput, intervals ...interface{}) {
By("Waiting for machines to update")
var totalMachineCount int32
totalMachineCount = *input.ControlPlane.Spec.Replicas
for _, md := range input.MachineDeployments {
totalMachineCount += *md.Spec.Replicas
}
Eventually(func() (bool, error) {
machineList := &clusterv1.MachineList{}
if err := input.Lister.List(ctx, machineList); err != nil {
return false, fmt.Errorf("failed to list machines: %w", err)
Eventually(func() error {
cp := input.ControlPlane.DeepCopy()
if err := input.Reader.Get(ctx, client.ObjectKeyFromObject(input.ControlPlane), cp); err != nil {
return fmt.Errorf("failed to get control plane: %w", err)
}
if len(machineList.Items) != int(totalMachineCount) { // not all replicas are created
return false, nil
updatedDeployments := []*clusterv1.MachineDeployment{}
for _, md := range input.MachineDeployments {
copy := &clusterv1.MachineDeployment{}
if err := input.Reader.Get(ctx, client.ObjectKeyFromObject(md), copy); client.IgnoreNotFound(err) != nil {
return fmt.Errorf("failed to get updated machine deployment: %w", err)
}
updatedDeployments = append(updatedDeployments, copy)
}
machineList := &clusterv1.MachineList{}
if err := input.Reader.List(ctx, machineList); err != nil {
return fmt.Errorf("failed to list machines: %w", err)
}
for _, machine := range machineList.Items {
expectedVersion := input.VersionAfterUpgrade + "+rke2r1"
if machine.Spec.Version != nil && *machine.Spec.Version != expectedVersion {
return false, nil
if machine.Spec.Version == nil || *machine.Spec.Version != expectedVersion {
return fmt.Errorf("Expected machine version to match %s, got %v", expectedVersion, machine.Spec.Version)
}
}
return true, nil
}, intervals...).Should(BeTrue(), framework.PrettyPrint(input.ControlPlane))
ready := cp.Status.ReadyReplicas == cp.Status.Replicas
if !ready {
return fmt.Errorf("Control plane is not ready: %d ready from %d", cp.Status.ReadyReplicas, cp.Status.Replicas)
}
expected := cp.Spec.Replicas != nil && *cp.Spec.Replicas == cp.Status.Replicas
if !expected {
return fmt.Errorf("Control plane is not scaled: %d replicas from %d", cp.Spec.Replicas, cp.Status.Replicas)
}
for _, md := range updatedDeployments {
if md.Spec.Replicas == nil || *md.Spec.Replicas != md.Status.ReadyReplicas {
return fmt.Errorf("Not all machine deployments are updated yet expected %v!=%d", md.Spec.Replicas, md.Status.ReadyReplicas)
}
}
return nil
}, intervals...).Should(Succeed())
}
// setDefaults sets the default values for ApplyCustomClusterTemplateAndWaitInput if not set.