karmada/test/e2e/failover_test.go

344 lines
12 KiB
Go

package e2e
import (
"context"
"fmt"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/rand"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
"k8s.io/utils/pointer"
"sigs.k8s.io/controller-runtime/pkg/client"
clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1"
policyv1alpha1 "github.com/karmada-io/karmada/pkg/apis/policy/v1alpha1"
controllercluster "github.com/karmada-io/karmada/pkg/controllers/cluster"
"github.com/karmada-io/karmada/pkg/util"
"github.com/karmada-io/karmada/pkg/util/helper"
"github.com/karmada-io/karmada/test/e2e/framework"
testhelper "github.com/karmada-io/karmada/test/helper"
)
// failover testing is used to test the rescheduling situation when some initially scheduled clusters fail
var _ = framework.SerialDescribe("failover testing", func() {
ginkgo.Context("Deployment propagation testing", func() {
var policyNamespace, policyName string
var deploymentNamespace, deploymentName string
var deployment *appsv1.Deployment
var maxGroups, minGroups, numOfFailedClusters int
var policy *policyv1alpha1.PropagationPolicy
ginkgo.BeforeEach(func() {
policyNamespace = testNamespace
policyName = deploymentNamePrefix + rand.String(RandomStrLength)
deploymentNamespace = testNamespace
deploymentName = policyName
deployment = testhelper.NewDeployment(deploymentNamespace, deploymentName)
maxGroups = 1
minGroups = 1
numOfFailedClusters = 1
// set MaxGroups=MinGroups=1, label is location=CHN.
policy = testhelper.NewPropagationPolicy(policyNamespace, policyName, []policyv1alpha1.ResourceSelector{
{
APIVersion: deployment.APIVersion,
Kind: deployment.Kind,
Name: deployment.Name,
},
}, policyv1alpha1.Placement{
ClusterAffinity: &policyv1alpha1.ClusterAffinity{
LabelSelector: &metav1.LabelSelector{
// only test push mode clusters
// because pull mode clusters cannot be disabled by changing APIEndpoint
MatchLabels: pushModeClusterLabels,
},
},
ClusterTolerations: []corev1.Toleration{
*helper.NewNotReadyToleration(2),
*helper.NewUnreachableToleration(2),
},
SpreadConstraints: []policyv1alpha1.SpreadConstraint{
{
SpreadByField: policyv1alpha1.SpreadByFieldCluster,
MaxGroups: maxGroups,
MinGroups: minGroups,
},
},
})
})
ginkgo.BeforeEach(func() {
framework.CreatePropagationPolicy(karmadaClient, policy)
framework.CreateDeployment(kubeClient, deployment)
ginkgo.DeferCleanup(func() {
framework.RemoveDeployment(kubeClient, deployment.Namespace, deployment.Name)
framework.RemovePropagationPolicy(karmadaClient, policy.Namespace, policy.Name)
})
})
ginkgo.It("deployment failover testing", func() {
var disabledClusters []string
targetClusterNames := framework.ExtractTargetClustersFrom(controlPlaneClient, deployment)
ginkgo.By("set one cluster condition status to false", func() {
temp := numOfFailedClusters
for _, targetClusterName := range targetClusterNames {
if temp > 0 {
klog.Infof("Set cluster %s to disable.", targetClusterName)
err := disableCluster(controlPlaneClient, targetClusterName)
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
// wait for the current cluster status changing to false
framework.WaitClusterFitWith(controlPlaneClient, targetClusterName, func(cluster *clusterv1alpha1.Cluster) bool {
return helper.TaintExists(cluster.Spec.Taints, controllercluster.NotReadyTaintTemplate)
})
disabledClusters = append(disabledClusters, targetClusterName)
temp--
}
}
})
ginkgo.By("check whether deployment of failed cluster is rescheduled to other available cluster", func() {
gomega.Eventually(func() int {
targetClusterNames = framework.ExtractTargetClustersFrom(controlPlaneClient, deployment)
for _, targetClusterName := range targetClusterNames {
// the target cluster should be overwritten to another available cluster
if !testhelper.IsExclude(targetClusterName, disabledClusters) {
return 0
}
}
return len(targetClusterNames)
}, pollTimeout, pollInterval).Should(gomega.Equal(minGroups))
})
ginkgo.By("recover not ready cluster", func() {
for _, disabledCluster := range disabledClusters {
fmt.Printf("cluster %s is waiting for recovering\n", disabledCluster)
originalAPIEndpoint := getClusterAPIEndpoint(disabledCluster)
err := recoverCluster(controlPlaneClient, disabledCluster, originalAPIEndpoint)
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
// wait for the disabled cluster recovered
err = wait.PollImmediate(pollInterval, pollTimeout, func() (done bool, err error) {
currentCluster, err := util.GetCluster(controlPlaneClient, disabledCluster)
if err != nil {
return false, err
}
if !helper.TaintExists(currentCluster.Spec.Taints, controllercluster.NotReadyTaintTemplate) {
fmt.Printf("cluster %s recovered\n", disabledCluster)
return true, nil
}
return false, nil
})
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
}
})
})
})
ginkgo.Context("Taint cluster testing", func() {
var policyNamespace, policyName string
var deploymentNamespace, deploymentName string
var deployment *appsv1.Deployment
var taint corev1.Taint
var maxGroups, minGroups, numOfFailedClusters int
var policy *policyv1alpha1.PropagationPolicy
maxGroups = 1
minGroups = 1
numOfFailedClusters = 1
ginkgo.BeforeEach(func() {
policyNamespace = testNamespace
policyName = deploymentNamePrefix + rand.String(RandomStrLength)
deploymentNamespace = testNamespace
deploymentName = policyName
deployment = testhelper.NewDeployment(deploymentNamespace, deploymentName)
policy = testhelper.NewPropagationPolicy(policyNamespace, policyName, []policyv1alpha1.ResourceSelector{
{
APIVersion: deployment.APIVersion,
Kind: deployment.Kind,
Name: deployment.Name,
},
}, policyv1alpha1.Placement{
ClusterAffinity: &policyv1alpha1.ClusterAffinity{
ClusterNames: framework.ClusterNames(),
},
ClusterTolerations: []corev1.Toleration{
{
Key: "fail-test",
Effect: corev1.TaintEffectNoExecute,
Operator: corev1.TolerationOpExists,
TolerationSeconds: pointer.Int64(3),
},
},
SpreadConstraints: []policyv1alpha1.SpreadConstraint{
{
SpreadByField: policyv1alpha1.SpreadByFieldCluster,
MaxGroups: maxGroups,
MinGroups: minGroups,
},
},
})
taint = corev1.Taint{
Key: "fail-test",
Effect: corev1.TaintEffectNoExecute,
}
})
ginkgo.BeforeEach(func() {
framework.CreatePropagationPolicy(karmadaClient, policy)
framework.CreateDeployment(kubeClient, deployment)
ginkgo.DeferCleanup(func() {
framework.RemoveDeployment(kubeClient, deployment.Namespace, deployment.Name)
framework.RemovePropagationPolicy(karmadaClient, policy.Namespace, policy.Name)
})
})
ginkgo.It("taint cluster", func() {
var disabledClusters []string
targetClusterNames := framework.ExtractTargetClustersFrom(controlPlaneClient, deployment)
ginkgo.By("taint one cluster", func() {
temp := numOfFailedClusters
for _, targetClusterName := range targetClusterNames {
if temp > 0 {
klog.Infof("Taint one cluster(%s).", targetClusterName)
err := taintCluster(controlPlaneClient, targetClusterName, taint)
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
disabledClusters = append(disabledClusters, targetClusterName)
temp--
}
}
})
ginkgo.By("check whether deployment of failed cluster is rescheduled to other available cluster", func() {
gomega.Eventually(func() int {
targetClusterNames = framework.ExtractTargetClustersFrom(controlPlaneClient, deployment)
for _, targetClusterName := range targetClusterNames {
// the target cluster should be overwritten to another available cluster
if !testhelper.IsExclude(targetClusterName, disabledClusters) {
return 0
}
}
return len(targetClusterNames)
}, pollTimeout, pollInterval).Should(gomega.Equal(minGroups))
})
ginkgo.By("recover not ready cluster", func() {
for _, disabledCluster := range disabledClusters {
fmt.Printf("cluster %s is waiting for recovering\n", disabledCluster)
err := recoverTaintedCluster(controlPlaneClient, disabledCluster, taint)
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
}
})
})
})
})
// disableCluster will set wrong API endpoint of current cluster
func disableCluster(c client.Client, clusterName string) error {
err := wait.PollImmediate(pollInterval, pollTimeout, func() (done bool, err error) {
clusterObj := &clusterv1alpha1.Cluster{}
if err := c.Get(context.TODO(), client.ObjectKey{Name: clusterName}, clusterObj); err != nil {
if apierrors.IsConflict(err) {
return false, nil
}
return false, err
}
// set the APIEndpoint of matched cluster to a wrong value
unavailableAPIEndpoint := "https://172.19.1.3:6443"
clusterObj.Spec.APIEndpoint = unavailableAPIEndpoint
if err := c.Update(context.TODO(), clusterObj); err != nil {
if apierrors.IsConflict(err) {
return false, nil
}
return false, err
}
return true, nil
})
return err
}
// taintCluster will taint cluster
func taintCluster(c client.Client, clusterName string, taint corev1.Taint) error {
err := wait.PollImmediate(pollInterval, pollTimeout, func() (done bool, err error) {
clusterObj := &clusterv1alpha1.Cluster{}
if err := c.Get(context.TODO(), client.ObjectKey{Name: clusterName}, clusterObj); err != nil {
if apierrors.IsConflict(err) {
return false, nil
}
return false, err
}
clusterObj.Spec.Taints = append(clusterObj.Spec.Taints, taint)
if err := c.Update(context.TODO(), clusterObj); err != nil {
if apierrors.IsConflict(err) {
return false, nil
}
return false, err
}
return true, nil
})
return err
}
// recoverTaintedCluster will recover the taint of the disabled cluster
func recoverTaintedCluster(c client.Client, clusterName string, taint corev1.Taint) error {
err := wait.PollImmediate(pollInterval, pollTimeout, func() (done bool, err error) {
clusterObj := &clusterv1alpha1.Cluster{}
if err := c.Get(context.TODO(), client.ObjectKey{Name: clusterName}, clusterObj); err != nil {
if apierrors.IsConflict(err) {
return false, nil
}
return false, err
}
if err := helper.UpdateClusterControllerTaint(context.TODO(), c, nil, []*corev1.Taint{&taint}, clusterObj); err != nil {
if apierrors.IsConflict(err) {
return false, nil
}
return false, err
}
return true, nil
})
return err
}
// recoverCluster will recover API endpoint of the disable cluster
func recoverCluster(c client.Client, clusterName string, originalAPIEndpoint string) error {
err := wait.PollImmediate(pollInterval, pollTimeout, func() (done bool, err error) {
clusterObj := &clusterv1alpha1.Cluster{}
if err := c.Get(context.TODO(), client.ObjectKey{Name: clusterName}, clusterObj); err != nil {
return false, err
}
clusterObj.Spec.APIEndpoint = originalAPIEndpoint
if err := c.Update(context.TODO(), clusterObj); err != nil {
if apierrors.IsConflict(err) {
return false, nil
}
return false, err
}
fmt.Printf("recovered API endpoint is %s\n", clusterObj.Spec.APIEndpoint)
return true, nil
})
return err
}
// get the API endpoint of a specific cluster
func getClusterAPIEndpoint(clusterName string) (apiEndpoint string) {
for _, cluster := range framework.Clusters() {
if cluster.Name == clusterName {
apiEndpoint = cluster.Spec.APIEndpoint
fmt.Printf("original API endpoint of the cluster %s is %s\n", clusterName, apiEndpoint)
}
}
return apiEndpoint
}