Integration tests: Warn (instead of erroring) upon pod restarts (#4623)

* Integration tests: Warn (instead of erroring) upon pod restarts Fixes #4595 Don't have integration tests fail whenever a pod is detected to have restarted just once. For now we'll be just logging this out and creating a warning annotation for it.
2020-06-18 06:08:05 -05:00 · 2020-06-18 06:08:05 -05:00 · c8c5980d63
parent b176fbeb6d
commit c8c5980d63
12 changed files with 125 additions and 28 deletions
--- a/test/edges/edges_test.go
+++ b/test/edges/edges_test.go
@ -83,8 +83,12 @@ func TestDirectEdges(t *testing.T) {
 	}

 	if err := TestHelper.CheckPods(testNamespace, "terminus", 1); err != nil {
+		if rce, ok := err.(*testutil.RestartCountError); ok {
+			testutil.AnnotatedWarn(t, "CheckPods timed-out", rce)
+		} else {
 			testutil.AnnotatedError(t, "CheckPods timed-out", err)
 		}
+	}

 	if err := TestHelper.CheckDeployment(testNamespace, "terminus", 1); err != nil {
 		testutil.AnnotatedErrorf(t, "CheckDeployment timed-out", "Error validating deployment [%s]:\n%s", "terminus", err)
@ -121,8 +125,12 @@ func TestDirectEdges(t *testing.T) {
 	}

 	if err := TestHelper.CheckPods(testNamespace, "slow-cooker", 1); err != nil {
+		if rce, ok := err.(*testutil.RestartCountError); ok {
+			testutil.AnnotatedWarn(t, "CheckPods timed-out", rce)
+		} else {
 			testutil.AnnotatedError(t, "CheckPods timed-out", err)
 		}
+	}

 	if err := TestHelper.CheckDeployment(testNamespace, "slow-cooker", 1); err != nil {
 		testutil.AnnotatedErrorf(t, "CheckDeployment timed-out", "error validating deployment [%s]:\n%s", "terminus", err)
--- a/test/egress/egress_test.go
+++ b/test/egress/egress_test.go
@ -41,7 +41,11 @@ func TestEgressHttp(t *testing.T) {

 	err = TestHelper.CheckPods(prefixedNs, "egress-test", 1)
 	if err != nil {
-		testutil.AnnotatedFatal(t, "CheckPods timed-out", err)
+		if rce, ok := err.(*testutil.RestartCountError); ok {
+			testutil.AnnotatedWarn(t, "CheckPods timed-out", rce)
+		} else {
+			testutil.AnnotatedError(t, "CheckPods timed-out", err)
+		}
 	}

 	testCase := func(url, methodToUse string) {
--- a/test/externalissuer/external_issuer_test.go
+++ b/test/externalissuer/external_issuer_test.go
@ -53,12 +53,20 @@ func verifyInstallApp(t *testing.T) {
 	}

 	if err := TestHelper.CheckPods(prefixedNs, TestAppBackendDeploymentName, 1); err != nil {
+		if rce, ok := err.(*testutil.RestartCountError); ok {
+			testutil.AnnotatedWarn(t, "CheckPods timed-out", rce)
+		} else {
 			testutil.AnnotatedError(t, "CheckPods timed-out", err)
 		}
+	}

 	if err := TestHelper.CheckPods(prefixedNs, "slow-cooker", 1); err != nil {
+		if rce, ok := err.(*testutil.RestartCountError); ok {
+			testutil.AnnotatedWarn(t, "CheckPods timed-out", rce)
+		} else {
 			testutil.AnnotatedError(t, "CheckPods timed-out", err)
 		}
+	}
 }

 func checkAppWoks(t *testing.T, timeout time.Duration) error {
--- a/test/get/get_test.go
+++ b/test/get/get_test.go
@ -78,8 +78,11 @@ func TestCliGet(t *testing.T) {
 	// wait for pods to start
 	for deploy, replicas := range deployReplicas {
 		if err := TestHelper.CheckPods(prefixedNs, deploy, replicas); err != nil {
-			testutil.AnnotatedError(t, "CheckPods timed-out",
-				fmt.Errorf("Error validating pods for deploy [%s]:\n%s", deploy, err))
+			if rce, ok := err.(*testutil.RestartCountError); ok {
+				testutil.AnnotatedWarn(t, "CheckPods timed-out", rce)
+			} else {
+				testutil.AnnotatedError(t, "CheckPods timed-out", err)
+			}
 		}
 	}

--- a/test/install_test.go
+++ b/test/install_test.go
@ -125,8 +125,12 @@ func TestUpgradeTestAppWorksBeforeUpgrade(t *testing.T) {
 		testAppNamespace := TestHelper.GetTestNamespace("upgrade-test")
 		for _, deploy := range []string{"emoji", "voting", "web"} {
 			if err := TestHelper.CheckPods(testAppNamespace, deploy, 1); err != nil {
+				if rce, ok := err.(*testutil.RestartCountError); ok {
+					testutil.AnnotatedWarn(t, "CheckPods timed-out", rce)
+				} else {
 					testutil.AnnotatedError(t, "CheckPods timed-out", err)
 				}
+			}

 			if err := TestHelper.CheckDeployment(testAppNamespace, deploy, 1); err != nil {
 				testutil.AnnotatedErrorf(t, "CheckDeployment timed-out", "Error validating deployment [%s]:\n%s", deploy, err)
--- a/test/serviceprofiles/serviceprofiles_test.go
+++ b/test/serviceprofiles/serviceprofiles_test.go
@ -52,8 +52,12 @@ func TestServiceProfiles(t *testing.T) {
 	// wait for deployments to start
 	for _, deploy := range []string{"t1", "t2", "t3", "gateway"} {
 		if err := TestHelper.CheckPods(testNamespace, deploy, 1); err != nil {
+			if rce, ok := err.(*testutil.RestartCountError); ok {
+				testutil.AnnotatedWarn(t, "CheckPods timed-out", rce)
+			} else {
 				testutil.AnnotatedError(t, "CheckPods timed-out", err)
 			}
+		}

 		if err := TestHelper.CheckDeployment(testNamespace, deploy, 1); err != nil {
 			testutil.AnnotatedErrorf(t, "CheckDeployment timed-out", "Error validating deployment [%s]:\n%s", deploy, err)
--- a/test/tap/tap_test.go
+++ b/test/tap/tap_test.go
@ -99,8 +99,12 @@ func TestCliTap(t *testing.T) {
 	// wait for deployments to start
 	for _, deploy := range []string{"t1", "t2", "t3", "gateway"} {
 		if err := TestHelper.CheckPods(prefixedNs, deploy, 1); err != nil {
+			if rce, ok := err.(*testutil.RestartCountError); ok {
+				testutil.AnnotatedWarn(t, "CheckPods timed-out", rce)
+			} else {
 				testutil.AnnotatedError(t, "CheckPods timed-out", err)
 			}
+		}

 		if err := TestHelper.CheckDeployment(prefixedNs, deploy, 1); err != nil {
 			testutil.AnnotatedErrorf(t, "CheckDeployment timed-out", "Error validating deployment [%s]:\n%s", deploy, err)
--- a/test/tracing/tracing_test.go
+++ b/test/tracing/tracing_test.go
@ -126,8 +126,12 @@ func TestTracing(t *testing.T) {
 		tracingNs:   "jaeger",
 	} {
 		if err := TestHelper.CheckPods(ns, deploy, 1); err != nil {
+			if rce, ok := err.(*testutil.RestartCountError); ok {
+				testutil.AnnotatedWarn(t, "CheckPods timed-out", rce)
+			} else {
 				testutil.AnnotatedError(t, "CheckPods timed-out", err)
 			}
+		}

 		if err := TestHelper.CheckDeployment(ns, deploy, 1); err != nil {
 			testutil.AnnotatedErrorf(t, "CheckDeployment timed-out", "Error validating deployment [%s]:\n%s", deploy, err)
--- a/test/trafficsplit/trafficsplit_test.go
+++ b/test/trafficsplit/trafficsplit_test.go
@ -168,8 +168,12 @@ func TestTrafficSplitCli(t *testing.T) {
 	// wait for deployments to start
 	for _, deploy := range []string{"backend", "failing", "slow-cooker"} {
 		if err := TestHelper.CheckPods(prefixedNs, deploy, 1); err != nil {
+			if rce, ok := err.(*testutil.RestartCountError); ok {
+				testutil.AnnotatedWarn(t, "CheckPods timed-out", rce)
+			} else {
 				testutil.AnnotatedError(t, "CheckPods timed-out", err)
 			}
+		}

 		if err := TestHelper.CheckDeployment(prefixedNs, deploy, 1); err != nil {
 			testutil.AnnotatedErrorf(t, "CheckDeployment timed-out", "Error validating deployment [%s]:\n%s", deploy, err)
--- a/test/uninstall/uninstall_test.go
+++ b/test/uninstall/uninstall_test.go
@ -51,8 +51,11 @@ func TestResourcesPostInstall(t *testing.T) {
 	// Tests Pods and Deployments
 	for deploy, spec := range testutil.LinkerdDeployReplicas {
 		if err := TestHelper.CheckPods(TestHelper.GetLinkerdNamespace(), deploy, spec.Replicas); err != nil {
-			testutil.AnnotatedFatal(t, "CheckPods timed-out",
-				fmt.Errorf("Error validating pods for deploy [%s]:\n%s", deploy, err))
+			if rce, ok := err.(*testutil.RestartCountError); ok {
+				testutil.AnnotatedWarn(t, "CheckPods timed-out", rce)
+			} else {
+				testutil.AnnotatedError(t, "CheckPods timed-out", err)
+			}
 		}
 		if err := TestHelper.CheckDeployment(TestHelper.GetLinkerdNamespace(), deploy, spec.Replicas); err != nil {
 			testutil.AnnotatedFatalf(t, "CheckDeployment timed-out", "Error validating deployment [%s]:\n%s", deploy, err)
--- a/testutil/annotations.go
+++ b/testutil/annotations.go
@ -13,9 +13,26 @@ const (
 	rootPath = "/linkerd2/"
 )

-func echoAnnotation(t *testing.T, args ...interface{}) {
+type level int
+
+const (
+	err level = iota
+	warn
+)
+
+func (l level) String() string {
+	switch l {
+	case err:
+		return "error"
+	case warn:
+		return "warning"
+	}
+	panic(fmt.Sprintf("invalid level: %d", l))
+}
+
+func echoAnnotation(t *testing.T, l level, args ...interface{}) {
 	if _, ok := os.LookupEnv(envFlag); ok {
-		_, fileName, fileLine, ok := runtime.Caller(2)
+		_, fileName, fileLine, ok := runtime.Caller(3)
 		if !ok {
 			panic("Couldn't recover runtime info")
 		}
@ -26,17 +43,25 @@ func echoAnnotation(t *testing.T, args ...interface{}) {
 		testName := parts[0]
 		for _, arg := range args {
 			msg := fmt.Sprintf("%s - %s", testName, arg)
-			fmt.Printf("::error file=%s,line=%d::%s\n", fileName, fileLine, msg)
+			fmt.Printf("::%s file=%s,line=%d::%s\n", l, fileName, fileLine, msg)
 		}
 	}
 }

+func echoAnnotationErr(t *testing.T, args ...interface{}) {
+	echoAnnotation(t, err, args...)
+}
+
+func echoAnnotationWarn(t *testing.T, args ...interface{}) {
+	echoAnnotation(t, warn, args...)
+}
+
 // Error is a wrapper around t.Error()
 // args are passed to t.Error(args) and each arg will be sent to stdout formatted
 // as a Github annotation when the envFlag environment variable is set
 func Error(t *testing.T, args ...interface{}) {
 	t.Helper()
-	echoAnnotation(t, args...)
+	echoAnnotationErr(t, args...)
 	t.Error(args...)
 }

@ -44,7 +69,7 @@ func Error(t *testing.T, args ...interface{}) {
 // will be used as the Github annotation
 func AnnotatedError(t *testing.T, msg string, args ...interface{}) {
 	t.Helper()
-	echoAnnotation(t, msg)
+	echoAnnotationErr(t, msg)
 	t.Error(args...)
 }

@ -54,7 +79,7 @@ func AnnotatedError(t *testing.T, msg string, args ...interface{}) {
 // environment variable is set
 func Errorf(t *testing.T, format string, args ...interface{}) {
 	t.Helper()
-	echoAnnotation(t, fmt.Sprintf(format, args...))
+	echoAnnotationErr(t, fmt.Sprintf(format, args...))
 	t.Errorf(format, args...)
 }

@ -62,7 +87,7 @@ func Errorf(t *testing.T, format string, args ...interface{}) {
 // will be used as the Github annotation
 func AnnotatedErrorf(t *testing.T, msg, format string, args ...interface{}) {
 	t.Helper()
-	echoAnnotation(t, msg)
+	echoAnnotationErr(t, msg)
 	t.Errorf(format, args...)
 }

@ -71,7 +96,7 @@ func AnnotatedErrorf(t *testing.T, msg, format string, args ...interface{}) {
 // as a Github annotation when the envFlag environment variable is set
 func Fatal(t *testing.T, args ...interface{}) {
 	t.Helper()
-	echoAnnotation(t, args)
+	echoAnnotationErr(t, args)
 	t.Fatal(args...)
 }

@ -79,7 +104,7 @@ func Fatal(t *testing.T, args ...interface{}) {
 // will be used as the Github annotation
 func AnnotatedFatal(t *testing.T, msg string, args ...interface{}) {
 	t.Helper()
-	echoAnnotation(t, msg)
+	echoAnnotationErr(t, msg)
 	t.Fatal(args...)
 }

@ -89,7 +114,7 @@ func AnnotatedFatal(t *testing.T, msg string, args ...interface{}) {
 // environment variable is set
 func Fatalf(t *testing.T, format string, args ...interface{}) {
 	t.Helper()
-	echoAnnotation(t, fmt.Sprintf(format, args...))
+	echoAnnotationErr(t, fmt.Sprintf(format, args...))
 	t.Fatalf(format, args...)
 }

@ -97,6 +122,14 @@ func Fatalf(t *testing.T, format string, args ...interface{}) {
 // will be used as the Github annotation
 func AnnotatedFatalf(t *testing.T, msg, format string, args ...interface{}) {
 	t.Helper()
-	echoAnnotation(t, msg)
+	echoAnnotationErr(t, msg)
 	t.Fatalf(format, args...)
 }
+
+// AnnotatedWarn is a wrapper around t.Log() but it also admits a msg string that
+// will be used as the Github warning annotation
+func AnnotatedWarn(t *testing.T, msg string, args ...interface{}) {
+	t.Helper()
+	echoAnnotationWarn(t, msg)
+	t.Log(args...)
+}
--- a/testutil/kubernetes_helper.go
+++ b/testutil/kubernetes_helper.go
@ -1,6 +1,7 @@
 package testutil

 import (
+	"errors"
 	"fmt"
 	"os/exec"
 	"regexp"
@ -26,6 +27,19 @@ type KubernetesHelper struct {
 	retryFor   func(time.Duration, func() error) error
 }

+// RestartCountError is returned by CheckPods() whenever a pod has restarted exactly one time.
+// Consumers should log this type of error instead of failing the test.
+// This is to alleviate CI flakiness stemming from a containerd bug.
+// See https://github.com/kubernetes/kubernetes/issues/89064
+// See https://github.com/containerd/containerd/issues/4068
+type RestartCountError struct {
+	msg string
+}
+
+func (e *RestartCountError) Error() string {
+	return e.msg
+}
+
 // NewKubernetesHelper creates a new instance of KubernetesHelper.
 func NewKubernetesHelper(k8sContext string, retryFor func(time.Duration, func() error) error) (*KubernetesHelper, error) {
 	rules := clientcmd.NewDefaultClientConfigLoadingRules()
@ -208,9 +222,13 @@ func (h *KubernetesHelper) CheckPods(namespace string, deploymentName string, re

 	for _, pod := range checkedPods {
 		for _, status := range append(pod.Status.ContainerStatuses, pod.Status.InitContainerStatuses...) {
-			if status.RestartCount != 0 {
-				return fmt.Errorf("Container [%s] in pod [%s] in namespace [%s] has restart count [%d]",
+			errStr := fmt.Sprintf("Container [%s] in pod [%s] in namespace [%s] has restart count [%d]",
 				status.Name, pod.Name, pod.Namespace, status.RestartCount)
+			if status.RestartCount == 1 {
+				return &RestartCountError{errStr}
+			}
+			if status.RestartCount > 1 {
+				return errors.New(errStr)
 			}
 		}
 	}