RA: fix suberror identifier bug in CAA recheck. (#4259)

In the RA's recheckCAA function we loop through a list of *core.Authorizations, dispatching each to a Go routine that checks CAA for the authz and writes an error to a results channel. Later, we iterate the same *core.Authorization list and read errors from the channel. If we get a non-nil error, then the current iteration's *core.Authorization is used as the identifier for the suberror created with the non-nil error. This is a flawed approach and relies on the scheduling of recheck goroutines matching the iteration of the authorizations. When the goroutines write error results to the channel in an order that doesn't match the loop over the authorizations the RA will construct a suberror with the wrong identifier. This manifests as making the TestRecheckCAAFail unit test appear flaky, because it specifically checks the expected identifiers in the returned subproblems. The fix involves writing both the checked authorization and the error result to the results channel. Later instead of iterating the authorizations we just read the correct number of results from the channel and use the attached authorization from the result when constructing a suberror. Resolves #4248 Take away lessons: Write unit tests and always verify expected values! Always investigate flaky unit tests! Sometimes there's a real bug and not just a subpar test :-)
2019-06-12 12:13:47 -04:00 · 2019-06-12 12:13:47 -04:00 · daf311f41b
parent 098a761c02
commit daf311f41b
2 changed files with 31 additions and 13 deletions
--- a/ra/ra.go
+++ b/ra/ra.go
@ -854,7 +854,12 @@ func (ra *RegistrationAuthorityImpl) checkAuthorizationsCAA(
 func (ra *RegistrationAuthorityImpl) recheckCAA(ctx context.Context, authzs []*core.Authorization) error {
 	ra.stats.Inc("recheck_caa", 1)
 	ra.stats.Inc("recheck_caa_authzs", int64(len(authzs)))
-	ch := make(chan error, len(authzs))
+
+	type authzCAAResult struct {
+		authz *core.Authorization
+		err   error
+	}
+	ch := make(chan authzCAAResult, len(authzs))
 	for _, authz := range authzs {
 		go func(authz *core.Authorization) {
 			name := authz.Identifier.Value
@ -870,10 +875,12 @@ func (ra *RegistrationAuthorityImpl) recheckCAA(ctx context.Context, authzs []*c
 				}
 			}
 			if method == "" {
-				ch <- berrors.InternalServerError(
-					"Internal error determining validation method for authorization ID %v (%v)",
-					authz.ID, name,
-				)
+				ch <- authzCAAResult{
+					authz: authz,
+					err: berrors.InternalServerError(
+						"Internal error determining validation method for authorization ID %v (%v)",
+						authz.ID, name),
+				}
 				return
 			}

@ -891,16 +898,22 @@ func (ra *RegistrationAuthorityImpl) recheckCAA(ctx context.Context, authzs []*c
 			} else if resp.Problem != nil {
 				err = berrors.CAAError(*resp.Problem.Detail)
 			}
-			ch <- err
+			ch <- authzCAAResult{
+				authz: authz,
+				err:   err,
+			}
 		}(authz)
 	}
 	var subErrors []berrors.SubBoulderError
-	for _, authz := range authzs {
-		err := <-ch
-		if err != nil {
+	// Read a recheckResult for each authz from the results channel
+	for i := 0; i < len(authzs); i++ {
+		recheckResult := <-ch
+		// If the result had a CAA boulder error, construct a suberror with the
+		// identifier from the authorization that was checked.
+		if err := recheckResult.err; err != nil {
 			if bErr, _ := err.(*berrors.BoulderError); berrors.Is(err, berrors.CAA) {
 				subErrors = append(subErrors, berrors.SubBoulderError{
-					Identifier:   authz.Identifier,
+					Identifier:   recheckResult.authz.Identifier,
 					BoulderError: bErr})
 			} else {
 				return err
--- a/ra/ra_test.go
+++ b/ra/ra_test.go
@ -17,6 +17,7 @@ import (
 	"net"
 	"net/url"
 	"os"
+	"regexp"
 	"sort"
 	"strconv"
 	"strings"
@ -2078,9 +2079,13 @@ func TestRecheckCAAFail(t *testing.T) {
 	// There should be two sub errors
 	test.AssertEquals(t, len(berr.SubErrors), 2)

-	// The top level error should reference the first failing identifier
-	if !strings.Contains(berr.Detail, `Rechecking CAA for "a.com" and 1 more identifiers`) {
-		t.Errorf("expected error to ref first failed identiifer, got %q", err)
+	// We don't know whether the asynchronous a.com or c.com CAA recheck will fail
+	// first. Whichever does will be mentioned in the top level problem detail.
+	expectedDetailRegex := regexp.MustCompile(
+		`Rechecking CAA for "(?:a\.com|c\.com)" and 1 more identifiers failed. Refer to sub-problems for more information`,
+	)
+	if !expectedDetailRegex.MatchString(berr.Detail) {
+		t.Errorf("expected suberror detail to match expected regex, got %q", err)
 	}

 	// There should be a sub error for both a.com and c.com with the correct type