Add MPICFullResults feature flag to turn off VA early return (#8046)

Add a new "MPICFullResults" feature flag. When this flag is enabled in
the VA, it will wait for all Remote VAs to return their results for both
Domain Control Validation and CAA checking, rather than short-circuiting
as soon as it has seen enough results to know whether corroboration will
or will not be achieved.

We make this change because waiting for these to return honestly doesn't
take that long, because we do validation (although not CAA rechecking)
asynchronously, and because it improves the quality of our MPIC quorum
summary logs (so we don't always say only 3/4 concurred because the
fourth was cancelled).

Fixes https://github.com/letsencrypt/boulder/issues/7809
This commit is contained in:
Aaron Gable 2025-03-11 10:49:05 -05:00 committed by GitHub
parent ad651d4a3d
commit dc14caf907
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 66 additions and 49 deletions

View File

@ -80,6 +80,11 @@ type Config struct {
// functionality (valid authz reuse) while letting us simplify our code by
// removing pending authz reuse.
NoPendingAuthzReuse bool
// MPICFullResults causes the VA to wait for all remote (MPIC) results, rather
// than cancelling outstanding requests after enough successes or failures for
// the result to be determined.
MPICFullResults bool
}
var fMu = new(sync.RWMutex)

View File

@ -38,7 +38,8 @@
}
},
"features": {
"DOH": true
"DOH": true,
"MPICFullResults": true
},
"remoteVAs": [
{

View File

@ -25,6 +25,7 @@ import (
"github.com/letsencrypt/boulder/core"
corepb "github.com/letsencrypt/boulder/core/proto"
berrors "github.com/letsencrypt/boulder/errors"
"github.com/letsencrypt/boulder/features"
bgrpc "github.com/letsencrypt/boulder/grpc"
"github.com/letsencrypt/boulder/identifier"
blog "github.com/letsencrypt/boulder/log"
@ -609,14 +610,16 @@ func (va *ValidationAuthorityImpl) doRemoteOperation(ctx context.Context, op rem
firstProb = currProb
}
// To respond faster, if we get enough successes or too many failures, we cancel remaining RPCs.
// Finish the loop to collect remaining responses into `failed` so we can rely on having a response
// for every request we made.
if len(passed) >= required && len(passedRIRs) >= requiredRIRs {
cancel()
}
if len(failed) > va.maxRemoteFailures {
cancel()
if !features.Get().MPICFullResults {
// To respond faster, if we get enough successes or too many failures, we cancel remaining RPCs.
// Finish the loop to collect remaining responses into `failed` so we can rely on having a response
// for every request we made.
if len(passed) >= required && len(passedRIRs) >= requiredRIRs {
cancel()
}
if len(failed) > va.maxRemoteFailures {
cancel()
}
}
// Once all the VAs have returned a result, break the loop.

View File

@ -12,6 +12,7 @@ import (
"net/http/httptest"
"net/netip"
"os"
"strconv"
"strings"
"sync"
"syscall"
@ -689,7 +690,8 @@ func TestMultiVA(t *testing.T) {
}
func TestMultiVAEarlyReturn(t *testing.T) {
t.Parallel()
// TODO(#7809): Make this test parallel when it no longer manipulates the
// MPICFullResults feature flag.
testCases := []struct {
name string
@ -768,53 +770,59 @@ func TestMultiVAEarlyReturn(t *testing.T) {
},
}
for _, tc := range testCases {
t.Run(fmt.Sprintf(tc.name), func(t *testing.T) {
t.Parallel()
for _, mpicFullResults := range []bool{false, true} {
for _, tc := range testCases {
t.Run(fmt.Sprintf("%s_(MPICFullResults: %s)", tc.name, strconv.FormatBool(mpicFullResults)), func(t *testing.T) {
// TODO(#7809): Make this test parallel when it no longer manipulates the
// MPICFullResults feature flag.
// Configure one test server per test case so that all tests can run in parallel.
ms := httpMultiSrv(t, expectedToken, map[string]bool{pass: true, fail: false})
defer ms.Close()
// Configure one test server per test case so that all tests can run in parallel.
ms := httpMultiSrv(t, expectedToken, map[string]bool{pass: true, fail: false})
defer ms.Close()
localVA, _ := setupWithRemotes(ms.Server, pass, tc.remoteConfs, nil)
localVA, _ := setupWithRemotes(ms.Server, pass, tc.remoteConfs, nil)
// Perform all validations
start := time.Now()
req := createValidationRequest(identifier.NewDNS("localhost"), core.ChallengeTypeHTTP01)
res, _ := localVA.DoDCV(ctx, req)
features.Set(features.Config{MPICFullResults: mpicFullResults})
defer features.Reset()
if tc.wantCorroboration {
if res.Problem != nil {
t.Errorf("expected corroboration, but got prob %s", res.Problem)
// Perform all validations
start := time.Now()
req := createValidationRequest(identifier.NewDNS("localhost"), core.ChallengeTypeHTTP01)
res, _ := localVA.DoDCV(ctx, req)
if tc.wantCorroboration {
if res.Problem != nil {
t.Errorf("expected corroboration, but got prob %s", res.Problem)
}
} else {
if res.Problem == nil {
t.Error("expected prob from PerformValidation, got nil")
}
}
} else {
if res.Problem == nil {
t.Error("expected prob from PerformValidation, got nil")
}
}
elapsed := time.Since(start).Round(time.Millisecond).Milliseconds()
elapsed := time.Since(start).Round(time.Millisecond).Milliseconds()
if tc.wantEarlyReturn {
// The slow UA should sleep for `slowRemoteSleepMillis`. But the first remote
// VA should fail quickly and the early-return code should cause the overall
// overall validation to return a prob quickly (i.e. in less than half of
// `slowRemoteSleepMillis`).
if elapsed > slowRemoteSleepMillis/2 {
t.Errorf(
"Expected an early return from PerformValidation in < %d ms, took %d ms",
slowRemoteSleepMillis/2, elapsed)
if tc.wantEarlyReturn && !mpicFullResults {
// The slow UA should sleep for `slowRemoteSleepMillis`. But the first remote
// VA should fail quickly and the early-return code should cause the overall
// overall validation to return a prob quickly (i.e. in less than half of
// `slowRemoteSleepMillis`).
if elapsed > slowRemoteSleepMillis/2 {
t.Errorf(
"Expected an early return from PerformValidation in < %d ms, took %d ms",
slowRemoteSleepMillis/2, elapsed)
}
} else {
// The VA will have to wait for all of the results, because the fast
// results aren't sufficient to determine (non)corroboration.
if elapsed < slowRemoteSleepMillis {
t.Errorf(
"Expected a slow return from PerformValidation in >= %d ms, took %d ms",
slowRemoteSleepMillis, elapsed)
}
}
} else {
// The VA will have to wait for all of the results, because the fast
// results aren't sufficient to determine (non)corroboration.
if elapsed < slowRemoteSleepMillis {
t.Errorf(
"Expected a slow return from PerformValidation in >= %d ms, took %d ms",
slowRemoteSleepMillis, elapsed)
}
}
})
})
}
}
}