Add MPICFullResults feature flag to turn off VA early return (#8046)
Add a new "MPICFullResults" feature flag. When this flag is enabled in the VA, it will wait for all Remote VAs to return their results for both Domain Control Validation and CAA checking, rather than short-circuiting as soon as it has seen enough results to know whether corroboration will or will not be achieved. We make this change because waiting for these to return honestly doesn't take that long, because we do validation (although not CAA rechecking) asynchronously, and because it improves the quality of our MPIC quorum summary logs (so we don't always say only 3/4 concurred because the fourth was cancelled). Fixes https://github.com/letsencrypt/boulder/issues/7809
This commit is contained in:
		
							parent
							
								
									ad651d4a3d
								
							
						
					
					
						commit
						dc14caf907
					
				|  | @ -80,6 +80,11 @@ type Config struct { | |||
| 	// functionality (valid authz reuse) while letting us simplify our code by
 | ||||
| 	// removing pending authz reuse.
 | ||||
| 	NoPendingAuthzReuse bool | ||||
| 
 | ||||
| 	// MPICFullResults causes the VA to wait for all remote (MPIC) results, rather
 | ||||
| 	// than cancelling outstanding requests after enough successes or failures for
 | ||||
| 	// the result to be determined.
 | ||||
| 	MPICFullResults bool | ||||
| } | ||||
| 
 | ||||
| var fMu = new(sync.RWMutex) | ||||
|  |  | |||
|  | @ -38,7 +38,8 @@ | |||
| 			} | ||||
| 		}, | ||||
| 		"features": { | ||||
| 			"DOH": true | ||||
| 			"DOH": true, | ||||
| 			"MPICFullResults": true | ||||
| 		}, | ||||
| 		"remoteVAs": [ | ||||
| 			{ | ||||
|  |  | |||
							
								
								
									
										19
									
								
								va/va.go
								
								
								
								
							
							
						
						
									
										19
									
								
								va/va.go
								
								
								
								
							|  | @ -25,6 +25,7 @@ import ( | |||
| 	"github.com/letsencrypt/boulder/core" | ||||
| 	corepb "github.com/letsencrypt/boulder/core/proto" | ||||
| 	berrors "github.com/letsencrypt/boulder/errors" | ||||
| 	"github.com/letsencrypt/boulder/features" | ||||
| 	bgrpc "github.com/letsencrypt/boulder/grpc" | ||||
| 	"github.com/letsencrypt/boulder/identifier" | ||||
| 	blog "github.com/letsencrypt/boulder/log" | ||||
|  | @ -609,14 +610,16 @@ func (va *ValidationAuthorityImpl) doRemoteOperation(ctx context.Context, op rem | |||
| 			firstProb = currProb | ||||
| 		} | ||||
| 
 | ||||
| 		// To respond faster, if we get enough successes or too many failures, we cancel remaining RPCs.
 | ||||
| 		// Finish the loop to collect remaining responses into `failed` so we can rely on having a response
 | ||||
| 		// for every request we made.
 | ||||
| 		if len(passed) >= required && len(passedRIRs) >= requiredRIRs { | ||||
| 			cancel() | ||||
| 		} | ||||
| 		if len(failed) > va.maxRemoteFailures { | ||||
| 			cancel() | ||||
| 		if !features.Get().MPICFullResults { | ||||
| 			// To respond faster, if we get enough successes or too many failures, we cancel remaining RPCs.
 | ||||
| 			// Finish the loop to collect remaining responses into `failed` so we can rely on having a response
 | ||||
| 			// for every request we made.
 | ||||
| 			if len(passed) >= required && len(passedRIRs) >= requiredRIRs { | ||||
| 				cancel() | ||||
| 			} | ||||
| 			if len(failed) > va.maxRemoteFailures { | ||||
| 				cancel() | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		// Once all the VAs have returned a result, break the loop.
 | ||||
|  |  | |||
|  | @ -12,6 +12,7 @@ import ( | |||
| 	"net/http/httptest" | ||||
| 	"net/netip" | ||||
| 	"os" | ||||
| 	"strconv" | ||||
| 	"strings" | ||||
| 	"sync" | ||||
| 	"syscall" | ||||
|  | @ -689,7 +690,8 @@ func TestMultiVA(t *testing.T) { | |||
| } | ||||
| 
 | ||||
| func TestMultiVAEarlyReturn(t *testing.T) { | ||||
| 	t.Parallel() | ||||
| 	// TODO(#7809): Make this test parallel when it no longer manipulates the
 | ||||
| 	// MPICFullResults feature flag.
 | ||||
| 
 | ||||
| 	testCases := []struct { | ||||
| 		name              string | ||||
|  | @ -768,53 +770,59 @@ func TestMultiVAEarlyReturn(t *testing.T) { | |||
| 		}, | ||||
| 	} | ||||
| 
 | ||||
| 	for _, tc := range testCases { | ||||
| 		t.Run(fmt.Sprintf(tc.name), func(t *testing.T) { | ||||
| 			t.Parallel() | ||||
| 	for _, mpicFullResults := range []bool{false, true} { | ||||
| 		for _, tc := range testCases { | ||||
| 			t.Run(fmt.Sprintf("%s_(MPICFullResults: %s)", tc.name, strconv.FormatBool(mpicFullResults)), func(t *testing.T) { | ||||
| 				// TODO(#7809): Make this test parallel when it no longer manipulates the
 | ||||
| 				// MPICFullResults feature flag.
 | ||||
| 
 | ||||
| 			// Configure one test server per test case so that all tests can run in parallel.
 | ||||
| 			ms := httpMultiSrv(t, expectedToken, map[string]bool{pass: true, fail: false}) | ||||
| 			defer ms.Close() | ||||
| 				// Configure one test server per test case so that all tests can run in parallel.
 | ||||
| 				ms := httpMultiSrv(t, expectedToken, map[string]bool{pass: true, fail: false}) | ||||
| 				defer ms.Close() | ||||
| 
 | ||||
| 			localVA, _ := setupWithRemotes(ms.Server, pass, tc.remoteConfs, nil) | ||||
| 				localVA, _ := setupWithRemotes(ms.Server, pass, tc.remoteConfs, nil) | ||||
| 
 | ||||
| 			// Perform all validations
 | ||||
| 			start := time.Now() | ||||
| 			req := createValidationRequest(identifier.NewDNS("localhost"), core.ChallengeTypeHTTP01) | ||||
| 			res, _ := localVA.DoDCV(ctx, req) | ||||
| 				features.Set(features.Config{MPICFullResults: mpicFullResults}) | ||||
| 				defer features.Reset() | ||||
| 
 | ||||
| 			if tc.wantCorroboration { | ||||
| 				if res.Problem != nil { | ||||
| 					t.Errorf("expected corroboration, but got prob %s", res.Problem) | ||||
| 				// Perform all validations
 | ||||
| 				start := time.Now() | ||||
| 				req := createValidationRequest(identifier.NewDNS("localhost"), core.ChallengeTypeHTTP01) | ||||
| 				res, _ := localVA.DoDCV(ctx, req) | ||||
| 
 | ||||
| 				if tc.wantCorroboration { | ||||
| 					if res.Problem != nil { | ||||
| 						t.Errorf("expected corroboration, but got prob %s", res.Problem) | ||||
| 					} | ||||
| 				} else { | ||||
| 					if res.Problem == nil { | ||||
| 						t.Error("expected prob from PerformValidation, got nil") | ||||
| 					} | ||||
| 				} | ||||
| 			} else { | ||||
| 				if res.Problem == nil { | ||||
| 					t.Error("expected prob from PerformValidation, got nil") | ||||
| 				} | ||||
| 			} | ||||
| 
 | ||||
| 			elapsed := time.Since(start).Round(time.Millisecond).Milliseconds() | ||||
| 				elapsed := time.Since(start).Round(time.Millisecond).Milliseconds() | ||||
| 
 | ||||
| 			if tc.wantEarlyReturn { | ||||
| 				// The slow UA should sleep for `slowRemoteSleepMillis`. But the first remote
 | ||||
| 				// VA should fail quickly and the early-return code should cause the overall
 | ||||
| 				// overall validation to return a prob quickly (i.e. in less than half of
 | ||||
| 				// `slowRemoteSleepMillis`).
 | ||||
| 				if elapsed > slowRemoteSleepMillis/2 { | ||||
| 					t.Errorf( | ||||
| 						"Expected an early return from PerformValidation in < %d ms, took %d ms", | ||||
| 						slowRemoteSleepMillis/2, elapsed) | ||||
| 				if tc.wantEarlyReturn && !mpicFullResults { | ||||
| 					// The slow UA should sleep for `slowRemoteSleepMillis`. But the first remote
 | ||||
| 					// VA should fail quickly and the early-return code should cause the overall
 | ||||
| 					// overall validation to return a prob quickly (i.e. in less than half of
 | ||||
| 					// `slowRemoteSleepMillis`).
 | ||||
| 					if elapsed > slowRemoteSleepMillis/2 { | ||||
| 						t.Errorf( | ||||
| 							"Expected an early return from PerformValidation in < %d ms, took %d ms", | ||||
| 							slowRemoteSleepMillis/2, elapsed) | ||||
| 					} | ||||
| 				} else { | ||||
| 					// The VA will have to wait for all of the results, because the fast
 | ||||
| 					// results aren't sufficient to determine (non)corroboration.
 | ||||
| 					if elapsed < slowRemoteSleepMillis { | ||||
| 						t.Errorf( | ||||
| 							"Expected a slow return from PerformValidation in >= %d ms, took %d ms", | ||||
| 							slowRemoteSleepMillis, elapsed) | ||||
| 					} | ||||
| 				} | ||||
| 			} else { | ||||
| 				// The VA will have to wait for all of the results, because the fast
 | ||||
| 				// results aren't sufficient to determine (non)corroboration.
 | ||||
| 				if elapsed < slowRemoteSleepMillis { | ||||
| 					t.Errorf( | ||||
| 						"Expected a slow return from PerformValidation in >= %d ms, took %d ms", | ||||
| 						slowRemoteSleepMillis, elapsed) | ||||
| 				} | ||||
| 			} | ||||
| 		}) | ||||
| 			}) | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue