VA: Consolidate multiple metrics into one histogram (#7816)

- Add a new histogram, validationLatency
- Add a VA.observeLatency for observing validation latency
- Refactor to ensure this metric can be observed exclusively within
VA.PerformValidation and VA.IsCAAValid.
- Replace validationTime, localValidationTime, remoteValidationTime,
remoteValidationFailures, caaCheckTime, localCAACheckTime,
remoteCAACheckTime, and remoteCAACheckFailures with validationLatency
This commit is contained in:
Samantha Frank 2024-11-15 15:51:39 -05:00 committed by GitHub
parent f9fb688427
commit 3baac6f6df
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 185 additions and 165 deletions

View File

@ -11,7 +11,6 @@ import (
"time"
"github.com/miekg/dns"
"github.com/prometheus/client_golang/prometheus"
"github.com/letsencrypt/boulder/bdns"
"github.com/letsencrypt/boulder/canceled"
@ -42,19 +41,46 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC
Requester: req.AccountURIID,
Hostname: req.Domain,
}
checkStartTime := va.clk.Now()
validationMethod := core.AcmeChallenge(req.ValidationMethod)
if !validationMethod.IsValid() {
challType := core.AcmeChallenge(req.ValidationMethod)
if !challType.IsValid() {
return nil, berrors.InternalServerError("unrecognized validation method %q", req.ValidationMethod)
}
acmeID := identifier.NewDNS(req.Domain)
params := &caaParams{
accountURIID: req.AccountURIID,
validationMethod: validationMethod,
validationMethod: challType,
}
var prob *probs.ProblemDetails
var internalErr error
var localLatency time.Duration
start := va.clk.Now()
defer func() {
probType := ""
outcome := fail
if prob != nil {
// CAA check failed.
probType = string(prob.Type)
logEvent.Error = prob.Error()
} else {
// CAA check passed.
outcome = pass
}
// Observe local check latency (primary|remote).
va.observeLatency(opCAA, va.perspective, string(challType), probType, outcome, localLatency)
if va.isPrimaryVA() {
// Observe total check latency (primary+remote).
va.observeLatency(opCAA, allPerspectives, string(challType), probType, outcome, va.clk.Since(start))
}
// Log the total check latency.
logEvent.ValidationLatency = va.clk.Since(start).Round(time.Millisecond).Seconds()
va.log.AuditObject("CAA check result", logEvent)
}()
var remoteCAAResults chan *remoteVAResult
if features.Get().EnforceMultiCAA {
if remoteVACount := len(va.remoteVAs); remoteVACount > 0 {
@ -63,16 +89,10 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC
}
}
checkResult := "success"
err := va.checkCAA(ctx, acmeID, params)
localCheckLatency := time.Since(checkStartTime)
var prob *probs.ProblemDetails
if err != nil {
prob = detailedError(err)
logEvent.Error = prob.Error()
logEvent.InternalError = err.Error()
internalErr = va.checkCAA(ctx, acmeID, params)
if internalErr != nil {
prob = detailedError(internalErr)
prob.Detail = fmt.Sprintf("While processing CAA for %s: %s", req.Domain, prob.Detail)
checkResult = "failure"
} else if remoteCAAResults != nil {
if !features.Get().EnforceMultiCAA && features.Get().MultiCAAFullResults {
// If we're not going to enforce multi CAA but we are logging the
@ -82,40 +102,24 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC
_ = va.processRemoteCAAResults(
req.Domain,
req.AccountURIID,
string(validationMethod),
string(challType),
remoteCAAResults)
}()
} else if features.Get().EnforceMultiCAA {
remoteProb := va.processRemoteCAAResults(
req.Domain,
req.AccountURIID,
string(validationMethod),
string(challType),
remoteCAAResults)
// If the remote result was a non-nil problem then fail the CAA check
if remoteProb != nil {
prob = remoteProb
// We only set .Error here, not InternalError, because the remote VA doesn't send
// us the internal error. But that's okay, because it got logged at the remote VA.
logEvent.Error = remoteProb.Error()
checkResult = "failure"
va.log.Infof("CAA check failed due to remote failures: identifier=%v err=%s",
req.Domain, remoteProb)
va.metrics.remoteCAACheckFailures.Inc()
}
}
}
checkLatency := time.Since(checkStartTime)
logEvent.ValidationLatency = checkLatency.Round(time.Millisecond).Seconds()
va.metrics.localCAACheckTime.With(prometheus.Labels{
"result": checkResult,
}).Observe(localCheckLatency.Seconds())
va.metrics.caaCheckTime.With(prometheus.Labels{
"result": checkResult,
}).Observe(checkLatency.Seconds())
va.log.AuditObject("CAA check result", logEvent)
if prob != nil {
// The ProblemDetails will be serialized through gRPC, which requires UTF-8.
@ -154,15 +158,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults(
challengeType string,
remoteResultsChan <-chan *remoteVAResult) *probs.ProblemDetails {
state := "failure"
start := va.clk.Now()
defer func() {
va.metrics.remoteCAACheckTime.With(prometheus.Labels{
"result": state,
}).Observe(va.clk.Since(start).Seconds())
}()
required := len(va.remoteVAs) - va.maxRemoteFailures
good := 0
bad := 0
@ -190,7 +185,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults(
// success or failure threshold is met.
if !features.Get().MultiCAAFullResults {
if good >= required {
state = "success"
return nil
} else if bad > va.maxRemoteFailures {
modifiedProblem := *result.Problem
@ -217,7 +211,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults(
// Based on the threshold of good/bad return nil or a problem.
if good >= required {
state = "success"
return nil
} else if bad > va.maxRemoteFailures {
modifiedProblem := *firstProb

View File

@ -9,6 +9,7 @@ import (
"testing"
"github.com/miekg/dns"
"github.com/prometheus/client_golang/prometheus"
"github.com/letsencrypt/boulder/bdns"
"github.com/letsencrypt/boulder/core"
@ -676,6 +677,7 @@ func TestMultiCAARechecking(t *testing.T) {
expectedProbSubstring string
expectedProbType probs.ProblemType
expectedDiffLogSubstring string
expectedLabels prometheus.Labels
localDNSClient bdns.Client
}{
{
@ -687,6 +689,13 @@ func TestMultiCAARechecking(t *testing.T) {
{remoteVA, remoteUA},
{remoteVA, remoteUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": "",
"result": pass,
},
},
{
name: "broken localVA, RVAs functional, no CAA records",
@ -699,6 +708,13 @@ func TestMultiCAARechecking(t *testing.T) {
{remoteVA, remoteUA},
{remoteVA, remoteUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": string(probs.DNSProblem),
"result": fail,
},
},
{
name: "functional localVA, 1 broken RVA, no CAA records",
@ -712,6 +728,13 @@ func TestMultiCAARechecking(t *testing.T) {
{remoteVA, remoteUA},
{remoteVA, remoteUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": string(probs.DNSProblem),
"result": fail,
},
},
{
name: "functional localVA, all broken RVAs, no CAA records",
@ -725,6 +748,13 @@ func TestMultiCAARechecking(t *testing.T) {
{brokenVA, brokenUA},
{brokenVA, brokenUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": string(probs.DNSProblem),
"result": fail,
},
},
{
name: "all VAs functional, CAA issue type present",
@ -735,6 +765,13 @@ func TestMultiCAARechecking(t *testing.T) {
{remoteVA, remoteUA},
{remoteVA, remoteUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": "",
"result": pass,
},
},
{
name: "functional localVA, 1 broken RVA, CAA issue type present",
@ -748,6 +785,13 @@ func TestMultiCAARechecking(t *testing.T) {
{remoteVA, remoteUA},
{remoteVA, remoteUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": string(probs.DNSProblem),
"result": fail,
},
},
{
name: "functional localVA, all broken RVAs, CAA issue type present",
@ -761,6 +805,13 @@ func TestMultiCAARechecking(t *testing.T) {
{brokenVA, brokenUA},
{brokenVA, brokenUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": string(probs.DNSProblem),
"result": fail,
},
},
{
// The localVA kicks off the background goroutines before doing its
@ -954,6 +1005,10 @@ func TestMultiCAARechecking(t *testing.T) {
} else {
test.AssertEquals(t, len(gotAnyRemoteFailures), 0)
}
if tc.expectedLabels != nil {
test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, tc.expectedLabels, 1)
}
})
}
}

View File

@ -28,10 +28,12 @@ func TestDNSValidationEmpty(t *testing.T) {
test.AssertEquals(t, res.Problems.ProblemType, "unauthorized")
test.AssertEquals(t, res.Problems.Detail, "No TXT record found at _acme-challenge.empty-txts.com")
test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{
"type": "dns-01",
"result": "invalid",
"problem_type": "unauthorized",
test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{
"operation": opChallAndCAA,
"perspective": PrimaryPerspective,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": string(probs.UnauthorizedProblem),
"result": fail,
}, 1)
}

175
va/va.go
View File

@ -31,7 +31,16 @@ import (
vapb "github.com/letsencrypt/boulder/va/proto"
)
const PrimaryPerspective = "Primary"
const (
PrimaryPerspective = "Primary"
allPerspectives = "all"
opChallAndCAA = "challenge+caa"
opCAA = "caa"
pass = "pass"
fail = "fail"
)
var (
// badTLSHeader contains the string 'HTTP /' which is returned when
@ -83,14 +92,14 @@ type RemoteVA struct {
}
type vaMetrics struct {
validationTime *prometheus.HistogramVec
localValidationTime *prometheus.HistogramVec
remoteValidationTime *prometheus.HistogramVec
remoteValidationFailures prometheus.Counter
caaCheckTime *prometheus.HistogramVec
localCAACheckTime *prometheus.HistogramVec
remoteCAACheckTime *prometheus.HistogramVec
remoteCAACheckFailures prometheus.Counter
// validationLatency is a histogram of the latency to perform validations
// from the primary and remote VA perspectives. It's labelled by:
// - operation: VA.ValidateChallenge or VA.CheckCAA as [challenge|caa|challenge+caa]
// - perspective: ValidationAuthorityImpl.perspective
// - challenge_type: core.Challenge.Type
// - problem_type: probs.ProblemType
// - result: the result of the validation as [pass|fail]
validationLatency *prometheus.HistogramVec
prospectiveRemoteCAACheckFailures prometheus.Counter
tlsALPNOIDCounter *prometheus.CounterVec
http01Fallbacks prometheus.Counter
@ -100,66 +109,15 @@ type vaMetrics struct {
}
func initMetrics(stats prometheus.Registerer) *vaMetrics {
validationTime := prometheus.NewHistogramVec(
validationLatency := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "validation_time",
Help: "Total time taken to validate a challenge and aggregate results",
Name: "validation_latency",
Help: "Histogram of the latency to perform validations from the primary and remote VA perspectives",
Buckets: metrics.InternetFacingBuckets,
},
[]string{"type", "result", "problem_type"})
stats.MustRegister(validationTime)
localValidationTime := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "local_validation_time",
Help: "Time taken to locally validate a challenge",
Buckets: metrics.InternetFacingBuckets,
},
[]string{"type", "result"})
stats.MustRegister(localValidationTime)
remoteValidationTime := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "remote_validation_time",
Help: "Time taken to remotely validate a challenge",
Buckets: metrics.InternetFacingBuckets,
},
[]string{"type"})
stats.MustRegister(remoteValidationTime)
remoteValidationFailures := prometheus.NewCounter(
prometheus.CounterOpts{
Name: "remote_validation_failures",
Help: "Number of validations failed due to remote VAs returning failure when consensus is enforced",
})
stats.MustRegister(remoteValidationFailures)
caaCheckTime := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "caa_check_time",
Help: "Total time taken to check CAA records and aggregate results",
Buckets: metrics.InternetFacingBuckets,
},
[]string{"result"})
stats.MustRegister(caaCheckTime)
localCAACheckTime := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "caa_check_time_local",
Help: "Time taken to locally check CAA records",
Buckets: metrics.InternetFacingBuckets,
},
[]string{"result"})
stats.MustRegister(localCAACheckTime)
remoteCAACheckTime := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "caa_check_time_remote",
Help: "Time taken to remotely check CAA records",
Buckets: metrics.InternetFacingBuckets,
},
[]string{"result"})
stats.MustRegister(remoteCAACheckTime)
remoteCAACheckFailures := prometheus.NewCounter(
prometheus.CounterOpts{
Name: "remote_caa_check_failures",
Help: "Number of CAA checks failed due to remote VAs returning failure when consensus is enforced",
})
stats.MustRegister(remoteCAACheckFailures)
[]string{"operation", "perspective", "challenge_type", "problem_type", "result"},
)
stats.MustRegister(validationLatency)
prospectiveRemoteCAACheckFailures := prometheus.NewCounter(
prometheus.CounterOpts{
Name: "prospective_remote_caa_check_failures",
@ -198,14 +156,7 @@ func initMetrics(stats prometheus.Registerer) *vaMetrics {
stats.MustRegister(ipv4FallbackCounter)
return &vaMetrics{
validationTime: validationTime,
remoteValidationTime: remoteValidationTime,
localValidationTime: localValidationTime,
remoteValidationFailures: remoteValidationFailures,
caaCheckTime: caaCheckTime,
localCAACheckTime: localCAACheckTime,
remoteCAACheckTime: remoteCAACheckTime,
remoteCAACheckFailures: remoteCAACheckFailures,
validationLatency: validationLatency,
prospectiveRemoteCAACheckFailures: prospectiveRemoteCAACheckFailures,
tlsALPNOIDCounter: tlsALPNOIDCounter,
http01Fallbacks: http01Fallbacks,
@ -426,6 +377,11 @@ func detailedError(err error) *probs.ProblemDetails {
return probs.Connection("Error getting validation data")
}
// isPrimaryVA returns true if the VA is the primary validation perspective.
func (va *ValidationAuthorityImpl) isPrimaryVA() bool {
return va.perspective == PrimaryPerspective
}
// validateChallenge simply passes through to the appropriate validation method
// depending on the challenge type.
func (va *ValidationAuthorityImpl) validateChallenge(
@ -449,6 +405,25 @@ func (va *ValidationAuthorityImpl) validateChallenge(
return nil, berrors.MalformedError("invalid challenge type %s", kind)
}
// observeLatency records entries in the validationLatency histogram of the
// latency to perform validations from the primary and remote VA perspectives.
// The labels are:
// - operation: VA.ValidateChallenge or VA.CheckCAA as [challenge|caa]
// - perspective: [ValidationAuthorityImpl.perspective|all]
// - challenge_type: core.Challenge.Type
// - problem_type: probs.ProblemType
// - result: the result of the validation as [pass|fail]
func (va *ValidationAuthorityImpl) observeLatency(op, perspective, challType, probType, result string, latency time.Duration) {
labels := prometheus.Labels{
"operation": op,
"perspective": perspective,
"challenge_type": challType,
"problem_type": probType,
"result": result,
}
va.metrics.validationLatency.With(labels).Observe(latency.Seconds())
}
// performRemoteValidation coordinates the whole process of kicking off and
// collecting results from calls to remote VAs' PerformValidation function. It
// returns a problem if too many remote perspectives failed to corroborate
@ -463,13 +438,6 @@ func (va *ValidationAuthorityImpl) performRemoteValidation(
return nil
}
start := va.clk.Now()
defer func() {
va.metrics.remoteValidationTime.With(prometheus.Labels{
"type": req.Challenge.Type,
}).Observe(va.clk.Since(start).Seconds())
}()
type response struct {
addr string
result *vapb.ValidationResult
@ -490,8 +458,6 @@ func (va *ValidationAuthorityImpl) performRemoteValidation(
required := remoteVACount - va.maxRemoteFailures
var passed []string
// failed contains a list of perspectives that failed to validate the domain
// or the addresses of remote VAs that failed to respond.
var failed []string
var firstProb *probs.ProblemDetails
@ -534,7 +500,6 @@ func (va *ValidationAuthorityImpl) performRemoteValidation(
}
if len(failed) > va.maxRemoteFailures {
// Too many failed responses to reach quorum.
va.metrics.remoteValidationFailures.Inc()
firstProb.Detail = fmt.Sprintf("During secondary validation: %s", firstProb.Detail)
return firstProb
}
@ -649,12 +614,12 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, req *v
return nil, berrors.InternalServerError("Incomplete validation request")
}
challenge, err := bgrpc.PBToChallenge(req.Challenge)
chall, err := bgrpc.PBToChallenge(req.Challenge)
if err != nil {
return nil, errors.New("challenge failed to deserialize")
}
err = challenge.CheckPending()
err = chall.CheckPending()
if err != nil {
return nil, berrors.MalformedError("challenge failed consistency check: %s", err)
}
@ -664,36 +629,34 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, req *v
// `prob`, or this will fail.
var prob *probs.ProblemDetails
var localLatency time.Duration
vStart := va.clk.Now()
start := va.clk.Now()
logEvent := verificationRequestEvent{
ID: req.Authz.Id,
Requester: req.Authz.RegID,
Hostname: req.DnsName,
Challenge: challenge,
Challenge: chall,
}
defer func() {
problemType := ""
probType := ""
outcome := fail
if prob != nil {
problemType = string(prob.Type)
probType = string(prob.Type)
logEvent.Error = prob.Error()
logEvent.Challenge.Error = prob
logEvent.Challenge.Status = core.StatusInvalid
} else {
logEvent.Challenge.Status = core.StatusValid
outcome = pass
}
// Observe local validation latency (primary|remote).
va.observeLatency(opChallAndCAA, va.perspective, string(chall.Type), probType, outcome, localLatency)
if va.isPrimaryVA() {
// Observe total validation latency (primary+remote).
va.observeLatency(opChallAndCAA, allPerspectives, string(chall.Type), probType, outcome, va.clk.Since(start))
}
va.metrics.localValidationTime.With(prometheus.Labels{
"type": string(logEvent.Challenge.Type),
"result": string(logEvent.Challenge.Status),
}).Observe(localLatency.Seconds())
va.metrics.validationTime.With(prometheus.Labels{
"type": string(logEvent.Challenge.Type),
"result": string(logEvent.Challenge.Status),
"problem_type": problemType,
}).Observe(time.Since(vStart).Seconds())
logEvent.ValidationLatency = time.Since(vStart).Round(time.Millisecond).Seconds()
// Log the total validation latency.
logEvent.ValidationLatency = time.Since(start).Round(time.Millisecond).Seconds()
va.log.AuditObject("Validation result", logEvent)
}()
@ -705,10 +668,12 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, req *v
ctx,
identifier.NewDNS(req.DnsName),
req.Authz.RegID,
challenge.Type,
challenge.Token,
chall.Type,
chall.Token,
req.ExpectedKeyAuthorization)
localLatency = time.Since(vStart)
// Stop the clock for local validation latency.
localLatency = va.clk.Since(start)
// Check for malformed ValidationRecords
logEvent.Challenge.ValidationRecord = records

View File

@ -257,11 +257,12 @@ func TestPerformValidationInvalid(t *testing.T) {
req := createValidationRequest("foo.com", core.ChallengeTypeDNS01)
res, _ := va.PerformValidation(context.Background(), req)
test.Assert(t, res.Problems != nil, "validation succeeded")
test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{
"type": "dns-01",
"result": "invalid",
"problem_type": "unauthorized",
test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{
"operation": opChallAndCAA,
"perspective": PrimaryPerspective,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": string(probs.UnauthorizedProblem),
"result": fail,
}, 1)
}
@ -286,10 +287,12 @@ func TestPerformValidationValid(t *testing.T) {
res, _ := va.PerformValidation(context.Background(), req)
test.Assert(t, res.Problems == nil, fmt.Sprintf("validation failed: %#v", res.Problems))
test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{
"type": "dns-01",
"result": "valid",
"problem_type": "",
test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{
"operation": opChallAndCAA,
"perspective": PrimaryPerspective,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": "",
"result": pass,
}, 1)
resultLog := mockLog.GetAllMatching(`Validation result`)
if len(resultLog) != 1 {
@ -311,10 +314,12 @@ func TestPerformValidationWildcard(t *testing.T) {
res, _ := va.PerformValidation(context.Background(), req)
test.Assert(t, res.Problems == nil, fmt.Sprintf("validation failed: %#v", res.Problems))
test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{
"type": "dns-01",
"result": "valid",
"problem_type": "",
test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{
"operation": opChallAndCAA,
"perspective": PrimaryPerspective,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": "",
"result": pass,
}, 1)
resultLog := mockLog.GetAllMatching(`Validation result`)
if len(resultLog) != 1 {