From 162458ff528a0ec01ae7e0fa1a9c3089dd05bd6c Mon Sep 17 00:00:00 2001 From: Samantha Date: Mon, 25 Sep 2023 12:10:59 -0400 Subject: [PATCH] RA: Use the same key/id as the override (#7098) Fix the overrideUsageGauge implementation added in #7076 that resulted in metric cardinality explosion. --- ra/ra.go | 38 ++++++++++++++++++-------------------- ra/ra_test.go | 16 ++++++++-------- ratelimit/rate-limits.go | 22 ++++++++++++---------- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/ra/ra.go b/ra/ra.go index 8000e6ad0..1b757a4f9 100644 --- a/ra/ra.go +++ b/ra/ra.go @@ -385,11 +385,11 @@ func (ra *RegistrationAuthorityImpl) checkRegistrationIPLimit(ctx context.Contex return err } - threshold, isOverridden := limit.GetThreshold(ip.String(), noRegistrationID) - if isOverridden { + threshold, overrideKey := limit.GetThreshold(ip.String(), noRegistrationID) + if overrideKey != "" { // We do not support overrides for the NewRegistrationsPerIPRange limit. utilization := float64(count.Count) / float64(threshold) - ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.RegistrationsPerIP, ip.String()).Set(utilization) + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.RegistrationsPerIP, overrideKey).Set(utilization) } if count.Count >= threshold { @@ -582,7 +582,7 @@ func (ra *RegistrationAuthorityImpl) validateContacts(contacts []string) error { func (ra *RegistrationAuthorityImpl) checkPendingAuthorizationLimit(ctx context.Context, regID int64, limit ratelimit.RateLimitPolicy) error { // This rate limit's threshold can only be overridden on a per-regID basis, // not based on any other key. - threshold, isOverridden := limit.GetThreshold("", regID) + threshold, overrideKey := limit.GetThreshold("", regID) if threshold == -1 { return nil } @@ -592,9 +592,9 @@ func (ra *RegistrationAuthorityImpl) checkPendingAuthorizationLimit(ctx context. if err != nil { return err } - if isOverridden { + if overrideKey != "" { utilization := float64(countPB.Count) / float64(threshold) - ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.PendingAuthorizationsPerAccount, strconv.FormatInt(regID, 10)).Set(utilization) + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.PendingAuthorizationsPerAccount, overrideKey).Set(utilization) } if countPB.Count >= threshold { ra.log.Infof("Rate limit exceeded, PendingAuthorizationsByRegID, regID: %d", regID) @@ -642,10 +642,10 @@ func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimit(ctx context. // Most rate limits have a key for overrides, but there is no meaningful key // here. noKey := "" - threshold, isOverridden := limit.GetThreshold(noKey, regID) - if isOverridden { + threshold, overrideKey := limit.GetThreshold(noKey, regID) + if overrideKey != "" { utilization := float64(count.Count) / float64(threshold) - ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.InvalidAuthorizationsPerAccount, strconv.FormatInt(regID, 10)).Set(utilization) + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.InvalidAuthorizationsPerAccount, overrideKey).Set(utilization) } if count.Count >= threshold { ra.log.Infof("Rate limit exceeded, InvalidAuthorizationsByRegID, regID: %d", regID) @@ -671,10 +671,10 @@ func (ra *RegistrationAuthorityImpl) checkNewOrdersPerAccountLimit(ctx context.C } // There is no meaningful override key to use for this rate limit noKey := "" - threshold, isOverridden := limit.GetThreshold(noKey, acctID) - if isOverridden { + threshold, overrideKey := limit.GetThreshold(noKey, acctID) + if overrideKey != "" { utilization := float64(count.Count) / float64(threshold) - ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.NewOrdersPerAccount, strconv.FormatInt(acctID, 10)).Set(utilization) + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.NewOrdersPerAccount, overrideKey).Set(utilization) } if count.Count >= threshold { @@ -1413,11 +1413,10 @@ func (ra *RegistrationAuthorityImpl) enforceNameCounts(ctx context.Context, name // over the names slice input to ensure the order of badNames will // return the badNames in the same order they were input. for _, name := range names { - threshold, isOverridden := limit.GetThreshold(name, regID) - if isOverridden { - clientId := fmt.Sprintf("%d:%s", regID, name) + threshold, overrideKey := limit.GetThreshold(name, regID) + if overrideKey != "" { utilization := float64(response.Counts[name]) / float64(threshold) - ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.CertificatesPerName, clientId).Set(utilization) + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.CertificatesPerName, overrideKey).Set(utilization) } if response.Counts[name] >= threshold { badNames = append(badNames, name) @@ -1470,7 +1469,7 @@ func (ra *RegistrationAuthorityImpl) checkCertificatesPerNameLimit(ctx context.C func (ra *RegistrationAuthorityImpl) checkCertificatesPerFQDNSetLimit(ctx context.Context, names []string, limit ratelimit.RateLimitPolicy, regID int64) error { names = core.UniqueLowerNames(names) - threshold, isOverridden := limit.GetThreshold(strings.Join(names, ","), regID) + threshold, overrideKey := limit.GetThreshold(strings.Join(names, ","), regID) if threshold <= 0 { // No limit configured. return nil @@ -1484,10 +1483,9 @@ func (ra *RegistrationAuthorityImpl) checkCertificatesPerFQDNSetLimit(ctx contex return fmt.Errorf("checking duplicate certificate limit for %q: %s", names, err) } - if isOverridden { - clientId := fmt.Sprintf("%d:%s", regID, strings.Join(names, ",")) + if overrideKey != "" { utilization := float64(len(prevIssuances.TimestampsNS)) / float64(threshold) - ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.CertificatesPerFQDNSet, clientId).Set(utilization) + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.CertificatesPerFQDNSet, overrideKey).Set(utilization) } if int64(len(prevIssuances.TimestampsNS)) < threshold { diff --git a/ra/ra_test.go b/ra/ra_test.go index a25a87bc5..4c1217a01 100644 --- a/ra/ra_test.go +++ b/ra/ra_test.go @@ -1235,9 +1235,9 @@ func TestCheckCertificatesPerNameLimit(t *testing.T) { err = ra.checkCertificatesPerNameLimit(ctx, []string{"www.example.com", "example.com", "good-example.com"}, rlp, 99) test.AssertError(t, err, "incorrectly failed to rate limit example.com") test.AssertErrorIs(t, err, berrors.RateLimit) - // There are no overrides for "99:example.com", so the override usage gauge + // There are no overrides for "example.com", so the override usage gauge // should contain 0 entries with labels matching it. - test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.CertificatesPerName, "override_key": "99:example.com"}, 0) + test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.CertificatesPerName, "override_key": "example.com"}, 0) // Verify it has no sub errors as there is only one bad name test.AssertEquals(t, err.Error(), "too many certificates already issued for \"example.com\". Retry after 1970-01-01T23:00:00Z: see https://letsencrypt.org/docs/rate-limits/") var bErr *berrors.BoulderError @@ -1265,10 +1265,10 @@ func TestCheckCertificatesPerNameLimit(t *testing.T) { mockSA.nameCounts.Counts["bigissuer.com"] = 50 err = ra.checkCertificatesPerNameLimit(ctx, []string{"www.example.com", "subdomain.bigissuer.com"}, rlp, 99) test.AssertNotError(t, err, "incorrectly rate limited bigissuer") - // "99:bigissuer.com" has an override of 100 and they've issued 50. We do + // "bigissuer.com" has an override of 100 and they've issued 50. We do // not count issuance that has yet to occur, so we expect to see 50% // utilization. - test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.CertificatesPerName, "override_key": "99:bigissuer.com"}, .5) + test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.CertificatesPerName, "override_key": "bigissuer.com"}, .5) // Two base domains, one above its override mockSA.nameCounts.Counts["example.com"] = 10 @@ -1276,18 +1276,18 @@ func TestCheckCertificatesPerNameLimit(t *testing.T) { err = ra.checkCertificatesPerNameLimit(ctx, []string{"www.example.com", "subdomain.bigissuer.com"}, rlp, 99) test.AssertError(t, err, "incorrectly failed to rate limit bigissuer") test.AssertErrorIs(t, err, berrors.RateLimit) - // "99:bigissuer.com" has an override of 100 and they've issued 100. So we + // "bigissuer.com" has an override of 100 and they've issued 100. So we // expect to see 100% utilization. - test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.CertificatesPerName, "override_key": "99:bigissuer.com"}, 1) + test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.CertificatesPerName, "override_key": "bigissuer.com"}, 1) // One base domain, above its override (which is below threshold) mockSA.nameCounts.Counts["smallissuer.co.uk"] = 1 err = ra.checkCertificatesPerNameLimit(ctx, []string{"www.smallissuer.co.uk"}, rlp, 99) test.AssertError(t, err, "incorrectly failed to rate limit smallissuer") test.AssertErrorIs(t, err, berrors.RateLimit) - // "99:smallissuer.co.uk" has an override of 1 and they've issued 1. So we + // "smallissuer.co.uk" has an override of 1 and they've issued 1. So we // expect to see 100% utilization. - test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.CertificatesPerName, "override_key": "99:smallissuer.co.uk"}, 1) + test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.CertificatesPerName, "override_key": "smallissuer.co.uk"}, 1) } // TestCheckExactCertificateLimit tests that the duplicate certificate limit diff --git a/ratelimit/rate-limits.go b/ratelimit/rate-limits.go index 0de83a1e3..0d52801d5 100644 --- a/ratelimit/rate-limits.go +++ b/ratelimit/rate-limits.go @@ -1,6 +1,7 @@ package ratelimit import ( + "strconv" "sync" "time" @@ -223,32 +224,33 @@ func (rlp *RateLimitPolicy) Enabled() bool { return rlp.Threshold != 0 } -// GetThreshold returns the threshold for this rate limit and true if that -// threshold is an override for the default limit, false otherwise. The -// threshold returned takes into account any overrides for `key` or `regID`. If -// both `key` and `regID` have an override the largest of the two will be used. -func (rlp *RateLimitPolicy) GetThreshold(key string, regID int64) (int64, bool) { +// GetThreshold returns the threshold for this rate limit and the override +// Id/Key if that threshold is the result of an override for the default limit, +// empty-string otherwise. The threshold returned takes into account any +// overrides for `key` or `regID`. If both `key` and `regID` have an override +// the largest of the two will be used. +func (rlp *RateLimitPolicy) GetThreshold(key string, regID int64) (int64, string) { regOverride, regOverrideExists := rlp.RegistrationOverrides[regID] keyOverride, keyOverrideExists := rlp.Overrides[key] if regOverrideExists && !keyOverrideExists { // If there is a regOverride and no keyOverride use the regOverride - return regOverride, true + return regOverride, strconv.FormatInt(regID, 10) } else if !regOverrideExists && keyOverrideExists { // If there is a keyOverride and no regOverride use the keyOverride - return keyOverride, true + return keyOverride, key } else if regOverrideExists && keyOverrideExists { // If there is both a regOverride and a keyOverride use whichever is larger. if regOverride > keyOverride { - return regOverride, true + return regOverride, strconv.FormatInt(regID, 10) } else { - return keyOverride, true + return keyOverride, key } } // Otherwise there was no regOverride and no keyOverride, use the base // Threshold - return rlp.Threshold, false + return rlp.Threshold, "" } // WindowBegin returns the time that a RateLimitPolicy's window begins, given a