From 077a4e2dc48f64689af0f36809758446f79e4173 Mon Sep 17 00:00:00 2001 From: Samantha Date: Wed, 23 Aug 2023 13:40:23 -0400 Subject: [PATCH] ratelimits: Export override utilization metrics (#7044) Fixes #7036 --- ratelimits/limit.go | 5 +++++ ratelimits/limiter.go | 18 +++++++++++++++++- ratelimits/limiter_test.go | 20 ++++++++++++++++---- 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/ratelimits/limit.go b/ratelimits/limit.go index b1d685c7e..a7f07bf7f 100644 --- a/ratelimits/limit.go +++ b/ratelimits/limit.go @@ -34,6 +34,10 @@ type limit struct { // bucket to go from empty to full (burst * (period / count)). This is // precomputed to avoid doing the same calculation on every request. burstOffset int64 + + // isOverride is true if this limit is an override limit, false if it is a + // default limit. + isOverride bool } func precomputeLimit(l limit) limit { @@ -131,6 +135,7 @@ func loadAndParseOverrideLimits(path string) (limits, error) { fqdnSet := core.HashNames(domains) id = fmt.Sprintf("%s:%s", regId, fqdnSet) } + v.isOverride = true parsed[bucketKey(name, id)] = precomputeLimit(v) } return parsed, nil diff --git a/ratelimits/limiter.go b/ratelimits/limiter.go index 9c10c9673..8666e8791 100644 --- a/ratelimits/limiter.go +++ b/ratelimits/limiter.go @@ -7,6 +7,7 @@ import ( "time" "github.com/jmhodges/clock" + "github.com/prometheus/client_golang/prometheus" ) // ErrInvalidCost indicates that the cost specified was <= 0. @@ -34,13 +35,15 @@ type Limiter struct { // source is used to store buckets. It must be safe for concurrent use. source source clk clock.Clock + + overrideUsageGauge *prometheus.GaugeVec } // NewLimiter returns a new *Limiter. The provided source must be safe for // concurrent use. The defaults and overrides paths are expected to be paths to // YAML files that contain the default and override limits, respectively. The // overrides file is optional, all other arguments are required. -func NewLimiter(clk clock.Clock, source source, defaults, overrides string) (*Limiter, error) { +func NewLimiter(clk clock.Clock, source source, defaults, overrides string, stats prometheus.Registerer) (*Limiter, error) { limiter := &Limiter{source: source, clk: clk} var err error @@ -60,6 +63,12 @@ func NewLimiter(clk clock.Clock, source source, defaults, overrides string) (*Li return nil, err } + limiter.overrideUsageGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "ratelimits_override_usage", + Help: "Proportion of override limit used, by limit name and client id.", + }, []string{"limit_name", "client_id"}) + stats.MustRegister(limiter.overrideUsageGauge) + return limiter, nil } @@ -160,6 +169,13 @@ func (l *Limiter) Spend(ctx context.Context, name Name, id string, cost int64) ( d := maybeSpend(l.clk, limit, tat, cost) + if limit.isOverride { + // Calculate the current utilization of the override limit for the + // specified client id. + utilization := float64(limit.Burst-d.Remaining) / float64(limit.Burst) + l.overrideUsageGauge.WithLabelValues(nameToString[name], id).Set(utilization) + } + if !d.Allowed { return d, nil } diff --git a/ratelimits/limiter_test.go b/ratelimits/limiter_test.go index 9eed1128f..b0cce5826 100644 --- a/ratelimits/limiter_test.go +++ b/ratelimits/limiter_test.go @@ -8,7 +8,9 @@ import ( "time" "github.com/jmhodges/clock" + "github.com/letsencrypt/boulder/metrics" "github.com/letsencrypt/boulder/test" + "github.com/prometheus/client_golang/prometheus" ) // tenZeroZeroTwo is overridden in 'testdata/working_override.yml' to have @@ -19,7 +21,7 @@ const tenZeroZeroTwo = "10.0.0.2" // - 'NewRegistrationsPerIPAddress' burst: 20 count: 20 period: 1s // - 'NewRegistrationsPerIPAddress:10.0.0.2' burst: 40 count: 40 period: 1s func newTestLimiter(t *testing.T, s source, clk clock.FakeClock) *Limiter { - l, err := NewLimiter(clk, s, "testdata/working_default.yml", "testdata/working_override.yml") + l, err := NewLimiter(clk, s, "testdata/working_default.yml", "testdata/working_override.yml", metrics.NoopRegisterer) test.AssertNotError(t, err, "should not error") return l } @@ -44,16 +46,16 @@ func setup(t *testing.T) (context.Context, map[string]*Limiter, clock.FakeClock, func Test_Limiter_WithBadLimitsPath(t *testing.T) { t.Parallel() - _, err := NewLimiter(clock.NewFake(), newInmem(), "testdata/does-not-exist.yml", "") + _, err := NewLimiter(clock.NewFake(), newInmem(), "testdata/does-not-exist.yml", "", metrics.NoopRegisterer) test.AssertError(t, err, "should error") - _, err = NewLimiter(clock.NewFake(), newInmem(), "testdata/defaults.yml", "testdata/does-not-exist.yml") + _, err = NewLimiter(clock.NewFake(), newInmem(), "testdata/defaults.yml", "testdata/does-not-exist.yml", metrics.NoopRegisterer) test.AssertError(t, err, "should error") } func Test_Limiter_getLimitNoExist(t *testing.T) { t.Parallel() - l, err := NewLimiter(clock.NewFake(), newInmem(), "testdata/working_default.yml", "") + l, err := NewLimiter(clock.NewFake(), newInmem(), "testdata/working_default.yml", "", metrics.NoopRegisterer) test.AssertNotError(t, err, "should not error") _, err = l.getLimit(Name(9999), "") test.AssertError(t, err, "should error") @@ -76,6 +78,11 @@ func Test_Limiter_CheckWithLimitOverrides(t *testing.T) { testCtx, limiters, clk, _ := setup(t) for name, l := range limiters { t.Run(name, func(t *testing.T) { + // Verify our overrideUsageGauge is being set correctly. 0.0 == 0% of + // the bucket has been consumed. + test.AssertMetricWithLabelsEquals(t, l.overrideUsageGauge, prometheus.Labels{ + "limit_name": nameToString[NewRegistrationsPerIPAddress], "client_id": tenZeroZeroTwo}, 0) + // Attempt to check a spend of 41 requests (a cost > the limit burst // capacity), this should fail with a specific error. _, err := l.Check(testCtx, NewRegistrationsPerIPAddress, tenZeroZeroTwo, 41) @@ -98,6 +105,11 @@ func Test_Limiter_CheckWithLimitOverrides(t *testing.T) { test.AssertEquals(t, d.Remaining, int64(0)) test.AssertEquals(t, d.ResetIn, time.Second) + // Verify our overrideUsageGauge is being set correctly. 1.0 == 100% of + // the bucket has been consumed. + test.AssertMetricWithLabelsEquals(t, l.overrideUsageGauge, prometheus.Labels{ + "limit_name": nameToString[NewRegistrationsPerIPAddress], "client_id": tenZeroZeroTwo}, 1.0) + // Verify our RetryIn is correct. 1 second == 1000 milliseconds and // 1000/40 = 25 milliseconds per request. test.AssertEquals(t, d.RetryIn, time.Millisecond*25)