ratelimits: Add a feature-flag which makes key-value implementation authoritative (#7666)

- Add feature flag `UseKvLimitsForNewOrder`
- Add feature flag `UseKvLimitsForNewAccount`
- Flush all Redis shards before running integration or unit tests, this
avoids false positives between local testing runs

Fixes #7664
Blocked by #7676
This commit is contained in:
Samantha Frank 2024-08-22 15:56:30 -04:00 committed by GitHub
parent c7a04e8e22
commit c9be034c00
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 204 additions and 102 deletions

View File

@ -104,6 +104,19 @@ type Config struct {
// returned to the Subscriber indicating that the order cannot be processed
// until the paused identifiers are unpaused and the order is resubmitted.
CheckIdentifiersPaused bool
// UseKvLimitsForNewOrder when enabled, causes the key-value rate limiter to
// be the authoritative source of rate limiting information for new-order
// callers and disables the legacy rate limiting checks.
//
// Note: this flag does not disable writes to the certificatesPerName or
// fqdnSets tables at Finalize time.
UseKvLimitsForNewOrder bool
// UseKvLimitsForNewAccount when enabled, causes the key-value rate limiter
// to be the authoritative source of rate limiting information for
// new-account callers and disables the legacy rate limiting checks.
UseKvLimitsForNewAccount bool
}
var fMu = new(sync.RWMutex)

View File

@ -491,9 +491,12 @@ func (ra *RegistrationAuthorityImpl) NewRegistration(ctx context.Context, reques
if err != nil {
return nil, berrors.InternalServerError("failed to unmarshal ip address: %s", err.Error())
}
err = ra.checkRegistrationLimits(ctx, ipAddr)
if err != nil {
return nil, err
if !features.Get().UseKvLimitsForNewAccount {
err = ra.checkRegistrationLimits(ctx, ipAddr)
if err != nil {
return nil, err
}
}
// Check that contacts conform to our expectations.
@ -2007,12 +2010,6 @@ func (ra *RegistrationAuthorityImpl) PerformValidation(
if prob != nil {
challenge.Status = core.StatusInvalid
challenge.Error = prob
// TODO(#5545): Spending can be async until key-value rate limits
// are authoritative. This saves us from adding latency to each
// request. Goroutines spun out below will respect a context
// deadline set by the ratelimits package and cannot be prematurely
// canceled by the requester.
go ra.countFailedValidation(vaCtx, authz.RegistrationID, authz.Identifier.Value)
} else {
challenge.Status = core.StatusValid
@ -2585,7 +2582,7 @@ func (ra *RegistrationAuthorityImpl) NewOrder(ctx context.Context, req *rapb.New
}
// Renewal orders, indicated by ARI, are exempt from NewOrder rate limits.
if !req.IsARIRenewal {
if !req.IsARIRenewal && !features.Get().UseKvLimitsForNewOrder {
// Check if there is rate limit space for issuing a certificate.
err = ra.checkNewOrderLimits(ctx, newOrder.DnsNames, newOrder.RegistrationID, req.IsRenewal)
if err != nil {
@ -2664,7 +2661,7 @@ func (ra *RegistrationAuthorityImpl) NewOrder(ctx context.Context, req *rapb.New
}
// Renewal orders, indicated by ARI, are exempt from NewOrder rate limits.
if len(missingAuthzIdents) > 0 && !req.IsARIRenewal {
if len(missingAuthzIdents) > 0 && !req.IsARIRenewal && !features.Get().UseKvLimitsForNewOrder {
pendingAuthzLimits := ra.rlPolicies.PendingAuthorizationsPerAccount()
if pendingAuthzLimits.Enabled() {
// The order isn't fully authorized we need to check that the client

10
test.sh
View File

@ -19,6 +19,14 @@ UNIT_PACKAGES=()
UNIT_FLAGS=()
FILTER=()
#
# Cleanup Functions
#
function flush_redis() {
go run ./test/boulder-tools/flushredis/main.go
}
#
# Print Functions
#
@ -225,6 +233,7 @@ fi
STAGE="unit"
if [[ "${RUN[@]}" =~ "$STAGE" ]] ; then
print_heading "Running Unit Tests"
flush_redis
run_unit_tests
fi
@ -234,6 +243,7 @@ fi
STAGE="integration"
if [[ "${RUN[@]}" =~ "$STAGE" ]] ; then
print_heading "Running Integration Tests"
flush_redis
python3 test/integration-test.py --chisel --gotest "${FILTER[@]}"
fi

View File

@ -0,0 +1,56 @@
package main
import (
"context"
"fmt"
"os"
"github.com/letsencrypt/boulder/cmd"
blog "github.com/letsencrypt/boulder/log"
"github.com/letsencrypt/boulder/metrics"
bredis "github.com/letsencrypt/boulder/redis"
"github.com/redis/go-redis/v9"
)
func main() {
rc := bredis.Config{
Username: "unittest-rw",
TLS: cmd.TLSConfig{
CACertFile: "test/certs/ipki/minica.pem",
CertFile: "test/certs/ipki/localhost/cert.pem",
KeyFile: "test/certs/ipki/localhost/key.pem",
},
Lookups: []cmd.ServiceDomain{
{
Service: "redisratelimits",
Domain: "service.consul",
},
},
LookupDNSAuthority: "consul.service.consul",
}
rc.PasswordConfig = cmd.PasswordConfig{
PasswordFile: "test/secrets/ratelimits_redis_password",
}
stats := metrics.NoopRegisterer
log := blog.NewMock()
ring, err := bredis.NewRingFromConfig(rc, stats, log)
if err != nil {
fmt.Printf("while constructing ring client: %v\n", err)
os.Exit(1)
}
err = ring.ForEachShard(context.Background(), func(ctx context.Context, shard *redis.Client) error {
cmd := shard.FlushAll(ctx)
_, err := cmd.Result()
if err != nil {
return err
}
return nil
})
if err != nil {
fmt.Printf("while flushing redis shards: %v\n", err)
os.Exit(1)
}
}

View File

@ -130,7 +130,9 @@
},
"features": {
"AsyncFinalize": true,
"CheckRenewalExemptionAtWFE": true
"CheckRenewalExemptionAtWFE": true,
"UseKvLimitsForNewOrder": true,
"UseKvLimitsForNewAccount": true
},
"ctLogs": {
"stagger": "500ms",

View File

@ -5,6 +5,10 @@
ids:
- id: 127.0.0.1
comment: localhost
- id: 10.77.77.77
comment: test
- id: 10.88.88.88
comment: test
- CertificatesPerDomain:
burst: 1
count: 1

View File

@ -130,7 +130,9 @@
"ServeRenewalInfo": true,
"TrackReplacementCertificatesARI": true,
"CheckRenewalExemptionAtWFE": true,
"CheckIdentifiersPaused": true
"CheckIdentifiersPaused": true,
"UseKvLimitsForNewOrder": true,
"UseKvLimitsForNewAccount": true
},
"certProfiles": {
"legacy": "The normal profile you know and love",

View File

@ -188,6 +188,14 @@ func httpSpan(endpoint string, children ...expectedSpans) expectedSpans {
}
}
func redisPipelineSpan(op, service string, children ...expectedSpans) expectedSpans {
return expectedSpans{
Operation: "redis.pipeline " + op,
Service: service,
Children: children,
}
}
// TestTraces tests that all the expected spans are present and properly connected
func TestTraces(t *testing.T) {
t.Parallel()
@ -210,17 +218,19 @@ func TestTraces(t *testing.T) {
{Operation: "/acme/new-nonce", Service: wfe, Children: []expectedSpans{
rpcSpan("nonce.NonceService/Nonce", wfe, "nonce-service")}},
httpSpan("/acme/new-acct",
redisPipelineSpan("get", wfe),
redisPipelineSpan("set", wfe),
rpcSpan("sa.StorageAuthorityReadOnly/KeyBlocked", wfe, sa),
rpcSpan("sa.StorageAuthorityReadOnly/GetRegistrationByKey", wfe, sa),
rpcSpan("ra.RegistrationAuthority/NewRegistration", wfe, ra,
rpcSpan("sa.StorageAuthority/KeyBlocked", ra, sa),
rpcSpan("sa.StorageAuthority/CountRegistrationsByIP", ra, sa),
rpcSpan("sa.StorageAuthority/NewRegistration", ra, sa))),
httpSpan("/acme/new-order",
rpcSpan("sa.StorageAuthorityReadOnly/GetRegistration", wfe, sa),
redisPipelineSpan("get", wfe),
redisPipelineSpan("set", wfe),
rpcSpan("ra.RegistrationAuthority/NewOrder", wfe, ra,
rpcSpan("sa.StorageAuthority/GetOrderForNames", ra, sa),
// 8 ra -> sa rate limit spans omitted here
rpcSpan("sa.StorageAuthority/NewOrderAndAuthzs", ra, sa))),
httpSpan("/acme/authz-v3/",
rpcSpan("sa.StorageAuthorityReadOnly/GetAuthorization2", wfe, sa)),
@ -236,8 +246,10 @@ func TestTraces(t *testing.T) {
rpcSpan("sa.StorageAuthority/GetValidOrderAuthorizations2", ra, sa),
rpcSpan("sa.StorageAuthority/SetOrderProcessing", ra, sa),
rpcSpan("ca.CertificateAuthority/IssuePrecertificate", ra, ca),
redisPipelineSpan("get", ra),
rpcSpan("Publisher/SubmitToSingleCTWithResult", ra, "boulder-publisher"),
rpcSpan("ca.CertificateAuthority/IssueCertificateForPrecertificate", ra, ca),
redisPipelineSpan("set", ra),
rpcSpan("sa.StorageAuthority/FinalizeOrder", ra, sa))),
httpSpan("/acme/order/", rpcSpan("sa.StorageAuthorityReadOnly/GetOrder", wfe, sa)),
httpSpan("/acme/cert/", rpcSpan("sa.StorageAuthorityReadOnly/GetCertificate", wfe, sa)),

View File

@ -3,19 +3,13 @@
package integration
import (
"context"
"crypto/rand"
"encoding/hex"
"fmt"
"os"
"strings"
"testing"
"github.com/jmhodges/clock"
"github.com/letsencrypt/boulder/cmd"
berrors "github.com/letsencrypt/boulder/errors"
blog "github.com/letsencrypt/boulder/log"
"github.com/letsencrypt/boulder/metrics"
"github.com/letsencrypt/boulder/ratelimits"
bredis "github.com/letsencrypt/boulder/redis"
"github.com/letsencrypt/boulder/test"
)
@ -23,6 +17,7 @@ func TestDuplicateFQDNRateLimit(t *testing.T) {
t.Parallel()
domain := random_domain()
// The global rate limit for a duplicate certificates is 2 per 3 hours.
_, err := authAndIssue(nil, nil, []string{domain}, true)
test.AssertNotError(t, err, "Failed to issue first certificate")
@ -33,45 +28,44 @@ func TestDuplicateFQDNRateLimit(t *testing.T) {
test.AssertError(t, err, "Somehow managed to issue third certificate")
if strings.Contains(os.Getenv("BOULDER_CONFIG_DIR"), "test/config-next") {
// Setup rate limiting.
rc := bredis.Config{
Username: "unittest-rw",
TLS: cmd.TLSConfig{
CACertFile: "test/certs/ipki/minica.pem",
CertFile: "test/certs/ipki/localhost/cert.pem",
KeyFile: "test/certs/ipki/localhost/key.pem",
},
Lookups: []cmd.ServiceDomain{
{
Service: "redisratelimits",
Domain: "service.consul",
},
},
LookupDNSAuthority: "consul.service.consul",
}
rc.PasswordConfig = cmd.PasswordConfig{
PasswordFile: "test/secrets/ratelimits_redis_password",
}
fc := clock.New()
stats := metrics.NoopRegisterer
log := blog.NewMock()
ring, err := bredis.NewRingFromConfig(rc, stats, log)
test.AssertNotError(t, err, "making redis ring client")
source := ratelimits.NewRedisSource(ring.Ring, fc, stats)
test.AssertNotNil(t, source, "source should not be nil")
limiter, err := ratelimits.NewLimiter(fc, source, stats)
test.AssertNotError(t, err, "making limiter")
txnBuilder, err := ratelimits.NewTransactionBuilder("test/config-next/wfe2-ratelimit-defaults.yml", "")
test.AssertNotError(t, err, "making transaction composer")
// Check that the CertificatesPerFQDNSet limit is reached.
txns, err := txnBuilder.NewOrderLimitTransactions(1, []string{domain}, false)
test.AssertNotError(t, err, "making transaction")
decision, err := limiter.BatchSpend(context.Background(), txns)
test.AssertNotError(t, err, "checking transaction")
err = decision.Result(fc.Now())
test.AssertErrorIs(t, err, berrors.RateLimit)
// Error should be served from key-value rate limits implementation.
test.AssertContains(t, err.Error(), "too many certificates (2) already issued for this exact set of domains in the last 3h0m0s")
} else {
// Error should be served from legacy rate limits implementation.
test.AssertContains(t, err.Error(), "too many certificates (2) already issued for this exact set of domains in the last 3 hours")
}
}
func TestCertificatesPerDomain(t *testing.T) {
t.Parallel()
randomDomain := random_domain()
randomSubDomain := func() string {
var bytes [3]byte
rand.Read(bytes[:])
return fmt.Sprintf("%s.%s", hex.EncodeToString(bytes[:]), randomDomain)
}
firstSubDomain := randomSubDomain()
_, err := authAndIssue(nil, nil, []string{firstSubDomain}, true)
test.AssertNotError(t, err, "Failed to issue first certificate")
_, err = authAndIssue(nil, nil, []string{randomSubDomain()}, true)
test.AssertNotError(t, err, "Failed to issue second certificate")
_, err = authAndIssue(nil, nil, []string{randomSubDomain()}, true)
test.AssertError(t, err, "Somehow managed to issue third certificate")
if strings.Contains(os.Getenv("BOULDER_CONFIG_DIR"), "test/config-next") {
// Error should be served from key-value rate limits implementation.
test.AssertContains(t, err.Error(), fmt.Sprintf("too many certificates (2) already issued for %q in the last 2160h0m0s", randomDomain))
} else {
// Error should be served from legacy rate limits implementation.
test.AssertContains(t, err.Error(), fmt.Sprintf("too many certificates already issued for %q", randomDomain))
}
// Issue a certificate for the first subdomain, which should succeed because
// it's a renewal.
_, err = authAndIssue(nil, nil, []string{firstSubDomain}, true)
test.AssertNotError(t, err, "Failed to issue renewal certificate")
}

View File

@ -9,7 +9,6 @@ rename-command BGREWRITEAOF ""
rename-command BGSAVE ""
rename-command CONFIG ""
rename-command DEBUG ""
rename-command FLUSHALL ""
rename-command FLUSHDB ""
rename-command KEYS ""
rename-command PEXPIRE ""

View File

@ -1559,12 +1559,14 @@ def test_renewal_exemption():
chisel2.expect_problem("urn:ietf:params:acme:error:rateLimited",
lambda: chisel2.auth_and_issue(["mail." + base_domain]))
# TODO(#5545)
# - Phase 2: Once the new rate limits are authoritative in config-next, ensure
# that this test only runs in config.
# - Phase 3: Once the new rate limits are authoritative in config, remove this
# test entirely.
# TODO(#5545) Remove this test once key-value rate limits are authoritative in
# production.
def test_certificates_per_name():
if CONFIG_NEXT:
# This test is replaced by TestCertificatesPerDomain in the Go
# integration tests because key-value rate limits does not support
# override limits of 0.
return
chisel2.expect_problem("urn:ietf:params:acme:error:rateLimited",
lambda: chisel2.auth_and_issue([random_domain() + ".lim.it"]))

View File

@ -642,36 +642,33 @@ func link(url, relation string) string {
// function is returned that can be called to refund the quota if the account
// creation fails, the func will be nil if any error was encountered during the
// check.
//
// TODO(#5545): For now we're simply exercising the new rate limiter codepath.
// This should eventually return a berrors.RateLimit error containing the retry
// after duration among other information available in the ratelimits.Decision.
func (wfe *WebFrontEndImpl) checkNewAccountLimits(ctx context.Context, ip net.IP) func() {
func (wfe *WebFrontEndImpl) checkNewAccountLimits(ctx context.Context, ip net.IP) (func(), error) {
if wfe.limiter == nil && wfe.txnBuilder == nil {
// Key-value rate limiting is disabled.
return nil
return nil, nil
}
txns, err := wfe.txnBuilder.NewAccountLimitTransactions(ip)
if err != nil {
// TODO(#5545): Once key-value rate limits are authoritative this
// log line should be removed in favor of returning the error.
wfe.log.Infof("building new account limit transactions: %v", err)
return nil
return nil, fmt.Errorf("building new account limit transactions: %w", err)
}
_, err = wfe.limiter.BatchSpend(ctx, txns)
d, err := wfe.limiter.BatchSpend(ctx, txns)
if err != nil {
wfe.log.Warningf("checking newAccount limits: %s", err)
return nil
return nil, fmt.Errorf("spending new account limits: %w", err)
}
err = d.Result(wfe.clk.Now())
if err != nil {
return nil, err
}
return func() {
_, err := wfe.limiter.BatchRefund(ctx, txns)
if err != nil {
wfe.log.Warningf("refunding newAccount limits: %s", err)
wfe.log.Warningf("refunding new account limits: %s", err)
}
}
}, nil
}
// NewAccount is used by clients to submit a new account
@ -796,7 +793,16 @@ func (wfe *WebFrontEndImpl) NewAccount(
InitialIP: ipBytes,
}
refundLimits := wfe.checkNewAccountLimits(ctx, ip)
refundLimits, err := wfe.checkNewAccountLimits(ctx, ip)
if err != nil {
if errors.Is(err, berrors.RateLimit) {
if features.Get().UseKvLimitsForNewAccount {
wfe.sendError(response, logEvent, probs.RateLimited(err.Error()), err)
return
}
}
wfe.log.Warningf(err.Error())
}
var newRegistrationSuccessful bool
var errIsRateLimit bool
@ -2026,37 +2032,33 @@ func (wfe *WebFrontEndImpl) orderToOrderJSON(request *http.Request, order *corep
// encountered during the check, it is logged but not returned. A refund
// function is returned that can be used to refund the quota if the order is not
// created, the func will be nil if any error was encountered during the check.
//
// TODO(#5545): For now we're simply exercising the new rate limiter codepath.
// This should eventually return a berrors.RateLimit error containing the retry
// after duration among other information available in the ratelimits.Decision.
func (wfe *WebFrontEndImpl) checkNewOrderLimits(ctx context.Context, regId int64, names []string, isRenewal bool) func() {
func (wfe *WebFrontEndImpl) checkNewOrderLimits(ctx context.Context, regId int64, names []string, isRenewal bool) (func(), error) {
if wfe.limiter == nil && wfe.txnBuilder == nil {
// Key-value rate limiting is disabled.
return nil
return nil, nil
}
txns, err := wfe.txnBuilder.NewOrderLimitTransactions(regId, names, isRenewal)
if err != nil {
wfe.log.Warningf("building new order limit transactions: %v", err)
return nil
return nil, fmt.Errorf("building new order limit transactions: %w", err)
}
_, err = wfe.limiter.BatchSpend(ctx, txns)
d, err := wfe.limiter.BatchSpend(ctx, txns)
if err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return nil
}
wfe.log.Warningf("checking newOrder limits: %s", err)
return nil
return nil, fmt.Errorf("spending new order limits: %w", err)
}
err = d.Result(wfe.clk.Now())
if err != nil {
return nil, err
}
return func() {
_, err := wfe.limiter.BatchRefund(ctx, txns)
if err != nil {
wfe.log.Warningf("refunding newOrder limits: %s", err)
wfe.log.Warningf("refunding new order limits: %s", err)
}
}
}, nil
}
// orderMatchesReplacement checks if the order matches the provided certificate
@ -2367,7 +2369,16 @@ func (wfe *WebFrontEndImpl) NewOrder(
return
}
refundLimits := wfe.checkNewOrderLimits(ctx, acct.ID, names, isRenewal)
refundLimits, err := wfe.checkNewOrderLimits(ctx, acct.ID, names, isRenewal)
if err != nil {
if errors.Is(err, berrors.RateLimit) {
if features.Get().UseKvLimitsForNewOrder {
wfe.sendError(response, logEvent, probs.RateLimited(err.Error()), err)
return
}
}
wfe.log.Warningf(err.Error())
}
var newOrderSuccessful bool
var errIsRateLimit bool