bad-key-revoker: Add delay to mitigate race condition (#8301)

Add a `MaxExpectedReplicationLag` parameter to `bad-key-revoker`. Wait
that interval before searching for certificates to revoke.

The interval is set to only 100ms in both `test/config` and
`test/config-next` so that integration tests don't require long sleeps.
The default value within BKR is, and the production value should be,
higher.

Part of #5686
This commit is contained in:
James Renken 2025-07-21 14:18:19 -07:00 committed by GitHub
parent a3c1e62049
commit 04ae9ebcda
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 109 additions and 54 deletions

View File

@ -46,16 +46,17 @@ type revoker interface {
}
type badKeyRevoker struct {
dbMap *db.WrappedMap
maxRevocations int
serialBatchSize int
raClient revoker
logger blog.Logger
clk clock.Clock
backoffIntervalBase time.Duration
backoffIntervalMax time.Duration
backoffFactor float64
backoffTicker int
dbMap *db.WrappedMap
maxRevocations int
serialBatchSize int
raClient revoker
logger blog.Logger
clk clock.Clock
backoffIntervalBase time.Duration
backoffIntervalMax time.Duration
backoffFactor float64
backoffTicker int
maxExpectedReplicationLag time.Duration
}
// uncheckedBlockedKey represents a row in the blockedKeys table
@ -76,8 +77,10 @@ func (bkr *badKeyRevoker) countUncheckedKeys(ctx context.Context) (int, error) {
&count,
`SELECT COUNT(*)
FROM (SELECT 1 FROM blockedKeys
WHERE extantCertificatesChecked = false
WHERE extantCertificatesChecked = false AND added < ? - INTERVAL ? SECOND
LIMIT ?) AS a`,
bkr.clk.Now(),
bkr.maxExpectedReplicationLag.Seconds(),
blockedKeysGaugeLimit,
)
return count, err
@ -90,8 +93,10 @@ func (bkr *badKeyRevoker) selectUncheckedKey(ctx context.Context) (uncheckedBloc
&row,
`SELECT keyHash, revokedBy
FROM blockedKeys
WHERE extantCertificatesChecked = false
WHERE extantCertificatesChecked = false AND added < ? - INTERVAL ? SECOND
LIMIT 1`,
bkr.clk.Now(),
bkr.maxExpectedReplicationLag.Seconds(),
)
return row, err
}
@ -275,6 +280,7 @@ type Config struct {
// is higher than MaximumRevocations bad-key-revoker will error out and refuse to
// progress until this is addressed.
MaximumRevocations int `validate:"gte=0"`
// FindCertificatesBatchSize specifies the maximum number of serials to select from the
// keyHashToSerial table at once
FindCertificatesBatchSize int `validate:"required"`
@ -288,6 +294,13 @@ type Config struct {
// algorithm will wait before retrying in the event of error
// or no work to do.
BackoffIntervalMax config.Duration `validate:"-"`
// MaxExpectedReplicationLag specifies the minimum duration
// bad-key-revoker should wait before searching for certificates
// matching a blockedKeys row. This should be just slightly greater than
// the database's maximum replication lag, and always well under 24
// hours.
MaxExpectedReplicationLag config.Duration `validate:"-"`
}
Syslog cmd.SyslogConfig
@ -330,15 +343,16 @@ func main() {
rac := rapb.NewRegistrationAuthorityClient(conn)
bkr := &badKeyRevoker{
dbMap: dbMap,
maxRevocations: config.BadKeyRevoker.MaximumRevocations,
serialBatchSize: config.BadKeyRevoker.FindCertificatesBatchSize,
raClient: rac,
logger: logger,
clk: clk,
backoffIntervalMax: config.BadKeyRevoker.BackoffIntervalMax.Duration,
backoffIntervalBase: config.BadKeyRevoker.Interval.Duration,
backoffFactor: 1.3,
dbMap: dbMap,
maxRevocations: config.BadKeyRevoker.MaximumRevocations,
serialBatchSize: config.BadKeyRevoker.FindCertificatesBatchSize,
raClient: rac,
logger: logger,
clk: clk,
backoffIntervalMax: config.BadKeyRevoker.BackoffIntervalMax.Duration,
backoffIntervalBase: config.BadKeyRevoker.Interval.Duration,
backoffFactor: 1.3,
maxExpectedReplicationLag: config.BadKeyRevoker.MaxExpectedReplicationLag.Duration,
}
// If `BackoffIntervalMax` was not set via the config, set it to 60
@ -354,6 +368,14 @@ func main() {
bkr.backoffIntervalBase = time.Second
}
// If `MaxExpectedReplicationLag` was not set via the config, then set
// `bkr.maxExpectedReplicationLag` to a default 22 seconds. This is based on
// ProxySQL's max_replication_lag for bad-key-revoker (10s), times two, plus
// two seconds.
if bkr.maxExpectedReplicationLag == 0 {
bkr.maxExpectedReplicationLag = time.Second * 22
}
// Run bad-key-revoker in a loop. Backoff if no work or errors.
for {
noWork, err := bkr.invoke(context.Background())

View File

@ -45,6 +45,12 @@ func insertBlockedRow(t *testing.T, dbMap *db.WrappedMap, fc clock.Clock, hash [
test.AssertNotError(t, err, "failed to add test row")
}
func fcBeforeRepLag(clk clock.Clock, bkr *badKeyRevoker) clock.FakeClock {
fc := clock.NewFake()
fc.Set(clk.Now().Add(-bkr.maxExpectedReplicationLag - time.Second))
return fc
}
func TestSelectUncheckedRows(t *testing.T) {
ctx := context.Background()
@ -55,12 +61,15 @@ func TestSelectUncheckedRows(t *testing.T) {
fc := clock.NewFake()
bkr := &badKeyRevoker{
dbMap: dbMap,
logger: blog.NewMock(),
clk: fc,
dbMap: dbMap,
logger: blog.NewMock(),
clk: fc,
maxExpectedReplicationLag: time.Second * 22,
}
hashA, hashB, hashC := randHash(t), randHash(t), randHash(t)
// insert a blocked key that's marked as already checked
insertBlockedRow(t, dbMap, fc, hashA, 1, true)
count, err := bkr.countUncheckedKeys(ctx)
test.AssertNotError(t, err, "countUncheckedKeys failed")
@ -68,11 +77,14 @@ func TestSelectUncheckedRows(t *testing.T) {
_, err = bkr.selectUncheckedKey(ctx)
test.AssertError(t, err, "selectUncheckedKey didn't fail with no rows to process")
test.Assert(t, db.IsNoRows(err), "returned error is not sql.ErrNoRows")
insertBlockedRow(t, dbMap, fc, hashB, 1, false)
// insert a blocked key that's due to be checked
insertBlockedRow(t, dbMap, fcBeforeRepLag(fc, bkr), hashB, 1, false)
// insert a freshly blocked key, so it's not yet due to be checked
insertBlockedRow(t, dbMap, fc, hashC, 1, false)
count, err = bkr.countUncheckedKeys(ctx)
test.AssertNotError(t, err, "countUncheckedKeys failed")
test.AssertEquals(t, count, 2)
test.AssertEquals(t, count, 1)
row, err := bkr.selectUncheckedKey(ctx)
test.AssertNotError(t, err, "selectUncheckKey failed")
test.AssertByteEquals(t, row.KeyHash, hashB)
@ -191,7 +203,13 @@ func TestFindUnrevokedNoRows(t *testing.T) {
)
test.AssertNotError(t, err, "failed to insert test keyHashToSerial row")
bkr := &badKeyRevoker{dbMap: dbMap, serialBatchSize: 1, maxRevocations: 10, clk: fc}
bkr := &badKeyRevoker{
dbMap: dbMap,
serialBatchSize: 1,
maxRevocations: 10,
clk: fc,
maxExpectedReplicationLag: time.Second * 22,
}
_, err = bkr.findUnrevoked(ctx, uncheckedBlockedKey{KeyHash: hashA})
test.Assert(t, db.IsNoRows(err), "expected NoRows error")
}
@ -207,7 +225,13 @@ func TestFindUnrevoked(t *testing.T) {
regID := insertRegistration(t, dbMap, fc)
bkr := &badKeyRevoker{dbMap: dbMap, serialBatchSize: 1, maxRevocations: 10, clk: fc}
bkr := &badKeyRevoker{
dbMap: dbMap,
serialBatchSize: 1,
maxRevocations: 10,
clk: fc,
maxExpectedReplicationLag: time.Second * 22,
}
hashA := randHash(t)
// insert valid, unexpired
@ -251,7 +275,11 @@ func TestRevokeCerts(t *testing.T) {
fc := clock.NewFake()
mr := &mockRevoker{}
bkr := &badKeyRevoker{dbMap: dbMap, raClient: mr, clk: fc}
bkr := &badKeyRevoker{
dbMap: dbMap,
raClient: mr,
clk: fc,
}
err = bkr.revokeCerts([]unrevokedCertificate{
{ID: 0, Serial: "ff"},
@ -269,11 +297,20 @@ func TestCertificateAbsent(t *testing.T) {
defer test.ResetBoulderTestDatabase(t)()
fc := clock.NewFake()
bkr := &badKeyRevoker{
dbMap: dbMap,
maxRevocations: 1,
serialBatchSize: 1,
raClient: &mockRevoker{},
logger: blog.NewMock(),
clk: fc,
maxExpectedReplicationLag: time.Second * 22,
}
// populate DB with all the test data
regIDA := insertRegistration(t, dbMap, fc)
hashA := randHash(t)
insertBlockedRow(t, dbMap, fc, hashA, regIDA, false)
insertBlockedRow(t, dbMap, fcBeforeRepLag(fc, bkr), hashA, regIDA, false)
// Add an entry to keyHashToSerial but not to certificateStatus or certificate
// status, and expect an error.
@ -286,14 +323,6 @@ func TestCertificateAbsent(t *testing.T) {
)
test.AssertNotError(t, err, "failed to insert test keyHashToSerial row")
bkr := &badKeyRevoker{
dbMap: dbMap,
maxRevocations: 1,
serialBatchSize: 1,
raClient: &mockRevoker{},
logger: blog.NewMock(),
clk: fc,
}
_, err = bkr.invoke(ctx)
test.AssertError(t, err, "expected error when row in keyHashToSerial didn't have a matching cert")
}
@ -309,12 +338,13 @@ func TestInvoke(t *testing.T) {
mr := &mockRevoker{}
bkr := &badKeyRevoker{
dbMap: dbMap,
maxRevocations: 10,
serialBatchSize: 1,
raClient: mr,
logger: blog.NewMock(),
clk: fc,
dbMap: dbMap,
maxRevocations: 10,
serialBatchSize: 1,
raClient: mr,
logger: blog.NewMock(),
clk: fc,
maxExpectedReplicationLag: time.Second * 22,
}
// populate DB with all the test data
@ -323,7 +353,7 @@ func TestInvoke(t *testing.T) {
regIDC := insertRegistration(t, dbMap, fc)
regIDD := insertRegistration(t, dbMap, fc)
hashA := randHash(t)
insertBlockedRow(t, dbMap, fc, hashA, regIDC, false)
insertBlockedRow(t, dbMap, fcBeforeRepLag(fc, bkr), hashA, regIDC, false)
insertGoodCert(t, dbMap, fc, hashA, "ff", regIDA)
insertGoodCert(t, dbMap, fc, hashA, "ee", regIDB)
insertGoodCert(t, dbMap, fc, hashA, "dd", regIDC)
@ -344,7 +374,7 @@ func TestInvoke(t *testing.T) {
// add a row with no associated valid certificates
hashB := randHash(t)
insertBlockedRow(t, dbMap, fc, hashB, regIDC, false)
insertBlockedRow(t, dbMap, fcBeforeRepLag(fc, bkr), hashB, regIDC, false)
insertCert(t, dbMap, fc, hashB, "bb", regIDA, Expired, Revoked)
noWork, err = bkr.invoke(ctx)
@ -375,11 +405,12 @@ func TestInvokeRevokerHasNoExtantCerts(t *testing.T) {
mr := &mockRevoker{}
bkr := &badKeyRevoker{dbMap: dbMap,
maxRevocations: 10,
serialBatchSize: 1,
raClient: mr,
logger: blog.NewMock(),
clk: fc,
maxRevocations: 10,
serialBatchSize: 1,
raClient: mr,
logger: blog.NewMock(),
clk: fc,
maxExpectedReplicationLag: time.Second * 22,
}
// populate DB with all the test data
@ -389,7 +420,7 @@ func TestInvokeRevokerHasNoExtantCerts(t *testing.T) {
hashA := randHash(t)
insertBlockedRow(t, dbMap, fc, hashA, regIDA, false)
insertBlockedRow(t, dbMap, fcBeforeRepLag(fc, bkr), hashA, regIDA, false)
insertGoodCert(t, dbMap, fc, hashA, "ee", regIDB)
insertGoodCert(t, dbMap, fc, hashA, "dd", regIDB)

View File

@ -22,7 +22,8 @@
"maximumRevocations": 15,
"findCertificatesBatchSize": 10,
"interval": "50ms",
"backoffIntervalMax": "2s"
"backoffIntervalMax": "2s",
"maxExpectedReplicationLag": "100ms"
},
"syslog": {
"stdoutlevel": 4,

View File

@ -23,7 +23,8 @@
"maximumRevocations": 15,
"findCertificatesBatchSize": 10,
"interval": "50ms",
"backoffIntervalMax": "2s"
"backoffIntervalMax": "2s",
"maxExpectedReplicationLag": "100ms"
},
"syslog": {
"stdoutlevel": 4,