SA: Improve concurrency robustness of CRL leasing transactions (#8030)

In a few places within the SA, we use explicit transactions to wrap
read-then-update style operations. Because we set the transaction
isolation level on a per-session basis, these transactions do not in
fact change their isolation level, and therefore generally remain at the
default isolation level of REPEATABLE READ.

Unfortunately, we cannot resolve this simply by converting the SELECT
statements into SELECT...FOR UPDATE statements: although this would fix
the issue by making those queries into locking statements, it also
triggers what appears to be an InnoDB bug when many transactions all
attempt to select-then-insert into a table with both a primary key and a
separate unique key, as the crlShards table has. This causes the
integration tests in GitHub Actions, which run with an empty database
and therefore use the needToInsert codepath instead of the update
codepath, to consistently flake.

Instead, resolve the issue by having the UPDATE statements specify that
the value of the leasedUntil column is still the same as was read by the
initial SELECT. Although two crl-updaters may still attempt these
transactions concurrently, the UPDATE statements will still be fully
sequenced, and the latter one will fail.

Part of https://github.com/letsencrypt/boulder/issues/8031
This commit is contained in:
Aaron Gable 2025-03-03 17:29:57 -06:00 committed by GitHub
parent e6c812a3db
commit 28b49a82d4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 32 additions and 19 deletions

View File

@ -1091,7 +1091,6 @@ func (ssa *SQLStorageAuthority) leaseOldestCRLShard(ctx context.Context, req *sa
// Determine which shard index we want to lease.
var shardIdx int
var needToInsert bool
if len(shards) < (int(req.MaxShardIdx + 1 - req.MinShardIdx)) {
// Some expected shards are missing (i.e. never-before-produced), so we
// pick one at random.
@ -1107,7 +1106,17 @@ func (ssa *SQLStorageAuthority) leaseOldestCRLShard(ctx context.Context, req *sa
shardIdx = idx
break
}
needToInsert = true
_, err = tx.ExecContext(ctx,
`INSERT INTO crlShards (issuerID, idx, leasedUntil)
VALUES (?, ?, ?)`,
req.IssuerNameID,
shardIdx,
req.Until.AsTime(),
)
if err != nil {
return -1, fmt.Errorf("inserting selected shard: %w", err)
}
} else {
// We got all the shards we expect, so we pick the oldest unleased shard.
var oldest *crlShardModel
@ -1125,34 +1134,29 @@ func (ssa *SQLStorageAuthority) leaseOldestCRLShard(ctx context.Context, req *sa
return -1, fmt.Errorf("issuer %d has no unleased shards in range %d-%d", req.IssuerNameID, req.MinShardIdx, req.MaxShardIdx)
}
shardIdx = oldest.Idx
needToInsert = false
}
if needToInsert {
_, err = tx.ExecContext(ctx,
`INSERT INTO crlShards (issuerID, idx, leasedUntil)
VALUES (?, ?, ?)`,
req.IssuerNameID,
shardIdx,
req.Until.AsTime(),
)
if err != nil {
return -1, fmt.Errorf("inserting selected shard: %w", err)
}
} else {
_, err = tx.ExecContext(ctx,
res, err := tx.ExecContext(ctx,
`UPDATE crlShards
SET leasedUntil = ?
WHERE issuerID = ?
AND idx = ?
AND leasedUntil = ?
LIMIT 1`,
req.Until.AsTime(),
req.IssuerNameID,
shardIdx,
oldest.LeasedUntil,
)
if err != nil {
return -1, fmt.Errorf("updating selected shard: %w", err)
}
rowsAffected, err := res.RowsAffected()
if err != nil {
return -1, fmt.Errorf("confirming update of selected shard: %w", err)
}
if rowsAffected != 1 {
return -1, errors.New("failed to lease shard")
}
}
return shardIdx, err
@ -1208,19 +1212,28 @@ func (ssa *SQLStorageAuthority) leaseSpecificCRLShard(ctx context.Context, req *
return nil, fmt.Errorf("inserting selected shard: %w", err)
}
} else {
_, err = tx.ExecContext(ctx,
res, err := tx.ExecContext(ctx,
`UPDATE crlShards
SET leasedUntil = ?
WHERE issuerID = ?
AND idx = ?
AND leasedUntil = ?
LIMIT 1`,
req.Until.AsTime(),
req.IssuerNameID,
req.MinShardIdx,
shardModel.LeasedUntil,
)
if err != nil {
return nil, fmt.Errorf("updating selected shard: %w", err)
}
rowsAffected, err := res.RowsAffected()
if err != nil {
return -1, fmt.Errorf("confirming update of selected shard: %w", err)
}
if rowsAffected != 1 {
return -1, errors.New("failed to lease shard")
}
}
return nil, nil

View File

@ -102,7 +102,7 @@ func TestCRLPipeline(t *testing.T) {
resp.Body.Close()
}
func TestTemporalAndExplicitShardingCoexist(t *testing.T) {
func TestCRLTemporalAndExplicitShardingCoexist(t *testing.T) {
db, err := sql.Open("mysql", vars.DBConnSAIntegrationFullPerms)
if err != nil {
t.Fatalf("sql.Open: %s", err)