SA: Improve concurrency robustness of CRL leasing transactions (#8030)
In a few places within the SA, we use explicit transactions to wrap read-then-update style operations. Because we set the transaction isolation level on a per-session basis, these transactions do not in fact change their isolation level, and therefore generally remain at the default isolation level of REPEATABLE READ. Unfortunately, we cannot resolve this simply by converting the SELECT statements into SELECT...FOR UPDATE statements: although this would fix the issue by making those queries into locking statements, it also triggers what appears to be an InnoDB bug when many transactions all attempt to select-then-insert into a table with both a primary key and a separate unique key, as the crlShards table has. This causes the integration tests in GitHub Actions, which run with an empty database and therefore use the needToInsert codepath instead of the update codepath, to consistently flake. Instead, resolve the issue by having the UPDATE statements specify that the value of the leasedUntil column is still the same as was read by the initial SELECT. Although two crl-updaters may still attempt these transactions concurrently, the UPDATE statements will still be fully sequenced, and the latter one will fail. Part of https://github.com/letsencrypt/boulder/issues/8031
This commit is contained in:
parent
e6c812a3db
commit
28b49a82d4
49
sa/sa.go
49
sa/sa.go
|
@ -1091,7 +1091,6 @@ func (ssa *SQLStorageAuthority) leaseOldestCRLShard(ctx context.Context, req *sa
|
|||
|
||||
// Determine which shard index we want to lease.
|
||||
var shardIdx int
|
||||
var needToInsert bool
|
||||
if len(shards) < (int(req.MaxShardIdx + 1 - req.MinShardIdx)) {
|
||||
// Some expected shards are missing (i.e. never-before-produced), so we
|
||||
// pick one at random.
|
||||
|
@ -1107,7 +1106,17 @@ func (ssa *SQLStorageAuthority) leaseOldestCRLShard(ctx context.Context, req *sa
|
|||
shardIdx = idx
|
||||
break
|
||||
}
|
||||
needToInsert = true
|
||||
|
||||
_, err = tx.ExecContext(ctx,
|
||||
`INSERT INTO crlShards (issuerID, idx, leasedUntil)
|
||||
VALUES (?, ?, ?)`,
|
||||
req.IssuerNameID,
|
||||
shardIdx,
|
||||
req.Until.AsTime(),
|
||||
)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("inserting selected shard: %w", err)
|
||||
}
|
||||
} else {
|
||||
// We got all the shards we expect, so we pick the oldest unleased shard.
|
||||
var oldest *crlShardModel
|
||||
|
@ -1125,34 +1134,29 @@ func (ssa *SQLStorageAuthority) leaseOldestCRLShard(ctx context.Context, req *sa
|
|||
return -1, fmt.Errorf("issuer %d has no unleased shards in range %d-%d", req.IssuerNameID, req.MinShardIdx, req.MaxShardIdx)
|
||||
}
|
||||
shardIdx = oldest.Idx
|
||||
needToInsert = false
|
||||
}
|
||||
|
||||
if needToInsert {
|
||||
_, err = tx.ExecContext(ctx,
|
||||
`INSERT INTO crlShards (issuerID, idx, leasedUntil)
|
||||
VALUES (?, ?, ?)`,
|
||||
req.IssuerNameID,
|
||||
shardIdx,
|
||||
req.Until.AsTime(),
|
||||
)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("inserting selected shard: %w", err)
|
||||
}
|
||||
} else {
|
||||
_, err = tx.ExecContext(ctx,
|
||||
res, err := tx.ExecContext(ctx,
|
||||
`UPDATE crlShards
|
||||
SET leasedUntil = ?
|
||||
WHERE issuerID = ?
|
||||
AND idx = ?
|
||||
AND leasedUntil = ?
|
||||
LIMIT 1`,
|
||||
req.Until.AsTime(),
|
||||
req.IssuerNameID,
|
||||
shardIdx,
|
||||
oldest.LeasedUntil,
|
||||
)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("updating selected shard: %w", err)
|
||||
}
|
||||
rowsAffected, err := res.RowsAffected()
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("confirming update of selected shard: %w", err)
|
||||
}
|
||||
if rowsAffected != 1 {
|
||||
return -1, errors.New("failed to lease shard")
|
||||
}
|
||||
}
|
||||
|
||||
return shardIdx, err
|
||||
|
@ -1208,19 +1212,28 @@ func (ssa *SQLStorageAuthority) leaseSpecificCRLShard(ctx context.Context, req *
|
|||
return nil, fmt.Errorf("inserting selected shard: %w", err)
|
||||
}
|
||||
} else {
|
||||
_, err = tx.ExecContext(ctx,
|
||||
res, err := tx.ExecContext(ctx,
|
||||
`UPDATE crlShards
|
||||
SET leasedUntil = ?
|
||||
WHERE issuerID = ?
|
||||
AND idx = ?
|
||||
AND leasedUntil = ?
|
||||
LIMIT 1`,
|
||||
req.Until.AsTime(),
|
||||
req.IssuerNameID,
|
||||
req.MinShardIdx,
|
||||
shardModel.LeasedUntil,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("updating selected shard: %w", err)
|
||||
}
|
||||
rowsAffected, err := res.RowsAffected()
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("confirming update of selected shard: %w", err)
|
||||
}
|
||||
if rowsAffected != 1 {
|
||||
return -1, errors.New("failed to lease shard")
|
||||
}
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
|
|
|
@ -102,7 +102,7 @@ func TestCRLPipeline(t *testing.T) {
|
|||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestTemporalAndExplicitShardingCoexist(t *testing.T) {
|
||||
func TestCRLTemporalAndExplicitShardingCoexist(t *testing.T) {
|
||||
db, err := sql.Open("mysql", vars.DBConnSAIntegrationFullPerms)
|
||||
if err != nil {
|
||||
t.Fatalf("sql.Open: %s", err)
|
||||
|
|
Loading…
Reference in New Issue