crl-updater: retry failed shards (#6907)

Add per-shard exponential backoff and retry to crl-updater. Each
individual CRL shard will be retried up to MaxAttempts (default 1)
times, with exponential backoff starting at 1 second and maxing out at 1
minute between each attempt.

This can effectively reduce the parallelism of crl-updater: while a
goroutine is sleeping between attempts of a failing shard, it is not
doing work on another shard. This is a desirable feature, since it means
that crl-updater gently reduces the total load it places on the network
and database when shards start to fail.

Setting this new config parameter is tracked in IN-9140
Fixes https://github.com/letsencrypt/boulder/issues/6895
This commit is contained in:
Aaron Gable 2023-05-22 12:59:09 -07:00 committed by GitHub
parent 3990a08328
commit fe523f142d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 100 additions and 6 deletions

View File

@ -87,6 +87,12 @@ type Config struct {
// by this updater.
MaxParallelism int `validate:"min=0"`
// MaxAttempts control how many times the updater will attempt to generate
// a single CRL shard. A higher number increases the likelihood of a fully
// successful run, but also increases the worst-case runtime and db/network
// load of said run. The default is 1.
MaxAttempts int `validate:"omitempty,min=1"`
Features map[string]bool
}
@ -158,6 +164,7 @@ func main() {
c.CRLUpdater.UpdatePeriod.Duration,
c.CRLUpdater.UpdateOffset.Duration,
c.CRLUpdater.MaxParallelism,
c.CRLUpdater.MaxAttempts,
sac,
cac,
csc,

View File

@ -189,12 +189,17 @@ func TestRetryBackoff(t *testing.T) {
base := time.Minute
max := 10 * time.Minute
backoff := RetryBackoff(0, base, max, factor)
assertBetween(float64(backoff), 0, 0)
expected := base
backoff := RetryBackoff(1, base, max, factor)
backoff = RetryBackoff(1, base, max, factor)
assertBetween(float64(backoff), float64(expected)*0.8, float64(expected)*1.2)
expected = time.Second * 90
backoff = RetryBackoff(2, base, max, factor)
assertBetween(float64(backoff), float64(expected)*0.8, float64(expected)*1.2)
expected = time.Minute * 10
// should be truncated
backoff = RetryBackoff(7, base, max, factor)

View File

@ -17,6 +17,7 @@ import (
"google.golang.org/protobuf/types/known/emptypb"
capb "github.com/letsencrypt/boulder/ca/proto"
"github.com/letsencrypt/boulder/core"
"github.com/letsencrypt/boulder/core/proto"
"github.com/letsencrypt/boulder/crl"
cspb "github.com/letsencrypt/boulder/crl/storer/proto"
@ -33,6 +34,7 @@ type crlUpdater struct {
updatePeriod time.Duration
updateOffset time.Duration
maxParallelism int
maxAttempts int
sa sapb.StorageAuthorityReadOnlyClient
ca capb.CRLGeneratorClient
@ -53,6 +55,7 @@ func NewUpdater(
updatePeriod time.Duration,
updateOffset time.Duration,
maxParallelism int,
maxAttempts int,
sa sapb.StorageAuthorityReadOnlyClient,
ca capb.CRLGeneratorClient,
cs cspb.CRLStorerClient,
@ -85,6 +88,10 @@ func NewUpdater(
maxParallelism = 1
}
if maxAttempts <= 0 {
maxAttempts = 1
}
tickHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "crl_updater_ticks",
Help: "A histogram of crl-updater tick latencies labeled by issuer and result",
@ -106,6 +113,7 @@ func NewUpdater(
updatePeriod,
updateOffset,
maxParallelism,
maxAttempts,
sa,
ca,
cs,
@ -244,7 +252,7 @@ func (cu *crlUpdater) tickIssuer(ctx context.Context, atTime time.Time, issuerNa
default:
out <- shardResult{
shardIdx: idx,
err: cu.tickShard(ctx, atTime, issuerNameID, idx, shardMap[idx]),
err: cu.tickShardWithRetry(ctx, atTime, issuerNameID, idx, shardMap[idx]),
}
}
}
@ -279,6 +287,30 @@ func (cu *crlUpdater) tickIssuer(ctx context.Context, atTime time.Time, issuerNa
return nil
}
// tickShardWithRetry calls tickShard repeatedly (with exponential backoff
// between attempts) until it succeeds or the max number of attempts is reached.
func (cu *crlUpdater) tickShardWithRetry(ctx context.Context, atTime time.Time, issuerNameID issuance.IssuerNameID, shardIdx int, chunks []chunk) error {
crlID := crl.Id(issuerNameID, crl.Number(atTime), shardIdx)
var err error
for i := 0; i < cu.maxAttempts; i++ {
// core.RetryBackoff always returns 0 when its first argument is zero.
sleepTime := core.RetryBackoff(i, time.Second, time.Minute, 2)
if i != 0 {
cu.log.Errf(
"Generating CRL failed, will retry in %vs: id=[%s] err=[%s]",
sleepTime.Seconds(), crlID, err)
}
cu.clk.Sleep(sleepTime)
err = cu.tickShard(ctx, atTime, issuerNameID, shardIdx, chunks)
if err == nil {
break
}
}
return err
}
// tickShard processes a single shard. It computes the shard's boundaries, gets
// the list of revoked certs in that shard from the SA, gets the CA to sign the
// resulting CRL, and gets the crl-storer to upload it. It returns an error if

View File

@ -146,7 +146,7 @@ func TestTickShard(t *testing.T) {
cu, err := NewUpdater(
[]*issuance.Certificate{e1, r3},
2, 18*time.Hour, 24*time.Hour,
6*time.Hour, 1*time.Minute, 1,
6*time.Hour, 1*time.Minute, 1, 1,
&fakeSAC{grcc: fakeGRCC{}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
&fakeCGC{gcc: fakeGCC{}},
&fakeCSC{ucc: fakeUCC{}},
@ -222,6 +222,55 @@ func TestTickShard(t *testing.T) {
cu.updatedCounter.Reset()
}
func TestTickShardWithRetry(t *testing.T) {
e1, err := issuance.LoadCertificate("../../test/hierarchy/int-e1.cert.pem")
test.AssertNotError(t, err, "loading test issuer")
r3, err := issuance.LoadCertificate("../../test/hierarchy/int-r3.cert.pem")
test.AssertNotError(t, err, "loading test issuer")
sentinelErr := errors.New("oops")
clk := clock.NewFake()
clk.Set(time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC))
// Build an updater that will always fail when it talks to the SA.
cu, err := NewUpdater(
[]*issuance.Certificate{e1, r3},
2, 18*time.Hour, 24*time.Hour,
6*time.Hour, 1*time.Minute, 1, 1,
&fakeSAC{grcc: fakeGRCC{err: sentinelErr}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
&fakeCGC{gcc: fakeGCC{}},
&fakeCSC{ucc: fakeUCC{}},
metrics.NoopRegisterer, blog.NewMock(), clk,
)
test.AssertNotError(t, err, "building test crlUpdater")
testChunks := []chunk{
{clk.Now(), clk.Now().Add(18 * time.Hour), 0},
}
// Ensure that having MaxAttempts set to 1 results in the clock not moving
// forward at all.
startTime := cu.clk.Now()
err = cu.tickShardWithRetry(context.Background(), cu.clk.Now(), e1.NameID(), 0, testChunks)
test.AssertError(t, err, "database error")
test.AssertErrorIs(t, err, sentinelErr)
test.AssertEquals(t, cu.clk.Now(), startTime)
// Ensure that having MaxAttempts set to 5 results in the clock moving forward
// by 1+2+4+8=15 seconds. The core.RetryBackoff system has 20% jitter built
// in, so we have to be approximate.
cu.maxAttempts = 5
startTime = cu.clk.Now()
err = cu.tickShardWithRetry(context.Background(), cu.clk.Now(), e1.NameID(), 0, testChunks)
test.AssertError(t, err, "database error")
test.AssertErrorIs(t, err, sentinelErr)
t.Logf("start: %v", startTime)
t.Logf("now: %v", cu.clk.Now())
test.Assert(t, startTime.Add(15*0.8*time.Second).Before(cu.clk.Now()), "retries didn't sleep enough")
test.Assert(t, startTime.Add(15*1.2*time.Second).After(cu.clk.Now()), "retries slept too much")
}
func TestTickIssuer(t *testing.T) {
e1, err := issuance.LoadCertificate("../../test/hierarchy/int-e1.cert.pem")
test.AssertNotError(t, err, "loading test issuer")
@ -234,7 +283,7 @@ func TestTickIssuer(t *testing.T) {
cu, err := NewUpdater(
[]*issuance.Certificate{e1, r3},
2, 18*time.Hour, 24*time.Hour,
6*time.Hour, 1*time.Minute, 1,
6*time.Hour, 1*time.Minute, 1, 1,
&fakeSAC{grcc: fakeGRCC{err: errors.New("db no worky")}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
&fakeCGC{gcc: fakeGCC{}},
&fakeCSC{ucc: fakeUCC{}},
@ -270,7 +319,7 @@ func TestTick(t *testing.T) {
cu, err := NewUpdater(
[]*issuance.Certificate{e1, r3},
2, 18*time.Hour, 24*time.Hour,
6*time.Hour, 1*time.Minute, 1,
6*time.Hour, 1*time.Minute, 1, 1,
&fakeSAC{grcc: fakeGRCC{err: errors.New("db no worky")}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
&fakeCGC{gcc: fakeGCC{}},
&fakeCSC{ucc: fakeUCC{}},

View File

@ -46,7 +46,8 @@
"lookbackPeriod": "24h",
"updatePeriod": "6h",
"updateOffset": "9120s",
"maxParallelism": 10
"maxParallelism": 10,
"maxAttempts": 5
},
"syslog": {
"stdoutlevel": 6,