crl-updater: retry failed shards (#6907)

Add per-shard exponential backoff and retry to crl-updater. Each individual CRL shard will be retried up to MaxAttempts (default 1) times, with exponential backoff starting at 1 second and maxing out at 1 minute between each attempt. This can effectively reduce the parallelism of crl-updater: while a goroutine is sleeping between attempts of a failing shard, it is not doing work on another shard. This is a desirable feature, since it means that crl-updater gently reduces the total load it places on the network and database when shards start to fail. Setting this new config parameter is tracked in IN-9140 Fixes https://github.com/letsencrypt/boulder/issues/6895
2023-05-22 12:59:09 -07:00 · 2023-05-22 12:59:09 -07:00 · fe523f142d
parent 3990a08328
commit fe523f142d
5 changed files with 100 additions and 6 deletions
--- a/cmd/crl-updater/main.go
+++ b/cmd/crl-updater/main.go
@ -87,6 +87,12 @@ type Config struct {
 		// by this updater.
 		MaxParallelism int `validate:"min=0"`

+		// MaxAttempts control how many times the updater will attempt to generate
+		// a single CRL shard. A higher number increases the likelihood of a fully
+		// successful run, but also increases the worst-case runtime and db/network
+		// load of said run. The default is 1.
+		MaxAttempts int `validate:"omitempty,min=1"`
+
 		Features map[string]bool
 	}

@ -158,6 +164,7 @@ func main() {
 		c.CRLUpdater.UpdatePeriod.Duration,
 		c.CRLUpdater.UpdateOffset.Duration,
 		c.CRLUpdater.MaxParallelism,
+		c.CRLUpdater.MaxAttempts,
 		sac,
 		cac,
 		csc,
--- a/core/util_test.go
+++ b/core/util_test.go
@ -189,12 +189,17 @@ func TestRetryBackoff(t *testing.T) {
 	base := time.Minute
 	max := 10 * time.Minute

+	backoff := RetryBackoff(0, base, max, factor)
+	assertBetween(float64(backoff), 0, 0)
+
 	expected := base
-	backoff := RetryBackoff(1, base, max, factor)
+	backoff = RetryBackoff(1, base, max, factor)
 	assertBetween(float64(backoff), float64(expected)*0.8, float64(expected)*1.2)
+
 	expected = time.Second * 90
 	backoff = RetryBackoff(2, base, max, factor)
 	assertBetween(float64(backoff), float64(expected)*0.8, float64(expected)*1.2)
+
 	expected = time.Minute * 10
 	// should be truncated
 	backoff = RetryBackoff(7, base, max, factor)
--- a/crl/updater/updater.go
+++ b/crl/updater/updater.go
@ -17,6 +17,7 @@ import (
 	"google.golang.org/protobuf/types/known/emptypb"

 	capb "github.com/letsencrypt/boulder/ca/proto"
+	"github.com/letsencrypt/boulder/core"
 	"github.com/letsencrypt/boulder/core/proto"
 	"github.com/letsencrypt/boulder/crl"
 	cspb "github.com/letsencrypt/boulder/crl/storer/proto"
@ -33,6 +34,7 @@ type crlUpdater struct {
 	updatePeriod   time.Duration
 	updateOffset   time.Duration
 	maxParallelism int
+	maxAttempts    int

 	sa sapb.StorageAuthorityReadOnlyClient
 	ca capb.CRLGeneratorClient
@ -53,6 +55,7 @@ func NewUpdater(
 	updatePeriod time.Duration,
 	updateOffset time.Duration,
 	maxParallelism int,
+	maxAttempts int,
 	sa sapb.StorageAuthorityReadOnlyClient,
 	ca capb.CRLGeneratorClient,
 	cs cspb.CRLStorerClient,
@ -85,6 +88,10 @@ func NewUpdater(
 		maxParallelism = 1
 	}

+	if maxAttempts <= 0 {
+		maxAttempts = 1
+	}
+
 	tickHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{
 		Name:    "crl_updater_ticks",
 		Help:    "A histogram of crl-updater tick latencies labeled by issuer and result",
@ -106,6 +113,7 @@ func NewUpdater(
 		updatePeriod,
 		updateOffset,
 		maxParallelism,
+		maxAttempts,
 		sa,
 		ca,
 		cs,
@ -244,7 +252,7 @@ func (cu *crlUpdater) tickIssuer(ctx context.Context, atTime time.Time, issuerNa
 			default:
 				out <- shardResult{
 					shardIdx: idx,
-					err:      cu.tickShard(ctx, atTime, issuerNameID, idx, shardMap[idx]),
+					err:      cu.tickShardWithRetry(ctx, atTime, issuerNameID, idx, shardMap[idx]),
 				}
 			}
 		}
@ -279,6 +287,30 @@ func (cu *crlUpdater) tickIssuer(ctx context.Context, atTime time.Time, issuerNa
 	return nil
 }

+// tickShardWithRetry calls tickShard repeatedly (with exponential backoff
+// between attempts) until it succeeds or the max number of attempts is reached.
+func (cu *crlUpdater) tickShardWithRetry(ctx context.Context, atTime time.Time, issuerNameID issuance.IssuerNameID, shardIdx int, chunks []chunk) error {
+	crlID := crl.Id(issuerNameID, crl.Number(atTime), shardIdx)
+
+	var err error
+	for i := 0; i < cu.maxAttempts; i++ {
+		// core.RetryBackoff always returns 0 when its first argument is zero.
+		sleepTime := core.RetryBackoff(i, time.Second, time.Minute, 2)
+		if i != 0 {
+			cu.log.Errf(
+				"Generating CRL failed, will retry in %vs: id=[%s] err=[%s]",
+				sleepTime.Seconds(), crlID, err)
+		}
+		cu.clk.Sleep(sleepTime)
+
+		err = cu.tickShard(ctx, atTime, issuerNameID, shardIdx, chunks)
+		if err == nil {
+			break
+		}
+	}
+	return err
+}
+
 // tickShard processes a single shard. It computes the shard's boundaries, gets
 // the list of revoked certs in that shard from the SA, gets the CA to sign the
 // resulting CRL, and gets the crl-storer to upload it. It returns an error if
--- a/crl/updater/updater_test.go
+++ b/crl/updater/updater_test.go
@ -146,7 +146,7 @@ func TestTickShard(t *testing.T) {
 	cu, err := NewUpdater(
 		[]*issuance.Certificate{e1, r3},
 		2, 18*time.Hour, 24*time.Hour,
-		6*time.Hour, 1*time.Minute, 1,
+		6*time.Hour, 1*time.Minute, 1, 1,
 		&fakeSAC{grcc: fakeGRCC{}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
 		&fakeCGC{gcc: fakeGCC{}},
 		&fakeCSC{ucc: fakeUCC{}},
@ -222,6 +222,55 @@ func TestTickShard(t *testing.T) {
 	cu.updatedCounter.Reset()
 }

+func TestTickShardWithRetry(t *testing.T) {
+	e1, err := issuance.LoadCertificate("../../test/hierarchy/int-e1.cert.pem")
+	test.AssertNotError(t, err, "loading test issuer")
+	r3, err := issuance.LoadCertificate("../../test/hierarchy/int-r3.cert.pem")
+	test.AssertNotError(t, err, "loading test issuer")
+
+	sentinelErr := errors.New("oops")
+
+	clk := clock.NewFake()
+	clk.Set(time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC))
+
+	// Build an updater that will always fail when it talks to the SA.
+	cu, err := NewUpdater(
+		[]*issuance.Certificate{e1, r3},
+		2, 18*time.Hour, 24*time.Hour,
+		6*time.Hour, 1*time.Minute, 1, 1,
+		&fakeSAC{grcc: fakeGRCC{err: sentinelErr}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
+		&fakeCGC{gcc: fakeGCC{}},
+		&fakeCSC{ucc: fakeUCC{}},
+		metrics.NoopRegisterer, blog.NewMock(), clk,
+	)
+	test.AssertNotError(t, err, "building test crlUpdater")
+
+	testChunks := []chunk{
+		{clk.Now(), clk.Now().Add(18 * time.Hour), 0},
+	}
+
+	// Ensure that having MaxAttempts set to 1 results in the clock not moving
+	// forward at all.
+	startTime := cu.clk.Now()
+	err = cu.tickShardWithRetry(context.Background(), cu.clk.Now(), e1.NameID(), 0, testChunks)
+	test.AssertError(t, err, "database error")
+	test.AssertErrorIs(t, err, sentinelErr)
+	test.AssertEquals(t, cu.clk.Now(), startTime)
+
+	// Ensure that having MaxAttempts set to 5 results in the clock moving forward
+	// by 1+2+4+8=15 seconds. The core.RetryBackoff system has 20% jitter built
+	// in, so we have to be approximate.
+	cu.maxAttempts = 5
+	startTime = cu.clk.Now()
+	err = cu.tickShardWithRetry(context.Background(), cu.clk.Now(), e1.NameID(), 0, testChunks)
+	test.AssertError(t, err, "database error")
+	test.AssertErrorIs(t, err, sentinelErr)
+	t.Logf("start: %v", startTime)
+	t.Logf("now: %v", cu.clk.Now())
+	test.Assert(t, startTime.Add(15*0.8*time.Second).Before(cu.clk.Now()), "retries didn't sleep enough")
+	test.Assert(t, startTime.Add(15*1.2*time.Second).After(cu.clk.Now()), "retries slept too much")
+}
+
 func TestTickIssuer(t *testing.T) {
 	e1, err := issuance.LoadCertificate("../../test/hierarchy/int-e1.cert.pem")
 	test.AssertNotError(t, err, "loading test issuer")
@ -234,7 +283,7 @@ func TestTickIssuer(t *testing.T) {
 	cu, err := NewUpdater(
 		[]*issuance.Certificate{e1, r3},
 		2, 18*time.Hour, 24*time.Hour,
-		6*time.Hour, 1*time.Minute, 1,
+		6*time.Hour, 1*time.Minute, 1, 1,
 		&fakeSAC{grcc: fakeGRCC{err: errors.New("db no worky")}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
 		&fakeCGC{gcc: fakeGCC{}},
 		&fakeCSC{ucc: fakeUCC{}},
@ -270,7 +319,7 @@ func TestTick(t *testing.T) {
 	cu, err := NewUpdater(
 		[]*issuance.Certificate{e1, r3},
 		2, 18*time.Hour, 24*time.Hour,
-		6*time.Hour, 1*time.Minute, 1,
+		6*time.Hour, 1*time.Minute, 1, 1,
 		&fakeSAC{grcc: fakeGRCC{err: errors.New("db no worky")}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
 		&fakeCGC{gcc: fakeGCC{}},
 		&fakeCSC{ucc: fakeUCC{}},
--- a/test/config-next/crl-updater.json
+++ b/test/config-next/crl-updater.json
@ -46,7 +46,8 @@
 		"lookbackPeriod": "24h",
 		"updatePeriod": "6h",
 		"updateOffset": "9120s",
-		"maxParallelism": 10
+		"maxParallelism": 10,
+		"maxAttempts": 5
 	},
 	"syslog": {
 		"stdoutlevel": 6,