crl-updater: retry failed shards (#6907)
Add per-shard exponential backoff and retry to crl-updater. Each individual CRL shard will be retried up to MaxAttempts (default 1) times, with exponential backoff starting at 1 second and maxing out at 1 minute between each attempt. This can effectively reduce the parallelism of crl-updater: while a goroutine is sleeping between attempts of a failing shard, it is not doing work on another shard. This is a desirable feature, since it means that crl-updater gently reduces the total load it places on the network and database when shards start to fail. Setting this new config parameter is tracked in IN-9140 Fixes https://github.com/letsencrypt/boulder/issues/6895
This commit is contained in:
parent
3990a08328
commit
fe523f142d
|
|
@ -87,6 +87,12 @@ type Config struct {
|
|||
// by this updater.
|
||||
MaxParallelism int `validate:"min=0"`
|
||||
|
||||
// MaxAttempts control how many times the updater will attempt to generate
|
||||
// a single CRL shard. A higher number increases the likelihood of a fully
|
||||
// successful run, but also increases the worst-case runtime and db/network
|
||||
// load of said run. The default is 1.
|
||||
MaxAttempts int `validate:"omitempty,min=1"`
|
||||
|
||||
Features map[string]bool
|
||||
}
|
||||
|
||||
|
|
@ -158,6 +164,7 @@ func main() {
|
|||
c.CRLUpdater.UpdatePeriod.Duration,
|
||||
c.CRLUpdater.UpdateOffset.Duration,
|
||||
c.CRLUpdater.MaxParallelism,
|
||||
c.CRLUpdater.MaxAttempts,
|
||||
sac,
|
||||
cac,
|
||||
csc,
|
||||
|
|
|
|||
|
|
@ -189,12 +189,17 @@ func TestRetryBackoff(t *testing.T) {
|
|||
base := time.Minute
|
||||
max := 10 * time.Minute
|
||||
|
||||
backoff := RetryBackoff(0, base, max, factor)
|
||||
assertBetween(float64(backoff), 0, 0)
|
||||
|
||||
expected := base
|
||||
backoff := RetryBackoff(1, base, max, factor)
|
||||
backoff = RetryBackoff(1, base, max, factor)
|
||||
assertBetween(float64(backoff), float64(expected)*0.8, float64(expected)*1.2)
|
||||
|
||||
expected = time.Second * 90
|
||||
backoff = RetryBackoff(2, base, max, factor)
|
||||
assertBetween(float64(backoff), float64(expected)*0.8, float64(expected)*1.2)
|
||||
|
||||
expected = time.Minute * 10
|
||||
// should be truncated
|
||||
backoff = RetryBackoff(7, base, max, factor)
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ import (
|
|||
"google.golang.org/protobuf/types/known/emptypb"
|
||||
|
||||
capb "github.com/letsencrypt/boulder/ca/proto"
|
||||
"github.com/letsencrypt/boulder/core"
|
||||
"github.com/letsencrypt/boulder/core/proto"
|
||||
"github.com/letsencrypt/boulder/crl"
|
||||
cspb "github.com/letsencrypt/boulder/crl/storer/proto"
|
||||
|
|
@ -33,6 +34,7 @@ type crlUpdater struct {
|
|||
updatePeriod time.Duration
|
||||
updateOffset time.Duration
|
||||
maxParallelism int
|
||||
maxAttempts int
|
||||
|
||||
sa sapb.StorageAuthorityReadOnlyClient
|
||||
ca capb.CRLGeneratorClient
|
||||
|
|
@ -53,6 +55,7 @@ func NewUpdater(
|
|||
updatePeriod time.Duration,
|
||||
updateOffset time.Duration,
|
||||
maxParallelism int,
|
||||
maxAttempts int,
|
||||
sa sapb.StorageAuthorityReadOnlyClient,
|
||||
ca capb.CRLGeneratorClient,
|
||||
cs cspb.CRLStorerClient,
|
||||
|
|
@ -85,6 +88,10 @@ func NewUpdater(
|
|||
maxParallelism = 1
|
||||
}
|
||||
|
||||
if maxAttempts <= 0 {
|
||||
maxAttempts = 1
|
||||
}
|
||||
|
||||
tickHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Name: "crl_updater_ticks",
|
||||
Help: "A histogram of crl-updater tick latencies labeled by issuer and result",
|
||||
|
|
@ -106,6 +113,7 @@ func NewUpdater(
|
|||
updatePeriod,
|
||||
updateOffset,
|
||||
maxParallelism,
|
||||
maxAttempts,
|
||||
sa,
|
||||
ca,
|
||||
cs,
|
||||
|
|
@ -244,7 +252,7 @@ func (cu *crlUpdater) tickIssuer(ctx context.Context, atTime time.Time, issuerNa
|
|||
default:
|
||||
out <- shardResult{
|
||||
shardIdx: idx,
|
||||
err: cu.tickShard(ctx, atTime, issuerNameID, idx, shardMap[idx]),
|
||||
err: cu.tickShardWithRetry(ctx, atTime, issuerNameID, idx, shardMap[idx]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -279,6 +287,30 @@ func (cu *crlUpdater) tickIssuer(ctx context.Context, atTime time.Time, issuerNa
|
|||
return nil
|
||||
}
|
||||
|
||||
// tickShardWithRetry calls tickShard repeatedly (with exponential backoff
|
||||
// between attempts) until it succeeds or the max number of attempts is reached.
|
||||
func (cu *crlUpdater) tickShardWithRetry(ctx context.Context, atTime time.Time, issuerNameID issuance.IssuerNameID, shardIdx int, chunks []chunk) error {
|
||||
crlID := crl.Id(issuerNameID, crl.Number(atTime), shardIdx)
|
||||
|
||||
var err error
|
||||
for i := 0; i < cu.maxAttempts; i++ {
|
||||
// core.RetryBackoff always returns 0 when its first argument is zero.
|
||||
sleepTime := core.RetryBackoff(i, time.Second, time.Minute, 2)
|
||||
if i != 0 {
|
||||
cu.log.Errf(
|
||||
"Generating CRL failed, will retry in %vs: id=[%s] err=[%s]",
|
||||
sleepTime.Seconds(), crlID, err)
|
||||
}
|
||||
cu.clk.Sleep(sleepTime)
|
||||
|
||||
err = cu.tickShard(ctx, atTime, issuerNameID, shardIdx, chunks)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// tickShard processes a single shard. It computes the shard's boundaries, gets
|
||||
// the list of revoked certs in that shard from the SA, gets the CA to sign the
|
||||
// resulting CRL, and gets the crl-storer to upload it. It returns an error if
|
||||
|
|
|
|||
|
|
@ -146,7 +146,7 @@ func TestTickShard(t *testing.T) {
|
|||
cu, err := NewUpdater(
|
||||
[]*issuance.Certificate{e1, r3},
|
||||
2, 18*time.Hour, 24*time.Hour,
|
||||
6*time.Hour, 1*time.Minute, 1,
|
||||
6*time.Hour, 1*time.Minute, 1, 1,
|
||||
&fakeSAC{grcc: fakeGRCC{}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
|
||||
&fakeCGC{gcc: fakeGCC{}},
|
||||
&fakeCSC{ucc: fakeUCC{}},
|
||||
|
|
@ -222,6 +222,55 @@ func TestTickShard(t *testing.T) {
|
|||
cu.updatedCounter.Reset()
|
||||
}
|
||||
|
||||
func TestTickShardWithRetry(t *testing.T) {
|
||||
e1, err := issuance.LoadCertificate("../../test/hierarchy/int-e1.cert.pem")
|
||||
test.AssertNotError(t, err, "loading test issuer")
|
||||
r3, err := issuance.LoadCertificate("../../test/hierarchy/int-r3.cert.pem")
|
||||
test.AssertNotError(t, err, "loading test issuer")
|
||||
|
||||
sentinelErr := errors.New("oops")
|
||||
|
||||
clk := clock.NewFake()
|
||||
clk.Set(time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC))
|
||||
|
||||
// Build an updater that will always fail when it talks to the SA.
|
||||
cu, err := NewUpdater(
|
||||
[]*issuance.Certificate{e1, r3},
|
||||
2, 18*time.Hour, 24*time.Hour,
|
||||
6*time.Hour, 1*time.Minute, 1, 1,
|
||||
&fakeSAC{grcc: fakeGRCC{err: sentinelErr}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
|
||||
&fakeCGC{gcc: fakeGCC{}},
|
||||
&fakeCSC{ucc: fakeUCC{}},
|
||||
metrics.NoopRegisterer, blog.NewMock(), clk,
|
||||
)
|
||||
test.AssertNotError(t, err, "building test crlUpdater")
|
||||
|
||||
testChunks := []chunk{
|
||||
{clk.Now(), clk.Now().Add(18 * time.Hour), 0},
|
||||
}
|
||||
|
||||
// Ensure that having MaxAttempts set to 1 results in the clock not moving
|
||||
// forward at all.
|
||||
startTime := cu.clk.Now()
|
||||
err = cu.tickShardWithRetry(context.Background(), cu.clk.Now(), e1.NameID(), 0, testChunks)
|
||||
test.AssertError(t, err, "database error")
|
||||
test.AssertErrorIs(t, err, sentinelErr)
|
||||
test.AssertEquals(t, cu.clk.Now(), startTime)
|
||||
|
||||
// Ensure that having MaxAttempts set to 5 results in the clock moving forward
|
||||
// by 1+2+4+8=15 seconds. The core.RetryBackoff system has 20% jitter built
|
||||
// in, so we have to be approximate.
|
||||
cu.maxAttempts = 5
|
||||
startTime = cu.clk.Now()
|
||||
err = cu.tickShardWithRetry(context.Background(), cu.clk.Now(), e1.NameID(), 0, testChunks)
|
||||
test.AssertError(t, err, "database error")
|
||||
test.AssertErrorIs(t, err, sentinelErr)
|
||||
t.Logf("start: %v", startTime)
|
||||
t.Logf("now: %v", cu.clk.Now())
|
||||
test.Assert(t, startTime.Add(15*0.8*time.Second).Before(cu.clk.Now()), "retries didn't sleep enough")
|
||||
test.Assert(t, startTime.Add(15*1.2*time.Second).After(cu.clk.Now()), "retries slept too much")
|
||||
}
|
||||
|
||||
func TestTickIssuer(t *testing.T) {
|
||||
e1, err := issuance.LoadCertificate("../../test/hierarchy/int-e1.cert.pem")
|
||||
test.AssertNotError(t, err, "loading test issuer")
|
||||
|
|
@ -234,7 +283,7 @@ func TestTickIssuer(t *testing.T) {
|
|||
cu, err := NewUpdater(
|
||||
[]*issuance.Certificate{e1, r3},
|
||||
2, 18*time.Hour, 24*time.Hour,
|
||||
6*time.Hour, 1*time.Minute, 1,
|
||||
6*time.Hour, 1*time.Minute, 1, 1,
|
||||
&fakeSAC{grcc: fakeGRCC{err: errors.New("db no worky")}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
|
||||
&fakeCGC{gcc: fakeGCC{}},
|
||||
&fakeCSC{ucc: fakeUCC{}},
|
||||
|
|
@ -270,7 +319,7 @@ func TestTick(t *testing.T) {
|
|||
cu, err := NewUpdater(
|
||||
[]*issuance.Certificate{e1, r3},
|
||||
2, 18*time.Hour, 24*time.Hour,
|
||||
6*time.Hour, 1*time.Minute, 1,
|
||||
6*time.Hour, 1*time.Minute, 1, 1,
|
||||
&fakeSAC{grcc: fakeGRCC{err: errors.New("db no worky")}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
|
||||
&fakeCGC{gcc: fakeGCC{}},
|
||||
&fakeCSC{ucc: fakeUCC{}},
|
||||
|
|
|
|||
|
|
@ -46,7 +46,8 @@
|
|||
"lookbackPeriod": "24h",
|
||||
"updatePeriod": "6h",
|
||||
"updateOffset": "9120s",
|
||||
"maxParallelism": 10
|
||||
"maxParallelism": 10,
|
||||
"maxAttempts": 5
|
||||
},
|
||||
"syslog": {
|
||||
"stdoutlevel": 6,
|
||||
|
|
|
|||
Loading…
Reference in New Issue