Backoff OCSP Updater on HSM failure
If a ServiceUnavailableError is returned from GenerateOCSP backoff before attempting to retry the call as to not to overwhelm the CA with calls that may instantly fail.
This commit is contained in:
parent
c75c1f26fa
commit
661476f40e
|
|
@ -85,21 +85,27 @@ func newUpdater(
|
|||
// Setup loops
|
||||
updater.loops = []*looper{
|
||||
&looper{
|
||||
clk: clk,
|
||||
stats: stats,
|
||||
batchSize: config.NewCertificateBatchSize,
|
||||
tickDur: config.NewCertificateWindow.Duration,
|
||||
tickFunc: updater.newCertificateTick,
|
||||
name: "NewCertificates",
|
||||
clk: clk,
|
||||
stats: stats,
|
||||
batchSize: config.NewCertificateBatchSize,
|
||||
tickDur: config.NewCertificateWindow.Duration,
|
||||
tickFunc: updater.newCertificateTick,
|
||||
name: "NewCertificates",
|
||||
failureBackoffFactor: config.SignFailureBackoffFactor,
|
||||
failureBackoffMax: config.SignFailureBackoffMax.Duration,
|
||||
},
|
||||
&looper{
|
||||
clk: clk,
|
||||
stats: stats,
|
||||
batchSize: config.OldOCSPBatchSize,
|
||||
tickDur: config.OldOCSPWindow.Duration,
|
||||
tickFunc: updater.oldOCSPResponsesTick,
|
||||
name: "OldOCSPResponses",
|
||||
clk: clk,
|
||||
stats: stats,
|
||||
batchSize: config.OldOCSPBatchSize,
|
||||
tickDur: config.OldOCSPWindow.Duration,
|
||||
tickFunc: updater.oldOCSPResponsesTick,
|
||||
name: "OldOCSPResponses",
|
||||
failureBackoffFactor: config.SignFailureBackoffFactor,
|
||||
failureBackoffMax: config.SignFailureBackoffMax.Duration,
|
||||
},
|
||||
// The missing SCT loop doesn't need to know about failureBackoffFactor or
|
||||
// failureBackoffMax as it doesn't make any calls to the CA
|
||||
&looper{
|
||||
clk: clk,
|
||||
stats: stats,
|
||||
|
|
@ -112,12 +118,14 @@ func newUpdater(
|
|||
if config.RevokedCertificateBatchSize != 0 &&
|
||||
config.RevokedCertificateWindow.Duration != 0 {
|
||||
updater.loops = append(updater.loops, &looper{
|
||||
clk: clk,
|
||||
stats: stats,
|
||||
batchSize: config.RevokedCertificateBatchSize,
|
||||
tickDur: config.RevokedCertificateWindow.Duration,
|
||||
tickFunc: updater.revokedCertificatesTick,
|
||||
name: "RevokedCertificates",
|
||||
clk: clk,
|
||||
stats: stats,
|
||||
batchSize: config.RevokedCertificateBatchSize,
|
||||
tickDur: config.RevokedCertificateWindow.Duration,
|
||||
tickFunc: updater.revokedCertificatesTick,
|
||||
name: "RevokedCertificates",
|
||||
failureBackoffFactor: config.SignFailureBackoffFactor,
|
||||
failureBackoffMax: config.SignFailureBackoffMax.Duration,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -248,17 +256,17 @@ func (updater *OCSPUpdater) storeResponse(status *core.CertificateStatus) error
|
|||
|
||||
// newCertificateTick checks for certificates issued since the last tick and
|
||||
// generates and stores OCSP responses for these certs
|
||||
func (updater *OCSPUpdater) newCertificateTick(batchSize int) {
|
||||
func (updater *OCSPUpdater) newCertificateTick(batchSize int) error {
|
||||
// Check for anything issued between now and previous tick and generate first
|
||||
// OCSP responses
|
||||
statuses, err := updater.getCertificatesWithMissingResponses(batchSize)
|
||||
if err != nil {
|
||||
updater.stats.Inc("OCSP.Errors.FindMissingResponses", 1, 1.0)
|
||||
updater.log.AuditErr(fmt.Errorf("Failed to find certificates with missing OCSP responses: %s", err))
|
||||
return
|
||||
return err
|
||||
}
|
||||
|
||||
updater.generateOCSPResponses(statuses)
|
||||
return updater.generateOCSPResponses(statuses)
|
||||
}
|
||||
|
||||
func (updater *OCSPUpdater) findRevokedCertificatesToUpdate(batchSize int) ([]core.CertificateStatus, error) {
|
||||
|
|
@ -277,12 +285,12 @@ func (updater *OCSPUpdater) findRevokedCertificatesToUpdate(batchSize int) ([]co
|
|||
return statuses, err
|
||||
}
|
||||
|
||||
func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) {
|
||||
func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) error {
|
||||
statuses, err := updater.findRevokedCertificatesToUpdate(batchSize)
|
||||
if err != nil {
|
||||
updater.stats.Inc("OCSP.Errors.FindRevokedCertificates", 1, 1.0)
|
||||
updater.log.AuditErr(fmt.Errorf("Failed to find revoked certificates: %s", err))
|
||||
return
|
||||
return err
|
||||
}
|
||||
|
||||
for _, status := range statuses {
|
||||
|
|
@ -290,7 +298,7 @@ func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) {
|
|||
if err != nil {
|
||||
updater.log.AuditErr(fmt.Errorf("Failed to generate revoked OCSP response: %s", err))
|
||||
updater.stats.Inc("OCSP.Errors.RevokedResponseGeneration", 1, 1.0)
|
||||
continue
|
||||
return err
|
||||
}
|
||||
err = updater.storeResponse(meta)
|
||||
if err != nil {
|
||||
|
|
@ -299,15 +307,16 @@ func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) {
|
|||
continue
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (updater *OCSPUpdater) generateOCSPResponses(statuses []core.CertificateStatus) {
|
||||
func (updater *OCSPUpdater) generateOCSPResponses(statuses []core.CertificateStatus) error {
|
||||
for _, status := range statuses {
|
||||
meta, err := updater.generateResponse(status)
|
||||
if err != nil {
|
||||
updater.log.AuditErr(fmt.Errorf("Failed to generate OCSP response: %s", err))
|
||||
updater.stats.Inc("OCSP.Errors.ResponseGeneration", 1, 1.0)
|
||||
continue
|
||||
return err
|
||||
}
|
||||
updater.stats.Inc("OCSP.GeneratedResponses", 1, 1.0)
|
||||
err = updater.storeResponse(meta)
|
||||
|
|
@ -318,21 +327,21 @@ func (updater *OCSPUpdater) generateOCSPResponses(statuses []core.CertificateSta
|
|||
}
|
||||
updater.stats.Inc("OCSP.StoredResponses", 1, 1.0)
|
||||
}
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
// oldOCSPResponsesTick looks for certificates with stale OCSP responses and
|
||||
// generates/stores new ones
|
||||
func (updater *OCSPUpdater) oldOCSPResponsesTick(batchSize int) {
|
||||
func (updater *OCSPUpdater) oldOCSPResponsesTick(batchSize int) error {
|
||||
now := time.Now()
|
||||
statuses, err := updater.findStaleOCSPResponses(now.Add(-updater.ocspMinTimeToExpiry), batchSize)
|
||||
if err != nil {
|
||||
updater.stats.Inc("OCSP.Errors.FindStaleResponses", 1, 1.0)
|
||||
updater.log.AuditErr(fmt.Errorf("Failed to find stale OCSP responses: %s", err))
|
||||
return
|
||||
return err
|
||||
}
|
||||
|
||||
updater.generateOCSPResponses(statuses)
|
||||
return updater.generateOCSPResponses(statuses)
|
||||
}
|
||||
|
||||
func (updater *OCSPUpdater) getSerialsIssuedSince(since time.Time, batchSize int) ([]string, error) {
|
||||
|
|
@ -366,13 +375,13 @@ func (updater *OCSPUpdater) getNumberOfReceipts(serial string) (int, error) {
|
|||
|
||||
// missingReceiptsTick looks for certificates without the correct number of SCT
|
||||
// receipts and retrieves them
|
||||
func (updater *OCSPUpdater) missingReceiptsTick(batchSize int) {
|
||||
func (updater *OCSPUpdater) missingReceiptsTick(batchSize int) error {
|
||||
now := updater.clk.Now()
|
||||
since := now.Add(-updater.oldestIssuedSCT)
|
||||
serials, err := updater.getSerialsIssuedSince(since, batchSize)
|
||||
if err != nil {
|
||||
updater.log.AuditErr(fmt.Errorf("Failed to get certificate serials: %s", err))
|
||||
return
|
||||
return err
|
||||
}
|
||||
|
||||
for _, serial := range serials {
|
||||
|
|
@ -396,16 +405,50 @@ func (updater *OCSPUpdater) missingReceiptsTick(batchSize int) {
|
|||
continue
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
type looper struct {
|
||||
clk clock.Clock
|
||||
stats statsd.Statter
|
||||
batchSize int
|
||||
tickDur time.Duration
|
||||
tickFunc func(int)
|
||||
name string
|
||||
clk clock.Clock
|
||||
stats statsd.Statter
|
||||
batchSize int
|
||||
tickDur time.Duration
|
||||
tickFunc func(int) error
|
||||
name string
|
||||
failureBackoffFactor float64
|
||||
failureBackoffMax time.Duration
|
||||
failures int
|
||||
}
|
||||
|
||||
func (l *looper) tick() {
|
||||
tickStart := l.clk.Now()
|
||||
err := l.tickFunc(l.batchSize)
|
||||
l.stats.TimingDuration(fmt.Sprintf("OCSP.%s.TickDuration", l.name), time.Since(tickStart), 1.0)
|
||||
l.stats.Inc(fmt.Sprintf("OCSP.%s.Ticks", l.name), 1, 1.0)
|
||||
tickEnd := tickStart.Add(time.Since(tickStart))
|
||||
expectedTickEnd := tickStart.Add(l.tickDur)
|
||||
if tickEnd.After(expectedTickEnd) {
|
||||
l.stats.Inc(fmt.Sprintf("OCSP.%s.LongTicks", l.name), 1, 1.0)
|
||||
}
|
||||
|
||||
// After we have all the stats stuff out of the way let's check if the tick
|
||||
// function failed, if the reason is the HSM is dead increase the length of
|
||||
// sleepDur using the exponentially increasing duration returned by core.RetryBackoff.
|
||||
sleepDur := expectedTickEnd.Sub(tickEnd)
|
||||
if err != nil {
|
||||
l.stats.Inc(fmt.Sprintf("OCSP.%s.FailedTicks", l.name), 1, 1.0)
|
||||
if _, ok := err.(core.ServiceUnavailableError); ok && (l.failureBackoffFactor > 0 && l.failureBackoffMax > 0) {
|
||||
l.failures++
|
||||
sleepDur = core.RetryBackoff(l.failures, l.tickDur, l.failureBackoffMax, l.failureBackoffFactor)
|
||||
}
|
||||
} else if l.failures > 0 {
|
||||
// If the tick was successful and previously there were failures reset
|
||||
// counter to 0
|
||||
l.failures = 0
|
||||
}
|
||||
|
||||
// Sleep for the remaining tick period or for the backoff time
|
||||
l.clk.Sleep(sleepDur)
|
||||
}
|
||||
|
||||
func (l *looper) loop() error {
|
||||
|
|
@ -413,18 +456,7 @@ func (l *looper) loop() error {
|
|||
return fmt.Errorf("Both batch size and tick duration are required, not running '%s' loop", l.name)
|
||||
}
|
||||
for {
|
||||
tickStart := l.clk.Now()
|
||||
l.tickFunc(l.batchSize)
|
||||
l.stats.TimingDuration(fmt.Sprintf("OCSP.%s.TickDuration", l.name), time.Since(tickStart), 1.0)
|
||||
l.stats.Inc(fmt.Sprintf("OCSP.%s.Ticks", l.name), 1, 1.0)
|
||||
tickEnd := tickStart.Add(time.Since(tickStart))
|
||||
expectedTickEnd := tickStart.Add(l.tickDur)
|
||||
if tickEnd.After(expectedTickEnd) {
|
||||
l.stats.Inc(fmt.Sprintf("OCSP.%s.LongTicks", l.name), 1, 1.0)
|
||||
}
|
||||
// Sleep for the remaining tick period (if this is a negative number sleep
|
||||
// will not do anything and carry on)
|
||||
l.clk.Sleep(expectedTickEnd.Sub(tickEnd))
|
||||
l.tick()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -321,3 +321,44 @@ func TestStoreResponseGuard(t *testing.T) {
|
|||
test.AssertNotError(t, err, "Failed to get certificate status")
|
||||
test.AssertEquals(t, len(changedStatus.OCSPResponse), 3)
|
||||
}
|
||||
|
||||
func TestLoopTickBackoff(t *testing.T) {
|
||||
fc := clock.NewFake()
|
||||
stats, _ := statsd.NewNoopClient(nil)
|
||||
l := looper{
|
||||
clk: fc,
|
||||
stats: stats,
|
||||
failureBackoffFactor: 1.5,
|
||||
failureBackoffMax: 10 * time.Minute,
|
||||
tickDur: time.Minute,
|
||||
tickFunc: func(_ int) error { return core.ServiceUnavailableError("sad HSM") },
|
||||
}
|
||||
|
||||
start := l.clk.Now()
|
||||
l.tick()
|
||||
// Expected to sleep for 1m
|
||||
backoff := float64(60000000000)
|
||||
maxJittered := backoff * 1.2
|
||||
test.AssertBetween(t, l.clk.Now().Sub(start).Nanoseconds(), int64(backoff), int64(maxJittered))
|
||||
|
||||
start = l.clk.Now()
|
||||
l.tick()
|
||||
// Expected to sleep for 1m30s
|
||||
backoff = 90000000000
|
||||
maxJittered = backoff * 1.2
|
||||
test.AssertBetween(t, l.clk.Now().Sub(start).Nanoseconds(), int64(backoff), int64(maxJittered))
|
||||
|
||||
l.failures = 6
|
||||
start = l.clk.Now()
|
||||
l.tick()
|
||||
// Expected to sleep for 11m23.4375s, should be truncated to 10m
|
||||
backoff = 600000000000
|
||||
maxJittered = backoff * 1.2
|
||||
test.AssertBetween(t, l.clk.Now().Sub(start).Nanoseconds(), int64(backoff), int64(maxJittered))
|
||||
|
||||
l.tickFunc = func(_ int) error { return nil }
|
||||
start = l.clk.Now()
|
||||
l.tick()
|
||||
test.AssertEquals(t, l.failures, 0)
|
||||
test.AssertEquals(t, l.clk.Now(), start)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -302,6 +302,9 @@ type OCSPUpdaterConfig struct {
|
|||
OCSPMinTimeToExpiry ConfigDuration
|
||||
OldestIssuedSCT ConfigDuration
|
||||
|
||||
SignFailureBackoffFactor float64
|
||||
SignFailureBackoffMax ConfigDuration
|
||||
|
||||
// DebugAddr is the address to run the /debug handlers on.
|
||||
DebugAddr string
|
||||
}
|
||||
|
|
|
|||
27
core/util.go
27
core/util.go
|
|
@ -26,9 +26,11 @@ import (
|
|||
"io"
|
||||
"io/ioutil"
|
||||
"math/big"
|
||||
mrand "math/rand"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
jose "github.com/letsencrypt/boulder/Godeps/_workspace/src/github.com/letsencrypt/go-jose"
|
||||
blog "github.com/letsencrypt/boulder/log"
|
||||
|
|
@ -536,3 +538,28 @@ func LoadCert(filename string) (cert *x509.Certificate, err error) {
|
|||
cert, err = x509.ParseCertificate(block.Bytes)
|
||||
return
|
||||
}
|
||||
|
||||
// retryJitter is used to prevent bunched retried queries from falling into lockstep
|
||||
const retryJitter = 0.2
|
||||
|
||||
// RetryBackoff calculates a backoff time based on number of retries, will always
|
||||
// add jitter so requests that start in unison won't fall into lockstep. Because of
|
||||
// this the returned duration can always be larger than the maximum by a factor of
|
||||
// retryJitter. Adapted from https://github.com/grpc/grpc-go/blob/master/rpc_util.go#L311
|
||||
func RetryBackoff(retries int, base, max time.Duration, factor float64) time.Duration {
|
||||
if retries == 0 {
|
||||
return 0
|
||||
}
|
||||
backoff, fMax := float64(base), float64(max)
|
||||
for backoff < fMax && retries > 1 {
|
||||
backoff *= factor
|
||||
retries--
|
||||
}
|
||||
if backoff > fMax {
|
||||
backoff = fMax
|
||||
}
|
||||
// Randomize backoff delays so that if a cluster of requests start at
|
||||
// the same time, they won't operate in lockstep.
|
||||
backoff *= (1 - retryJitter) + 2*retryJitter*mrand.Float64()
|
||||
return time.Duration(backoff)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -173,6 +173,8 @@
|
|||
"revokedCertificateBatchSize": 1000,
|
||||
"ocspMinTimeToExpiry": "72h",
|
||||
"oldestIssuedSCT": "72h",
|
||||
"signFailureBackoffFactor": 1.2,
|
||||
"signFailureBackoffMax": "30m",
|
||||
"debugAddr": "localhost:8006"
|
||||
},
|
||||
|
||||
|
|
|
|||
|
|
@ -133,3 +133,10 @@ func AssertSeverity(t *testing.T, data string, severity int) {
|
|||
expected := fmt.Sprintf("\"severity\":%d", severity)
|
||||
AssertContains(t, data, expected)
|
||||
}
|
||||
|
||||
// AssertBetween determines if a is between b and c
|
||||
func AssertBetween(t *testing.T, a, b, c int64) {
|
||||
if a < b || a > c {
|
||||
t.Fatalf("%d is not between %d and %d", a, b, c)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue