Backoff OCSP Updater on HSM failure

If a ServiceUnavailableError is returned from GenerateOCSP backoff before
attempting to retry the call as to not to overwhelm the CA with calls that
may instantly fail.
This commit is contained in:
Roland Shoemaker 2015-10-26 14:06:32 -07:00
parent c75c1f26fa
commit 661476f40e
6 changed files with 163 additions and 51 deletions

View File

@ -91,6 +91,8 @@ func newUpdater(
tickDur: config.NewCertificateWindow.Duration,
tickFunc: updater.newCertificateTick,
name: "NewCertificates",
failureBackoffFactor: config.SignFailureBackoffFactor,
failureBackoffMax: config.SignFailureBackoffMax.Duration,
},
&looper{
clk: clk,
@ -99,7 +101,11 @@ func newUpdater(
tickDur: config.OldOCSPWindow.Duration,
tickFunc: updater.oldOCSPResponsesTick,
name: "OldOCSPResponses",
failureBackoffFactor: config.SignFailureBackoffFactor,
failureBackoffMax: config.SignFailureBackoffMax.Duration,
},
// The missing SCT loop doesn't need to know about failureBackoffFactor or
// failureBackoffMax as it doesn't make any calls to the CA
&looper{
clk: clk,
stats: stats,
@ -118,6 +124,8 @@ func newUpdater(
tickDur: config.RevokedCertificateWindow.Duration,
tickFunc: updater.revokedCertificatesTick,
name: "RevokedCertificates",
failureBackoffFactor: config.SignFailureBackoffFactor,
failureBackoffMax: config.SignFailureBackoffMax.Duration,
})
}
@ -248,17 +256,17 @@ func (updater *OCSPUpdater) storeResponse(status *core.CertificateStatus) error
// newCertificateTick checks for certificates issued since the last tick and
// generates and stores OCSP responses for these certs
func (updater *OCSPUpdater) newCertificateTick(batchSize int) {
func (updater *OCSPUpdater) newCertificateTick(batchSize int) error {
// Check for anything issued between now and previous tick and generate first
// OCSP responses
statuses, err := updater.getCertificatesWithMissingResponses(batchSize)
if err != nil {
updater.stats.Inc("OCSP.Errors.FindMissingResponses", 1, 1.0)
updater.log.AuditErr(fmt.Errorf("Failed to find certificates with missing OCSP responses: %s", err))
return
return err
}
updater.generateOCSPResponses(statuses)
return updater.generateOCSPResponses(statuses)
}
func (updater *OCSPUpdater) findRevokedCertificatesToUpdate(batchSize int) ([]core.CertificateStatus, error) {
@ -277,12 +285,12 @@ func (updater *OCSPUpdater) findRevokedCertificatesToUpdate(batchSize int) ([]co
return statuses, err
}
func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) {
func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) error {
statuses, err := updater.findRevokedCertificatesToUpdate(batchSize)
if err != nil {
updater.stats.Inc("OCSP.Errors.FindRevokedCertificates", 1, 1.0)
updater.log.AuditErr(fmt.Errorf("Failed to find revoked certificates: %s", err))
return
return err
}
for _, status := range statuses {
@ -290,7 +298,7 @@ func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) {
if err != nil {
updater.log.AuditErr(fmt.Errorf("Failed to generate revoked OCSP response: %s", err))
updater.stats.Inc("OCSP.Errors.RevokedResponseGeneration", 1, 1.0)
continue
return err
}
err = updater.storeResponse(meta)
if err != nil {
@ -299,15 +307,16 @@ func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) {
continue
}
}
return nil
}
func (updater *OCSPUpdater) generateOCSPResponses(statuses []core.CertificateStatus) {
func (updater *OCSPUpdater) generateOCSPResponses(statuses []core.CertificateStatus) error {
for _, status := range statuses {
meta, err := updater.generateResponse(status)
if err != nil {
updater.log.AuditErr(fmt.Errorf("Failed to generate OCSP response: %s", err))
updater.stats.Inc("OCSP.Errors.ResponseGeneration", 1, 1.0)
continue
return err
}
updater.stats.Inc("OCSP.GeneratedResponses", 1, 1.0)
err = updater.storeResponse(meta)
@ -318,21 +327,21 @@ func (updater *OCSPUpdater) generateOCSPResponses(statuses []core.CertificateSta
}
updater.stats.Inc("OCSP.StoredResponses", 1, 1.0)
}
return
return nil
}
// oldOCSPResponsesTick looks for certificates with stale OCSP responses and
// generates/stores new ones
func (updater *OCSPUpdater) oldOCSPResponsesTick(batchSize int) {
func (updater *OCSPUpdater) oldOCSPResponsesTick(batchSize int) error {
now := time.Now()
statuses, err := updater.findStaleOCSPResponses(now.Add(-updater.ocspMinTimeToExpiry), batchSize)
if err != nil {
updater.stats.Inc("OCSP.Errors.FindStaleResponses", 1, 1.0)
updater.log.AuditErr(fmt.Errorf("Failed to find stale OCSP responses: %s", err))
return
return err
}
updater.generateOCSPResponses(statuses)
return updater.generateOCSPResponses(statuses)
}
func (updater *OCSPUpdater) getSerialsIssuedSince(since time.Time, batchSize int) ([]string, error) {
@ -366,13 +375,13 @@ func (updater *OCSPUpdater) getNumberOfReceipts(serial string) (int, error) {
// missingReceiptsTick looks for certificates without the correct number of SCT
// receipts and retrieves them
func (updater *OCSPUpdater) missingReceiptsTick(batchSize int) {
func (updater *OCSPUpdater) missingReceiptsTick(batchSize int) error {
now := updater.clk.Now()
since := now.Add(-updater.oldestIssuedSCT)
serials, err := updater.getSerialsIssuedSince(since, batchSize)
if err != nil {
updater.log.AuditErr(fmt.Errorf("Failed to get certificate serials: %s", err))
return
return err
}
for _, serial := range serials {
@ -396,7 +405,7 @@ func (updater *OCSPUpdater) missingReceiptsTick(batchSize int) {
continue
}
}
return nil
}
type looper struct {
@ -404,17 +413,16 @@ type looper struct {
stats statsd.Statter
batchSize int
tickDur time.Duration
tickFunc func(int)
tickFunc func(int) error
name string
failureBackoffFactor float64
failureBackoffMax time.Duration
failures int
}
func (l *looper) loop() error {
if l.batchSize == 0 || l.tickDur == 0 {
return fmt.Errorf("Both batch size and tick duration are required, not running '%s' loop", l.name)
}
for {
func (l *looper) tick() {
tickStart := l.clk.Now()
l.tickFunc(l.batchSize)
err := l.tickFunc(l.batchSize)
l.stats.TimingDuration(fmt.Sprintf("OCSP.%s.TickDuration", l.name), time.Since(tickStart), 1.0)
l.stats.Inc(fmt.Sprintf("OCSP.%s.Ticks", l.name), 1, 1.0)
tickEnd := tickStart.Add(time.Since(tickStart))
@ -422,9 +430,33 @@ func (l *looper) loop() error {
if tickEnd.After(expectedTickEnd) {
l.stats.Inc(fmt.Sprintf("OCSP.%s.LongTicks", l.name), 1, 1.0)
}
// Sleep for the remaining tick period (if this is a negative number sleep
// will not do anything and carry on)
l.clk.Sleep(expectedTickEnd.Sub(tickEnd))
// After we have all the stats stuff out of the way let's check if the tick
// function failed, if the reason is the HSM is dead increase the length of
// sleepDur using the exponentially increasing duration returned by core.RetryBackoff.
sleepDur := expectedTickEnd.Sub(tickEnd)
if err != nil {
l.stats.Inc(fmt.Sprintf("OCSP.%s.FailedTicks", l.name), 1, 1.0)
if _, ok := err.(core.ServiceUnavailableError); ok && (l.failureBackoffFactor > 0 && l.failureBackoffMax > 0) {
l.failures++
sleepDur = core.RetryBackoff(l.failures, l.tickDur, l.failureBackoffMax, l.failureBackoffFactor)
}
} else if l.failures > 0 {
// If the tick was successful and previously there were failures reset
// counter to 0
l.failures = 0
}
// Sleep for the remaining tick period or for the backoff time
l.clk.Sleep(sleepDur)
}
func (l *looper) loop() error {
if l.batchSize == 0 || l.tickDur == 0 {
return fmt.Errorf("Both batch size and tick duration are required, not running '%s' loop", l.name)
}
for {
l.tick()
}
}

View File

@ -321,3 +321,44 @@ func TestStoreResponseGuard(t *testing.T) {
test.AssertNotError(t, err, "Failed to get certificate status")
test.AssertEquals(t, len(changedStatus.OCSPResponse), 3)
}
func TestLoopTickBackoff(t *testing.T) {
fc := clock.NewFake()
stats, _ := statsd.NewNoopClient(nil)
l := looper{
clk: fc,
stats: stats,
failureBackoffFactor: 1.5,
failureBackoffMax: 10 * time.Minute,
tickDur: time.Minute,
tickFunc: func(_ int) error { return core.ServiceUnavailableError("sad HSM") },
}
start := l.clk.Now()
l.tick()
// Expected to sleep for 1m
backoff := float64(60000000000)
maxJittered := backoff * 1.2
test.AssertBetween(t, l.clk.Now().Sub(start).Nanoseconds(), int64(backoff), int64(maxJittered))
start = l.clk.Now()
l.tick()
// Expected to sleep for 1m30s
backoff = 90000000000
maxJittered = backoff * 1.2
test.AssertBetween(t, l.clk.Now().Sub(start).Nanoseconds(), int64(backoff), int64(maxJittered))
l.failures = 6
start = l.clk.Now()
l.tick()
// Expected to sleep for 11m23.4375s, should be truncated to 10m
backoff = 600000000000
maxJittered = backoff * 1.2
test.AssertBetween(t, l.clk.Now().Sub(start).Nanoseconds(), int64(backoff), int64(maxJittered))
l.tickFunc = func(_ int) error { return nil }
start = l.clk.Now()
l.tick()
test.AssertEquals(t, l.failures, 0)
test.AssertEquals(t, l.clk.Now(), start)
}

View File

@ -302,6 +302,9 @@ type OCSPUpdaterConfig struct {
OCSPMinTimeToExpiry ConfigDuration
OldestIssuedSCT ConfigDuration
SignFailureBackoffFactor float64
SignFailureBackoffMax ConfigDuration
// DebugAddr is the address to run the /debug handlers on.
DebugAddr string
}

View File

@ -26,9 +26,11 @@ import (
"io"
"io/ioutil"
"math/big"
mrand "math/rand"
"net/url"
"regexp"
"strings"
"time"
jose "github.com/letsencrypt/boulder/Godeps/_workspace/src/github.com/letsencrypt/go-jose"
blog "github.com/letsencrypt/boulder/log"
@ -536,3 +538,28 @@ func LoadCert(filename string) (cert *x509.Certificate, err error) {
cert, err = x509.ParseCertificate(block.Bytes)
return
}
// retryJitter is used to prevent bunched retried queries from falling into lockstep
const retryJitter = 0.2
// RetryBackoff calculates a backoff time based on number of retries, will always
// add jitter so requests that start in unison won't fall into lockstep. Because of
// this the returned duration can always be larger than the maximum by a factor of
// retryJitter. Adapted from https://github.com/grpc/grpc-go/blob/master/rpc_util.go#L311
func RetryBackoff(retries int, base, max time.Duration, factor float64) time.Duration {
if retries == 0 {
return 0
}
backoff, fMax := float64(base), float64(max)
for backoff < fMax && retries > 1 {
backoff *= factor
retries--
}
if backoff > fMax {
backoff = fMax
}
// Randomize backoff delays so that if a cluster of requests start at
// the same time, they won't operate in lockstep.
backoff *= (1 - retryJitter) + 2*retryJitter*mrand.Float64()
return time.Duration(backoff)
}

View File

@ -173,6 +173,8 @@
"revokedCertificateBatchSize": 1000,
"ocspMinTimeToExpiry": "72h",
"oldestIssuedSCT": "72h",
"signFailureBackoffFactor": 1.2,
"signFailureBackoffMax": "30m",
"debugAddr": "localhost:8006"
},

View File

@ -133,3 +133,10 @@ func AssertSeverity(t *testing.T, data string, severity int) {
expected := fmt.Sprintf("\"severity\":%d", severity)
AssertContains(t, data, expected)
}
// AssertBetween determines if a is between b and c
func AssertBetween(t *testing.T, a, b, c int64) {
if a < b || a > c {
t.Fatalf("%d is not between %d and %d", a, b, c)
}
}