Backoff OCSP Updater on HSM failure
If a ServiceUnavailableError is returned from GenerateOCSP backoff before attempting to retry the call as to not to overwhelm the CA with calls that may instantly fail.
This commit is contained in:
parent
c75c1f26fa
commit
661476f40e
|
|
@ -91,6 +91,8 @@ func newUpdater(
|
||||||
tickDur: config.NewCertificateWindow.Duration,
|
tickDur: config.NewCertificateWindow.Duration,
|
||||||
tickFunc: updater.newCertificateTick,
|
tickFunc: updater.newCertificateTick,
|
||||||
name: "NewCertificates",
|
name: "NewCertificates",
|
||||||
|
failureBackoffFactor: config.SignFailureBackoffFactor,
|
||||||
|
failureBackoffMax: config.SignFailureBackoffMax.Duration,
|
||||||
},
|
},
|
||||||
&looper{
|
&looper{
|
||||||
clk: clk,
|
clk: clk,
|
||||||
|
|
@ -99,7 +101,11 @@ func newUpdater(
|
||||||
tickDur: config.OldOCSPWindow.Duration,
|
tickDur: config.OldOCSPWindow.Duration,
|
||||||
tickFunc: updater.oldOCSPResponsesTick,
|
tickFunc: updater.oldOCSPResponsesTick,
|
||||||
name: "OldOCSPResponses",
|
name: "OldOCSPResponses",
|
||||||
|
failureBackoffFactor: config.SignFailureBackoffFactor,
|
||||||
|
failureBackoffMax: config.SignFailureBackoffMax.Duration,
|
||||||
},
|
},
|
||||||
|
// The missing SCT loop doesn't need to know about failureBackoffFactor or
|
||||||
|
// failureBackoffMax as it doesn't make any calls to the CA
|
||||||
&looper{
|
&looper{
|
||||||
clk: clk,
|
clk: clk,
|
||||||
stats: stats,
|
stats: stats,
|
||||||
|
|
@ -118,6 +124,8 @@ func newUpdater(
|
||||||
tickDur: config.RevokedCertificateWindow.Duration,
|
tickDur: config.RevokedCertificateWindow.Duration,
|
||||||
tickFunc: updater.revokedCertificatesTick,
|
tickFunc: updater.revokedCertificatesTick,
|
||||||
name: "RevokedCertificates",
|
name: "RevokedCertificates",
|
||||||
|
failureBackoffFactor: config.SignFailureBackoffFactor,
|
||||||
|
failureBackoffMax: config.SignFailureBackoffMax.Duration,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -248,17 +256,17 @@ func (updater *OCSPUpdater) storeResponse(status *core.CertificateStatus) error
|
||||||
|
|
||||||
// newCertificateTick checks for certificates issued since the last tick and
|
// newCertificateTick checks for certificates issued since the last tick and
|
||||||
// generates and stores OCSP responses for these certs
|
// generates and stores OCSP responses for these certs
|
||||||
func (updater *OCSPUpdater) newCertificateTick(batchSize int) {
|
func (updater *OCSPUpdater) newCertificateTick(batchSize int) error {
|
||||||
// Check for anything issued between now and previous tick and generate first
|
// Check for anything issued between now and previous tick and generate first
|
||||||
// OCSP responses
|
// OCSP responses
|
||||||
statuses, err := updater.getCertificatesWithMissingResponses(batchSize)
|
statuses, err := updater.getCertificatesWithMissingResponses(batchSize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
updater.stats.Inc("OCSP.Errors.FindMissingResponses", 1, 1.0)
|
updater.stats.Inc("OCSP.Errors.FindMissingResponses", 1, 1.0)
|
||||||
updater.log.AuditErr(fmt.Errorf("Failed to find certificates with missing OCSP responses: %s", err))
|
updater.log.AuditErr(fmt.Errorf("Failed to find certificates with missing OCSP responses: %s", err))
|
||||||
return
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
updater.generateOCSPResponses(statuses)
|
return updater.generateOCSPResponses(statuses)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (updater *OCSPUpdater) findRevokedCertificatesToUpdate(batchSize int) ([]core.CertificateStatus, error) {
|
func (updater *OCSPUpdater) findRevokedCertificatesToUpdate(batchSize int) ([]core.CertificateStatus, error) {
|
||||||
|
|
@ -277,12 +285,12 @@ func (updater *OCSPUpdater) findRevokedCertificatesToUpdate(batchSize int) ([]co
|
||||||
return statuses, err
|
return statuses, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) {
|
func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) error {
|
||||||
statuses, err := updater.findRevokedCertificatesToUpdate(batchSize)
|
statuses, err := updater.findRevokedCertificatesToUpdate(batchSize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
updater.stats.Inc("OCSP.Errors.FindRevokedCertificates", 1, 1.0)
|
updater.stats.Inc("OCSP.Errors.FindRevokedCertificates", 1, 1.0)
|
||||||
updater.log.AuditErr(fmt.Errorf("Failed to find revoked certificates: %s", err))
|
updater.log.AuditErr(fmt.Errorf("Failed to find revoked certificates: %s", err))
|
||||||
return
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, status := range statuses {
|
for _, status := range statuses {
|
||||||
|
|
@ -290,7 +298,7 @@ func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
updater.log.AuditErr(fmt.Errorf("Failed to generate revoked OCSP response: %s", err))
|
updater.log.AuditErr(fmt.Errorf("Failed to generate revoked OCSP response: %s", err))
|
||||||
updater.stats.Inc("OCSP.Errors.RevokedResponseGeneration", 1, 1.0)
|
updater.stats.Inc("OCSP.Errors.RevokedResponseGeneration", 1, 1.0)
|
||||||
continue
|
return err
|
||||||
}
|
}
|
||||||
err = updater.storeResponse(meta)
|
err = updater.storeResponse(meta)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
@ -299,15 +307,16 @@ func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (updater *OCSPUpdater) generateOCSPResponses(statuses []core.CertificateStatus) {
|
func (updater *OCSPUpdater) generateOCSPResponses(statuses []core.CertificateStatus) error {
|
||||||
for _, status := range statuses {
|
for _, status := range statuses {
|
||||||
meta, err := updater.generateResponse(status)
|
meta, err := updater.generateResponse(status)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
updater.log.AuditErr(fmt.Errorf("Failed to generate OCSP response: %s", err))
|
updater.log.AuditErr(fmt.Errorf("Failed to generate OCSP response: %s", err))
|
||||||
updater.stats.Inc("OCSP.Errors.ResponseGeneration", 1, 1.0)
|
updater.stats.Inc("OCSP.Errors.ResponseGeneration", 1, 1.0)
|
||||||
continue
|
return err
|
||||||
}
|
}
|
||||||
updater.stats.Inc("OCSP.GeneratedResponses", 1, 1.0)
|
updater.stats.Inc("OCSP.GeneratedResponses", 1, 1.0)
|
||||||
err = updater.storeResponse(meta)
|
err = updater.storeResponse(meta)
|
||||||
|
|
@ -318,21 +327,21 @@ func (updater *OCSPUpdater) generateOCSPResponses(statuses []core.CertificateSta
|
||||||
}
|
}
|
||||||
updater.stats.Inc("OCSP.StoredResponses", 1, 1.0)
|
updater.stats.Inc("OCSP.StoredResponses", 1, 1.0)
|
||||||
}
|
}
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// oldOCSPResponsesTick looks for certificates with stale OCSP responses and
|
// oldOCSPResponsesTick looks for certificates with stale OCSP responses and
|
||||||
// generates/stores new ones
|
// generates/stores new ones
|
||||||
func (updater *OCSPUpdater) oldOCSPResponsesTick(batchSize int) {
|
func (updater *OCSPUpdater) oldOCSPResponsesTick(batchSize int) error {
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
statuses, err := updater.findStaleOCSPResponses(now.Add(-updater.ocspMinTimeToExpiry), batchSize)
|
statuses, err := updater.findStaleOCSPResponses(now.Add(-updater.ocspMinTimeToExpiry), batchSize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
updater.stats.Inc("OCSP.Errors.FindStaleResponses", 1, 1.0)
|
updater.stats.Inc("OCSP.Errors.FindStaleResponses", 1, 1.0)
|
||||||
updater.log.AuditErr(fmt.Errorf("Failed to find stale OCSP responses: %s", err))
|
updater.log.AuditErr(fmt.Errorf("Failed to find stale OCSP responses: %s", err))
|
||||||
return
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
updater.generateOCSPResponses(statuses)
|
return updater.generateOCSPResponses(statuses)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (updater *OCSPUpdater) getSerialsIssuedSince(since time.Time, batchSize int) ([]string, error) {
|
func (updater *OCSPUpdater) getSerialsIssuedSince(since time.Time, batchSize int) ([]string, error) {
|
||||||
|
|
@ -366,13 +375,13 @@ func (updater *OCSPUpdater) getNumberOfReceipts(serial string) (int, error) {
|
||||||
|
|
||||||
// missingReceiptsTick looks for certificates without the correct number of SCT
|
// missingReceiptsTick looks for certificates without the correct number of SCT
|
||||||
// receipts and retrieves them
|
// receipts and retrieves them
|
||||||
func (updater *OCSPUpdater) missingReceiptsTick(batchSize int) {
|
func (updater *OCSPUpdater) missingReceiptsTick(batchSize int) error {
|
||||||
now := updater.clk.Now()
|
now := updater.clk.Now()
|
||||||
since := now.Add(-updater.oldestIssuedSCT)
|
since := now.Add(-updater.oldestIssuedSCT)
|
||||||
serials, err := updater.getSerialsIssuedSince(since, batchSize)
|
serials, err := updater.getSerialsIssuedSince(since, batchSize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
updater.log.AuditErr(fmt.Errorf("Failed to get certificate serials: %s", err))
|
updater.log.AuditErr(fmt.Errorf("Failed to get certificate serials: %s", err))
|
||||||
return
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, serial := range serials {
|
for _, serial := range serials {
|
||||||
|
|
@ -396,7 +405,7 @@ func (updater *OCSPUpdater) missingReceiptsTick(batchSize int) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type looper struct {
|
type looper struct {
|
||||||
|
|
@ -404,17 +413,16 @@ type looper struct {
|
||||||
stats statsd.Statter
|
stats statsd.Statter
|
||||||
batchSize int
|
batchSize int
|
||||||
tickDur time.Duration
|
tickDur time.Duration
|
||||||
tickFunc func(int)
|
tickFunc func(int) error
|
||||||
name string
|
name string
|
||||||
|
failureBackoffFactor float64
|
||||||
|
failureBackoffMax time.Duration
|
||||||
|
failures int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l *looper) loop() error {
|
func (l *looper) tick() {
|
||||||
if l.batchSize == 0 || l.tickDur == 0 {
|
|
||||||
return fmt.Errorf("Both batch size and tick duration are required, not running '%s' loop", l.name)
|
|
||||||
}
|
|
||||||
for {
|
|
||||||
tickStart := l.clk.Now()
|
tickStart := l.clk.Now()
|
||||||
l.tickFunc(l.batchSize)
|
err := l.tickFunc(l.batchSize)
|
||||||
l.stats.TimingDuration(fmt.Sprintf("OCSP.%s.TickDuration", l.name), time.Since(tickStart), 1.0)
|
l.stats.TimingDuration(fmt.Sprintf("OCSP.%s.TickDuration", l.name), time.Since(tickStart), 1.0)
|
||||||
l.stats.Inc(fmt.Sprintf("OCSP.%s.Ticks", l.name), 1, 1.0)
|
l.stats.Inc(fmt.Sprintf("OCSP.%s.Ticks", l.name), 1, 1.0)
|
||||||
tickEnd := tickStart.Add(time.Since(tickStart))
|
tickEnd := tickStart.Add(time.Since(tickStart))
|
||||||
|
|
@ -422,9 +430,33 @@ func (l *looper) loop() error {
|
||||||
if tickEnd.After(expectedTickEnd) {
|
if tickEnd.After(expectedTickEnd) {
|
||||||
l.stats.Inc(fmt.Sprintf("OCSP.%s.LongTicks", l.name), 1, 1.0)
|
l.stats.Inc(fmt.Sprintf("OCSP.%s.LongTicks", l.name), 1, 1.0)
|
||||||
}
|
}
|
||||||
// Sleep for the remaining tick period (if this is a negative number sleep
|
|
||||||
// will not do anything and carry on)
|
// After we have all the stats stuff out of the way let's check if the tick
|
||||||
l.clk.Sleep(expectedTickEnd.Sub(tickEnd))
|
// function failed, if the reason is the HSM is dead increase the length of
|
||||||
|
// sleepDur using the exponentially increasing duration returned by core.RetryBackoff.
|
||||||
|
sleepDur := expectedTickEnd.Sub(tickEnd)
|
||||||
|
if err != nil {
|
||||||
|
l.stats.Inc(fmt.Sprintf("OCSP.%s.FailedTicks", l.name), 1, 1.0)
|
||||||
|
if _, ok := err.(core.ServiceUnavailableError); ok && (l.failureBackoffFactor > 0 && l.failureBackoffMax > 0) {
|
||||||
|
l.failures++
|
||||||
|
sleepDur = core.RetryBackoff(l.failures, l.tickDur, l.failureBackoffMax, l.failureBackoffFactor)
|
||||||
|
}
|
||||||
|
} else if l.failures > 0 {
|
||||||
|
// If the tick was successful and previously there were failures reset
|
||||||
|
// counter to 0
|
||||||
|
l.failures = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sleep for the remaining tick period or for the backoff time
|
||||||
|
l.clk.Sleep(sleepDur)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *looper) loop() error {
|
||||||
|
if l.batchSize == 0 || l.tickDur == 0 {
|
||||||
|
return fmt.Errorf("Both batch size and tick duration are required, not running '%s' loop", l.name)
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
l.tick()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -321,3 +321,44 @@ func TestStoreResponseGuard(t *testing.T) {
|
||||||
test.AssertNotError(t, err, "Failed to get certificate status")
|
test.AssertNotError(t, err, "Failed to get certificate status")
|
||||||
test.AssertEquals(t, len(changedStatus.OCSPResponse), 3)
|
test.AssertEquals(t, len(changedStatus.OCSPResponse), 3)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestLoopTickBackoff(t *testing.T) {
|
||||||
|
fc := clock.NewFake()
|
||||||
|
stats, _ := statsd.NewNoopClient(nil)
|
||||||
|
l := looper{
|
||||||
|
clk: fc,
|
||||||
|
stats: stats,
|
||||||
|
failureBackoffFactor: 1.5,
|
||||||
|
failureBackoffMax: 10 * time.Minute,
|
||||||
|
tickDur: time.Minute,
|
||||||
|
tickFunc: func(_ int) error { return core.ServiceUnavailableError("sad HSM") },
|
||||||
|
}
|
||||||
|
|
||||||
|
start := l.clk.Now()
|
||||||
|
l.tick()
|
||||||
|
// Expected to sleep for 1m
|
||||||
|
backoff := float64(60000000000)
|
||||||
|
maxJittered := backoff * 1.2
|
||||||
|
test.AssertBetween(t, l.clk.Now().Sub(start).Nanoseconds(), int64(backoff), int64(maxJittered))
|
||||||
|
|
||||||
|
start = l.clk.Now()
|
||||||
|
l.tick()
|
||||||
|
// Expected to sleep for 1m30s
|
||||||
|
backoff = 90000000000
|
||||||
|
maxJittered = backoff * 1.2
|
||||||
|
test.AssertBetween(t, l.clk.Now().Sub(start).Nanoseconds(), int64(backoff), int64(maxJittered))
|
||||||
|
|
||||||
|
l.failures = 6
|
||||||
|
start = l.clk.Now()
|
||||||
|
l.tick()
|
||||||
|
// Expected to sleep for 11m23.4375s, should be truncated to 10m
|
||||||
|
backoff = 600000000000
|
||||||
|
maxJittered = backoff * 1.2
|
||||||
|
test.AssertBetween(t, l.clk.Now().Sub(start).Nanoseconds(), int64(backoff), int64(maxJittered))
|
||||||
|
|
||||||
|
l.tickFunc = func(_ int) error { return nil }
|
||||||
|
start = l.clk.Now()
|
||||||
|
l.tick()
|
||||||
|
test.AssertEquals(t, l.failures, 0)
|
||||||
|
test.AssertEquals(t, l.clk.Now(), start)
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -302,6 +302,9 @@ type OCSPUpdaterConfig struct {
|
||||||
OCSPMinTimeToExpiry ConfigDuration
|
OCSPMinTimeToExpiry ConfigDuration
|
||||||
OldestIssuedSCT ConfigDuration
|
OldestIssuedSCT ConfigDuration
|
||||||
|
|
||||||
|
SignFailureBackoffFactor float64
|
||||||
|
SignFailureBackoffMax ConfigDuration
|
||||||
|
|
||||||
// DebugAddr is the address to run the /debug handlers on.
|
// DebugAddr is the address to run the /debug handlers on.
|
||||||
DebugAddr string
|
DebugAddr string
|
||||||
}
|
}
|
||||||
|
|
|
||||||
27
core/util.go
27
core/util.go
|
|
@ -26,9 +26,11 @@ import (
|
||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"math/big"
|
"math/big"
|
||||||
|
mrand "math/rand"
|
||||||
"net/url"
|
"net/url"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
jose "github.com/letsencrypt/boulder/Godeps/_workspace/src/github.com/letsencrypt/go-jose"
|
jose "github.com/letsencrypt/boulder/Godeps/_workspace/src/github.com/letsencrypt/go-jose"
|
||||||
blog "github.com/letsencrypt/boulder/log"
|
blog "github.com/letsencrypt/boulder/log"
|
||||||
|
|
@ -536,3 +538,28 @@ func LoadCert(filename string) (cert *x509.Certificate, err error) {
|
||||||
cert, err = x509.ParseCertificate(block.Bytes)
|
cert, err = x509.ParseCertificate(block.Bytes)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// retryJitter is used to prevent bunched retried queries from falling into lockstep
|
||||||
|
const retryJitter = 0.2
|
||||||
|
|
||||||
|
// RetryBackoff calculates a backoff time based on number of retries, will always
|
||||||
|
// add jitter so requests that start in unison won't fall into lockstep. Because of
|
||||||
|
// this the returned duration can always be larger than the maximum by a factor of
|
||||||
|
// retryJitter. Adapted from https://github.com/grpc/grpc-go/blob/master/rpc_util.go#L311
|
||||||
|
func RetryBackoff(retries int, base, max time.Duration, factor float64) time.Duration {
|
||||||
|
if retries == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
backoff, fMax := float64(base), float64(max)
|
||||||
|
for backoff < fMax && retries > 1 {
|
||||||
|
backoff *= factor
|
||||||
|
retries--
|
||||||
|
}
|
||||||
|
if backoff > fMax {
|
||||||
|
backoff = fMax
|
||||||
|
}
|
||||||
|
// Randomize backoff delays so that if a cluster of requests start at
|
||||||
|
// the same time, they won't operate in lockstep.
|
||||||
|
backoff *= (1 - retryJitter) + 2*retryJitter*mrand.Float64()
|
||||||
|
return time.Duration(backoff)
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -173,6 +173,8 @@
|
||||||
"revokedCertificateBatchSize": 1000,
|
"revokedCertificateBatchSize": 1000,
|
||||||
"ocspMinTimeToExpiry": "72h",
|
"ocspMinTimeToExpiry": "72h",
|
||||||
"oldestIssuedSCT": "72h",
|
"oldestIssuedSCT": "72h",
|
||||||
|
"signFailureBackoffFactor": 1.2,
|
||||||
|
"signFailureBackoffMax": "30m",
|
||||||
"debugAddr": "localhost:8006"
|
"debugAddr": "localhost:8006"
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -133,3 +133,10 @@ func AssertSeverity(t *testing.T, data string, severity int) {
|
||||||
expected := fmt.Sprintf("\"severity\":%d", severity)
|
expected := fmt.Sprintf("\"severity\":%d", severity)
|
||||||
AssertContains(t, data, expected)
|
AssertContains(t, data, expected)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AssertBetween determines if a is between b and c
|
||||||
|
func AssertBetween(t *testing.T, a, b, c int64) {
|
||||||
|
if a < b || a > c {
|
||||||
|
t.Fatalf("%d is not between %d and %d", a, b, c)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue