Backoff OCSP Updater on HSM failure
If a ServiceUnavailableError is returned from GenerateOCSP backoff before attempting to retry the call as to not to overwhelm the CA with calls that may instantly fail.
This commit is contained in:
		
							parent
							
								
									c75c1f26fa
								
							
						
					
					
						commit
						661476f40e
					
				|  | @ -91,6 +91,8 @@ func newUpdater( | |||
| 			tickDur:              config.NewCertificateWindow.Duration, | ||||
| 			tickFunc:             updater.newCertificateTick, | ||||
| 			name:                 "NewCertificates", | ||||
| 			failureBackoffFactor: config.SignFailureBackoffFactor, | ||||
| 			failureBackoffMax:    config.SignFailureBackoffMax.Duration, | ||||
| 		}, | ||||
| 		&looper{ | ||||
| 			clk:                  clk, | ||||
|  | @ -99,7 +101,11 @@ func newUpdater( | |||
| 			tickDur:              config.OldOCSPWindow.Duration, | ||||
| 			tickFunc:             updater.oldOCSPResponsesTick, | ||||
| 			name:                 "OldOCSPResponses", | ||||
| 			failureBackoffFactor: config.SignFailureBackoffFactor, | ||||
| 			failureBackoffMax:    config.SignFailureBackoffMax.Duration, | ||||
| 		}, | ||||
| 		// The missing SCT loop doesn't need to know about failureBackoffFactor or
 | ||||
| 		// failureBackoffMax as it doesn't make any calls to the CA
 | ||||
| 		&looper{ | ||||
| 			clk:       clk, | ||||
| 			stats:     stats, | ||||
|  | @ -118,6 +124,8 @@ func newUpdater( | |||
| 			tickDur:              config.RevokedCertificateWindow.Duration, | ||||
| 			tickFunc:             updater.revokedCertificatesTick, | ||||
| 			name:                 "RevokedCertificates", | ||||
| 			failureBackoffFactor: config.SignFailureBackoffFactor, | ||||
| 			failureBackoffMax:    config.SignFailureBackoffMax.Duration, | ||||
| 		}) | ||||
| 	} | ||||
| 
 | ||||
|  | @ -248,17 +256,17 @@ func (updater *OCSPUpdater) storeResponse(status *core.CertificateStatus) error | |||
| 
 | ||||
| // newCertificateTick checks for certificates issued since the last tick and
 | ||||
| // generates and stores OCSP responses for these certs
 | ||||
| func (updater *OCSPUpdater) newCertificateTick(batchSize int) { | ||||
| func (updater *OCSPUpdater) newCertificateTick(batchSize int) error { | ||||
| 	// Check for anything issued between now and previous tick and generate first
 | ||||
| 	// OCSP responses
 | ||||
| 	statuses, err := updater.getCertificatesWithMissingResponses(batchSize) | ||||
| 	if err != nil { | ||||
| 		updater.stats.Inc("OCSP.Errors.FindMissingResponses", 1, 1.0) | ||||
| 		updater.log.AuditErr(fmt.Errorf("Failed to find certificates with missing OCSP responses: %s", err)) | ||||
| 		return | ||||
| 		return err | ||||
| 	} | ||||
| 
 | ||||
| 	updater.generateOCSPResponses(statuses) | ||||
| 	return updater.generateOCSPResponses(statuses) | ||||
| } | ||||
| 
 | ||||
| func (updater *OCSPUpdater) findRevokedCertificatesToUpdate(batchSize int) ([]core.CertificateStatus, error) { | ||||
|  | @ -277,12 +285,12 @@ func (updater *OCSPUpdater) findRevokedCertificatesToUpdate(batchSize int) ([]co | |||
| 	return statuses, err | ||||
| } | ||||
| 
 | ||||
| func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) { | ||||
| func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) error { | ||||
| 	statuses, err := updater.findRevokedCertificatesToUpdate(batchSize) | ||||
| 	if err != nil { | ||||
| 		updater.stats.Inc("OCSP.Errors.FindRevokedCertificates", 1, 1.0) | ||||
| 		updater.log.AuditErr(fmt.Errorf("Failed to find revoked certificates: %s", err)) | ||||
| 		return | ||||
| 		return err | ||||
| 	} | ||||
| 
 | ||||
| 	for _, status := range statuses { | ||||
|  | @ -290,7 +298,7 @@ func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) { | |||
| 		if err != nil { | ||||
| 			updater.log.AuditErr(fmt.Errorf("Failed to generate revoked OCSP response: %s", err)) | ||||
| 			updater.stats.Inc("OCSP.Errors.RevokedResponseGeneration", 1, 1.0) | ||||
| 			continue | ||||
| 			return err | ||||
| 		} | ||||
| 		err = updater.storeResponse(meta) | ||||
| 		if err != nil { | ||||
|  | @ -299,15 +307,16 @@ func (updater *OCSPUpdater) revokedCertificatesTick(batchSize int) { | |||
| 			continue | ||||
| 		} | ||||
| 	} | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| func (updater *OCSPUpdater) generateOCSPResponses(statuses []core.CertificateStatus) { | ||||
| func (updater *OCSPUpdater) generateOCSPResponses(statuses []core.CertificateStatus) error { | ||||
| 	for _, status := range statuses { | ||||
| 		meta, err := updater.generateResponse(status) | ||||
| 		if err != nil { | ||||
| 			updater.log.AuditErr(fmt.Errorf("Failed to generate OCSP response: %s", err)) | ||||
| 			updater.stats.Inc("OCSP.Errors.ResponseGeneration", 1, 1.0) | ||||
| 			continue | ||||
| 			return err | ||||
| 		} | ||||
| 		updater.stats.Inc("OCSP.GeneratedResponses", 1, 1.0) | ||||
| 		err = updater.storeResponse(meta) | ||||
|  | @ -318,21 +327,21 @@ func (updater *OCSPUpdater) generateOCSPResponses(statuses []core.CertificateSta | |||
| 		} | ||||
| 		updater.stats.Inc("OCSP.StoredResponses", 1, 1.0) | ||||
| 	} | ||||
| 	return | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| // oldOCSPResponsesTick looks for certificates with stale OCSP responses and
 | ||||
| // generates/stores new ones
 | ||||
| func (updater *OCSPUpdater) oldOCSPResponsesTick(batchSize int) { | ||||
| func (updater *OCSPUpdater) oldOCSPResponsesTick(batchSize int) error { | ||||
| 	now := time.Now() | ||||
| 	statuses, err := updater.findStaleOCSPResponses(now.Add(-updater.ocspMinTimeToExpiry), batchSize) | ||||
| 	if err != nil { | ||||
| 		updater.stats.Inc("OCSP.Errors.FindStaleResponses", 1, 1.0) | ||||
| 		updater.log.AuditErr(fmt.Errorf("Failed to find stale OCSP responses: %s", err)) | ||||
| 		return | ||||
| 		return err | ||||
| 	} | ||||
| 
 | ||||
| 	updater.generateOCSPResponses(statuses) | ||||
| 	return updater.generateOCSPResponses(statuses) | ||||
| } | ||||
| 
 | ||||
| func (updater *OCSPUpdater) getSerialsIssuedSince(since time.Time, batchSize int) ([]string, error) { | ||||
|  | @ -366,13 +375,13 @@ func (updater *OCSPUpdater) getNumberOfReceipts(serial string) (int, error) { | |||
| 
 | ||||
| // missingReceiptsTick looks for certificates without the correct number of SCT
 | ||||
| // receipts and retrieves them
 | ||||
| func (updater *OCSPUpdater) missingReceiptsTick(batchSize int) { | ||||
| func (updater *OCSPUpdater) missingReceiptsTick(batchSize int) error { | ||||
| 	now := updater.clk.Now() | ||||
| 	since := now.Add(-updater.oldestIssuedSCT) | ||||
| 	serials, err := updater.getSerialsIssuedSince(since, batchSize) | ||||
| 	if err != nil { | ||||
| 		updater.log.AuditErr(fmt.Errorf("Failed to get certificate serials: %s", err)) | ||||
| 		return | ||||
| 		return err | ||||
| 	} | ||||
| 
 | ||||
| 	for _, serial := range serials { | ||||
|  | @ -396,7 +405,7 @@ func (updater *OCSPUpdater) missingReceiptsTick(batchSize int) { | |||
| 			continue | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| type looper struct { | ||||
|  | @ -404,17 +413,16 @@ type looper struct { | |||
| 	stats                statsd.Statter | ||||
| 	batchSize            int | ||||
| 	tickDur              time.Duration | ||||
| 	tickFunc  func(int) | ||||
| 	tickFunc             func(int) error | ||||
| 	name                 string | ||||
| 	failureBackoffFactor float64 | ||||
| 	failureBackoffMax    time.Duration | ||||
| 	failures             int | ||||
| } | ||||
| 
 | ||||
| func (l *looper) loop() error { | ||||
| 	if l.batchSize == 0 || l.tickDur == 0 { | ||||
| 		return fmt.Errorf("Both batch size and tick duration are required, not running '%s' loop", l.name) | ||||
| 	} | ||||
| 	for { | ||||
| func (l *looper) tick() { | ||||
| 	tickStart := l.clk.Now() | ||||
| 		l.tickFunc(l.batchSize) | ||||
| 	err := l.tickFunc(l.batchSize) | ||||
| 	l.stats.TimingDuration(fmt.Sprintf("OCSP.%s.TickDuration", l.name), time.Since(tickStart), 1.0) | ||||
| 	l.stats.Inc(fmt.Sprintf("OCSP.%s.Ticks", l.name), 1, 1.0) | ||||
| 	tickEnd := tickStart.Add(time.Since(tickStart)) | ||||
|  | @ -422,9 +430,33 @@ func (l *looper) loop() error { | |||
| 	if tickEnd.After(expectedTickEnd) { | ||||
| 		l.stats.Inc(fmt.Sprintf("OCSP.%s.LongTicks", l.name), 1, 1.0) | ||||
| 	} | ||||
| 		// Sleep for the remaining tick period (if this is a negative number sleep
 | ||||
| 		// will not do anything and carry on)
 | ||||
| 		l.clk.Sleep(expectedTickEnd.Sub(tickEnd)) | ||||
| 
 | ||||
| 	// After we have all the stats stuff out of the way let's check if the tick
 | ||||
| 	// function failed, if the reason is the HSM is dead increase the length of
 | ||||
| 	// sleepDur using the exponentially increasing duration returned by core.RetryBackoff.
 | ||||
| 	sleepDur := expectedTickEnd.Sub(tickEnd) | ||||
| 	if err != nil { | ||||
| 		l.stats.Inc(fmt.Sprintf("OCSP.%s.FailedTicks", l.name), 1, 1.0) | ||||
| 		if _, ok := err.(core.ServiceUnavailableError); ok && (l.failureBackoffFactor > 0 && l.failureBackoffMax > 0) { | ||||
| 			l.failures++ | ||||
| 			sleepDur = core.RetryBackoff(l.failures, l.tickDur, l.failureBackoffMax, l.failureBackoffFactor) | ||||
| 		} | ||||
| 	} else if l.failures > 0 { | ||||
| 		// If the tick was successful and previously there were failures reset
 | ||||
| 		// counter to 0
 | ||||
| 		l.failures = 0 | ||||
| 	} | ||||
| 
 | ||||
| 	// Sleep for the remaining tick period or for the backoff time
 | ||||
| 	l.clk.Sleep(sleepDur) | ||||
| } | ||||
| 
 | ||||
| func (l *looper) loop() error { | ||||
| 	if l.batchSize == 0 || l.tickDur == 0 { | ||||
| 		return fmt.Errorf("Both batch size and tick duration are required, not running '%s' loop", l.name) | ||||
| 	} | ||||
| 	for { | ||||
| 		l.tick() | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
|  |  | |||
|  | @ -321,3 +321,44 @@ func TestStoreResponseGuard(t *testing.T) { | |||
| 	test.AssertNotError(t, err, "Failed to get certificate status") | ||||
| 	test.AssertEquals(t, len(changedStatus.OCSPResponse), 3) | ||||
| } | ||||
| 
 | ||||
| func TestLoopTickBackoff(t *testing.T) { | ||||
| 	fc := clock.NewFake() | ||||
| 	stats, _ := statsd.NewNoopClient(nil) | ||||
| 	l := looper{ | ||||
| 		clk:                  fc, | ||||
| 		stats:                stats, | ||||
| 		failureBackoffFactor: 1.5, | ||||
| 		failureBackoffMax:    10 * time.Minute, | ||||
| 		tickDur:              time.Minute, | ||||
| 		tickFunc:             func(_ int) error { return core.ServiceUnavailableError("sad HSM") }, | ||||
| 	} | ||||
| 
 | ||||
| 	start := l.clk.Now() | ||||
| 	l.tick() | ||||
| 	// Expected to sleep for 1m
 | ||||
| 	backoff := float64(60000000000) | ||||
| 	maxJittered := backoff * 1.2 | ||||
| 	test.AssertBetween(t, l.clk.Now().Sub(start).Nanoseconds(), int64(backoff), int64(maxJittered)) | ||||
| 
 | ||||
| 	start = l.clk.Now() | ||||
| 	l.tick() | ||||
| 	// Expected to sleep for 1m30s
 | ||||
| 	backoff = 90000000000 | ||||
| 	maxJittered = backoff * 1.2 | ||||
| 	test.AssertBetween(t, l.clk.Now().Sub(start).Nanoseconds(), int64(backoff), int64(maxJittered)) | ||||
| 
 | ||||
| 	l.failures = 6 | ||||
| 	start = l.clk.Now() | ||||
| 	l.tick() | ||||
| 	// Expected to sleep for 11m23.4375s, should be truncated to 10m
 | ||||
| 	backoff = 600000000000 | ||||
| 	maxJittered = backoff * 1.2 | ||||
| 	test.AssertBetween(t, l.clk.Now().Sub(start).Nanoseconds(), int64(backoff), int64(maxJittered)) | ||||
| 
 | ||||
| 	l.tickFunc = func(_ int) error { return nil } | ||||
| 	start = l.clk.Now() | ||||
| 	l.tick() | ||||
| 	test.AssertEquals(t, l.failures, 0) | ||||
| 	test.AssertEquals(t, l.clk.Now(), start) | ||||
| } | ||||
|  |  | |||
|  | @ -302,6 +302,9 @@ type OCSPUpdaterConfig struct { | |||
| 	OCSPMinTimeToExpiry ConfigDuration | ||||
| 	OldestIssuedSCT     ConfigDuration | ||||
| 
 | ||||
| 	SignFailureBackoffFactor float64 | ||||
| 	SignFailureBackoffMax    ConfigDuration | ||||
| 
 | ||||
| 	// DebugAddr is the address to run the /debug handlers on.
 | ||||
| 	DebugAddr string | ||||
| } | ||||
|  |  | |||
							
								
								
									
										27
									
								
								core/util.go
								
								
								
								
							
							
						
						
									
										27
									
								
								core/util.go
								
								
								
								
							|  | @ -26,9 +26,11 @@ import ( | |||
| 	"io" | ||||
| 	"io/ioutil" | ||||
| 	"math/big" | ||||
| 	mrand "math/rand" | ||||
| 	"net/url" | ||||
| 	"regexp" | ||||
| 	"strings" | ||||
| 	"time" | ||||
| 
 | ||||
| 	jose "github.com/letsencrypt/boulder/Godeps/_workspace/src/github.com/letsencrypt/go-jose" | ||||
| 	blog "github.com/letsencrypt/boulder/log" | ||||
|  | @ -536,3 +538,28 @@ func LoadCert(filename string) (cert *x509.Certificate, err error) { | |||
| 	cert, err = x509.ParseCertificate(block.Bytes) | ||||
| 	return | ||||
| } | ||||
| 
 | ||||
| // retryJitter is used to prevent bunched retried queries from falling into lockstep
 | ||||
| const retryJitter = 0.2 | ||||
| 
 | ||||
| // RetryBackoff calculates a backoff time based on number of retries, will always
 | ||||
| // add jitter so requests that start in unison won't fall into lockstep. Because of
 | ||||
| // this the returned duration can always be larger than the maximum by a factor of
 | ||||
| // retryJitter. Adapted from https://github.com/grpc/grpc-go/blob/master/rpc_util.go#L311
 | ||||
| func RetryBackoff(retries int, base, max time.Duration, factor float64) time.Duration { | ||||
| 	if retries == 0 { | ||||
| 		return 0 | ||||
| 	} | ||||
| 	backoff, fMax := float64(base), float64(max) | ||||
| 	for backoff < fMax && retries > 1 { | ||||
| 		backoff *= factor | ||||
| 		retries-- | ||||
| 	} | ||||
| 	if backoff > fMax { | ||||
| 		backoff = fMax | ||||
| 	} | ||||
| 	// Randomize backoff delays so that if a cluster of requests start at
 | ||||
| 	// the same time, they won't operate in lockstep.
 | ||||
| 	backoff *= (1 - retryJitter) + 2*retryJitter*mrand.Float64() | ||||
| 	return time.Duration(backoff) | ||||
| } | ||||
|  |  | |||
|  | @ -173,6 +173,8 @@ | |||
|     "revokedCertificateBatchSize": 1000, | ||||
|     "ocspMinTimeToExpiry": "72h", | ||||
|     "oldestIssuedSCT": "72h", | ||||
|     "signFailureBackoffFactor": 1.2, | ||||
|     "signFailureBackoffMax": "30m", | ||||
|     "debugAddr": "localhost:8006" | ||||
|   }, | ||||
| 
 | ||||
|  |  | |||
|  | @ -133,3 +133,10 @@ func AssertSeverity(t *testing.T, data string, severity int) { | |||
| 	expected := fmt.Sprintf("\"severity\":%d", severity) | ||||
| 	AssertContains(t, data, expected) | ||||
| } | ||||
| 
 | ||||
| // AssertBetween determines if a is between b and c
 | ||||
| func AssertBetween(t *testing.T, a, b, c int64) { | ||||
| 	if a < b || a > c { | ||||
| 		t.Fatalf("%d is not between %d and %d", a, b, c) | ||||
| 	} | ||||
| } | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue