rocsp: improve stats (#6257)

For multiSource, split out checkSecondary's metrics into their own
counter. Treat NotFound as a separate error type (so we can more
clearly distinguish the half-hourly pattern of fetches for expired
certificates).

In redisSource, add a histogram for the ages of responses fetched from
cache (regardless of whether they are served or not). This parallels
ocsp_respond_ages in ocsp/responder.go, but may show ages beyond the
compliance limit, even under normal operations, because it is checked
before signAndServe is called.
This commit is contained in:
Jacob Hoffman-Andrews 2022-07-28 21:49:38 -07:00 committed by GitHub
parent 745eef159b
commit e7bf6383d8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 57 additions and 27 deletions

View File

@ -13,11 +13,12 @@ import (
) )
type multiSource struct { type multiSource struct {
primary Source primary Source
secondary Source secondary Source
expectedFreshness time.Duration expectedFreshness time.Duration
counter *prometheus.CounterVec counter *prometheus.CounterVec
log blog.Logger checkSecondaryCounter *prometheus.CounterVec
log blog.Logger
} }
// NewMultiSource creates a source that combines a primary and a secondary source. // NewMultiSource creates a source that combines a primary and a secondary source.
@ -37,18 +38,26 @@ func NewMultiSource(primary, secondary Source, expectedFreshness time.Duration,
if primary == nil || secondary == nil { if primary == nil || secondary == nil {
return nil, errors.New("must provide both primary and secondary sources") return nil, errors.New("must provide both primary and secondary sources")
} }
counter := prometheus.NewCounterVec(prometheus.CounterOpts{ counter := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "ocsp_multiplex_responses", Name: "ocsp_multiplex_responses",
Help: "Count of OCSP requests/responses by action taken by the multiSource", Help: "Count of OCSP requests/responses by action taken by the multiSource",
}, []string{"result"}) }, []string{"result"})
stats.MustRegister(counter) stats.MustRegister(counter)
checkSecondaryCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "ocsp_multiplex_check_secondary",
Help: "Count of OCSP requests/responses by action taken by the multiSource",
}, []string{"result"})
stats.MustRegister(checkSecondaryCounter)
return &multiSource{ return &multiSource{
primary: primary, primary: primary,
secondary: secondary, secondary: secondary,
expectedFreshness: expectedFreshness, expectedFreshness: expectedFreshness,
counter: counter, counter: counter,
log: log, checkSecondaryCounter: checkSecondaryCounter,
log: log,
}, nil }, nil
} }
@ -91,7 +100,11 @@ func (src *multiSource) Response(ctx context.Context, req *ocsp.Request) (*Respo
// check the secondary's status against the (more reliable) primary's // check the secondary's status against the (more reliable) primary's
// status. // status.
if r.err != nil { if r.err != nil {
src.counter.WithLabelValues("primary_error").Inc() if errors.Is(r.err, ErrNotFound) {
src.counter.WithLabelValues("primary_not_found").Inc()
} else {
src.counter.WithLabelValues("primary_error").Inc()
}
return nil, r.err return nil, r.err
} }
primaryResponse = r.resp primaryResponse = r.resp
@ -156,14 +169,14 @@ func (src *multiSource) checkSecondary(primaryResponse *Response, secondaryChan
if secondaryResult.err != nil { if secondaryResult.err != nil {
if errors.Is(secondaryResult.err, rocsp.ErrRedisNotFound) { if errors.Is(secondaryResult.err, rocsp.ErrRedisNotFound) {
// This case will happen for several hours after first issuance. // This case will happen for several hours after first issuance.
src.counter.WithLabelValues("primary_good_secondary_not_found").Inc() src.checkSecondaryCounter.WithLabelValues("not_found").Inc()
} else { } else {
src.counter.WithLabelValues("primary_good_secondary_error").Inc() src.checkSecondaryCounter.WithLabelValues("error").Inc()
} }
} }
src.counter.WithLabelValues("primary_good_secondary_good").Inc() src.checkSecondaryCounter.WithLabelValues("good").Inc()
default: default:
src.counter.WithLabelValues("primary_good_secondary_slow").Inc() src.checkSecondaryCounter.WithLabelValues("slow").Inc()
} }
} }

View File

@ -34,11 +34,12 @@ type rocspClient interface {
} }
type redisSource struct { type redisSource struct {
client rocspClient client rocspClient
signer responder.Source signer responder.Source
counter *prometheus.CounterVec counter *prometheus.CounterVec
clk clock.Clock cachedResponseAges prometheus.Histogram
liveSigningPeriod time.Duration clk clock.Clock
liveSigningPeriod time.Duration
// Note: this logger is not currently used, as all audit log events are from // Note: this logger is not currently used, as all audit log events are from
// the dbSource right now, but it should and will be used in the future. // the dbSource right now, but it should and will be used in the future.
log blog.Logger log blog.Logger
@ -60,17 +61,31 @@ func NewRedisSource(
}, []string{"result"}) }, []string{"result"})
stats.MustRegister(counter) stats.MustRegister(counter)
// Set up 12-hour-wide buckets, measured in seconds.
buckets := make([]float64, 14)
for i := range buckets {
buckets[i] = 43200 * float64(i)
}
cachedResponseAges := prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "ocsp_redis_cached_response_ages",
Help: "How old are the cached OCSP responses when we successfully retrieve them.",
Buckets: buckets,
})
stats.MustRegister(cachedResponseAges)
var rocspReader rocspClient var rocspReader rocspClient
if client != nil { if client != nil {
rocspReader = client rocspReader = client
} }
return &redisSource{ return &redisSource{
client: rocspReader, client: rocspReader,
signer: signer, signer: signer,
counter: counter, counter: counter,
liveSigningPeriod: liveSigningPeriod, cachedResponseAges: cachedResponseAges,
clk: clk, liveSigningPeriod: liveSigningPeriod,
log: log, clk: clk,
log: log,
}, nil }, nil
} }
@ -108,7 +123,9 @@ func (src *redisSource) Response(ctx context.Context, req *ocsp.Request) (*respo
} }
func (src *redisSource) isStale(resp *ocsp.Response) bool { func (src *redisSource) isStale(resp *ocsp.Response) bool {
return src.clk.Since(resp.ThisUpdate) > src.liveSigningPeriod age := src.clk.Since(resp.ThisUpdate)
src.cachedResponseAges.Observe(age.Seconds())
return age > src.liveSigningPeriod
} }
func (src *redisSource) signAndSave(ctx context.Context, req *ocsp.Request, cause string) (*responder.Response, error) { func (src *redisSource) signAndSave(ctx context.Context, req *ocsp.Request, cause string) (*responder.Response, error) {