rocsp: improve stats (#6257)

For multiSource, split out checkSecondary's metrics into their own
counter. Treat NotFound as a separate error type (so we can more
clearly distinguish the half-hourly pattern of fetches for expired
certificates).

In redisSource, add a histogram for the ages of responses fetched from
cache (regardless of whether they are served or not). This parallels
ocsp_respond_ages in ocsp/responder.go, but may show ages beyond the
compliance limit, even under normal operations, because it is checked
before signAndServe is called.
This commit is contained in:
Jacob Hoffman-Andrews 2022-07-28 21:49:38 -07:00 committed by GitHub
parent 745eef159b
commit e7bf6383d8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 57 additions and 27 deletions

View File

@ -13,11 +13,12 @@ import (
)
type multiSource struct {
primary Source
secondary Source
expectedFreshness time.Duration
counter *prometheus.CounterVec
log blog.Logger
primary Source
secondary Source
expectedFreshness time.Duration
counter *prometheus.CounterVec
checkSecondaryCounter *prometheus.CounterVec
log blog.Logger
}
// NewMultiSource creates a source that combines a primary and a secondary source.
@ -37,18 +38,26 @@ func NewMultiSource(primary, secondary Source, expectedFreshness time.Duration,
if primary == nil || secondary == nil {
return nil, errors.New("must provide both primary and secondary sources")
}
counter := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "ocsp_multiplex_responses",
Help: "Count of OCSP requests/responses by action taken by the multiSource",
}, []string{"result"})
stats.MustRegister(counter)
checkSecondaryCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "ocsp_multiplex_check_secondary",
Help: "Count of OCSP requests/responses by action taken by the multiSource",
}, []string{"result"})
stats.MustRegister(checkSecondaryCounter)
return &multiSource{
primary: primary,
secondary: secondary,
expectedFreshness: expectedFreshness,
counter: counter,
log: log,
primary: primary,
secondary: secondary,
expectedFreshness: expectedFreshness,
counter: counter,
checkSecondaryCounter: checkSecondaryCounter,
log: log,
}, nil
}
@ -91,7 +100,11 @@ func (src *multiSource) Response(ctx context.Context, req *ocsp.Request) (*Respo
// check the secondary's status against the (more reliable) primary's
// status.
if r.err != nil {
src.counter.WithLabelValues("primary_error").Inc()
if errors.Is(r.err, ErrNotFound) {
src.counter.WithLabelValues("primary_not_found").Inc()
} else {
src.counter.WithLabelValues("primary_error").Inc()
}
return nil, r.err
}
primaryResponse = r.resp
@ -156,14 +169,14 @@ func (src *multiSource) checkSecondary(primaryResponse *Response, secondaryChan
if secondaryResult.err != nil {
if errors.Is(secondaryResult.err, rocsp.ErrRedisNotFound) {
// This case will happen for several hours after first issuance.
src.counter.WithLabelValues("primary_good_secondary_not_found").Inc()
src.checkSecondaryCounter.WithLabelValues("not_found").Inc()
} else {
src.counter.WithLabelValues("primary_good_secondary_error").Inc()
src.checkSecondaryCounter.WithLabelValues("error").Inc()
}
}
src.counter.WithLabelValues("primary_good_secondary_good").Inc()
src.checkSecondaryCounter.WithLabelValues("good").Inc()
default:
src.counter.WithLabelValues("primary_good_secondary_slow").Inc()
src.checkSecondaryCounter.WithLabelValues("slow").Inc()
}
}

View File

@ -34,11 +34,12 @@ type rocspClient interface {
}
type redisSource struct {
client rocspClient
signer responder.Source
counter *prometheus.CounterVec
clk clock.Clock
liveSigningPeriod time.Duration
client rocspClient
signer responder.Source
counter *prometheus.CounterVec
cachedResponseAges prometheus.Histogram
clk clock.Clock
liveSigningPeriod time.Duration
// Note: this logger is not currently used, as all audit log events are from
// the dbSource right now, but it should and will be used in the future.
log blog.Logger
@ -60,17 +61,31 @@ func NewRedisSource(
}, []string{"result"})
stats.MustRegister(counter)
// Set up 12-hour-wide buckets, measured in seconds.
buckets := make([]float64, 14)
for i := range buckets {
buckets[i] = 43200 * float64(i)
}
cachedResponseAges := prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "ocsp_redis_cached_response_ages",
Help: "How old are the cached OCSP responses when we successfully retrieve them.",
Buckets: buckets,
})
stats.MustRegister(cachedResponseAges)
var rocspReader rocspClient
if client != nil {
rocspReader = client
}
return &redisSource{
client: rocspReader,
signer: signer,
counter: counter,
liveSigningPeriod: liveSigningPeriod,
clk: clk,
log: log,
client: rocspReader,
signer: signer,
counter: counter,
cachedResponseAges: cachedResponseAges,
liveSigningPeriod: liveSigningPeriod,
clk: clk,
log: log,
}, nil
}
@ -108,7 +123,9 @@ func (src *redisSource) Response(ctx context.Context, req *ocsp.Request) (*respo
}
func (src *redisSource) isStale(resp *ocsp.Response) bool {
return src.clk.Since(resp.ThisUpdate) > src.liveSigningPeriod
age := src.clk.Since(resp.ThisUpdate)
src.cachedResponseAges.Observe(age.Seconds())
return age > src.liveSigningPeriod
}
func (src *redisSource) signAndSave(ctx context.Context, req *ocsp.Request, cause string) (*responder.Response, error) {