Use Prometheus stats for VA, WFE, and OCSP Responder (#2628)

Rename HTTPMonitor to MeasuredHandler.
Remove inflight stat (we didn't use it).
Add timing stat by method, endpoint, and status code.
The timing stat subsumes the "rate" stat, so remove that.
WFE now wraps in MeasuredHandler, instead of relying on its cmd/main.go.
Remove FBAdapter stats.
MeasuredHandler tracks stats by method, status code, and endpoint.

In VA, add a Prometheus histogram for validation timing.
This commit is contained in:
Jacob Hoffman-Andrews 2017-04-03 17:03:04 -07:00 committed by Roland Bracewell Shoemaker
parent ca3a2e0e3c
commit 4b665e35a6
8 changed files with 219 additions and 90 deletions

View File

@ -137,14 +137,11 @@ func main() {
// Set up paths
wfe.BaseURL = c.Common.BaseURL
h := wfe.Handler()
httpMonitor := metrics.NewHTTPMonitor(scope, h)
logger.Info(fmt.Sprintf("Server running, listening on %s...\n", c.WFE.ListenAddress))
srv := &http.Server{
Addr: c.WFE.ListenAddress,
Handler: httpMonitor,
Handler: wfe.Handler(),
}
go cmd.DebugServer(c.WFE.DebugAddr)
@ -153,7 +150,6 @@ func main() {
hd := &httpdown.HTTP{
StopTimeout: c.WFE.ShutdownStopTimeout.Duration,
KillTimeout: c.WFE.ShutdownKillTimeout.Duration,
Stats: metrics.NewFBAdapter(scope, clock.Default()),
}
hdSrv, err := hd.ListenAndServe(srv)
cmd.FailOnError(err, "Error starting HTTP server")
@ -167,7 +163,7 @@ func main() {
logger.Info(fmt.Sprintf("TLS Server running, listening on %s...\n", c.WFE.TLSListenAddress))
TLSSrv := &http.Server{
Addr: c.WFE.TLSListenAddress,
Handler: httpMonitor,
Handler: wfe.Handler(),
TLSConfig: tlsConfig,
}
hdTLSSrv, err = hd.ListenAndServe(TLSSrv)

View File

@ -23,6 +23,7 @@ import (
"github.com/letsencrypt/boulder/features"
blog "github.com/letsencrypt/boulder/log"
"github.com/letsencrypt/boulder/metrics"
"github.com/letsencrypt/boulder/metrics/measured_http"
"github.com/letsencrypt/boulder/sa"
)
@ -223,7 +224,6 @@ as generated by Boulder's single-ocsp command.
hd := &httpdown.HTTP{
StopTimeout: stopTimeout,
KillTimeout: killTimeout,
Stats: metrics.NewFBAdapter(scope, clock.Default()),
}
hdSrv, err := hd.ListenAndServe(srv)
cmd.FailOnError(err, "Error starting HTTP server")
@ -244,5 +244,5 @@ func mux(scope metrics.Scope, responderPath string, source cfocsp.Source) http.H
}
m.ServeHTTP(w, r)
})
return metrics.NewHTTPMonitor(scope, h)
return measured_http.New(h, clock.Default())
}

View File

@ -0,0 +1,93 @@
package measured_http
import (
"fmt"
"net/http"
"regexp"
"strings"
"github.com/jmhodges/clock"
"github.com/prometheus/client_golang/prometheus"
)
var (
responseTime = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "response_time",
Help: "Time taken to respond to a request",
},
[]string{"endpoint", "method", "code"})
)
func init() {
prometheus.MustRegister(responseTime)
}
// responseWriterWithStatus satisfies http.ResponseWriter, but keeps track of the
// status code for gathering stats.
type responseWriterWithStatus struct {
http.ResponseWriter
code int
}
// WriteHeader stores a status code for generating stats.
func (r *responseWriterWithStatus) WriteHeader(code int) {
r.code = code
r.ResponseWriter.WriteHeader(code)
}
// MeasuredHandler wraps an http.Handler and records prometheus stats
type MeasuredHandler struct {
http.Handler
clk clock.Clock
// Normally this is always responseTime, but we override it for testing.
stat *prometheus.HistogramVec
}
func New(h http.Handler, clk clock.Clock) *MeasuredHandler {
return &MeasuredHandler{
Handler: h,
clk: clk,
stat: responseTime,
}
}
var endpointShortNameRegexp = regexp.MustCompile("^[a-z-]*$")
// endpointFromPath turns a request path into a value suitable for a Prometheus
// label value, by eliminating path components that vary widely (like user ids
// and authorization ids). It uses a simple heuristic: Remove everything after
// the first path component that doesn't match lowercase a-z, plus hyphen. This
// happens to work very well with Boulder's WFE.
func endpointFromPath(path string) string {
// Remove any query
path = strings.Split(path, "?")[0]
components := strings.Split(path, "/")
var i int
var v string
for i, v = range components {
matched := endpointShortNameRegexp.MatchString(v)
if !matched {
return strings.Join(components[:i], "/")
}
}
return path
}
func (h *MeasuredHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
begin := h.clk.Now()
rwws := &responseWriterWithStatus{w, 0}
// Copy in case handlers down the chain use StripPrefix, which modifies
// URL path.
endpoint := endpointFromPath(r.URL.Path)
defer func() {
h.stat.With(prometheus.Labels{
"endpoint": endpoint,
"method": r.Method,
"code": fmt.Sprintf("%d", rwws.code),
}).Observe(h.clk.Since(begin).Seconds())
}()
h.Handler.ServeHTTP(rwws, r)
}

View File

@ -0,0 +1,97 @@
package measured_http
import (
"net/http"
"net/http/httptest"
"net/url"
"testing"
"time"
"github.com/jmhodges/clock"
"github.com/prometheus/client_golang/prometheus"
io_prometheus_client "github.com/prometheus/client_model/go"
)
func TestEndpointFromPath(t *testing.T) {
tc := []struct {
input, expected string
}{
{"/", "/"},
{"/acme", "/acme"},
{"/acme/new-authz", "/acme/new-authz"},
{"/acme/new-authz/", "/acme/new-authz/"},
{"/acme/authz/1234", "/acme/authz"},
{"/acme/authz/aGVsbG8K/1234", "/acme/authz"},
{"/directory?foo=bar", "/directory"},
}
for _, c := range tc {
output := endpointFromPath(c.input)
if output != c.expected {
t.Errorf("endpointFromPath(%q) = %q (expected %q)",
c.input, output, c.expected)
}
}
}
type sleepyHandler struct {
clk clock.FakeClock
}
func (h sleepyHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
h.clk.Sleep(999 * time.Second)
w.WriteHeader(302)
}
func TestMeasuring(t *testing.T) {
clk := clock.NewFake()
// Create a local histogram stat with the same labels as the real one, but
// don't register it; we will collect its data here in the test to verify it.
stat := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "fake",
Help: "fake",
},
[]string{"endpoint", "method", "code"})
mh := MeasuredHandler{
Handler: sleepyHandler{clk},
clk: clk,
stat: stat,
}
mh.ServeHTTP(httptest.NewRecorder(), &http.Request{
URL: &url.URL{Path: "/foo"},
Method: "GET",
})
ch := make(chan prometheus.Metric, 10)
stat.Collect(ch)
m := <-ch
var iom io_prometheus_client.Metric
_ = m.Write(&iom)
hist := iom.Histogram
if *hist.SampleCount != 1 {
t.Errorf("SampleCount = %d (expected 1)", *hist.SampleCount)
}
if *hist.SampleSum != 999 {
t.Errorf("SampleSum = %g (expected 999)", *hist.SampleSum)
}
expectedLabels := map[string]string{
"endpoint": "/foo",
"method": "GET",
"code": "302",
}
for _, labelPair := range iom.Label {
if expectedLabels[*labelPair.Name] == "" {
t.Errorf("Unexpected label %s", *labelPair.Name)
} else if expectedLabels[*labelPair.Name] != *labelPair.Value {
t.Errorf("labels[%q] = %q (expected %q)", *labelPair.Name, *labelPair.Value,
expectedLabels[*labelPair.Name])
}
delete(expectedLabels, *labelPair.Name)
}
if len(expectedLabels) != 0 {
t.Errorf("Some labels were expected, but not observed: %v", expectedLabels)
}
}

View File

@ -2,91 +2,12 @@ package metrics
import (
"fmt"
"net/http"
"os"
"sync/atomic"
"time"
"github.com/cactus/go-statsd-client/statsd"
"github.com/jmhodges/clock"
)
// HTTPMonitor stores some server state
type HTTPMonitor struct {
stats Scope
handler http.Handler
connectionsInFlight int64
}
// NewHTTPMonitor returns a new initialized HTTPMonitor
func NewHTTPMonitor(stats Scope, handler http.Handler) *HTTPMonitor {
return &HTTPMonitor{
stats: stats,
handler: handler,
connectionsInFlight: 0,
}
}
func (h *HTTPMonitor) ServeHTTP(w http.ResponseWriter, r *http.Request) {
h.stats.Inc("HTTP.Rate", 1)
inFlight := atomic.AddInt64(&h.connectionsInFlight, 1)
h.stats.Gauge("HTTP.OpenConnections", inFlight)
h.handler.ServeHTTP(w, r)
inFlight = atomic.AddInt64(&h.connectionsInFlight, -1)
h.stats.Gauge("HTTP.ConnectionsInFlight", inFlight)
}
// FBAdapter provides a facebookgo/stats client interface that sends metrics via
// a StatsD client
type FBAdapter struct {
stats Scope
clk clock.Clock
}
// NewFBAdapter returns a new adapter
func NewFBAdapter(stats Scope, clock clock.Clock) FBAdapter {
return FBAdapter{stats: stats, clk: clock}
}
// BumpAvg is essentially statsd.Statter.Gauge
func (fba FBAdapter) BumpAvg(key string, val float64) {
fba.stats.Gauge(key, int64(val))
}
// BumpSum is essentially statsd.Statter.Inc (httpdown only ever uses positive
// deltas)
func (fba FBAdapter) BumpSum(key string, val float64) {
fba.stats.Inc(key, int64(val))
}
type btHolder struct {
key string
stats Scope
started time.Time
}
func (bth btHolder) End() {
bth.stats.TimingDuration(bth.key, time.Since(bth.started))
}
// BumpTime is essentially a (much better) statsd.Statter.TimingDuration
func (fba FBAdapter) BumpTime(key string) interface {
End()
} {
return btHolder{
key: key,
started: fba.clk.Now(),
stats: fba.stats,
}
}
// BumpHistogram isn't used by facebookgo/httpdown
func (fba FBAdapter) BumpHistogram(_ string, _ float64) {
return
}
// Statter implements the statsd.Statter interface but
// appends the name of the host the process is running on
// to the end of every stat name

View File

@ -21,6 +21,7 @@ import (
"github.com/jmhodges/clock"
"github.com/miekg/dns"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/net/context"
"github.com/letsencrypt/boulder/bdns"
@ -41,6 +42,19 @@ const (
maxResponseSize = 128
)
var (
validationTime = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "valiation_time",
Help: "Time taken to validate a challenge",
},
[]string{"type", "result"})
)
func init() {
prometheus.MustRegister(validationTime)
}
var validationTimeout = time.Second * 5
// ValidationAuthorityImpl represents a VA
@ -677,6 +691,10 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, domain
logEvent.Challenge = challenge
validationTime.With(prometheus.Labels{
"type": string(challenge.Type),
"result": string(challenge.Status),
}).Observe(time.Since(vStart).Seconds())
va.stats.TimingDuration(fmt.Sprintf("Validations.%s.%s", challenge.Type, challenge.Status), time.Since(vStart))
va.log.AuditObject("Validation result", logEvent)

View File

@ -28,6 +28,7 @@ type requestEvent struct {
ResponseNonce string `json:",omitempty"`
UserAgent string `json:",omitempty"`
Extra map[string]interface{} `json:",omitempty"`
Code int
}
func (e *requestEvent) AddError(msg string, args ...interface{}) {

View File

@ -27,12 +27,16 @@ import (
"github.com/letsencrypt/boulder/goodkey"
blog "github.com/letsencrypt/boulder/log"
"github.com/letsencrypt/boulder/metrics"
"github.com/letsencrypt/boulder/metrics/measured_http"
"github.com/letsencrypt/boulder/nonce"
"github.com/letsencrypt/boulder/probs"
"github.com/letsencrypt/boulder/revocation"
)
// Paths are the ACME-spec identified URL path-segments for various methods
// Paths are the ACME-spec identified URL path-segments for various methods.
// NOTE: In metrics/measured_http we make the assumption that these are all
// lowercase plus hyphens. If you violate that assumption you should update
// measured_http.
const (
directoryPath = "/directory"
newRegPath = "/acme/new-reg"
@ -299,7 +303,7 @@ func (wfe *WebFrontEndImpl) Handler() http.Handler {
clk: clock.Default(),
wfe: wfeHandlerFunc(wfe.Index),
})
return m
return measured_http.New(m, wfe.clk)
}
// Method implementations
@ -579,7 +583,6 @@ func (wfe *WebFrontEndImpl) sendError(response http.ResponseWriter, logEvent *re
response.WriteHeader(code)
response.Write(problemDoc)
wfe.stats.Inc(fmt.Sprintf("HTTP.ErrorCodes.%d", code), 1)
problemSegments := strings.Split(string(prob.Type), ":")
if len(problemSegments) > 0 {
wfe.stats.Inc(fmt.Sprintf("HTTP.ProblemTypes.%s", problemSegments[len(problemSegments)-1]), 1)