Use Prometheus stats for VA, WFE, and OCSP Responder (#2628)
Rename HTTPMonitor to MeasuredHandler. Remove inflight stat (we didn't use it). Add timing stat by method, endpoint, and status code. The timing stat subsumes the "rate" stat, so remove that. WFE now wraps in MeasuredHandler, instead of relying on its cmd/main.go. Remove FBAdapter stats. MeasuredHandler tracks stats by method, status code, and endpoint. In VA, add a Prometheus histogram for validation timing.
This commit is contained in:
parent
ca3a2e0e3c
commit
4b665e35a6
|
@ -137,14 +137,11 @@ func main() {
|
|||
|
||||
// Set up paths
|
||||
wfe.BaseURL = c.Common.BaseURL
|
||||
h := wfe.Handler()
|
||||
|
||||
httpMonitor := metrics.NewHTTPMonitor(scope, h)
|
||||
|
||||
logger.Info(fmt.Sprintf("Server running, listening on %s...\n", c.WFE.ListenAddress))
|
||||
srv := &http.Server{
|
||||
Addr: c.WFE.ListenAddress,
|
||||
Handler: httpMonitor,
|
||||
Handler: wfe.Handler(),
|
||||
}
|
||||
|
||||
go cmd.DebugServer(c.WFE.DebugAddr)
|
||||
|
@ -153,7 +150,6 @@ func main() {
|
|||
hd := &httpdown.HTTP{
|
||||
StopTimeout: c.WFE.ShutdownStopTimeout.Duration,
|
||||
KillTimeout: c.WFE.ShutdownKillTimeout.Duration,
|
||||
Stats: metrics.NewFBAdapter(scope, clock.Default()),
|
||||
}
|
||||
hdSrv, err := hd.ListenAndServe(srv)
|
||||
cmd.FailOnError(err, "Error starting HTTP server")
|
||||
|
@ -167,7 +163,7 @@ func main() {
|
|||
logger.Info(fmt.Sprintf("TLS Server running, listening on %s...\n", c.WFE.TLSListenAddress))
|
||||
TLSSrv := &http.Server{
|
||||
Addr: c.WFE.TLSListenAddress,
|
||||
Handler: httpMonitor,
|
||||
Handler: wfe.Handler(),
|
||||
TLSConfig: tlsConfig,
|
||||
}
|
||||
hdTLSSrv, err = hd.ListenAndServe(TLSSrv)
|
||||
|
|
|
@ -23,6 +23,7 @@ import (
|
|||
"github.com/letsencrypt/boulder/features"
|
||||
blog "github.com/letsencrypt/boulder/log"
|
||||
"github.com/letsencrypt/boulder/metrics"
|
||||
"github.com/letsencrypt/boulder/metrics/measured_http"
|
||||
"github.com/letsencrypt/boulder/sa"
|
||||
)
|
||||
|
||||
|
@ -223,7 +224,6 @@ as generated by Boulder's single-ocsp command.
|
|||
hd := &httpdown.HTTP{
|
||||
StopTimeout: stopTimeout,
|
||||
KillTimeout: killTimeout,
|
||||
Stats: metrics.NewFBAdapter(scope, clock.Default()),
|
||||
}
|
||||
hdSrv, err := hd.ListenAndServe(srv)
|
||||
cmd.FailOnError(err, "Error starting HTTP server")
|
||||
|
@ -244,5 +244,5 @@ func mux(scope metrics.Scope, responderPath string, source cfocsp.Source) http.H
|
|||
}
|
||||
m.ServeHTTP(w, r)
|
||||
})
|
||||
return metrics.NewHTTPMonitor(scope, h)
|
||||
return measured_http.New(h, clock.Default())
|
||||
}
|
||||
|
|
|
@ -0,0 +1,93 @@
|
|||
package measured_http
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/jmhodges/clock"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
var (
|
||||
responseTime = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Name: "response_time",
|
||||
Help: "Time taken to respond to a request",
|
||||
},
|
||||
[]string{"endpoint", "method", "code"})
|
||||
)
|
||||
|
||||
func init() {
|
||||
prometheus.MustRegister(responseTime)
|
||||
}
|
||||
|
||||
// responseWriterWithStatus satisfies http.ResponseWriter, but keeps track of the
|
||||
// status code for gathering stats.
|
||||
type responseWriterWithStatus struct {
|
||||
http.ResponseWriter
|
||||
code int
|
||||
}
|
||||
|
||||
// WriteHeader stores a status code for generating stats.
|
||||
func (r *responseWriterWithStatus) WriteHeader(code int) {
|
||||
r.code = code
|
||||
r.ResponseWriter.WriteHeader(code)
|
||||
}
|
||||
|
||||
// MeasuredHandler wraps an http.Handler and records prometheus stats
|
||||
type MeasuredHandler struct {
|
||||
http.Handler
|
||||
clk clock.Clock
|
||||
// Normally this is always responseTime, but we override it for testing.
|
||||
stat *prometheus.HistogramVec
|
||||
}
|
||||
|
||||
func New(h http.Handler, clk clock.Clock) *MeasuredHandler {
|
||||
return &MeasuredHandler{
|
||||
Handler: h,
|
||||
clk: clk,
|
||||
stat: responseTime,
|
||||
}
|
||||
}
|
||||
|
||||
var endpointShortNameRegexp = regexp.MustCompile("^[a-z-]*$")
|
||||
|
||||
// endpointFromPath turns a request path into a value suitable for a Prometheus
|
||||
// label value, by eliminating path components that vary widely (like user ids
|
||||
// and authorization ids). It uses a simple heuristic: Remove everything after
|
||||
// the first path component that doesn't match lowercase a-z, plus hyphen. This
|
||||
// happens to work very well with Boulder's WFE.
|
||||
func endpointFromPath(path string) string {
|
||||
// Remove any query
|
||||
path = strings.Split(path, "?")[0]
|
||||
components := strings.Split(path, "/")
|
||||
var i int
|
||||
var v string
|
||||
for i, v = range components {
|
||||
matched := endpointShortNameRegexp.MatchString(v)
|
||||
if !matched {
|
||||
return strings.Join(components[:i], "/")
|
||||
}
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
func (h *MeasuredHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
begin := h.clk.Now()
|
||||
rwws := &responseWriterWithStatus{w, 0}
|
||||
// Copy in case handlers down the chain use StripPrefix, which modifies
|
||||
// URL path.
|
||||
endpoint := endpointFromPath(r.URL.Path)
|
||||
|
||||
defer func() {
|
||||
h.stat.With(prometheus.Labels{
|
||||
"endpoint": endpoint,
|
||||
"method": r.Method,
|
||||
"code": fmt.Sprintf("%d", rwws.code),
|
||||
}).Observe(h.clk.Since(begin).Seconds())
|
||||
}()
|
||||
|
||||
h.Handler.ServeHTTP(rwws, r)
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
package measured_http
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"net/url"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/jmhodges/clock"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
io_prometheus_client "github.com/prometheus/client_model/go"
|
||||
)
|
||||
|
||||
func TestEndpointFromPath(t *testing.T) {
|
||||
tc := []struct {
|
||||
input, expected string
|
||||
}{
|
||||
{"/", "/"},
|
||||
{"/acme", "/acme"},
|
||||
{"/acme/new-authz", "/acme/new-authz"},
|
||||
{"/acme/new-authz/", "/acme/new-authz/"},
|
||||
{"/acme/authz/1234", "/acme/authz"},
|
||||
{"/acme/authz/aGVsbG8K/1234", "/acme/authz"},
|
||||
{"/directory?foo=bar", "/directory"},
|
||||
}
|
||||
for _, c := range tc {
|
||||
output := endpointFromPath(c.input)
|
||||
if output != c.expected {
|
||||
t.Errorf("endpointFromPath(%q) = %q (expected %q)",
|
||||
c.input, output, c.expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type sleepyHandler struct {
|
||||
clk clock.FakeClock
|
||||
}
|
||||
|
||||
func (h sleepyHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
h.clk.Sleep(999 * time.Second)
|
||||
w.WriteHeader(302)
|
||||
}
|
||||
|
||||
func TestMeasuring(t *testing.T) {
|
||||
clk := clock.NewFake()
|
||||
|
||||
// Create a local histogram stat with the same labels as the real one, but
|
||||
// don't register it; we will collect its data here in the test to verify it.
|
||||
stat := prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Name: "fake",
|
||||
Help: "fake",
|
||||
},
|
||||
[]string{"endpoint", "method", "code"})
|
||||
|
||||
mh := MeasuredHandler{
|
||||
Handler: sleepyHandler{clk},
|
||||
clk: clk,
|
||||
stat: stat,
|
||||
}
|
||||
mh.ServeHTTP(httptest.NewRecorder(), &http.Request{
|
||||
URL: &url.URL{Path: "/foo"},
|
||||
Method: "GET",
|
||||
})
|
||||
ch := make(chan prometheus.Metric, 10)
|
||||
stat.Collect(ch)
|
||||
m := <-ch
|
||||
var iom io_prometheus_client.Metric
|
||||
_ = m.Write(&iom)
|
||||
|
||||
hist := iom.Histogram
|
||||
if *hist.SampleCount != 1 {
|
||||
t.Errorf("SampleCount = %d (expected 1)", *hist.SampleCount)
|
||||
}
|
||||
if *hist.SampleSum != 999 {
|
||||
t.Errorf("SampleSum = %g (expected 999)", *hist.SampleSum)
|
||||
}
|
||||
|
||||
expectedLabels := map[string]string{
|
||||
"endpoint": "/foo",
|
||||
"method": "GET",
|
||||
"code": "302",
|
||||
}
|
||||
for _, labelPair := range iom.Label {
|
||||
if expectedLabels[*labelPair.Name] == "" {
|
||||
t.Errorf("Unexpected label %s", *labelPair.Name)
|
||||
} else if expectedLabels[*labelPair.Name] != *labelPair.Value {
|
||||
t.Errorf("labels[%q] = %q (expected %q)", *labelPair.Name, *labelPair.Value,
|
||||
expectedLabels[*labelPair.Name])
|
||||
}
|
||||
delete(expectedLabels, *labelPair.Name)
|
||||
}
|
||||
if len(expectedLabels) != 0 {
|
||||
t.Errorf("Some labels were expected, but not observed: %v", expectedLabels)
|
||||
}
|
||||
}
|
|
@ -2,91 +2,12 @@ package metrics
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/cactus/go-statsd-client/statsd"
|
||||
"github.com/jmhodges/clock"
|
||||
)
|
||||
|
||||
// HTTPMonitor stores some server state
|
||||
type HTTPMonitor struct {
|
||||
stats Scope
|
||||
handler http.Handler
|
||||
connectionsInFlight int64
|
||||
}
|
||||
|
||||
// NewHTTPMonitor returns a new initialized HTTPMonitor
|
||||
func NewHTTPMonitor(stats Scope, handler http.Handler) *HTTPMonitor {
|
||||
return &HTTPMonitor{
|
||||
stats: stats,
|
||||
handler: handler,
|
||||
connectionsInFlight: 0,
|
||||
}
|
||||
}
|
||||
|
||||
func (h *HTTPMonitor) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
h.stats.Inc("HTTP.Rate", 1)
|
||||
inFlight := atomic.AddInt64(&h.connectionsInFlight, 1)
|
||||
h.stats.Gauge("HTTP.OpenConnections", inFlight)
|
||||
|
||||
h.handler.ServeHTTP(w, r)
|
||||
|
||||
inFlight = atomic.AddInt64(&h.connectionsInFlight, -1)
|
||||
h.stats.Gauge("HTTP.ConnectionsInFlight", inFlight)
|
||||
}
|
||||
|
||||
// FBAdapter provides a facebookgo/stats client interface that sends metrics via
|
||||
// a StatsD client
|
||||
type FBAdapter struct {
|
||||
stats Scope
|
||||
clk clock.Clock
|
||||
}
|
||||
|
||||
// NewFBAdapter returns a new adapter
|
||||
func NewFBAdapter(stats Scope, clock clock.Clock) FBAdapter {
|
||||
return FBAdapter{stats: stats, clk: clock}
|
||||
}
|
||||
|
||||
// BumpAvg is essentially statsd.Statter.Gauge
|
||||
func (fba FBAdapter) BumpAvg(key string, val float64) {
|
||||
fba.stats.Gauge(key, int64(val))
|
||||
}
|
||||
|
||||
// BumpSum is essentially statsd.Statter.Inc (httpdown only ever uses positive
|
||||
// deltas)
|
||||
func (fba FBAdapter) BumpSum(key string, val float64) {
|
||||
fba.stats.Inc(key, int64(val))
|
||||
}
|
||||
|
||||
type btHolder struct {
|
||||
key string
|
||||
stats Scope
|
||||
started time.Time
|
||||
}
|
||||
|
||||
func (bth btHolder) End() {
|
||||
bth.stats.TimingDuration(bth.key, time.Since(bth.started))
|
||||
}
|
||||
|
||||
// BumpTime is essentially a (much better) statsd.Statter.TimingDuration
|
||||
func (fba FBAdapter) BumpTime(key string) interface {
|
||||
End()
|
||||
} {
|
||||
return btHolder{
|
||||
key: key,
|
||||
started: fba.clk.Now(),
|
||||
stats: fba.stats,
|
||||
}
|
||||
}
|
||||
|
||||
// BumpHistogram isn't used by facebookgo/httpdown
|
||||
func (fba FBAdapter) BumpHistogram(_ string, _ float64) {
|
||||
return
|
||||
}
|
||||
|
||||
// Statter implements the statsd.Statter interface but
|
||||
// appends the name of the host the process is running on
|
||||
// to the end of every stat name
|
||||
|
|
18
va/va.go
18
va/va.go
|
@ -21,6 +21,7 @@ import (
|
|||
|
||||
"github.com/jmhodges/clock"
|
||||
"github.com/miekg/dns"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"golang.org/x/net/context"
|
||||
|
||||
"github.com/letsencrypt/boulder/bdns"
|
||||
|
@ -41,6 +42,19 @@ const (
|
|||
maxResponseSize = 128
|
||||
)
|
||||
|
||||
var (
|
||||
validationTime = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Name: "valiation_time",
|
||||
Help: "Time taken to validate a challenge",
|
||||
},
|
||||
[]string{"type", "result"})
|
||||
)
|
||||
|
||||
func init() {
|
||||
prometheus.MustRegister(validationTime)
|
||||
}
|
||||
|
||||
var validationTimeout = time.Second * 5
|
||||
|
||||
// ValidationAuthorityImpl represents a VA
|
||||
|
@ -677,6 +691,10 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, domain
|
|||
|
||||
logEvent.Challenge = challenge
|
||||
|
||||
validationTime.With(prometheus.Labels{
|
||||
"type": string(challenge.Type),
|
||||
"result": string(challenge.Status),
|
||||
}).Observe(time.Since(vStart).Seconds())
|
||||
va.stats.TimingDuration(fmt.Sprintf("Validations.%s.%s", challenge.Type, challenge.Status), time.Since(vStart))
|
||||
|
||||
va.log.AuditObject("Validation result", logEvent)
|
||||
|
|
|
@ -28,6 +28,7 @@ type requestEvent struct {
|
|||
ResponseNonce string `json:",omitempty"`
|
||||
UserAgent string `json:",omitempty"`
|
||||
Extra map[string]interface{} `json:",omitempty"`
|
||||
Code int
|
||||
}
|
||||
|
||||
func (e *requestEvent) AddError(msg string, args ...interface{}) {
|
||||
|
|
|
@ -27,12 +27,16 @@ import (
|
|||
"github.com/letsencrypt/boulder/goodkey"
|
||||
blog "github.com/letsencrypt/boulder/log"
|
||||
"github.com/letsencrypt/boulder/metrics"
|
||||
"github.com/letsencrypt/boulder/metrics/measured_http"
|
||||
"github.com/letsencrypt/boulder/nonce"
|
||||
"github.com/letsencrypt/boulder/probs"
|
||||
"github.com/letsencrypt/boulder/revocation"
|
||||
)
|
||||
|
||||
// Paths are the ACME-spec identified URL path-segments for various methods
|
||||
// Paths are the ACME-spec identified URL path-segments for various methods.
|
||||
// NOTE: In metrics/measured_http we make the assumption that these are all
|
||||
// lowercase plus hyphens. If you violate that assumption you should update
|
||||
// measured_http.
|
||||
const (
|
||||
directoryPath = "/directory"
|
||||
newRegPath = "/acme/new-reg"
|
||||
|
@ -299,7 +303,7 @@ func (wfe *WebFrontEndImpl) Handler() http.Handler {
|
|||
clk: clock.Default(),
|
||||
wfe: wfeHandlerFunc(wfe.Index),
|
||||
})
|
||||
return m
|
||||
return measured_http.New(m, wfe.clk)
|
||||
}
|
||||
|
||||
// Method implementations
|
||||
|
@ -579,7 +583,6 @@ func (wfe *WebFrontEndImpl) sendError(response http.ResponseWriter, logEvent *re
|
|||
response.WriteHeader(code)
|
||||
response.Write(problemDoc)
|
||||
|
||||
wfe.stats.Inc(fmt.Sprintf("HTTP.ErrorCodes.%d", code), 1)
|
||||
problemSegments := strings.Split(string(prob.Type), ":")
|
||||
if len(problemSegments) > 0 {
|
||||
wfe.stats.Inc(fmt.Sprintf("HTTP.ProblemTypes.%s", problemSegments[len(problemSegments)-1]), 1)
|
||||
|
|
Loading…
Reference in New Issue