Replace autoprom stats with Prometheus style stats. (#2869)

This commit replaces the existing expiration-mailer autoprom stats with
first-class Prometheus style stats.
This commit is contained in:
Daniel McCarney 2017-07-13 18:14:36 -04:00 committed by Jacob Hoffman-Andrews
parent 4fbcf86238
commit f70e262935
2 changed files with 88 additions and 37 deletions

View File

@ -31,6 +31,7 @@ import (
"github.com/letsencrypt/boulder/metrics"
"github.com/letsencrypt/boulder/sa"
sapb "github.com/letsencrypt/boulder/sa/proto"
"github.com/prometheus/client_golang/prometheus"
)
const (
@ -43,7 +44,6 @@ type regStore interface {
}
type mailer struct {
stats metrics.Scope
log blog.Logger
dbMap *gorp.DbMap
rs regStore
@ -53,6 +53,15 @@ type mailer struct {
nagTimes []time.Duration
limit int
clk clock.Clock
stats mailerStats
}
type mailerStats struct {
nagsAtCapacity *prometheus.CounterVec
errorCount *prometheus.CounterVec
renewalCount *prometheus.CounterVec
sendLatency prometheus.Histogram
processingLatency prometheus.Histogram
}
func (m *mailer) sendNags(contacts []string, certs []*x509.Certificate) error {
@ -112,7 +121,7 @@ func (m *mailer) sendNags(contacts []string, certs []*x509.Certificate) error {
ExpirationSubject: expiringSubject,
})
if err != nil {
m.stats.Inc("Errors.SendingNag.SubjectTemplateFailure", 1)
m.stats.errorCount.With(prometheus.Labels{"type": "SubjectTemplateFailure"}).Inc()
return err
}
@ -128,7 +137,7 @@ func (m *mailer) sendNags(contacts []string, certs []*x509.Certificate) error {
msgBuf := new(bytes.Buffer)
err = m.emailTemplate.Execute(msgBuf, email)
if err != nil {
m.stats.Inc("Errors.SendingNag.TemplateFailure", 1)
m.stats.errorCount.With(prometheus.Labels{"type": "TemplateFailure"}).Inc()
return err
}
startSending := m.clk.Now()
@ -138,7 +147,7 @@ func (m *mailer) sendNags(contacts []string, certs []*x509.Certificate) error {
}
finishSending := m.clk.Now()
elapsed := finishSending.Sub(startSending)
m.stats.TimingDuration("SendLatency", elapsed)
m.stats.sendLatency.Observe(elapsed.Seconds())
return nil
}
@ -190,7 +199,7 @@ func (m *mailer) processCerts(allCerts []core.Certificate) {
reg, err := m.rs.GetRegistration(ctx, regID)
if err != nil {
m.log.AuditErr(fmt.Sprintf("Error fetching registration %d: %s", regID, err))
m.stats.Inc("Errors.GetRegistration", 1)
m.stats.errorCount.With(prometheus.Labels{"type": "GetRegistration"}).Inc()
continue
}
@ -200,7 +209,7 @@ func (m *mailer) processCerts(allCerts []core.Certificate) {
if err != nil {
// TODO(#1420): tell registration about this error
m.log.AuditErr(fmt.Sprintf("Error parsing certificate %s: %s", cert.Serial, err))
m.stats.Inc("Errors.ParseCertificate", 1)
m.stats.errorCount.With(prometheus.Labels{"type": "ParseCertificate"}).Inc()
continue
}
@ -209,10 +218,10 @@ func (m *mailer) processCerts(allCerts []core.Certificate) {
m.log.AuditErr(fmt.Sprintf("expiration-mailer: error fetching renewal state: %v", err))
// assume not renewed
} else if renewed {
m.stats.Inc("Renewed", 1)
m.stats.renewalCount.With(prometheus.Labels{}).Inc()
if err := m.updateCertStatus(cert.Serial); err != nil {
m.log.AuditErr(fmt.Sprintf("Error updating certificate status for %s: %s", cert.Serial, err))
m.stats.Inc("Errors.UpdateCertificateStatus", 1)
m.stats.errorCount.With(prometheus.Labels{"type": "UpdateCertificateStatus"}).Inc()
}
continue
}
@ -239,7 +248,7 @@ func (m *mailer) processCerts(allCerts []core.Certificate) {
err = m.updateCertStatus(serial)
if err != nil {
m.log.AuditErr(fmt.Sprintf("Error updating certificate status for %s: %s", serial, err))
m.stats.Inc("Errors.UpdateCertificateStatus", 1)
m.stats.errorCount.With(prometheus.Labels{"type": "UpdateCertificateStatus"}).Inc()
continue
}
}
@ -328,15 +337,14 @@ func (m *mailer) findExpiringCertificates() error {
"nag group %s expiring certificates at configured capacity (cert limit %d)\n",
expiresIn.String(),
m.limit))
statName := fmt.Sprintf("Errors.Nag-%s.AtCapacity", expiresIn.String())
m.stats.Inc(statName, 1)
m.stats.nagsAtCapacity.With(prometheus.Labels{"nagGroup": expiresIn.String()}).Inc()
}
processingStarted := m.clk.Now()
m.processCerts(certs)
processingEnded := m.clk.Now()
elapsed := processingEnded.Sub(processingStarted)
m.stats.TimingDuration("ProcessingCertificatesLatency", elapsed)
m.stats.processingLatency.Observe(elapsed.Seconds())
}
return nil
@ -385,6 +393,54 @@ type config struct {
Syslog cmd.SyslogConfig
}
func initStats(scope metrics.Scope) mailerStats {
nagsAtCapacity := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "nagsAtCapacity",
Help: "Count of nag groups at capcacity",
},
[]string{"nagGroup"})
scope.MustRegister(nagsAtCapacity)
errorCount := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "errors",
Help: "Number of errors",
},
[]string{"type"})
scope.MustRegister(errorCount)
renewalCount := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "renewals",
Help: "Number of messages skipped for being renewals",
},
nil)
scope.MustRegister(renewalCount)
sendLatency := prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "sendLatency",
Help: "Time the mailer takes sending messages",
})
scope.MustRegister(sendLatency)
processingLatency := prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "processingLatency",
Help: "Time the mailer takes processing certificates",
})
scope.MustRegister(processingLatency)
return mailerStats{
nagsAtCapacity: nagsAtCapacity,
errorCount: errorCount,
renewalCount: renewalCount,
sendLatency: sendLatency,
processingLatency: processingLatency,
}
}
func main() {
configFile := flag.String("config", "", "File path to the configuration file for this service")
certLimit := flag.Int("cert_limit", 0, "Count of certificates to process per expiration period")
@ -487,7 +543,6 @@ func main() {
sort.Sort(nags)
m := mailer{
stats: scope,
log: logger,
dbMap: dbMap,
rs: sac,
@ -497,6 +552,7 @@ func main() {
nagTimes: nags,
limit: c.Mailer.CertLimit,
clk: cmd.Clock(),
stats: initStats(scope),
}
go cmd.DebugServer(c.Mailer.DebugAddr)

View File

@ -17,8 +17,9 @@ import (
"golang.org/x/net/context"
"github.com/golang/mock/gomock"
"github.com/jmhodges/clock"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_model/go"
"gopkg.in/go-gorp/gorp.v2"
"gopkg.in/square/go-jose.v1"
@ -26,7 +27,6 @@ import (
berrors "github.com/letsencrypt/boulder/errors"
blog "github.com/letsencrypt/boulder/log"
"github.com/letsencrypt/boulder/metrics"
"github.com/letsencrypt/boulder/metrics/mock_metrics"
"github.com/letsencrypt/boulder/mocks"
"github.com/letsencrypt/boulder/sa"
"github.com/letsencrypt/boulder/sa/satest"
@ -102,7 +102,6 @@ var (
)
func TestSendNags(t *testing.T) {
stats := metrics.NewNoopScope()
mc := mocks.Mailer{}
rs := newFakeRegStore()
fc := newFakeClock(t)
@ -110,7 +109,6 @@ func TestSendNags(t *testing.T) {
staticTmpl := template.Must(template.New("expiry-email-subject-static").Parse(testEmailSubject))
m := mailer{
stats: stats,
log: log,
mailer: &mc,
emailTemplate: tmpl,
@ -118,6 +116,7 @@ func TestSendNags(t *testing.T) {
subjectTemplate: staticTmpl,
rs: rs,
clk: fc,
stats: initStats(metrics.NewNoopScope()),
}
cert := &x509.Certificate{
@ -423,6 +422,15 @@ func addExpiringCerts(t *testing.T, ctx *testCtx) []core.Certificate {
return []core.Certificate{*certA, *certB, *certC, *certD}
}
func countGroupsAtCapacity(group string, counter *prometheus.CounterVec) int {
ch := make(chan prometheus.Metric, 10)
counter.With(prometheus.Labels{"nagGroup": group}).Collect(ch)
m := <-ch
var iom io_prometheus_client.Metric
_ = m.Write(&iom)
return int(iom.Counter.GetValue())
}
func TestFindCertsAtCapacity(t *testing.T) {
testCtx := setup(t, []time.Duration{time.Hour * 24})
@ -430,31 +438,19 @@ func TestFindCertsAtCapacity(t *testing.T) {
log.Clear()
// Override the mailer `stats` with a mock
ctrl := gomock.NewController(t)
defer ctrl.Finish()
statter := mock_metrics.NewMockScope(ctrl)
testCtx.m.stats = statter
// Set the limit to 1 so we are "at capacity" with one result
testCtx.m.limit = 1
// The mock statter should have had the "48h0m0s" nag capacity stat incremented once.
// Note: this is not the 24h0m0s nag as you would expect sending time.Hour
// * 24 to setup() for the nag duration. This is because all of the nags are
// offset by defaultNagCheckInterval, which is 24hrs.
statter.EXPECT().Inc("Errors.Nag-48h0m0s.AtCapacity", int64(1))
// findExpiringCertificates() ends up invoking sendNags which calls
// TimingDuration so we need to EXPECT that with the mock
statter.EXPECT().TimingDuration("SendLatency", time.Duration(0))
// Similarly, findExpiringCertificates() sends its latency as well
statter.EXPECT().TimingDuration("ProcessingCertificatesLatency", time.Duration(0))
err := testCtx.m.findExpiringCertificates()
test.AssertNotError(t, err, "Failed to find expiring certs")
test.AssertEquals(t, len(testCtx.mc.Messages), 1)
// The "48h0m0s" nag group should have its prometheus stat incremented once.
// Note: this is not the 24h0m0s nag as you would expect sending time.Hour
// * 24 to setup() for the nag duration. This is because all of the nags are
// offset by defaultNagCheckInterval, which is 24hrs.
test.AssertEquals(t, countGroupsAtCapacity("48h0m0s", testCtx.m.stats.nagsAtCapacity), 1)
// A consecutive run shouldn't find anything - similarly we do not EXPECT()
// anything on statter to be called, and if it is then we have a test failure
testCtx.mc.Clear()
@ -871,7 +867,6 @@ func setup(t *testing.T, nagTimes []time.Duration) *testCtx {
}
cleanUp := test.ResetSATestDatabase(t)
stats := metrics.NewNoopScope()
mc := &mocks.Mailer{}
offsetNags := make([]time.Duration, len(nagTimes))
@ -881,7 +876,6 @@ func setup(t *testing.T, nagTimes []time.Duration) *testCtx {
m := &mailer{
log: log,
stats: stats,
mailer: mc,
emailTemplate: tmpl,
subjectTemplate: subjTmpl,
@ -890,6 +884,7 @@ func setup(t *testing.T, nagTimes []time.Duration) *testCtx {
nagTimes: offsetNags,
limit: 100,
clk: fc,
stats: initStats(metrics.NewNoopScope()),
}
return &testCtx{
dbMap: dbMap,