Replace autoprom stats with Prometheus style stats. (#2869)
This commit replaces the existing expiration-mailer autoprom stats with first-class Prometheus style stats.
This commit is contained in:
parent
4fbcf86238
commit
f70e262935
|
|
@ -31,6 +31,7 @@ import (
|
|||
"github.com/letsencrypt/boulder/metrics"
|
||||
"github.com/letsencrypt/boulder/sa"
|
||||
sapb "github.com/letsencrypt/boulder/sa/proto"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
const (
|
||||
|
|
@ -43,7 +44,6 @@ type regStore interface {
|
|||
}
|
||||
|
||||
type mailer struct {
|
||||
stats metrics.Scope
|
||||
log blog.Logger
|
||||
dbMap *gorp.DbMap
|
||||
rs regStore
|
||||
|
|
@ -53,6 +53,15 @@ type mailer struct {
|
|||
nagTimes []time.Duration
|
||||
limit int
|
||||
clk clock.Clock
|
||||
stats mailerStats
|
||||
}
|
||||
|
||||
type mailerStats struct {
|
||||
nagsAtCapacity *prometheus.CounterVec
|
||||
errorCount *prometheus.CounterVec
|
||||
renewalCount *prometheus.CounterVec
|
||||
sendLatency prometheus.Histogram
|
||||
processingLatency prometheus.Histogram
|
||||
}
|
||||
|
||||
func (m *mailer) sendNags(contacts []string, certs []*x509.Certificate) error {
|
||||
|
|
@ -112,7 +121,7 @@ func (m *mailer) sendNags(contacts []string, certs []*x509.Certificate) error {
|
|||
ExpirationSubject: expiringSubject,
|
||||
})
|
||||
if err != nil {
|
||||
m.stats.Inc("Errors.SendingNag.SubjectTemplateFailure", 1)
|
||||
m.stats.errorCount.With(prometheus.Labels{"type": "SubjectTemplateFailure"}).Inc()
|
||||
return err
|
||||
}
|
||||
|
||||
|
|
@ -128,7 +137,7 @@ func (m *mailer) sendNags(contacts []string, certs []*x509.Certificate) error {
|
|||
msgBuf := new(bytes.Buffer)
|
||||
err = m.emailTemplate.Execute(msgBuf, email)
|
||||
if err != nil {
|
||||
m.stats.Inc("Errors.SendingNag.TemplateFailure", 1)
|
||||
m.stats.errorCount.With(prometheus.Labels{"type": "TemplateFailure"}).Inc()
|
||||
return err
|
||||
}
|
||||
startSending := m.clk.Now()
|
||||
|
|
@ -138,7 +147,7 @@ func (m *mailer) sendNags(contacts []string, certs []*x509.Certificate) error {
|
|||
}
|
||||
finishSending := m.clk.Now()
|
||||
elapsed := finishSending.Sub(startSending)
|
||||
m.stats.TimingDuration("SendLatency", elapsed)
|
||||
m.stats.sendLatency.Observe(elapsed.Seconds())
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
@ -190,7 +199,7 @@ func (m *mailer) processCerts(allCerts []core.Certificate) {
|
|||
reg, err := m.rs.GetRegistration(ctx, regID)
|
||||
if err != nil {
|
||||
m.log.AuditErr(fmt.Sprintf("Error fetching registration %d: %s", regID, err))
|
||||
m.stats.Inc("Errors.GetRegistration", 1)
|
||||
m.stats.errorCount.With(prometheus.Labels{"type": "GetRegistration"}).Inc()
|
||||
continue
|
||||
}
|
||||
|
||||
|
|
@ -200,7 +209,7 @@ func (m *mailer) processCerts(allCerts []core.Certificate) {
|
|||
if err != nil {
|
||||
// TODO(#1420): tell registration about this error
|
||||
m.log.AuditErr(fmt.Sprintf("Error parsing certificate %s: %s", cert.Serial, err))
|
||||
m.stats.Inc("Errors.ParseCertificate", 1)
|
||||
m.stats.errorCount.With(prometheus.Labels{"type": "ParseCertificate"}).Inc()
|
||||
continue
|
||||
}
|
||||
|
||||
|
|
@ -209,10 +218,10 @@ func (m *mailer) processCerts(allCerts []core.Certificate) {
|
|||
m.log.AuditErr(fmt.Sprintf("expiration-mailer: error fetching renewal state: %v", err))
|
||||
// assume not renewed
|
||||
} else if renewed {
|
||||
m.stats.Inc("Renewed", 1)
|
||||
m.stats.renewalCount.With(prometheus.Labels{}).Inc()
|
||||
if err := m.updateCertStatus(cert.Serial); err != nil {
|
||||
m.log.AuditErr(fmt.Sprintf("Error updating certificate status for %s: %s", cert.Serial, err))
|
||||
m.stats.Inc("Errors.UpdateCertificateStatus", 1)
|
||||
m.stats.errorCount.With(prometheus.Labels{"type": "UpdateCertificateStatus"}).Inc()
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
|
@ -239,7 +248,7 @@ func (m *mailer) processCerts(allCerts []core.Certificate) {
|
|||
err = m.updateCertStatus(serial)
|
||||
if err != nil {
|
||||
m.log.AuditErr(fmt.Sprintf("Error updating certificate status for %s: %s", serial, err))
|
||||
m.stats.Inc("Errors.UpdateCertificateStatus", 1)
|
||||
m.stats.errorCount.With(prometheus.Labels{"type": "UpdateCertificateStatus"}).Inc()
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
|
@ -328,15 +337,14 @@ func (m *mailer) findExpiringCertificates() error {
|
|||
"nag group %s expiring certificates at configured capacity (cert limit %d)\n",
|
||||
expiresIn.String(),
|
||||
m.limit))
|
||||
statName := fmt.Sprintf("Errors.Nag-%s.AtCapacity", expiresIn.String())
|
||||
m.stats.Inc(statName, 1)
|
||||
m.stats.nagsAtCapacity.With(prometheus.Labels{"nagGroup": expiresIn.String()}).Inc()
|
||||
}
|
||||
|
||||
processingStarted := m.clk.Now()
|
||||
m.processCerts(certs)
|
||||
processingEnded := m.clk.Now()
|
||||
elapsed := processingEnded.Sub(processingStarted)
|
||||
m.stats.TimingDuration("ProcessingCertificatesLatency", elapsed)
|
||||
m.stats.processingLatency.Observe(elapsed.Seconds())
|
||||
}
|
||||
|
||||
return nil
|
||||
|
|
@ -385,6 +393,54 @@ type config struct {
|
|||
Syslog cmd.SyslogConfig
|
||||
}
|
||||
|
||||
func initStats(scope metrics.Scope) mailerStats {
|
||||
nagsAtCapacity := prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "nagsAtCapacity",
|
||||
Help: "Count of nag groups at capcacity",
|
||||
},
|
||||
[]string{"nagGroup"})
|
||||
scope.MustRegister(nagsAtCapacity)
|
||||
|
||||
errorCount := prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "errors",
|
||||
Help: "Number of errors",
|
||||
},
|
||||
[]string{"type"})
|
||||
scope.MustRegister(errorCount)
|
||||
|
||||
renewalCount := prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "renewals",
|
||||
Help: "Number of messages skipped for being renewals",
|
||||
},
|
||||
nil)
|
||||
scope.MustRegister(renewalCount)
|
||||
|
||||
sendLatency := prometheus.NewHistogram(
|
||||
prometheus.HistogramOpts{
|
||||
Name: "sendLatency",
|
||||
Help: "Time the mailer takes sending messages",
|
||||
})
|
||||
scope.MustRegister(sendLatency)
|
||||
|
||||
processingLatency := prometheus.NewHistogram(
|
||||
prometheus.HistogramOpts{
|
||||
Name: "processingLatency",
|
||||
Help: "Time the mailer takes processing certificates",
|
||||
})
|
||||
scope.MustRegister(processingLatency)
|
||||
|
||||
return mailerStats{
|
||||
nagsAtCapacity: nagsAtCapacity,
|
||||
errorCount: errorCount,
|
||||
renewalCount: renewalCount,
|
||||
sendLatency: sendLatency,
|
||||
processingLatency: processingLatency,
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
configFile := flag.String("config", "", "File path to the configuration file for this service")
|
||||
certLimit := flag.Int("cert_limit", 0, "Count of certificates to process per expiration period")
|
||||
|
|
@ -487,7 +543,6 @@ func main() {
|
|||
sort.Sort(nags)
|
||||
|
||||
m := mailer{
|
||||
stats: scope,
|
||||
log: logger,
|
||||
dbMap: dbMap,
|
||||
rs: sac,
|
||||
|
|
@ -497,6 +552,7 @@ func main() {
|
|||
nagTimes: nags,
|
||||
limit: c.Mailer.CertLimit,
|
||||
clk: cmd.Clock(),
|
||||
stats: initStats(scope),
|
||||
}
|
||||
|
||||
go cmd.DebugServer(c.Mailer.DebugAddr)
|
||||
|
|
|
|||
|
|
@ -17,8 +17,9 @@ import (
|
|||
|
||||
"golang.org/x/net/context"
|
||||
|
||||
"github.com/golang/mock/gomock"
|
||||
"github.com/jmhodges/clock"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_model/go"
|
||||
"gopkg.in/go-gorp/gorp.v2"
|
||||
"gopkg.in/square/go-jose.v1"
|
||||
|
||||
|
|
@ -26,7 +27,6 @@ import (
|
|||
berrors "github.com/letsencrypt/boulder/errors"
|
||||
blog "github.com/letsencrypt/boulder/log"
|
||||
"github.com/letsencrypt/boulder/metrics"
|
||||
"github.com/letsencrypt/boulder/metrics/mock_metrics"
|
||||
"github.com/letsencrypt/boulder/mocks"
|
||||
"github.com/letsencrypt/boulder/sa"
|
||||
"github.com/letsencrypt/boulder/sa/satest"
|
||||
|
|
@ -102,7 +102,6 @@ var (
|
|||
)
|
||||
|
||||
func TestSendNags(t *testing.T) {
|
||||
stats := metrics.NewNoopScope()
|
||||
mc := mocks.Mailer{}
|
||||
rs := newFakeRegStore()
|
||||
fc := newFakeClock(t)
|
||||
|
|
@ -110,7 +109,6 @@ func TestSendNags(t *testing.T) {
|
|||
staticTmpl := template.Must(template.New("expiry-email-subject-static").Parse(testEmailSubject))
|
||||
|
||||
m := mailer{
|
||||
stats: stats,
|
||||
log: log,
|
||||
mailer: &mc,
|
||||
emailTemplate: tmpl,
|
||||
|
|
@ -118,6 +116,7 @@ func TestSendNags(t *testing.T) {
|
|||
subjectTemplate: staticTmpl,
|
||||
rs: rs,
|
||||
clk: fc,
|
||||
stats: initStats(metrics.NewNoopScope()),
|
||||
}
|
||||
|
||||
cert := &x509.Certificate{
|
||||
|
|
@ -423,6 +422,15 @@ func addExpiringCerts(t *testing.T, ctx *testCtx) []core.Certificate {
|
|||
return []core.Certificate{*certA, *certB, *certC, *certD}
|
||||
}
|
||||
|
||||
func countGroupsAtCapacity(group string, counter *prometheus.CounterVec) int {
|
||||
ch := make(chan prometheus.Metric, 10)
|
||||
counter.With(prometheus.Labels{"nagGroup": group}).Collect(ch)
|
||||
m := <-ch
|
||||
var iom io_prometheus_client.Metric
|
||||
_ = m.Write(&iom)
|
||||
return int(iom.Counter.GetValue())
|
||||
}
|
||||
|
||||
func TestFindCertsAtCapacity(t *testing.T) {
|
||||
testCtx := setup(t, []time.Duration{time.Hour * 24})
|
||||
|
||||
|
|
@ -430,31 +438,19 @@ func TestFindCertsAtCapacity(t *testing.T) {
|
|||
|
||||
log.Clear()
|
||||
|
||||
// Override the mailer `stats` with a mock
|
||||
ctrl := gomock.NewController(t)
|
||||
defer ctrl.Finish()
|
||||
statter := mock_metrics.NewMockScope(ctrl)
|
||||
testCtx.m.stats = statter
|
||||
|
||||
// Set the limit to 1 so we are "at capacity" with one result
|
||||
testCtx.m.limit = 1
|
||||
|
||||
// The mock statter should have had the "48h0m0s" nag capacity stat incremented once.
|
||||
// Note: this is not the 24h0m0s nag as you would expect sending time.Hour
|
||||
// * 24 to setup() for the nag duration. This is because all of the nags are
|
||||
// offset by defaultNagCheckInterval, which is 24hrs.
|
||||
statter.EXPECT().Inc("Errors.Nag-48h0m0s.AtCapacity", int64(1))
|
||||
|
||||
// findExpiringCertificates() ends up invoking sendNags which calls
|
||||
// TimingDuration so we need to EXPECT that with the mock
|
||||
statter.EXPECT().TimingDuration("SendLatency", time.Duration(0))
|
||||
// Similarly, findExpiringCertificates() sends its latency as well
|
||||
statter.EXPECT().TimingDuration("ProcessingCertificatesLatency", time.Duration(0))
|
||||
|
||||
err := testCtx.m.findExpiringCertificates()
|
||||
test.AssertNotError(t, err, "Failed to find expiring certs")
|
||||
test.AssertEquals(t, len(testCtx.mc.Messages), 1)
|
||||
|
||||
// The "48h0m0s" nag group should have its prometheus stat incremented once.
|
||||
// Note: this is not the 24h0m0s nag as you would expect sending time.Hour
|
||||
// * 24 to setup() for the nag duration. This is because all of the nags are
|
||||
// offset by defaultNagCheckInterval, which is 24hrs.
|
||||
test.AssertEquals(t, countGroupsAtCapacity("48h0m0s", testCtx.m.stats.nagsAtCapacity), 1)
|
||||
|
||||
// A consecutive run shouldn't find anything - similarly we do not EXPECT()
|
||||
// anything on statter to be called, and if it is then we have a test failure
|
||||
testCtx.mc.Clear()
|
||||
|
|
@ -871,7 +867,6 @@ func setup(t *testing.T, nagTimes []time.Duration) *testCtx {
|
|||
}
|
||||
cleanUp := test.ResetSATestDatabase(t)
|
||||
|
||||
stats := metrics.NewNoopScope()
|
||||
mc := &mocks.Mailer{}
|
||||
|
||||
offsetNags := make([]time.Duration, len(nagTimes))
|
||||
|
|
@ -881,7 +876,6 @@ func setup(t *testing.T, nagTimes []time.Duration) *testCtx {
|
|||
|
||||
m := &mailer{
|
||||
log: log,
|
||||
stats: stats,
|
||||
mailer: mc,
|
||||
emailTemplate: tmpl,
|
||||
subjectTemplate: subjTmpl,
|
||||
|
|
@ -890,6 +884,7 @@ func setup(t *testing.T, nagTimes []time.Duration) *testCtx {
|
|||
nagTimes: offsetNags,
|
||||
limit: 100,
|
||||
clk: fc,
|
||||
stats: initStats(metrics.NewNoopScope()),
|
||||
}
|
||||
return &testCtx{
|
||||
dbMap: dbMap,
|
||||
|
|
|
|||
Loading…
Reference in New Issue