Switch nag group to count serials, not certs (#5605)
If expiry mailer's SELECT statement is regularly returning exactly as many rows as it is configured to limit at, that means that we are consistently missing some certs that we should probably be sending emails about. Change the way this metric is computed to be based on the number of rows returned directly by the SELECT, rather than based on the filtered set of rows that ended up corresponding to final certificates. This bug was almost certainly causing us to not detect nag groups being at capacity, given that the expiry-mailer logs consistently say that nag groups have nearly-but-not-quite as many certs in them as the configured maximum. This bug was introduced in #4420, when expiration-mailer gained the ability to silently continue past certain classes of errors retrieving certificate details.
This commit is contained in:
parent
2fe12cdf20
commit
b3192bdead
|
|
@ -296,6 +296,21 @@ func (m *mailer) findExpiringCertificates() error {
|
|||
return err
|
||||
}
|
||||
|
||||
// If the number of rows was exactly `m.limit` rows we need to increment
|
||||
// a stat indicating that this nag group is at capacity based on the
|
||||
// configured cert limit. If this condition continually occurs across mailer
|
||||
// runs then we will not catch up, resulting in under-sending expiration
|
||||
// mails. The effects of this were initially described in issue #2002[0].
|
||||
//
|
||||
// 0: https://github.com/letsencrypt/boulder/issues/2002
|
||||
atCapacity := float64(0)
|
||||
if len(serials) == m.limit {
|
||||
m.log.Infof("nag group %s expiring certificates at configured capacity (select limit %d)",
|
||||
expiresIn.String(), m.limit)
|
||||
atCapacity = float64(1)
|
||||
}
|
||||
m.stats.nagsAtCapacity.With(prometheus.Labels{"nag_group": expiresIn.String()}).Set(atCapacity)
|
||||
|
||||
// Now we can sequentially retrieve the certificate details for each of the
|
||||
// certificate status rows
|
||||
var certs []core.Certificate
|
||||
|
|
@ -319,21 +334,6 @@ func (m *mailer) findExpiringCertificates() error {
|
|||
m.log.Infof("Found %d certificates expiring between %s and %s", len(certs),
|
||||
left.Format("2006-01-02 03:04"), right.Format("2006-01-02 03:04"))
|
||||
|
||||
// If the `certs` result was exactly `m.limit` rows we need to increment
|
||||
// a stat indicating that this nag group is at capacity based on the
|
||||
// configured cert limit. If this condition continually occurs across mailer
|
||||
// runs then we will not catch up, resulting in under-sending expiration
|
||||
// mails. The effects of this were initially described in issue #2002[0].
|
||||
//
|
||||
// 0: https://github.com/letsencrypt/boulder/issues/2002
|
||||
atCapacity := float64(0)
|
||||
if len(certs) == m.limit {
|
||||
m.log.Infof("nag group %s expiring certificates at configured capacity (cert limit %d)",
|
||||
expiresIn.String(), m.limit)
|
||||
atCapacity = float64(1)
|
||||
}
|
||||
m.stats.nagsAtCapacity.With(prometheus.Labels{"nag_group": expiresIn.String()}).Set(atCapacity)
|
||||
|
||||
if len(certs) == 0 {
|
||||
continue // nothing to do
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue