boulder-janitor: switch workbatch gauge to counter. (#4477)

A gauge wasn't the appropriate stat type choice for this usage.

Switching the stat to be a counter instead of a gauge means we can't
detect when the janitor is finished its work in the integration test by
watching for this stat to drop to zero for all the table labels we're
concerned with. Instead the test is updated to watch for the counter
value to stabilize for a period longer than the workbatch sleep.
This commit is contained in:
Daniel McCarney 2019-10-11 17:40:59 -04:00 committed by Jacob Hoffman-Andrews
parent 83882abf46
commit d35c20db75
3 changed files with 25 additions and 29 deletions

View File

@ -31,11 +31,11 @@ var (
Help: "Number of deletions by table the boulder-janitor has performed.",
},
[]string{"table"})
// workStat is a prometheus gauge vector tracking the number of rows found
// workStat is a prometheus counter vector tracking the number of rows found
// during a batchedJob's getWork stage and queued into the work channel sliced
// by a table label.
workStat = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
workStat = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "janitor_workbatch",
Help: "Number of items of work by table the boulder-janitor queued for deletion.",
},
@ -113,7 +113,7 @@ func (j batchedDBJob) getWork(work chan<- int64, startID int64) (int64, error) {
rows++
lastID = v.ID
}
workStat.WithLabelValues(j.table).Set(float64(rows))
workStat.WithLabelValues(j.table).Add(float64(rows))
return lastID, nil
}

View File

@ -10,7 +10,6 @@ import (
"github.com/jmhodges/clock"
blog "github.com/letsencrypt/boulder/log"
"github.com/letsencrypt/boulder/test"
"github.com/prometheus/client_golang/prometheus"
)
func setup() (*blog.Mock, clock.FakeClock) {
@ -122,8 +121,7 @@ func TestGetWork(t *testing.T) {
}
// We expect the work gauge for this table has been updated
workCount, err := test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})
test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")
workCount := test.CountCounterVec("table", table, workStat)
test.AssertEquals(t, workCount, len(mockIDs))
// Set the third item in mockIDs to have an expiry after the purge cutoff
@ -140,8 +138,7 @@ func TestGetWork(t *testing.T) {
got := <-workChan
test.AssertEquals(t, got, mockIDs[i].ID)
}
workCount, err = test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})
test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")
workCount = test.CountCounterVec("table", table, workStat)
test.AssertEquals(t, workCount, 2)
}

View File

@ -123,32 +123,31 @@ def run_janitor():
raise Exception("stat line {0} was missing required parts".format(line))
return parts[1]
# Wait for the janitor to report it isn't finding new work
print("waiting for boulder-janitor work to complete...\n")
workDone = False
for i in range(10):
certStatusWorkbatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
# Wait for the janitor to finish its work. The easiest way to tell this
# externally is to watch for the work batch counters to stabilize for
# a period longer than the configured workSleep.
attempts = 0
while True:
if attempts > 5:
raise Exception("timed out waiting for janitor workbatch counts to stabilize")
certStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
certsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
certsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
if not certStatusWorkbatch or not certsWorkBatch or not certsPerNameWorkBatch:
print("not done after check {0}. Sleeping".format(i))
time.sleep(2)
continue
allReady = True
for line in [certStatusWorkbatch, certsWorkBatch, certsPerNameWorkBatch]:
if stat_value(line) != "0":
allReady = False
# sleep for double the configured workSleep for each job
time.sleep(1)
if allReady is False:
print("not done after check {0}. Sleeping".format(i))
time.sleep(2)
else:
workDone = True
newCertStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
newCertsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
newCertsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
if (certStatusWorkBatch == newCertStatusWorkBatch
and certsWorkBatch == newCertsWorkBatch
and certsPerNameWorkBatch == newCertsPerNameWorkBatch):
break
if workDone is False:
raise Exception("Timed out waiting for janitor to report all work completed\n")
attempts = attempts + 1
# Check deletion stats are not empty/zero
for i in range(10):