diff --git a/cmd/boulder-janitor/job.go b/cmd/boulder-janitor/job.go index a20622c5b..6825f4659 100644 --- a/cmd/boulder-janitor/job.go +++ b/cmd/boulder-janitor/job.go @@ -31,11 +31,11 @@ var ( Help: "Number of deletions by table the boulder-janitor has performed.", }, []string{"table"}) - // workStat is a prometheus gauge vector tracking the number of rows found + // workStat is a prometheus counter vector tracking the number of rows found // during a batchedJob's getWork stage and queued into the work channel sliced // by a table label. - workStat = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ + workStat = prometheus.NewCounterVec( + prometheus.CounterOpts{ Name: "janitor_workbatch", Help: "Number of items of work by table the boulder-janitor queued for deletion.", }, @@ -113,7 +113,7 @@ func (j batchedDBJob) getWork(work chan<- int64, startID int64) (int64, error) { rows++ lastID = v.ID } - workStat.WithLabelValues(j.table).Set(float64(rows)) + workStat.WithLabelValues(j.table).Add(float64(rows)) return lastID, nil } diff --git a/cmd/boulder-janitor/job_test.go b/cmd/boulder-janitor/job_test.go index b4834f341..6ba0bb955 100644 --- a/cmd/boulder-janitor/job_test.go +++ b/cmd/boulder-janitor/job_test.go @@ -10,7 +10,6 @@ import ( "github.com/jmhodges/clock" blog "github.com/letsencrypt/boulder/log" "github.com/letsencrypt/boulder/test" - "github.com/prometheus/client_golang/prometheus" ) func setup() (*blog.Mock, clock.FakeClock) { @@ -122,8 +121,7 @@ func TestGetWork(t *testing.T) { } // We expect the work gauge for this table has been updated - workCount, err := test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table}) - test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels") + workCount := test.CountCounterVec("table", table, workStat) test.AssertEquals(t, workCount, len(mockIDs)) // Set the third item in mockIDs to have an expiry after the purge cutoff @@ -140,8 +138,7 @@ func TestGetWork(t *testing.T) { got := <-workChan test.AssertEquals(t, got, mockIDs[i].ID) } - workCount, err = test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table}) - test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels") + workCount = test.CountCounterVec("table", table, workStat) test.AssertEquals(t, workCount, 2) } diff --git a/test/integration-test.py b/test/integration-test.py index 30b72fc45..47d2674a5 100644 --- a/test/integration-test.py +++ b/test/integration-test.py @@ -123,32 +123,31 @@ def run_janitor(): raise Exception("stat line {0} was missing required parts".format(line)) return parts[1] - # Wait for the janitor to report it isn't finding new work - print("waiting for boulder-janitor work to complete...\n") - workDone = False - for i in range(10): - certStatusWorkbatch = get_stat_line(8014, statline("workbatch", "certificateStatus")) + # Wait for the janitor to finish its work. The easiest way to tell this + # externally is to watch for the work batch counters to stabilize for + # a period longer than the configured workSleep. + attempts = 0 + while True: + if attempts > 5: + raise Exception("timed out waiting for janitor workbatch counts to stabilize") + + certStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus")) certsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates")) certsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName")) - if not certStatusWorkbatch or not certsWorkBatch or not certsPerNameWorkBatch: - print("not done after check {0}. Sleeping".format(i)) - time.sleep(2) - continue - allReady = True - for line in [certStatusWorkbatch, certsWorkBatch, certsPerNameWorkBatch]: - if stat_value(line) != "0": - allReady = False + # sleep for double the configured workSleep for each job + time.sleep(1) - if allReady is False: - print("not done after check {0}. Sleeping".format(i)) - time.sleep(2) - else: - workDone = True + newCertStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus")) + newCertsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates")) + newCertsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName")) + + if (certStatusWorkBatch == newCertStatusWorkBatch + and certsWorkBatch == newCertsWorkBatch + and certsPerNameWorkBatch == newCertsPerNameWorkBatch): break - if workDone is False: - raise Exception("Timed out waiting for janitor to report all work completed\n") + attempts = attempts + 1 # Check deletion stats are not empty/zero for i in range(10):