boulder-janitor: switch workbatch gauge to counter. (#4477)

A gauge wasn't the appropriate stat type choice for this usage. Switching the stat to be a counter instead of a gauge means we can't detect when the janitor is finished its work in the integration test by watching for this stat to drop to zero for all the table labels we're concerned with. Instead the test is updated to watch for the counter value to stabilize for a period longer than the workbatch sleep.
2019-10-11 17:40:59 -04:00 · 2019-10-11 17:40:59 -04:00 · d35c20db75
parent 83882abf46
commit d35c20db75
3 changed files with 25 additions and 29 deletions
--- a/cmd/boulder-janitor/job.go
+++ b/cmd/boulder-janitor/job.go
@ -31,11 +31,11 @@ var (
 			Help: "Number of deletions by table the boulder-janitor has performed.",
 		},
 		[]string{"table"})
-	// workStat is a prometheus gauge vector tracking the number of rows found
+	// workStat is a prometheus counter vector tracking the number of rows found
 	// during a batchedJob's getWork stage and queued into the work channel sliced
 	// by a table label.
-	workStat = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
+	workStat = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
 			Name: "janitor_workbatch",
 			Help: "Number of items of work by table the boulder-janitor queued for deletion.",
 		},
@ -113,7 +113,7 @@ func (j batchedDBJob) getWork(work chan<- int64, startID int64) (int64, error) {
 		rows++
 		lastID = v.ID
 	}
-	workStat.WithLabelValues(j.table).Set(float64(rows))
+	workStat.WithLabelValues(j.table).Add(float64(rows))
 	return lastID, nil
 }

--- a/cmd/boulder-janitor/job_test.go
+++ b/cmd/boulder-janitor/job_test.go
@ -10,7 +10,6 @@ import (
 	"github.com/jmhodges/clock"
 	blog "github.com/letsencrypt/boulder/log"
 	"github.com/letsencrypt/boulder/test"
-	"github.com/prometheus/client_golang/prometheus"
 )

 func setup() (*blog.Mock, clock.FakeClock) {
@ -122,8 +121,7 @@ func TestGetWork(t *testing.T) {
 	}

 	// We expect the work gauge for this table has been updated
-	workCount, err := test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})
-	test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")
+	workCount := test.CountCounterVec("table", table, workStat)
 	test.AssertEquals(t, workCount, len(mockIDs))

 	// Set the third item in mockIDs to have an expiry after the purge cutoff
@ -140,8 +138,7 @@ func TestGetWork(t *testing.T) {
 		got := <-workChan
 		test.AssertEquals(t, got, mockIDs[i].ID)
 	}
-	workCount, err = test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})
-	test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")
+	workCount = test.CountCounterVec("table", table, workStat)
 	test.AssertEquals(t, workCount, 2)
 }

--- a/test/integration-test.py
+++ b/test/integration-test.py
@ -123,32 +123,31 @@ def run_janitor():
            raise Exception("stat line {0} was missing required parts".format(line))
        return parts[1]

-    # Wait for the janitor to report it isn't finding new work
-    print("waiting for boulder-janitor work to complete...\n")
-    workDone = False
-    for i in range(10):
-        certStatusWorkbatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
+    # Wait for the janitor to finish its work. The easiest way to tell this
+    # externally is to watch for the work batch counters to stabilize for
+    # a period longer than the configured workSleep.
+    attempts = 0
+    while True:
+        if attempts > 5:
+            raise Exception("timed out waiting for janitor workbatch counts to stabilize")
+
+        certStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
        certsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
        certsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
-        if not certStatusWorkbatch or not certsWorkBatch or not certsPerNameWorkBatch:
-            print("not done after check {0}. Sleeping".format(i))
-            time.sleep(2)
-            continue

-        allReady = True
-        for line in [certStatusWorkbatch, certsWorkBatch, certsPerNameWorkBatch]:
-            if stat_value(line) != "0":
-                allReady = False
+        # sleep for double the configured workSleep for each job
+        time.sleep(1)

-        if allReady is False:
-            print("not done after check {0}. Sleeping".format(i))
-            time.sleep(2)
-        else:
-            workDone = True
+        newCertStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
+        newCertsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
+        newCertsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
+
+        if (certStatusWorkBatch == newCertStatusWorkBatch 
+            and certsWorkBatch == newCertsWorkBatch 
+            and certsPerNameWorkBatch == newCertsPerNameWorkBatch):
            break

-    if workDone is False:
-        raise Exception("Timed out waiting for janitor to report all work completed\n")
+        attempts = attempts + 1

    # Check deletion stats are not empty/zero
    for i in range(10):