From d35c20db75a0842e8bd7eb0449d9e23d3d964d89 Mon Sep 17 00:00:00 2001
From: Daniel McCarney <cpu@letsencrypt.org>
Date: Fri, 11 Oct 2019 17:40:59 -0400
Subject: [PATCH] boulder-janitor: switch workbatch gauge to counter. (#4477)

A gauge wasn't the appropriate stat type choice for this usage.

Switching the stat to be a counter instead of a gauge means we can't
detect when the janitor is finished its work in the integration test by
watching for this stat to drop to zero for all the table labels we're
concerned with. Instead the test is updated to watch for the counter
value to stabilize for a period longer than the workbatch sleep.
---
 cmd/boulder-janitor/job.go      |  8 +++----
 cmd/boulder-janitor/job_test.go |  7 ++----
 test/integration-test.py        | 39 ++++++++++++++++-----------------
 3 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/cmd/boulder-janitor/job.go b/cmd/boulder-janitor/job.go
index a20622c5b..6825f4659 100644
--- a/cmd/boulder-janitor/job.go
+++ b/cmd/boulder-janitor/job.go
@@ -31,11 +31,11 @@ var (
 			Help: "Number of deletions by table the boulder-janitor has performed.",
 		},
 		[]string{"table"})
-	// workStat is a prometheus gauge vector tracking the number of rows found
+	// workStat is a prometheus counter vector tracking the number of rows found
 	// during a batchedJob's getWork stage and queued into the work channel sliced
 	// by a table label.
-	workStat = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
+	workStat = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
 			Name: "janitor_workbatch",
 			Help: "Number of items of work by table the boulder-janitor queued for deletion.",
 		},
@@ -113,7 +113,7 @@ func (j batchedDBJob) getWork(work chan<- int64, startID int64) (int64, error) {
 		rows++
 		lastID = v.ID
 	}
-	workStat.WithLabelValues(j.table).Set(float64(rows))
+	workStat.WithLabelValues(j.table).Add(float64(rows))
 	return lastID, nil
 }
 
diff --git a/cmd/boulder-janitor/job_test.go b/cmd/boulder-janitor/job_test.go
index b4834f341..6ba0bb955 100644
--- a/cmd/boulder-janitor/job_test.go
+++ b/cmd/boulder-janitor/job_test.go
@@ -10,7 +10,6 @@ import (
 	"github.com/jmhodges/clock"
 	blog "github.com/letsencrypt/boulder/log"
 	"github.com/letsencrypt/boulder/test"
-	"github.com/prometheus/client_golang/prometheus"
 )
 
 func setup() (*blog.Mock, clock.FakeClock) {
@@ -122,8 +121,7 @@ func TestGetWork(t *testing.T) {
 	}
 
 	// We expect the work gauge for this table has been updated
-	workCount, err := test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})
-	test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")
+	workCount := test.CountCounterVec("table", table, workStat)
 	test.AssertEquals(t, workCount, len(mockIDs))
 
 	// Set the third item in mockIDs to have an expiry after the purge cutoff
@@ -140,8 +138,7 @@ func TestGetWork(t *testing.T) {
 		got := <-workChan
 		test.AssertEquals(t, got, mockIDs[i].ID)
 	}
-	workCount, err = test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})
-	test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")
+	workCount = test.CountCounterVec("table", table, workStat)
 	test.AssertEquals(t, workCount, 2)
 }
 
diff --git a/test/integration-test.py b/test/integration-test.py
index 30b72fc45..47d2674a5 100644
--- a/test/integration-test.py
+++ b/test/integration-test.py
@@ -123,32 +123,31 @@ def run_janitor():
             raise Exception("stat line {0} was missing required parts".format(line))
         return parts[1]
 
-    # Wait for the janitor to report it isn't finding new work
-    print("waiting for boulder-janitor work to complete...\n")
-    workDone = False
-    for i in range(10):
-        certStatusWorkbatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
+    # Wait for the janitor to finish its work. The easiest way to tell this
+    # externally is to watch for the work batch counters to stabilize for
+    # a period longer than the configured workSleep.
+    attempts = 0
+    while True:
+        if attempts > 5:
+            raise Exception("timed out waiting for janitor workbatch counts to stabilize")
+
+        certStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
         certsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
         certsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
-        if not certStatusWorkbatch or not certsWorkBatch or not certsPerNameWorkBatch:
-            print("not done after check {0}. Sleeping".format(i))
-            time.sleep(2)
-            continue
 
-        allReady = True
-        for line in [certStatusWorkbatch, certsWorkBatch, certsPerNameWorkBatch]:
-            if stat_value(line) != "0":
-                allReady = False
+        # sleep for double the configured workSleep for each job
+        time.sleep(1)
 
-        if allReady is False:
-            print("not done after check {0}. Sleeping".format(i))
-            time.sleep(2)
-        else:
-            workDone = True
+        newCertStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
+        newCertsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
+        newCertsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
+
+        if (certStatusWorkBatch == newCertStatusWorkBatch 
+            and certsWorkBatch == newCertsWorkBatch 
+            and certsPerNameWorkBatch == newCertsPerNameWorkBatch):
             break
 
-    if workDone is False:
-        raise Exception("Timed out waiting for janitor to report all work completed\n")
+        attempts = attempts + 1
 
     # Check deletion stats are not empty/zero
     for i in range(10):