boulder-janitor: switch workbatch gauge to counter. (#4477)
A gauge wasn't the appropriate stat type choice for this usage. Switching the stat to be a counter instead of a gauge means we can't detect when the janitor is finished its work in the integration test by watching for this stat to drop to zero for all the table labels we're concerned with. Instead the test is updated to watch for the counter value to stabilize for a period longer than the workbatch sleep.
This commit is contained in:
parent
83882abf46
commit
d35c20db75
|
@ -31,11 +31,11 @@ var (
|
||||||
Help: "Number of deletions by table the boulder-janitor has performed.",
|
Help: "Number of deletions by table the boulder-janitor has performed.",
|
||||||
},
|
},
|
||||||
[]string{"table"})
|
[]string{"table"})
|
||||||
// workStat is a prometheus gauge vector tracking the number of rows found
|
// workStat is a prometheus counter vector tracking the number of rows found
|
||||||
// during a batchedJob's getWork stage and queued into the work channel sliced
|
// during a batchedJob's getWork stage and queued into the work channel sliced
|
||||||
// by a table label.
|
// by a table label.
|
||||||
workStat = prometheus.NewGaugeVec(
|
workStat = prometheus.NewCounterVec(
|
||||||
prometheus.GaugeOpts{
|
prometheus.CounterOpts{
|
||||||
Name: "janitor_workbatch",
|
Name: "janitor_workbatch",
|
||||||
Help: "Number of items of work by table the boulder-janitor queued for deletion.",
|
Help: "Number of items of work by table the boulder-janitor queued for deletion.",
|
||||||
},
|
},
|
||||||
|
@ -113,7 +113,7 @@ func (j batchedDBJob) getWork(work chan<- int64, startID int64) (int64, error) {
|
||||||
rows++
|
rows++
|
||||||
lastID = v.ID
|
lastID = v.ID
|
||||||
}
|
}
|
||||||
workStat.WithLabelValues(j.table).Set(float64(rows))
|
workStat.WithLabelValues(j.table).Add(float64(rows))
|
||||||
return lastID, nil
|
return lastID, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,6 @@ import (
|
||||||
"github.com/jmhodges/clock"
|
"github.com/jmhodges/clock"
|
||||||
blog "github.com/letsencrypt/boulder/log"
|
blog "github.com/letsencrypt/boulder/log"
|
||||||
"github.com/letsencrypt/boulder/test"
|
"github.com/letsencrypt/boulder/test"
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func setup() (*blog.Mock, clock.FakeClock) {
|
func setup() (*blog.Mock, clock.FakeClock) {
|
||||||
|
@ -122,8 +121,7 @@ func TestGetWork(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// We expect the work gauge for this table has been updated
|
// We expect the work gauge for this table has been updated
|
||||||
workCount, err := test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})
|
workCount := test.CountCounterVec("table", table, workStat)
|
||||||
test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")
|
|
||||||
test.AssertEquals(t, workCount, len(mockIDs))
|
test.AssertEquals(t, workCount, len(mockIDs))
|
||||||
|
|
||||||
// Set the third item in mockIDs to have an expiry after the purge cutoff
|
// Set the third item in mockIDs to have an expiry after the purge cutoff
|
||||||
|
@ -140,8 +138,7 @@ func TestGetWork(t *testing.T) {
|
||||||
got := <-workChan
|
got := <-workChan
|
||||||
test.AssertEquals(t, got, mockIDs[i].ID)
|
test.AssertEquals(t, got, mockIDs[i].ID)
|
||||||
}
|
}
|
||||||
workCount, err = test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})
|
workCount = test.CountCounterVec("table", table, workStat)
|
||||||
test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")
|
|
||||||
test.AssertEquals(t, workCount, 2)
|
test.AssertEquals(t, workCount, 2)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -123,32 +123,31 @@ def run_janitor():
|
||||||
raise Exception("stat line {0} was missing required parts".format(line))
|
raise Exception("stat line {0} was missing required parts".format(line))
|
||||||
return parts[1]
|
return parts[1]
|
||||||
|
|
||||||
# Wait for the janitor to report it isn't finding new work
|
# Wait for the janitor to finish its work. The easiest way to tell this
|
||||||
print("waiting for boulder-janitor work to complete...\n")
|
# externally is to watch for the work batch counters to stabilize for
|
||||||
workDone = False
|
# a period longer than the configured workSleep.
|
||||||
for i in range(10):
|
attempts = 0
|
||||||
certStatusWorkbatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
|
while True:
|
||||||
|
if attempts > 5:
|
||||||
|
raise Exception("timed out waiting for janitor workbatch counts to stabilize")
|
||||||
|
|
||||||
|
certStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
|
||||||
certsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
|
certsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
|
||||||
certsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
|
certsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
|
||||||
if not certStatusWorkbatch or not certsWorkBatch or not certsPerNameWorkBatch:
|
|
||||||
print("not done after check {0}. Sleeping".format(i))
|
|
||||||
time.sleep(2)
|
|
||||||
continue
|
|
||||||
|
|
||||||
allReady = True
|
# sleep for double the configured workSleep for each job
|
||||||
for line in [certStatusWorkbatch, certsWorkBatch, certsPerNameWorkBatch]:
|
time.sleep(1)
|
||||||
if stat_value(line) != "0":
|
|
||||||
allReady = False
|
|
||||||
|
|
||||||
if allReady is False:
|
newCertStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
|
||||||
print("not done after check {0}. Sleeping".format(i))
|
newCertsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
|
||||||
time.sleep(2)
|
newCertsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
|
||||||
else:
|
|
||||||
workDone = True
|
if (certStatusWorkBatch == newCertStatusWorkBatch
|
||||||
|
and certsWorkBatch == newCertsWorkBatch
|
||||||
|
and certsPerNameWorkBatch == newCertsPerNameWorkBatch):
|
||||||
break
|
break
|
||||||
|
|
||||||
if workDone is False:
|
attempts = attempts + 1
|
||||||
raise Exception("Timed out waiting for janitor to report all work completed\n")
|
|
||||||
|
|
||||||
# Check deletion stats are not empty/zero
|
# Check deletion stats are not empty/zero
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
|
|
Loading…
Reference in New Issue