From bb005e1c79f3972ef8c7f24e3bd44779bbced9e5 Mon Sep 17 00:00:00 2001 From: Daniel McCarney Date: Mon, 29 Jul 2019 16:13:10 -0400 Subject: [PATCH] integration: add test for boulder-janitor. (#4364) --- cmd/boulder-janitor/certs.go | 1 + cmd/boulder-janitor/certsPerName.go | 1 + cmd/boulder-janitor/certstatus.go | 1 + cmd/boulder-janitor/config.go | 3 + cmd/boulder-janitor/janitor.go | 2 + cmd/boulder-janitor/job.go | 9 ++- test/config-next/janitor.json | 17 ++--- test/integration-test.py | 96 ++++++++++++++++++++++++++++- 8 files changed, 120 insertions(+), 10 deletions(-) diff --git a/cmd/boulder-janitor/certs.go b/cmd/boulder-janitor/certs.go index f3cac742e..7cf953861 100644 --- a/cmd/boulder-janitor/certs.go +++ b/cmd/boulder-janitor/certs.go @@ -24,6 +24,7 @@ func newCertificatesJob( db: db, log: log, purgeBefore: purgeBefore, + workSleep: config.Janitor.Certificates.WorkSleep.Duration, batchSize: config.Janitor.Certificates.BatchSize, maxDPS: config.Janitor.Certificates.MaxDPS, parallelism: config.Janitor.Certificates.Parallelism, diff --git a/cmd/boulder-janitor/certsPerName.go b/cmd/boulder-janitor/certsPerName.go index 347f915f8..29997cb80 100644 --- a/cmd/boulder-janitor/certsPerName.go +++ b/cmd/boulder-janitor/certsPerName.go @@ -24,6 +24,7 @@ func newCertificatesPerNameJob( db: db, log: log, purgeBefore: purgeBefore, + workSleep: config.Janitor.CertificatesPerName.WorkSleep.Duration, batchSize: config.Janitor.CertificatesPerName.BatchSize, maxDPS: config.Janitor.CertificatesPerName.MaxDPS, parallelism: config.Janitor.CertificatesPerName.Parallelism, diff --git a/cmd/boulder-janitor/certstatus.go b/cmd/boulder-janitor/certstatus.go index a97a8bf96..31c21188c 100644 --- a/cmd/boulder-janitor/certstatus.go +++ b/cmd/boulder-janitor/certstatus.go @@ -24,6 +24,7 @@ func newCertificateStatusJob( db: db, log: log, purgeBefore: purgeBefore, + workSleep: config.Janitor.CertificateStatus.WorkSleep.Duration, batchSize: config.Janitor.CertificateStatus.BatchSize, maxDPS: config.Janitor.CertificateStatus.MaxDPS, parallelism: config.Janitor.CertificateStatus.Parallelism, diff --git a/cmd/boulder-janitor/config.go b/cmd/boulder-janitor/config.go index 5d9af3dc8..ec00c91bf 100644 --- a/cmd/boulder-janitor/config.go +++ b/cmd/boulder-janitor/config.go @@ -13,6 +13,9 @@ type CleanupConfig struct { Enabled bool // GracePeriod controls when a resource is old enough to be cleaned up. GracePeriod cmd.ConfigDuration + // WorkSleep controls how long the janitor's work threads sleep between + // finding no work and trying again. Defaults to a minute if not provided. + WorkSleep cmd.ConfigDuration // BatchSize controls how many rows of the resource will be read from the DB // per-query. BatchSize int64 diff --git a/cmd/boulder-janitor/janitor.go b/cmd/boulder-janitor/janitor.go index c2dcefc91..ecccc5699 100644 --- a/cmd/boulder-janitor/janitor.go +++ b/cmd/boulder-janitor/janitor.go @@ -47,7 +47,9 @@ func New(clk clock.Clock, config Config) (*janitor, error) { // Setup logging and stats scope, logger := cmd.StatsAndLogging(config.Janitor.Syslog, config.Janitor.DebugAddr) + scope.MustRegister(errStat) scope.MustRegister(deletedStat) + scope.MustRegister(workStat) defer logger.AuditPanic() logger.Info(cmd.VersionString()) diff --git a/cmd/boulder-janitor/job.go b/cmd/boulder-janitor/job.go index ff78557b7..5ac703fcb 100644 --- a/cmd/boulder-janitor/job.go +++ b/cmd/boulder-janitor/job.go @@ -52,6 +52,9 @@ type batchedDBJob struct { // purgeBefore indicates the cut-off for the the resoruce being cleaned up by // the job. Rows that older than now - purgeBefore are deleted. purgeBefore time.Time + // workSleep is a duration that the job will sleep between getWork() calls + // when no new work is found. If not provided, defaults to a minute. + workSleep time.Duration // batchSize indicates how many database rows of work should be returned per query. batchSize int64 // maxDPS optionally indicates a maximum rate of deletes to run per second. @@ -171,7 +174,11 @@ func (j batchedDBJob) RunForever() { j.log.Debugf( "made no new progress on table %q. Sleeping for a minute", j.table) - time.Sleep(time.Minute) + if j.workSleep.Seconds() == 0 { + time.Sleep(time.Minute) + } else { + time.Sleep(j.workSleep) + } } id = lastID } diff --git a/test/config-next/janitor.json b/test/config-next/janitor.json index 698b85a6f..fcf4a21c0 100644 --- a/test/config-next/janitor.json +++ b/test/config-next/janitor.json @@ -1,7 +1,7 @@ { "janitor": { "syslog": { - "stdoutLevel": 7 + "stdoutLevel": 6 }, "dbConnectFile": "test/secrets/janitor_dburl", "maxDBConns": 10, @@ -9,23 +9,26 @@ "certificates": { "enabled": true, "gracePeriod": "1h", - "batchSize": 10, + "batchSize": 100, + "workSleep": "500ms", "parallelism": 2, - "maxDPS": 1 + "maxDPS": 50 }, "certificateStatus": { "enabled": true, "gracePeriod": "1h", - "batchSize": 10, + "batchSize": 100, + "workSleep": "500ms", "parallelism": 2, - "maxDPS": 1 + "maxDPS": 50 }, "certificatesPerName": { "enabled": true, "gracePeriod": "1h", - "batchSize": 10, + "batchSize": 100, + "workSleep": "500ms", "parallelism": 2, - "maxDPS": 1 + "maxDPS": 50 } } } diff --git a/test/integration-test.py b/test/integration-test.py index df0175890..f60e542bf 100644 --- a/test/integration-test.py +++ b/test/integration-test.py @@ -30,8 +30,6 @@ from helpers import * from acme import challenges -import requests - def run_client_tests(): root = os.environ.get("CERTBOT_PATH") assert root is not None, ( @@ -83,6 +81,94 @@ def run_expired_authz_purger(): expect(now, 0, "authz") expect(after_grace_period, 1, "authz") +def run_janitor(): + # Set the fake clock to a year in the future such that all of the database + # rows created during the integration tests are older than the grace period. + now = datetime.datetime.utcnow() + target_time = now+datetime.timedelta(days=+365) + + e = os.environ.copy() + e.setdefault("GORACE", "halt_on_error=1") + e.setdefault("FAKECLOCK", fakeclock(target_time)) + + # Note: Must use exec here so that killing this process kills the command. + cmdline = "exec ./bin/boulder-janitor --config test/config-next/janitor.json" + p = subprocess.Popen(cmdline, shell=True, env=e) + + # Wait for the janitor to come up + waitport(8014, "boulder-janitor", None) + + def statline(statname, table): + # NOTE: we omit the trailing "}}" to make this match general enough to + # permit new labels in the future. + return "janitor_{0}{{table=\"{1}\"".format(statname, table) + + def get_stat_line(port, stat): + url = "http://localhost:%d/metrics" % port + response = requests.get(url) + for l in response.content.split("\n"): + if l.strip().startswith(stat): + return l + return None + + def stat_value(line): + parts = line.split(" ") + if len(parts) != 2: + raise Exception("stat line {0} was missing required parts".format(line)) + return parts[1] + + # Wait for the janitor to report it isn't finding new work + print("waiting for boulder-janitor work to complete...\n") + workDone = False + for i in range(10): + certStatusWorkbatch = get_stat_line(8014, statline("workbatch", "certificateStatus")) + certsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates")) + certsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName")) + + allReady = True + for line in [certStatusWorkbatch, certsWorkBatch, certsPerNameWorkBatch]: + if stat_value(line) != "0": + allReady = False + + if allReady is False: + print("not done after check {0}. Sleeping".format(i)) + time.sleep(2) + else: + workDone = True + break + + if workDone is False: + raise Exception("Timed out waiting for janitor to report all work completed\n") + + # Check deletion stats are not empty/zero + for i in range(10): + certStatusDeletes = get_stat_line(8014, statline("deletions", "certificateStatus")) + certsDeletes = get_stat_line(8014, statline("deletions", "certificates")) + certsPerNameDeletes = get_stat_line(8014, statline("deletions", "certificatesPerName")) + + if certStatusDeletes is None or certsDeletes is None or certsPerNameDeletes is None: + print("delete stats not present after check {0}. Sleeping".format(i)) + time.sleep(2) + continue + + for l in [certStatusDeletes, certsDeletes, certsPerNameDeletes]: + if stat_value(l) == "0": + raise Exception("Expected a non-zero number of deletes to be performed. Found {0}".format(l)) + + # Check that all error stats are empty + errorStats = [ + statline("errors", "certificateStatus"), + statline("errors", "certificates"), + statline("errors", "certificatesPerName"), + ] + for eStat in errorStats: + actual = get_stat_line(8014, eStat) + if actual is not None: + raise Exception("Expected to find no error stat lines but found {0}\n".format(eStat)) + + # Terminate the janitor + p.terminate() + def test_single_ocsp(): """Run the single-ocsp command, which is used to generate OCSP responses for intermediate certificates on a manual basis. Then start up an @@ -187,6 +273,12 @@ def main(): check_balance() if not CONFIG_NEXT: run_expired_authz_purger() + + # Run the boulder-janitor. This should happen after all other tests because + # it runs with the fake clock set to the future and deletes rows that may + # otherwise be referenced by tests. + run_janitor() + # Run the load-generator last. run_loadtest will stop the # pebble-challtestsrv before running the load-generator and will not restart # it.