integration: add test for boulder-janitor. (#4364)

This commit is contained in:
Daniel McCarney 2019-07-29 16:13:10 -04:00 committed by GitHub
parent 98677b83d8
commit bb005e1c79
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 120 additions and 10 deletions

View File

@ -24,6 +24,7 @@ func newCertificatesJob(
db: db,
log: log,
purgeBefore: purgeBefore,
workSleep: config.Janitor.Certificates.WorkSleep.Duration,
batchSize: config.Janitor.Certificates.BatchSize,
maxDPS: config.Janitor.Certificates.MaxDPS,
parallelism: config.Janitor.Certificates.Parallelism,

View File

@ -24,6 +24,7 @@ func newCertificatesPerNameJob(
db: db,
log: log,
purgeBefore: purgeBefore,
workSleep: config.Janitor.CertificatesPerName.WorkSleep.Duration,
batchSize: config.Janitor.CertificatesPerName.BatchSize,
maxDPS: config.Janitor.CertificatesPerName.MaxDPS,
parallelism: config.Janitor.CertificatesPerName.Parallelism,

View File

@ -24,6 +24,7 @@ func newCertificateStatusJob(
db: db,
log: log,
purgeBefore: purgeBefore,
workSleep: config.Janitor.CertificateStatus.WorkSleep.Duration,
batchSize: config.Janitor.CertificateStatus.BatchSize,
maxDPS: config.Janitor.CertificateStatus.MaxDPS,
parallelism: config.Janitor.CertificateStatus.Parallelism,

View File

@ -13,6 +13,9 @@ type CleanupConfig struct {
Enabled bool
// GracePeriod controls when a resource is old enough to be cleaned up.
GracePeriod cmd.ConfigDuration
// WorkSleep controls how long the janitor's work threads sleep between
// finding no work and trying again. Defaults to a minute if not provided.
WorkSleep cmd.ConfigDuration
// BatchSize controls how many rows of the resource will be read from the DB
// per-query.
BatchSize int64

View File

@ -47,7 +47,9 @@ func New(clk clock.Clock, config Config) (*janitor, error) {
// Setup logging and stats
scope, logger := cmd.StatsAndLogging(config.Janitor.Syslog, config.Janitor.DebugAddr)
scope.MustRegister(errStat)
scope.MustRegister(deletedStat)
scope.MustRegister(workStat)
defer logger.AuditPanic()
logger.Info(cmd.VersionString())

View File

@ -52,6 +52,9 @@ type batchedDBJob struct {
// purgeBefore indicates the cut-off for the the resoruce being cleaned up by
// the job. Rows that older than now - purgeBefore are deleted.
purgeBefore time.Time
// workSleep is a duration that the job will sleep between getWork() calls
// when no new work is found. If not provided, defaults to a minute.
workSleep time.Duration
// batchSize indicates how many database rows of work should be returned per query.
batchSize int64
// maxDPS optionally indicates a maximum rate of deletes to run per second.
@ -171,7 +174,11 @@ func (j batchedDBJob) RunForever() {
j.log.Debugf(
"made no new progress on table %q. Sleeping for a minute",
j.table)
time.Sleep(time.Minute)
if j.workSleep.Seconds() == 0 {
time.Sleep(time.Minute)
} else {
time.Sleep(j.workSleep)
}
}
id = lastID
}

View File

@ -1,7 +1,7 @@
{
"janitor": {
"syslog": {
"stdoutLevel": 7
"stdoutLevel": 6
},
"dbConnectFile": "test/secrets/janitor_dburl",
"maxDBConns": 10,
@ -9,23 +9,26 @@
"certificates": {
"enabled": true,
"gracePeriod": "1h",
"batchSize": 10,
"batchSize": 100,
"workSleep": "500ms",
"parallelism": 2,
"maxDPS": 1
"maxDPS": 50
},
"certificateStatus": {
"enabled": true,
"gracePeriod": "1h",
"batchSize": 10,
"batchSize": 100,
"workSleep": "500ms",
"parallelism": 2,
"maxDPS": 1
"maxDPS": 50
},
"certificatesPerName": {
"enabled": true,
"gracePeriod": "1h",
"batchSize": 10,
"batchSize": 100,
"workSleep": "500ms",
"parallelism": 2,
"maxDPS": 1
"maxDPS": 50
}
}
}

View File

@ -30,8 +30,6 @@ from helpers import *
from acme import challenges
import requests
def run_client_tests():
root = os.environ.get("CERTBOT_PATH")
assert root is not None, (
@ -83,6 +81,94 @@ def run_expired_authz_purger():
expect(now, 0, "authz")
expect(after_grace_period, 1, "authz")
def run_janitor():
# Set the fake clock to a year in the future such that all of the database
# rows created during the integration tests are older than the grace period.
now = datetime.datetime.utcnow()
target_time = now+datetime.timedelta(days=+365)
e = os.environ.copy()
e.setdefault("GORACE", "halt_on_error=1")
e.setdefault("FAKECLOCK", fakeclock(target_time))
# Note: Must use exec here so that killing this process kills the command.
cmdline = "exec ./bin/boulder-janitor --config test/config-next/janitor.json"
p = subprocess.Popen(cmdline, shell=True, env=e)
# Wait for the janitor to come up
waitport(8014, "boulder-janitor", None)
def statline(statname, table):
# NOTE: we omit the trailing "}}" to make this match general enough to
# permit new labels in the future.
return "janitor_{0}{{table=\"{1}\"".format(statname, table)
def get_stat_line(port, stat):
url = "http://localhost:%d/metrics" % port
response = requests.get(url)
for l in response.content.split("\n"):
if l.strip().startswith(stat):
return l
return None
def stat_value(line):
parts = line.split(" ")
if len(parts) != 2:
raise Exception("stat line {0} was missing required parts".format(line))
return parts[1]
# Wait for the janitor to report it isn't finding new work
print("waiting for boulder-janitor work to complete...\n")
workDone = False
for i in range(10):
certStatusWorkbatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
certsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
certsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
allReady = True
for line in [certStatusWorkbatch, certsWorkBatch, certsPerNameWorkBatch]:
if stat_value(line) != "0":
allReady = False
if allReady is False:
print("not done after check {0}. Sleeping".format(i))
time.sleep(2)
else:
workDone = True
break
if workDone is False:
raise Exception("Timed out waiting for janitor to report all work completed\n")
# Check deletion stats are not empty/zero
for i in range(10):
certStatusDeletes = get_stat_line(8014, statline("deletions", "certificateStatus"))
certsDeletes = get_stat_line(8014, statline("deletions", "certificates"))
certsPerNameDeletes = get_stat_line(8014, statline("deletions", "certificatesPerName"))
if certStatusDeletes is None or certsDeletes is None or certsPerNameDeletes is None:
print("delete stats not present after check {0}. Sleeping".format(i))
time.sleep(2)
continue
for l in [certStatusDeletes, certsDeletes, certsPerNameDeletes]:
if stat_value(l) == "0":
raise Exception("Expected a non-zero number of deletes to be performed. Found {0}".format(l))
# Check that all error stats are empty
errorStats = [
statline("errors", "certificateStatus"),
statline("errors", "certificates"),
statline("errors", "certificatesPerName"),
]
for eStat in errorStats:
actual = get_stat_line(8014, eStat)
if actual is not None:
raise Exception("Expected to find no error stat lines but found {0}\n".format(eStat))
# Terminate the janitor
p.terminate()
def test_single_ocsp():
"""Run the single-ocsp command, which is used to generate OCSP responses for
intermediate certificates on a manual basis. Then start up an
@ -187,6 +273,12 @@ def main():
check_balance()
if not CONFIG_NEXT:
run_expired_authz_purger()
# Run the boulder-janitor. This should happen after all other tests because
# it runs with the fake clock set to the future and deletes rows that may
# otherwise be referenced by tests.
run_janitor()
# Run the load-generator last. run_loadtest will stop the
# pebble-challtestsrv before running the load-generator and will not restart
# it.