From d5d8b0dba8b06ae294e681481c8dfff8e0eba13e Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Fri, 26 May 2017 07:53:43 -0700 Subject: [PATCH] Grafana dashboard update. (#2786) - Add OCSP graphs - Graph overall request rate - Separate out WFE vs OCSP graphs - Fix challenge graph (add a / to endpoint) - Some incidental changes to "step" - Add a lint script to check for common dashboard mistakes --- .travis.yml | 2 +- test.sh | 8 +- test/grafana/boulderdash.json | 346 +++++++++++++++++++++++++++++++--- test/grafana/lint.py | 27 +++ 4 files changed, 360 insertions(+), 23 deletions(-) create mode 100644 test/grafana/lint.py diff --git a/.travis.yml b/.travis.yml index a0d4139f5..d3d5e41fa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -36,7 +36,7 @@ env: - PATH=$HOME/bin:$PATH # protoc gets installed here - GO15VENDOREXPERIMENT=1 matrix: - - RUN="vet fmt migrations integration godep-restore errcheck generate" + - RUN="vet fmt migrations integration godep-restore errcheck generate dashlint" # Config changes that have landed in master but not yet been applied to # production can be made in boulder-config-next.json. - RUN="integration" BOULDER_CONFIG_DIR="test/config-next" diff --git a/test.sh b/test.sh index 3726e48ae..58f7cfa0c 100755 --- a/test.sh +++ b/test.sh @@ -11,7 +11,7 @@ fi # defaults, because we don't want to run it locally (would be too disruptive to # GOPATH). We also omit coverage by default on local runs because it generates # artifacts on disk that aren't needed. -RUN=${RUN:-vet fmt migrations unit integration errcheck} +RUN=${RUN:-vet fmt migrations unit integration errcheck dashlint} # The list of segments to hard fail on, as opposed to continuing to the end of # the unit tests before failing. @@ -251,4 +251,10 @@ if [[ "$RUN" =~ "rpm" ]]; then end_context #"rpm" fi +if [[ "$RUN" =~ "dashlint" ]]; then + start_context "dashlint" + run python test/grafana/lint.py + end_context #"dashlint" +fi + exit ${FAILURE} diff --git a/test/grafana/boulderdash.json b/test/grafana/boulderdash.json index 38b9fce99..a9bf56851 100644 --- a/test/grafana/boulderdash.json +++ b/test/grafana/boulderdash.json @@ -19,7 +19,7 @@ }, "editable": true, "gnetId": null, - "graphTooltip": 0, + "graphTooltip": 2, "hideControls": false, "id": null, "links": [], @@ -27,7 +27,7 @@ "rows": [ { "collapse": false, - "height": "250px", + "height": 256, "panels": [ { "aliasColors": {}, @@ -53,13 +53,14 @@ "points": false, "renderer": "flot", "seriesOverrides": [], - "span": 12, + "span": 6, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(response_time_count{code!~\"50.\"}[$interval])) / sum by (job) (rate(response_time_count{}[$interval]))", + "expr": "sum by (instance) (rate(response_time_count{code!~\"50.\",instance=~\".*wfe.*\"}[$interval])) / sum by (instance) (rate(response_time_count{}[$interval]))", "intervalFactor": 2, + "legendFormat": "", "metric": "", "refId": "A", "step": 240 @@ -99,6 +100,78 @@ "show": true } ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "$datasource", + "fill": 1, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(response_time_count[$interval]))", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Request volume", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] } ], "repeat": null, @@ -148,7 +221,7 @@ "legendFormat": "{{result}}", "metric": "", "refId": "A", - "step": 600 + "step": 240 } ], "thresholds": [], @@ -222,7 +295,7 @@ "legendFormat": "{{result}}", "metric": "", "refId": "A", - "step": 600 + "step": 240 } ], "thresholds": [], @@ -296,7 +369,7 @@ "legendFormat": "{{result}}", "metric": "", "refId": "A", - "step": 600 + "step": 240 } ], "thresholds": [], @@ -375,7 +448,8 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (code) (rate(response_time_count{method=\"GET\"}[$interval]))", + "expr": "sum by (code) (rate(response_time_count{method=\"GET\",instance=~\".*wfe.*\"}[$interval]))", + "interval": "", "intervalFactor": 2, "legendFormat": "{{code}}", "metric": "response_", @@ -386,7 +460,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "GETs per second by response code", + "title": "GETs per second by response code (WFE)", "tooltip": { "shared": true, "sort": 0, @@ -447,7 +521,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (code) (rate(response_time_count{method=\"POST\"}[$interval]))", + "expr": "sum by (code) (rate(response_time_count{method=\"POST\",instance=~\".*wfe.*\"}[$interval]))", "intervalFactor": 2, "legendFormat": "{{code}}", "refId": "A", @@ -457,7 +531,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "POSTs per second by response code", + "title": "POSTs per second by response code (WFE)", "tooltip": { "shared": true, "sort": 0, @@ -535,7 +609,7 @@ "intervalFactor": 2, "legendFormat": "{{endpoint}}", "refId": "A", - "step": 600 + "step": 240 } ], "thresholds": [], @@ -606,8 +680,9 @@ "interval": "", "intervalFactor": 2, "legendFormat": "{{endpoint}}", + "metric": "response_", "refId": "A", - "step": 600 + "step": 240 } ], "thresholds": [], @@ -680,7 +755,7 @@ "legendFormat": "{{code}} {{endpoint}}", "metric": "", "refId": "A", - "step": 600 + "step": 240 } ], "thresholds": [], @@ -760,7 +835,9 @@ "targets": [ { "expr": "sum by (code) (rate(response_time_count{endpoint=\"/acme/new-reg\"}[$interval]))", + "interval": "", "intervalFactor": 2, + "legendFormat": "{{code}}", "refId": "A", "step": 600 } @@ -787,7 +864,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -929,7 +1006,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -971,10 +1048,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (code) (rate(response_time_count{method=\"POST\",endpoint=\"/acme/challenge\"}[$interval]))", + "expr": "sum by (code) (rate(response_time_count{method=\"POST\",endpoint=\"/acme/challenge/\"}[$interval]))", "interval": "", "intervalFactor": 2, "legendFormat": "{{code}}", + "metric": "", "refId": "A", "step": 600 } @@ -1001,7 +1079,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -1061,7 +1139,7 @@ "legendFormat": "{{instance}}", "metric": "process_cpu_seconds_total", "refId": "A", - "step": 240 + "step": 120 } ], "thresholds": [], @@ -1110,7 +1188,233 @@ { "collapse": false, "height": 250, - "panels": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "$datasource", + "fill": 3, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (method, code) (rate(response_time_count{instance=~\".*ocsp.*\",code!=\"405\"}[$interval]))", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{method}}, {{code}}", + "metric": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "OCSP response volume", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "$datasource", + "fill": 1, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.5, sum by (le, endpoint) (rate(response_time_bucket{instance=~\".*ocsp.*\"}[$interval])))", + "interval": "", + "intervalFactor": 2, + "legendFormat": "median", + "refId": "A", + "step": 240 + }, + { + "expr": "histogram_quantile(0.99, sum by (le, endpoint) (rate(response_time_bucket{instance=~\".*ocsp.*\"}[$interval])))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "99th percentile", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "OCSP latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": "$datasource", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(response_time_count{code!~\"[45]0.\",instance=~\".*ocsp.*\"}[$interval])) / sum by (job) (rate(response_time_count{instance=~\".*ocsp.*\"}[$interval]))", + "interval": "", + "intervalFactor": 2, + "legendFormat": "success rate", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "OCSP success rate (excluding 400s)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], "repeat": null, "repeatIteration": null, "repeatRowId": null, @@ -1244,5 +1548,5 @@ }, "timezone": "utc", "title": "Boulderdash", - "version": 6 + "version": 25 } diff --git a/test/grafana/lint.py b/test/grafana/lint.py new file mode 100644 index 000000000..531fe4049 --- /dev/null +++ b/test/grafana/lint.py @@ -0,0 +1,27 @@ +#!/usr/bin/python +# Check dashboard JSON files for common errors, like forgetting to templatize a +# datasource. +import json +import os +with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), + "boulderdash.json")) as f: + dashboard = json.load(f) + +# When exporting, the current value of templated variables is saved. We don't +# want to save a specific value for datasource, since that's +# deployment-specific, so we ensure that the dashboard was exported with the +# datasource template variable set to "Default." +for li in dashboard["templating"]["list"]: + if li["type"] == "datasource": + assert(li["current"]["value"] == "default") + +# Additionally, ensure each panel's datasource is using the template variable +# rather than a hardcoded datasource. Grafana will choose a hardcoded +# datasource on new panels by default, so this is an easy mistake to make. +for ro in dashboard["rows"]: + for pa in ro["panels"]: + assert(pa["datasource"] == "$datasource") + +# It seems that __inputs is non-empty when template variables at the top of the +# dashboard have been modified from the defaults; check for that. +assert(len(dashboard["__inputs"]) == 0)