Grafana dashboard update. (#2786)

- Add OCSP graphs - Graph overall request rate - Separate out WFE vs OCSP graphs - Fix challenge graph (add a / to endpoint) - Some incidental changes to "step" - Add a lint script to check for common dashboard mistakes
2017-05-26 07:53:43 -07:00 · 2017-05-26 07:53:43 -07:00 · d5d8b0dba8
parent 8fe0697b9a
commit d5d8b0dba8
4 changed files with 360 additions and 23 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -36,7 +36,7 @@ env:
    - PATH=$HOME/bin:$PATH # protoc gets installed here
    - GO15VENDOREXPERIMENT=1
  matrix:
-    - RUN="vet fmt migrations integration godep-restore errcheck generate"
+    - RUN="vet fmt migrations integration godep-restore errcheck generate dashlint"
    # Config changes that have landed in master but not yet been applied to
    # production can be made in boulder-config-next.json.
    - RUN="integration" BOULDER_CONFIG_DIR="test/config-next"
--- a/test.sh
+++ b/test.sh
@ -11,7 +11,7 @@ fi
 # defaults, because we don't want to run it locally (would be too disruptive to
 # GOPATH). We also omit coverage by default on local runs because it generates
 # artifacts on disk that aren't needed.
-RUN=${RUN:-vet fmt migrations unit integration errcheck}
+RUN=${RUN:-vet fmt migrations unit integration errcheck dashlint}

 # The list of segments to hard fail on, as opposed to continuing to the end of
 # the unit tests before failing.
@ -251,4 +251,10 @@ if [[ "$RUN" =~ "rpm" ]]; then
  end_context #"rpm"
 fi

+if [[ "$RUN" =~ "dashlint" ]]; then
+  start_context "dashlint"
+  run python test/grafana/lint.py
+  end_context #"dashlint"
+fi
+
 exit ${FAILURE}
--- a/test/grafana/boulderdash.json
+++ b/test/grafana/boulderdash.json
@ -19,7 +19,7 @@
  },
  "editable": true,
  "gnetId": null,
-  "graphTooltip": 0,
+  "graphTooltip": 2,
  "hideControls": false,
  "id": null,
  "links": [],
@ -27,7 +27,7 @@
  "rows": [
    {
      "collapse": false,
-      "height": "250px",
+      "height": 256,
      "panels": [
        {
          "aliasColors": {},
@ -53,13 +53,14 @@
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
-          "span": 12,
+          "span": 6,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
-              "expr": "sum by (job) (rate(response_time_count{code!~\"50.\"}[$interval])) / sum by (job) (rate(response_time_count{}[$interval]))",
+              "expr": "sum by (instance) (rate(response_time_count{code!~\"50.\",instance=~\".*wfe.*\"}[$interval])) / sum by (instance) (rate(response_time_count{}[$interval]))",
              "intervalFactor": 2,
+              "legendFormat": "",
              "metric": "",
              "refId": "A",
              "step": 240
@ -99,6 +100,78 @@
              "show": true
            }
          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "datasource": "$datasource",
+          "fill": 1,
+          "id": 19,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum by (instance) (rate(response_time_count[$interval]))",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "{{instance}}",
+              "refId": "A",
+              "step": 240
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Request volume",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
        }
      ],
      "repeat": null,
@ -148,7 +221,7 @@
              "legendFormat": "{{result}}",
              "metric": "",
              "refId": "A",
-              "step": 600
+              "step": 240
            }
          ],
          "thresholds": [],
@ -222,7 +295,7 @@
              "legendFormat": "{{result}}",
              "metric": "",
              "refId": "A",
-              "step": 600
+              "step": 240
            }
          ],
          "thresholds": [],
@ -296,7 +369,7 @@
              "legendFormat": "{{result}}",
              "metric": "",
              "refId": "A",
-              "step": 600
+              "step": 240
            }
          ],
          "thresholds": [],
@ -375,7 +448,8 @@
          "steppedLine": false,
          "targets": [
            {
-              "expr": "sum by (code) (rate(response_time_count{method=\"GET\"}[$interval]))",
+              "expr": "sum by (code) (rate(response_time_count{method=\"GET\",instance=~\".*wfe.*\"}[$interval]))",
+              "interval": "",
              "intervalFactor": 2,
              "legendFormat": "{{code}}",
              "metric": "response_",
@ -386,7 +460,7 @@
          "thresholds": [],
          "timeFrom": null,
          "timeShift": null,
-          "title": "GETs per second by response code",
+          "title": "GETs per second by response code (WFE)",
          "tooltip": {
            "shared": true,
            "sort": 0,
@ -447,7 +521,7 @@
          "steppedLine": false,
          "targets": [
            {
-              "expr": "sum by (code) (rate(response_time_count{method=\"POST\"}[$interval]))",
+              "expr": "sum by (code) (rate(response_time_count{method=\"POST\",instance=~\".*wfe.*\"}[$interval]))",
              "intervalFactor": 2,
              "legendFormat": "{{code}}",
              "refId": "A",
@ -457,7 +531,7 @@
          "thresholds": [],
          "timeFrom": null,
          "timeShift": null,
-          "title": "POSTs per second by response code",
+          "title": "POSTs per second by response code (WFE)",
          "tooltip": {
            "shared": true,
            "sort": 0,
@ -535,7 +609,7 @@
              "intervalFactor": 2,
              "legendFormat": "{{endpoint}}",
              "refId": "A",
-              "step": 600
+              "step": 240
            }
          ],
          "thresholds": [],
@ -606,8 +680,9 @@
              "interval": "",
              "intervalFactor": 2,
              "legendFormat": "{{endpoint}}",
+              "metric": "response_",
              "refId": "A",
-              "step": 600
+              "step": 240
            }
          ],
          "thresholds": [],
@ -680,7 +755,7 @@
              "legendFormat": "{{code}} {{endpoint}}",
              "metric": "",
              "refId": "A",
-              "step": 600
+              "step": 240
            }
          ],
          "thresholds": [],
@ -760,7 +835,9 @@
          "targets": [
            {
              "expr": "sum by (code) (rate(response_time_count{endpoint=\"/acme/new-reg\"}[$interval]))",
+              "interval": "",
              "intervalFactor": 2,
+              "legendFormat": "{{code}}",
              "refId": "A",
              "step": 600
            }
@ -787,7 +864,7 @@
              "label": null,
              "logBase": 1,
              "max": null,
-              "min": null,
+              "min": "0",
              "show": true
            },
            {
@ -929,7 +1006,7 @@
              "label": null,
              "logBase": 1,
              "max": null,
-              "min": null,
+              "min": "0",
              "show": true
            },
            {
@ -971,10 +1048,11 @@
          "steppedLine": false,
          "targets": [
            {
-              "expr": "sum by (code) (rate(response_time_count{method=\"POST\",endpoint=\"/acme/challenge\"}[$interval]))",
+              "expr": "sum by (code) (rate(response_time_count{method=\"POST\",endpoint=\"/acme/challenge/\"}[$interval]))",
              "interval": "",
              "intervalFactor": 2,
              "legendFormat": "{{code}}",
+              "metric": "",
              "refId": "A",
              "step": 600
            }
@ -1001,7 +1079,7 @@
              "label": null,
              "logBase": 1,
              "max": null,
-              "min": null,
+              "min": "0",
              "show": true
            },
            {
@ -1061,7 +1139,7 @@
              "legendFormat": "{{instance}}",
              "metric": "process_cpu_seconds_total",
              "refId": "A",
-              "step": 240
+              "step": 120
            }
          ],
          "thresholds": [],
@ -1110,7 +1188,233 @@
    {
      "collapse": false,
      "height": 250,
-      "panels": [],
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "datasource": "$datasource",
+          "fill": 3,
+          "id": 16,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "span": 4,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum by (method, code) (rate(response_time_count{instance=~\".*ocsp.*\",code!=\"405\"}[$interval]))",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "{{method}}, {{code}}",
+              "metric": "",
+              "refId": "A",
+              "step": 240
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "OCSP response volume",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "datasource": "$datasource",
+          "fill": 1,
+          "id": 17,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "span": 4,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "histogram_quantile(0.5, sum by (le, endpoint) (rate(response_time_bucket{instance=~\".*ocsp.*\"}[$interval])))",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "median",
+              "refId": "A",
+              "step": 240
+            },
+            {
+              "expr": "histogram_quantile(0.99, sum by (le, endpoint) (rate(response_time_bucket{instance=~\".*ocsp.*\"}[$interval])))",
+              "hide": false,
+              "intervalFactor": 2,
+              "legendFormat": "99th percentile",
+              "refId": "B",
+              "step": 240
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "OCSP latency",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "datasource": "$datasource",
+          "fill": 1,
+          "id": 18,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "span": 4,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum by (job) (rate(response_time_count{code!~\"[45]0.\",instance=~\".*ocsp.*\"}[$interval])) / sum by (job) (rate(response_time_count{instance=~\".*ocsp.*\"}[$interval]))",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "success rate",
+              "refId": "A",
+              "step": 240
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "OCSP success rate (excluding 400s)",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
      "repeat": null,
      "repeatIteration": null,
      "repeatRowId": null,
@ -1244,5 +1548,5 @@
  },
  "timezone": "utc",
  "title": "Boulderdash",
-  "version": 6
+  "version": 25
 }
--- a/test/grafana/lint.py
+++ b/test/grafana/lint.py
@ -0,0 +1,27 @@
+#!/usr/bin/python
+# Check dashboard JSON files for common errors, like forgetting to templatize a
+# datasource.
+import json
+import os
+with open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
+    "boulderdash.json")) as f:
+    dashboard = json.load(f)
+
+# When exporting, the current value of templated variables is saved. We don't
+# want to save a specific value for datasource, since that's
+# deployment-specific, so we ensure that the dashboard was exported with the
+# datasource template variable set to "Default."
+for li in dashboard["templating"]["list"]:
+    if li["type"] == "datasource":
+        assert(li["current"]["value"] == "default")
+
+# Additionally, ensure each panel's datasource is using the template variable
+# rather than a hardcoded datasource. Grafana will choose a hardcoded
+# datasource on new panels by default, so this is an easy mistake to make.
+for ro in dashboard["rows"]:
+    for pa in ro["panels"]:
+        assert(pa["datasource"] == "$datasource")
+
+# It seems that __inputs is non-empty when template variables at the top of the
+# dashboard have been modified from the defaults; check for that.
+assert(len(dashboard["__inputs"]) == 0)