From d5d8b0dba8b06ae294e681481c8dfff8e0eba13e Mon Sep 17 00:00:00 2001
From: Jacob Hoffman-Andrews <github@hoffman-andrews.com>
Date: Fri, 26 May 2017 07:53:43 -0700
Subject: [PATCH] Grafana dashboard update. (#2786)

- Add OCSP graphs
- Graph overall request rate
- Separate out WFE vs OCSP graphs
- Fix challenge graph (add a / to endpoint)
- Some incidental changes to "step"
- Add a lint script to check for common dashboard mistakes
---
 .travis.yml                   |   2 +-
 test.sh                       |   8 +-
 test/grafana/boulderdash.json | 346 +++++++++++++++++++++++++++++++---
 test/grafana/lint.py          |  27 +++
 4 files changed, 360 insertions(+), 23 deletions(-)
 create mode 100644 test/grafana/lint.py

diff --git a/.travis.yml b/.travis.yml
index a0d4139f5..d3d5e41fa 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -36,7 +36,7 @@ env:
     - PATH=$HOME/bin:$PATH # protoc gets installed here
     - GO15VENDOREXPERIMENT=1
   matrix:
-    - RUN="vet fmt migrations integration godep-restore errcheck generate"
+    - RUN="vet fmt migrations integration godep-restore errcheck generate dashlint"
     # Config changes that have landed in master but not yet been applied to
     # production can be made in boulder-config-next.json.
     - RUN="integration" BOULDER_CONFIG_DIR="test/config-next"
diff --git a/test.sh b/test.sh
index 3726e48ae..58f7cfa0c 100755
--- a/test.sh
+++ b/test.sh
@@ -11,7 +11,7 @@ fi
 # defaults, because we don't want to run it locally (would be too disruptive to
 # GOPATH). We also omit coverage by default on local runs because it generates
 # artifacts on disk that aren't needed.
-RUN=${RUN:-vet fmt migrations unit integration errcheck}
+RUN=${RUN:-vet fmt migrations unit integration errcheck dashlint}
 
 # The list of segments to hard fail on, as opposed to continuing to the end of
 # the unit tests before failing.
@@ -251,4 +251,10 @@ if [[ "$RUN" =~ "rpm" ]]; then
   end_context #"rpm"
 fi
 
+if [[ "$RUN" =~ "dashlint" ]]; then
+  start_context "dashlint"
+  run python test/grafana/lint.py
+  end_context #"dashlint"
+fi
+
 exit ${FAILURE}
diff --git a/test/grafana/boulderdash.json b/test/grafana/boulderdash.json
index 38b9fce99..a9bf56851 100644
--- a/test/grafana/boulderdash.json
+++ b/test/grafana/boulderdash.json
@@ -19,7 +19,7 @@
   },
   "editable": true,
   "gnetId": null,
-  "graphTooltip": 0,
+  "graphTooltip": 2,
   "hideControls": false,
   "id": null,
   "links": [],
@@ -27,7 +27,7 @@
   "rows": [
     {
       "collapse": false,
-      "height": "250px",
+      "height": 256,
       "panels": [
         {
           "aliasColors": {},
@@ -53,13 +53,14 @@
           "points": false,
           "renderer": "flot",
           "seriesOverrides": [],
-          "span": 12,
+          "span": 6,
           "stack": false,
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum by (job) (rate(response_time_count{code!~\"50.\"}[$interval])) / sum by (job) (rate(response_time_count{}[$interval]))",
+              "expr": "sum by (instance) (rate(response_time_count{code!~\"50.\",instance=~\".*wfe.*\"}[$interval])) / sum by (instance) (rate(response_time_count{}[$interval]))",
               "intervalFactor": 2,
+              "legendFormat": "",
               "metric": "",
               "refId": "A",
               "step": 240
@@ -99,6 +100,78 @@
               "show": true
             }
           ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "datasource": "$datasource",
+          "fill": 1,
+          "id": 19,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "span": 6,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum by (instance) (rate(response_time_count[$interval]))",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "{{instance}}",
+              "refId": "A",
+              "step": 240
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Request volume",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
         }
       ],
       "repeat": null,
@@ -148,7 +221,7 @@
               "legendFormat": "{{result}}",
               "metric": "",
               "refId": "A",
-              "step": 600
+              "step": 240
             }
           ],
           "thresholds": [],
@@ -222,7 +295,7 @@
               "legendFormat": "{{result}}",
               "metric": "",
               "refId": "A",
-              "step": 600
+              "step": 240
             }
           ],
           "thresholds": [],
@@ -296,7 +369,7 @@
               "legendFormat": "{{result}}",
               "metric": "",
               "refId": "A",
-              "step": 600
+              "step": 240
             }
           ],
           "thresholds": [],
@@ -375,7 +448,8 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum by (code) (rate(response_time_count{method=\"GET\"}[$interval]))",
+              "expr": "sum by (code) (rate(response_time_count{method=\"GET\",instance=~\".*wfe.*\"}[$interval]))",
+              "interval": "",
               "intervalFactor": 2,
               "legendFormat": "{{code}}",
               "metric": "response_",
@@ -386,7 +460,7 @@
           "thresholds": [],
           "timeFrom": null,
           "timeShift": null,
-          "title": "GETs per second by response code",
+          "title": "GETs per second by response code (WFE)",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -447,7 +521,7 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum by (code) (rate(response_time_count{method=\"POST\"}[$interval]))",
+              "expr": "sum by (code) (rate(response_time_count{method=\"POST\",instance=~\".*wfe.*\"}[$interval]))",
               "intervalFactor": 2,
               "legendFormat": "{{code}}",
               "refId": "A",
@@ -457,7 +531,7 @@
           "thresholds": [],
           "timeFrom": null,
           "timeShift": null,
-          "title": "POSTs per second by response code",
+          "title": "POSTs per second by response code (WFE)",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -535,7 +609,7 @@
               "intervalFactor": 2,
               "legendFormat": "{{endpoint}}",
               "refId": "A",
-              "step": 600
+              "step": 240
             }
           ],
           "thresholds": [],
@@ -606,8 +680,9 @@
               "interval": "",
               "intervalFactor": 2,
               "legendFormat": "{{endpoint}}",
+              "metric": "response_",
               "refId": "A",
-              "step": 600
+              "step": 240
             }
           ],
           "thresholds": [],
@@ -680,7 +755,7 @@
               "legendFormat": "{{code}} {{endpoint}}",
               "metric": "",
               "refId": "A",
-              "step": 600
+              "step": 240
             }
           ],
           "thresholds": [],
@@ -760,7 +835,9 @@
           "targets": [
             {
               "expr": "sum by (code) (rate(response_time_count{endpoint=\"/acme/new-reg\"}[$interval]))",
+              "interval": "",
               "intervalFactor": 2,
+              "legendFormat": "{{code}}",
               "refId": "A",
               "step": 600
             }
@@ -787,7 +864,7 @@
               "label": null,
               "logBase": 1,
               "max": null,
-              "min": null,
+              "min": "0",
               "show": true
             },
             {
@@ -929,7 +1006,7 @@
               "label": null,
               "logBase": 1,
               "max": null,
-              "min": null,
+              "min": "0",
               "show": true
             },
             {
@@ -971,10 +1048,11 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum by (code) (rate(response_time_count{method=\"POST\",endpoint=\"/acme/challenge\"}[$interval]))",
+              "expr": "sum by (code) (rate(response_time_count{method=\"POST\",endpoint=\"/acme/challenge/\"}[$interval]))",
               "interval": "",
               "intervalFactor": 2,
               "legendFormat": "{{code}}",
+              "metric": "",
               "refId": "A",
               "step": 600
             }
@@ -1001,7 +1079,7 @@
               "label": null,
               "logBase": 1,
               "max": null,
-              "min": null,
+              "min": "0",
               "show": true
             },
             {
@@ -1061,7 +1139,7 @@
               "legendFormat": "{{instance}}",
               "metric": "process_cpu_seconds_total",
               "refId": "A",
-              "step": 240
+              "step": 120
             }
           ],
           "thresholds": [],
@@ -1110,7 +1188,233 @@
     {
       "collapse": false,
       "height": 250,
-      "panels": [],
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "datasource": "$datasource",
+          "fill": 3,
+          "id": 16,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "span": 4,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum by (method, code) (rate(response_time_count{instance=~\".*ocsp.*\",code!=\"405\"}[$interval]))",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "{{method}}, {{code}}",
+              "metric": "",
+              "refId": "A",
+              "step": 240
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "OCSP response volume",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "datasource": "$datasource",
+          "fill": 1,
+          "id": 17,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "span": 4,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "histogram_quantile(0.5, sum by (le, endpoint) (rate(response_time_bucket{instance=~\".*ocsp.*\"}[$interval])))",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "median",
+              "refId": "A",
+              "step": 240
+            },
+            {
+              "expr": "histogram_quantile(0.99, sum by (le, endpoint) (rate(response_time_bucket{instance=~\".*ocsp.*\"}[$interval])))",
+              "hide": false,
+              "intervalFactor": 2,
+              "legendFormat": "99th percentile",
+              "refId": "B",
+              "step": 240
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "OCSP latency",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "datasource": "$datasource",
+          "fill": 1,
+          "id": 18,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "span": 4,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum by (job) (rate(response_time_count{code!~\"[45]0.\",instance=~\".*ocsp.*\"}[$interval])) / sum by (job) (rate(response_time_count{instance=~\".*ocsp.*\"}[$interval]))",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "success rate",
+              "refId": "A",
+              "step": 240
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "OCSP success rate (excluding 400s)",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        }
+      ],
       "repeat": null,
       "repeatIteration": null,
       "repeatRowId": null,
@@ -1244,5 +1548,5 @@
   },
   "timezone": "utc",
   "title": "Boulderdash",
-  "version": 6
+  "version": 25
 }
diff --git a/test/grafana/lint.py b/test/grafana/lint.py
new file mode 100644
index 000000000..531fe4049
--- /dev/null
+++ b/test/grafana/lint.py
@@ -0,0 +1,27 @@
+#!/usr/bin/python
+# Check dashboard JSON files for common errors, like forgetting to templatize a
+# datasource.
+import json
+import os
+with open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
+    "boulderdash.json")) as f:
+    dashboard = json.load(f)
+
+# When exporting, the current value of templated variables is saved. We don't
+# want to save a specific value for datasource, since that's
+# deployment-specific, so we ensure that the dashboard was exported with the
+# datasource template variable set to "Default."
+for li in dashboard["templating"]["list"]:
+    if li["type"] == "datasource":
+        assert(li["current"]["value"] == "default")
+
+# Additionally, ensure each panel's datasource is using the template variable
+# rather than a hardcoded datasource. Grafana will choose a hardcoded
+# datasource on new panels by default, so this is an easy mistake to make.
+for ro in dashboard["rows"]:
+    for pa in ro["panels"]:
+        assert(pa["datasource"] == "$datasource")
+
+# It seems that __inputs is non-empty when template variables at the top of the
+# dashboard have been modified from the defaults; check for that.
+assert(len(dashboard["__inputs"]) == 0)