Grafana dashboard update. (#2786)

- Add OCSP graphs
- Graph overall request rate
- Separate out WFE vs OCSP graphs
- Fix challenge graph (add a / to endpoint)
- Some incidental changes to "step"
- Add a lint script to check for common dashboard mistakes
This commit is contained in:
Jacob Hoffman-Andrews 2017-05-26 07:53:43 -07:00 committed by Daniel McCarney
parent 8fe0697b9a
commit d5d8b0dba8
4 changed files with 360 additions and 23 deletions

View File

@ -36,7 +36,7 @@ env:
- PATH=$HOME/bin:$PATH # protoc gets installed here
- GO15VENDOREXPERIMENT=1
matrix:
- RUN="vet fmt migrations integration godep-restore errcheck generate"
- RUN="vet fmt migrations integration godep-restore errcheck generate dashlint"
# Config changes that have landed in master but not yet been applied to
# production can be made in boulder-config-next.json.
- RUN="integration" BOULDER_CONFIG_DIR="test/config-next"

View File

@ -11,7 +11,7 @@ fi
# defaults, because we don't want to run it locally (would be too disruptive to
# GOPATH). We also omit coverage by default on local runs because it generates
# artifacts on disk that aren't needed.
RUN=${RUN:-vet fmt migrations unit integration errcheck}
RUN=${RUN:-vet fmt migrations unit integration errcheck dashlint}
# The list of segments to hard fail on, as opposed to continuing to the end of
# the unit tests before failing.
@ -251,4 +251,10 @@ if [[ "$RUN" =~ "rpm" ]]; then
end_context #"rpm"
fi
if [[ "$RUN" =~ "dashlint" ]]; then
start_context "dashlint"
run python test/grafana/lint.py
end_context #"dashlint"
fi
exit ${FAILURE}

View File

@ -19,7 +19,7 @@
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"graphTooltip": 2,
"hideControls": false,
"id": null,
"links": [],
@ -27,7 +27,7 @@
"rows": [
{
"collapse": false,
"height": "250px",
"height": 256,
"panels": [
{
"aliasColors": {},
@ -53,13 +53,14 @@
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(response_time_count{code!~\"50.\"}[$interval])) / sum by (job) (rate(response_time_count{}[$interval]))",
"expr": "sum by (instance) (rate(response_time_count{code!~\"50.\",instance=~\".*wfe.*\"}[$interval])) / sum by (instance) (rate(response_time_count{}[$interval]))",
"intervalFactor": 2,
"legendFormat": "",
"metric": "",
"refId": "A",
"step": 240
@ -99,6 +100,78 @@
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"datasource": "$datasource",
"fill": 1,
"id": 19,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (instance) (rate(response_time_count[$interval]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
"refId": "A",
"step": 240
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Request volume",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
@ -148,7 +221,7 @@
"legendFormat": "{{result}}",
"metric": "",
"refId": "A",
"step": 600
"step": 240
}
],
"thresholds": [],
@ -222,7 +295,7 @@
"legendFormat": "{{result}}",
"metric": "",
"refId": "A",
"step": 600
"step": 240
}
],
"thresholds": [],
@ -296,7 +369,7 @@
"legendFormat": "{{result}}",
"metric": "",
"refId": "A",
"step": 600
"step": 240
}
],
"thresholds": [],
@ -375,7 +448,8 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (code) (rate(response_time_count{method=\"GET\"}[$interval]))",
"expr": "sum by (code) (rate(response_time_count{method=\"GET\",instance=~\".*wfe.*\"}[$interval]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{code}}",
"metric": "response_",
@ -386,7 +460,7 @@
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "GETs per second by response code",
"title": "GETs per second by response code (WFE)",
"tooltip": {
"shared": true,
"sort": 0,
@ -447,7 +521,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (code) (rate(response_time_count{method=\"POST\"}[$interval]))",
"expr": "sum by (code) (rate(response_time_count{method=\"POST\",instance=~\".*wfe.*\"}[$interval]))",
"intervalFactor": 2,
"legendFormat": "{{code}}",
"refId": "A",
@ -457,7 +531,7 @@
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "POSTs per second by response code",
"title": "POSTs per second by response code (WFE)",
"tooltip": {
"shared": true,
"sort": 0,
@ -535,7 +609,7 @@
"intervalFactor": 2,
"legendFormat": "{{endpoint}}",
"refId": "A",
"step": 600
"step": 240
}
],
"thresholds": [],
@ -606,8 +680,9 @@
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{endpoint}}",
"metric": "response_",
"refId": "A",
"step": 600
"step": 240
}
],
"thresholds": [],
@ -680,7 +755,7 @@
"legendFormat": "{{code}} {{endpoint}}",
"metric": "",
"refId": "A",
"step": 600
"step": 240
}
],
"thresholds": [],
@ -760,7 +835,9 @@
"targets": [
{
"expr": "sum by (code) (rate(response_time_count{endpoint=\"/acme/new-reg\"}[$interval]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{code}}",
"refId": "A",
"step": 600
}
@ -787,7 +864,7 @@
"label": null,
"logBase": 1,
"max": null,
"min": null,
"min": "0",
"show": true
},
{
@ -929,7 +1006,7 @@
"label": null,
"logBase": 1,
"max": null,
"min": null,
"min": "0",
"show": true
},
{
@ -971,10 +1048,11 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (code) (rate(response_time_count{method=\"POST\",endpoint=\"/acme/challenge\"}[$interval]))",
"expr": "sum by (code) (rate(response_time_count{method=\"POST\",endpoint=\"/acme/challenge/\"}[$interval]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{code}}",
"metric": "",
"refId": "A",
"step": 600
}
@ -1001,7 +1079,7 @@
"label": null,
"logBase": 1,
"max": null,
"min": null,
"min": "0",
"show": true
},
{
@ -1061,7 +1139,7 @@
"legendFormat": "{{instance}}",
"metric": "process_cpu_seconds_total",
"refId": "A",
"step": 240
"step": 120
}
],
"thresholds": [],
@ -1110,7 +1188,233 @@
{
"collapse": false,
"height": 250,
"panels": [],
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "$datasource",
"fill": 3,
"id": 16,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (method, code) (rate(response_time_count{instance=~\".*ocsp.*\",code!=\"405\"}[$interval]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{method}}, {{code}}",
"metric": "",
"refId": "A",
"step": 240
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "OCSP response volume",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"datasource": "$datasource",
"fill": 1,
"id": 17,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.5, sum by (le, endpoint) (rate(response_time_bucket{instance=~\".*ocsp.*\"}[$interval])))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "median",
"refId": "A",
"step": 240
},
{
"expr": "histogram_quantile(0.99, sum by (le, endpoint) (rate(response_time_bucket{instance=~\".*ocsp.*\"}[$interval])))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "99th percentile",
"refId": "B",
"step": 240
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "OCSP latency",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"datasource": "$datasource",
"fill": 1,
"id": 18,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(response_time_count{code!~\"[45]0.\",instance=~\".*ocsp.*\"}[$interval])) / sum by (job) (rate(response_time_count{instance=~\".*ocsp.*\"}[$interval]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "success rate",
"refId": "A",
"step": 240
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "OCSP success rate (excluding 400s)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
@ -1244,5 +1548,5 @@
},
"timezone": "utc",
"title": "Boulderdash",
"version": 6
"version": 25
}

27
test/grafana/lint.py Normal file
View File

@ -0,0 +1,27 @@
#!/usr/bin/python
# Check dashboard JSON files for common errors, like forgetting to templatize a
# datasource.
import json
import os
with open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
"boulderdash.json")) as f:
dashboard = json.load(f)
# When exporting, the current value of templated variables is saved. We don't
# want to save a specific value for datasource, since that's
# deployment-specific, so we ensure that the dashboard was exported with the
# datasource template variable set to "Default."
for li in dashboard["templating"]["list"]:
if li["type"] == "datasource":
assert(li["current"]["value"] == "default")
# Additionally, ensure each panel's datasource is using the template variable
# rather than a hardcoded datasource. Grafana will choose a hardcoded
# datasource on new panels by default, so this is an easy mistake to make.
for ro in dashboard["rows"]:
for pa in ro["panels"]:
assert(pa["datasource"] == "$datasource")
# It seems that __inputs is non-empty when template variables at the top of the
# dashboard have been modified from the defaults; check for that.
assert(len(dashboard["__inputs"]) == 0)