Grafana dashboard update. (#2786)
- Add OCSP graphs - Graph overall request rate - Separate out WFE vs OCSP graphs - Fix challenge graph (add a / to endpoint) - Some incidental changes to "step" - Add a lint script to check for common dashboard mistakes
This commit is contained in:
parent
8fe0697b9a
commit
d5d8b0dba8
|
@ -36,7 +36,7 @@ env:
|
|||
- PATH=$HOME/bin:$PATH # protoc gets installed here
|
||||
- GO15VENDOREXPERIMENT=1
|
||||
matrix:
|
||||
- RUN="vet fmt migrations integration godep-restore errcheck generate"
|
||||
- RUN="vet fmt migrations integration godep-restore errcheck generate dashlint"
|
||||
# Config changes that have landed in master but not yet been applied to
|
||||
# production can be made in boulder-config-next.json.
|
||||
- RUN="integration" BOULDER_CONFIG_DIR="test/config-next"
|
||||
|
|
8
test.sh
8
test.sh
|
@ -11,7 +11,7 @@ fi
|
|||
# defaults, because we don't want to run it locally (would be too disruptive to
|
||||
# GOPATH). We also omit coverage by default on local runs because it generates
|
||||
# artifacts on disk that aren't needed.
|
||||
RUN=${RUN:-vet fmt migrations unit integration errcheck}
|
||||
RUN=${RUN:-vet fmt migrations unit integration errcheck dashlint}
|
||||
|
||||
# The list of segments to hard fail on, as opposed to continuing to the end of
|
||||
# the unit tests before failing.
|
||||
|
@ -251,4 +251,10 @@ if [[ "$RUN" =~ "rpm" ]]; then
|
|||
end_context #"rpm"
|
||||
fi
|
||||
|
||||
if [[ "$RUN" =~ "dashlint" ]]; then
|
||||
start_context "dashlint"
|
||||
run python test/grafana/lint.py
|
||||
end_context #"dashlint"
|
||||
fi
|
||||
|
||||
exit ${FAILURE}
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"graphTooltip": 2,
|
||||
"hideControls": false,
|
||||
"id": null,
|
||||
"links": [],
|
||||
|
@ -27,7 +27,7 @@
|
|||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"height": 256,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
|
@ -53,13 +53,14 @@
|
|||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 12,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (job) (rate(response_time_count{code!~\"50.\"}[$interval])) / sum by (job) (rate(response_time_count{}[$interval]))",
|
||||
"expr": "sum by (instance) (rate(response_time_count{code!~\"50.\",instance=~\".*wfe.*\"}[$interval])) / sum by (instance) (rate(response_time_count{}[$interval]))",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"metric": "",
|
||||
"refId": "A",
|
||||
"step": 240
|
||||
|
@ -99,6 +100,78 @@
|
|||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 19,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (instance) (rate(response_time_count[$interval]))",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "A",
|
||||
"step": 240
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Request volume",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
|
@ -148,7 +221,7 @@
|
|||
"legendFormat": "{{result}}",
|
||||
"metric": "",
|
||||
"refId": "A",
|
||||
"step": 600
|
||||
"step": 240
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
|
@ -222,7 +295,7 @@
|
|||
"legendFormat": "{{result}}",
|
||||
"metric": "",
|
||||
"refId": "A",
|
||||
"step": 600
|
||||
"step": 240
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
|
@ -296,7 +369,7 @@
|
|||
"legendFormat": "{{result}}",
|
||||
"metric": "",
|
||||
"refId": "A",
|
||||
"step": 600
|
||||
"step": 240
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
|
@ -375,7 +448,8 @@
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (code) (rate(response_time_count{method=\"GET\"}[$interval]))",
|
||||
"expr": "sum by (code) (rate(response_time_count{method=\"GET\",instance=~\".*wfe.*\"}[$interval]))",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{code}}",
|
||||
"metric": "response_",
|
||||
|
@ -386,7 +460,7 @@
|
|||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "GETs per second by response code",
|
||||
"title": "GETs per second by response code (WFE)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
|
@ -447,7 +521,7 @@
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (code) (rate(response_time_count{method=\"POST\"}[$interval]))",
|
||||
"expr": "sum by (code) (rate(response_time_count{method=\"POST\",instance=~\".*wfe.*\"}[$interval]))",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{code}}",
|
||||
"refId": "A",
|
||||
|
@ -457,7 +531,7 @@
|
|||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "POSTs per second by response code",
|
||||
"title": "POSTs per second by response code (WFE)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
|
@ -535,7 +609,7 @@
|
|||
"intervalFactor": 2,
|
||||
"legendFormat": "{{endpoint}}",
|
||||
"refId": "A",
|
||||
"step": 600
|
||||
"step": 240
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
|
@ -606,8 +680,9 @@
|
|||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{endpoint}}",
|
||||
"metric": "response_",
|
||||
"refId": "A",
|
||||
"step": 600
|
||||
"step": 240
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
|
@ -680,7 +755,7 @@
|
|||
"legendFormat": "{{code}} {{endpoint}}",
|
||||
"metric": "",
|
||||
"refId": "A",
|
||||
"step": 600
|
||||
"step": 240
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
|
@ -760,7 +835,9 @@
|
|||
"targets": [
|
||||
{
|
||||
"expr": "sum by (code) (rate(response_time_count{endpoint=\"/acme/new-reg\"}[$interval]))",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{code}}",
|
||||
"refId": "A",
|
||||
"step": 600
|
||||
}
|
||||
|
@ -787,7 +864,7 @@
|
|||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"min": "0",
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
|
@ -929,7 +1006,7 @@
|
|||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"min": "0",
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
|
@ -971,10 +1048,11 @@
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (code) (rate(response_time_count{method=\"POST\",endpoint=\"/acme/challenge\"}[$interval]))",
|
||||
"expr": "sum by (code) (rate(response_time_count{method=\"POST\",endpoint=\"/acme/challenge/\"}[$interval]))",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{code}}",
|
||||
"metric": "",
|
||||
"refId": "A",
|
||||
"step": 600
|
||||
}
|
||||
|
@ -1001,7 +1079,7 @@
|
|||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"min": "0",
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
|
@ -1061,7 +1139,7 @@
|
|||
"legendFormat": "{{instance}}",
|
||||
"metric": "process_cpu_seconds_total",
|
||||
"refId": "A",
|
||||
"step": 240
|
||||
"step": 120
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
|
@ -1110,7 +1188,233 @@
|
|||
{
|
||||
"collapse": false,
|
||||
"height": 250,
|
||||
"panels": [],
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 3,
|
||||
"id": 16,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (method, code) (rate(response_time_count{instance=~\".*ocsp.*\",code!=\"405\"}[$interval]))",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{method}}, {{code}}",
|
||||
"metric": "",
|
||||
"refId": "A",
|
||||
"step": 240
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "OCSP response volume",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 17,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.5, sum by (le, endpoint) (rate(response_time_bucket{instance=~\".*ocsp.*\"}[$interval])))",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "median",
|
||||
"refId": "A",
|
||||
"step": 240
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum by (le, endpoint) (rate(response_time_bucket{instance=~\".*ocsp.*\"}[$interval])))",
|
||||
"hide": false,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "99th percentile",
|
||||
"refId": "B",
|
||||
"step": 240
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "OCSP latency",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 18,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (job) (rate(response_time_count{code!~\"[45]0.\",instance=~\".*ocsp.*\"}[$interval])) / sum by (job) (rate(response_time_count{instance=~\".*ocsp.*\"}[$interval]))",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "success rate",
|
||||
"refId": "A",
|
||||
"step": 240
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "OCSP success rate (excluding 400s)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
|
@ -1244,5 +1548,5 @@
|
|||
},
|
||||
"timezone": "utc",
|
||||
"title": "Boulderdash",
|
||||
"version": 6
|
||||
"version": 25
|
||||
}
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
#!/usr/bin/python
|
||||
# Check dashboard JSON files for common errors, like forgetting to templatize a
|
||||
# datasource.
|
||||
import json
|
||||
import os
|
||||
with open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
|
||||
"boulderdash.json")) as f:
|
||||
dashboard = json.load(f)
|
||||
|
||||
# When exporting, the current value of templated variables is saved. We don't
|
||||
# want to save a specific value for datasource, since that's
|
||||
# deployment-specific, so we ensure that the dashboard was exported with the
|
||||
# datasource template variable set to "Default."
|
||||
for li in dashboard["templating"]["list"]:
|
||||
if li["type"] == "datasource":
|
||||
assert(li["current"]["value"] == "default")
|
||||
|
||||
# Additionally, ensure each panel's datasource is using the template variable
|
||||
# rather than a hardcoded datasource. Grafana will choose a hardcoded
|
||||
# datasource on new panels by default, so this is an easy mistake to make.
|
||||
for ro in dashboard["rows"]:
|
||||
for pa in ro["panels"]:
|
||||
assert(pa["datasource"] == "$datasource")
|
||||
|
||||
# It seems that __inputs is non-empty when template variables at the top of the
|
||||
# dashboard have been modified from the defaults; check for that.
|
||||
assert(len(dashboard["__inputs"]) == 0)
|
Loading…
Reference in New Issue