Multicluster dashboard for traffic metrics (#4178)

This change adds labels to endpoints that target remote services. It also adds a Grafana dashboard that can be used to monitor multicluster traffic.

Signed-off-by: Zahari Dichev <zaharidichev@gmail.com>
This commit is contained in:
Zahari Dichev 2020-05-14 17:48:27 +03:00 committed by GitHub
parent bfe02490ad
commit ef1a2c2b10
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 1002 additions and 1 deletions

View File

@ -19,6 +19,15 @@ import (
const (
kubeSystem = "kube-system"
podIPIndex = "ip"
// metrics labels
service = "service"
namespace = "namespace"
remoteGatewayNamespace = "remote_gateway_namespace"
remoteGateway = "remote_gateway"
remoteCluster = "remote_cluster"
remoteService = "remote_service"
remoteServiceNamespace = "remote_service_namespace"
)
// TODO: prom metrics for all the queues/caches
@ -450,6 +459,30 @@ func (pp *portPublisher) updateEndpoints(endpoints *corev1.Endpoints) {
pp.metrics.setExists(true)
}
func metricLabels(endpoints *corev1.Endpoints) map[string]string {
labels := map[string]string{service: endpoints.Name, namespace: endpoints.Namespace}
gateway, hasRemoteGateway := endpoints.Labels[consts.RemoteGatewayNameLabel]
gatewayNs, hasRemoteGatwayNs := endpoints.Labels[consts.RemoteGatewayNsLabel]
remoteClusterName, hasRemoteClusterName := endpoints.Labels[consts.RemoteClusterNameLabel]
serviceFqn, hasServiceFqn := endpoints.Annotations[consts.RemoteServiceFqName]
if hasRemoteGateway && hasRemoteGatwayNs && hasRemoteClusterName && hasServiceFqn {
// this means we are looking at Endpoints created for the purpose of mirroring
// an out of cluster service.
labels[remoteGatewayNamespace] = gatewayNs
labels[remoteGateway] = gateway
labels[remoteCluster] = remoteClusterName
fqParts := strings.Split(serviceFqn, ".")
if len(fqParts) >= 2 {
labels[remoteService] = fqParts[0]
labels[remoteServiceNamespace] = fqParts[1]
}
}
return labels
}
func (pp *portPublisher) endpointsToAddresses(endpoints *corev1.Endpoints) AddressSet {
addresses := make(map[ID]Address)
for _, subset := range endpoints.Subsets {
@ -504,7 +537,7 @@ func (pp *portPublisher) endpointsToAddresses(endpoints *corev1.Endpoints) Addre
}
return AddressSet{
Addresses: addresses,
Labels: map[string]string{"service": endpoints.Name, "namespace": endpoints.Namespace},
Labels: metricLabels(endpoints),
}
}

View File

@ -0,0 +1,968 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 1,
"id": null,
"iteration": 1531434867463,
"links": [],
"panels": [
{
"content": "<div style=\"display: flex; align-items: center\">\n <img src=\"https://linkerd.io/images/identity/favicon/linkerd-favicon.png\" style=\"height:32px;\"/>&nbsp;\n <span style=\"font-size: 32px\">Cluster: $cluster, Gateway: $gateway</span>\n</div>",
"gridPos": {
"h": 2,
"w": 24,
"x": 0,
"y": 0
},
"id": 20,
"links": [],
"mode": "html",
"options": {},
"title": "",
"transparent": true,
"type": "text"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#d44a3a",
"rgba(237, 129, 40, 0.89)",
"#299c46"
],
"datasource": "prometheus",
"decimals": null,
"format": "percentunit",
"gauge": {
"maxValue": 1,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 2
},
"id": 5,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"options": {},
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": true,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"tableColumn": "",
"targets": [
{
"expr": "sum(irate(response_total{classification=\"success\", dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", direction=\"outbound\"}[30s])) / sum(irate(response_total{dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", direction=\"outbound\"}[30s]))",
"format": "time_series",
"instant": false,
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"thresholds": "0.9,.99",
"title": "SUCCESS RATE",
"transparent": true,
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "prometheus",
"decimals": null,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 2
},
"id": 4,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"options": {},
"postfix": " RPS",
"postfixFontSize": "100%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": true,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"tableColumn": "",
"targets": [
{
"expr": "sum(irate(request_total{dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", direction=\"outbound\"}[30s]))",
"format": "time_series",
"instant": false,
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"thresholds": "",
"title": "REQUEST RATE",
"transparent": true,
"type": "singlestat",
"valueFontSize": "100%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "prometheus",
"decimals": null,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 2
},
"id": 81,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"options": {},
"postfix": " ms",
"postfixFontSize": "100%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": true,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"tableColumn": "",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(irate(response_latency_ms_bucket{dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", direction=\"outbound\"}[30s])) by (le))",
"format": "time_series",
"instant": false,
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"thresholds": "",
"title": "P95 LATENCY",
"transparent": true,
"type": "singlestat",
"valueFontSize": "100%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"content": "<div class=\"text-center dashboard-header\">\n <span>TOP-LINE TRAFFIC</span>\n</div>",
"gridPos": {
"h": 2,
"w": 24,
"x": 0,
"y": 6
},
"id": 17,
"links": [],
"mode": "html",
"options": {},
"title": "",
"transparent": true,
"type": "text"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"fill": 1,
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"y": 8
},
"id": 67,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(response_total{classification=\"success\", dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", direction=\"outbound\"}[30s])) by (dst_remote_gateway) / sum(irate(response_total{dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", direction=\"outbound\"}[30s])) by (dst_remote_gateway)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "gateway/{{dst_remote_gateway}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "SUCCESS RATE",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": null,
"format": "percentunit",
"label": "",
"logBase": 1,
"max": "1",
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"fill": 0,
"gridPos": {
"h": 7,
"w": 8,
"x": 8,
"y": 8
},
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(request_total{dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", direction=\"outbound\", tls=\"true\"}[30s])) by (dst_remote_gateway)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "🔒gateway/{{dst_remote_gateway}}",
"refId": "A"
},
{
"expr": "sum(irate(request_total{dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", tls!=\"true\"}[30s])) by (dst_remote_gateway)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "gatewy/{{dst_remote_gateway}}",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "REQUEST RATE",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": null,
"format": "rps",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"fill": 1,
"gridPos": {
"h": 7,
"w": 8,
"x": 16,
"y": 8
},
"id": 68,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.5, sum(irate(response_latency_ms_bucket{dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", direction=\"outbound\"}[30s])) by (le, dst_remote_gateway))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "p50 gateway/{{dst_remote_gateway}}",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(irate(response_latency_ms_bucket{dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", direction=\"outbound\"}[30s])) by (le, dst_remote_gateway))",
"format": "time_series",
"hide": false,
"intervalFactor": 1,
"legendFormat": "p95 gateway/{{dst_remote_gateway}}",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(irate(response_latency_ms_bucket{dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", direction=\"outbound\"}[30s])) by (le, dst_remote_gateway))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "p99 gateway/{{dst_remote_gateway}}",
"refId": "C"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "LATENCY",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": null,
"format": "ms",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"content": "<div class=\"text-center dashboard-header\">\n <span>TRAFFIC BY REMOTE SERVICE</span>\n</div>",
"gridPos": {
"h": 2,
"w": 24,
"x": 0,
"y": 15
},
"id": 32,
"links": [],
"mode": "html",
"options": {},
"title": "",
"transparent": true,
"type": "text"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"fill": 1,
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"y": 17
},
"id": 77,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(response_total{classification=\"success\", dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", direction=\"outbound\"}[30s])) by (dst_remote_service) / sum(irate(response_total{dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", direction=\"outbound\"}[30s])) by (dst_remote_service)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "remote-svc/{{dst_remote_service}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "SUCCESS RATE",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": null,
"format": "percentunit",
"label": "",
"logBase": 1,
"max": "1",
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"fill": 0,
"gridPos": {
"h": 7,
"w": 8,
"x": 8,
"y": 17
},
"id": 78,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(request_total{dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", tls=\"true\"}[30s])) by (dst_remote_service)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "🔒remote-svc/{{dst_remote_service}}",
"refId": "A"
},
{
"expr": "sum(irate(request_total{dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", tls!=\"true\"}[30s])) by (dst_remote_service)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "remote-svc/{{dst_remote_service}}",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "REQUEST RATE",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "rps",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"fill": 1,
"gridPos": {
"h": 7,
"w": 8,
"x": 16,
"y": 17
},
"id": 79,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{dst_remote_gateway=\"$gateway\", dst_remote_gateway!=\"\", dst_remote_cluster=\"$cluster\", dst_remote_cluster!=\"\", direction=\"outbound\"}[30s])) by (le, dst_remote_service))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "P95 remote-svc/{{dst_remote_service}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "P95 LATENCY",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "ms",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": "1m",
"schemaVersion": 18,
"style": "dark",
"tags": [
"linkerd"
],
"templating": {
"list": [
{
"allValue": ".*",
"current": {
"text": "All",
"value": "$__all"
},
"datasource": "prometheus",
"definition": "",
"hide": 0,
"includeAll": false,
"label": "Cluster",
"multi": false,
"name": "cluster",
"options": [],
"query": "label_values(request_total, dst_remote_cluster)",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".*",
"current": {
"text": "All",
"value": "$__all"
},
"datasource": "prometheus",
"definition": "",
"hide": 0,
"includeAll": false,
"label": "Gateway",
"multi": false,
"name": "gateway",
"options": [],
"query": "label_values(request_total{dst_remote_cluster=\"$cluster\"}, dst_remote_gateway)",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "Linkerd Multicluster",
"uid": "linkerd-multicluster",
"version": 1
}