mirror of https://github.com/linkerd/linkerd2.git
Add peer label to TCP read and write stat queries (#5903)
Add peer label to TCP read and write stat queries Closes #5693 ### Tests --- After refactoring, `linkerd viz stat` behaves the same way (I haven't checked gateways or routes). ``` $ linkerd viz stat deploy/web -n emojivoto -o wide NAME MESHED SUCCESS RPS LATENCY_P50 LATENCY_P95 LATENCY_P99 TCP_CONN READ_BYTES/SEC WRITE_BYTES/SEC web 1/1 91.91% 2.3rps 2ms 4ms 5ms 3 185.3B/s 5180.0B/s # same value as before, latency seems to have dropped time="2021-03-22T18:19:44Z" level=debug msg="Query request:\n\tsum(increase(tcp_write_bytes_total{deployment=\"web\", direction=\"inbound\", namespace=\"emojivoto\", peer=\"src\"}[1m])) by (namespace, deployment)" time="2021-03-22T18:19:44Z" level=debug msg="Query request:\n\tsum(increase(tcp_read_bytes_total{deployment=\"web\", direction=\"inbound\", namespace=\"emojivoto\", peer=\"src\"}[1m])) by (namespace, deployment)" # queries show the peer label --- $ linkerd viz stat deploy/web -n emojivoto --from deploy/vote-bot -o wide NAME MESHED SUCCESS RPS LATENCY_P50 LATENCY_P95 LATENCY_P99 TCP_CONN READ_BYTES/SEC WRITE_BYTES/SEC web 1/1 93.16% 1.9rps 3ms 4ms 4ms 1 4503.4B/s 153.1B/s # stats same as before except for latency which seems to have dropped a bit time="2021-03-22T18:22:10Z" level=debug msg="Query request:\n\tsum(increase(tcp_write_bytes_total{deployment=\"vote-bot\", direction=\"outbound\", dst_deployment=\"web\", dst_namespace=\"emojivoto\", namespace=\"emojivoto\", peer=\"dst\"}[1m])) by (dst_namespace, dst_deployment)" time="2021-03-22T18:22:10Z" level=debug msg="Query request:\n\tsum(increase(tcp_read_bytes_total{deployment=\"vote-bot\", direction=\"outbound\", dst_deployment=\"web\", dst_namespace=\"emojivoto\", namespace=\"emojivoto\", peer=\"dst\"}[1m])) by (dst_namespace, dst_deployment)" # queries show the right label ``` Signed-off-by: mateiidavid <matei.david.35@gmail.com>
This commit is contained in:
parent
073212ac6d
commit
e798b33e2e
|
|
@ -117,10 +117,11 @@ func (s *grpcServer) getGatewaysMetrics(ctx context.Context, req *pb.GatewaysReq
|
||||||
labels, groupBy := buildGatewaysRequestLabels(req)
|
labels, groupBy := buildGatewaysRequestLabels(req)
|
||||||
|
|
||||||
promQueries := map[promType]string{
|
promQueries := map[promType]string{
|
||||||
promGatewayAlive: gatewayAliveQuery,
|
promGatewayAlive: fmt.Sprintf(gatewayAliveQuery, labels.String(), groupBy.String()),
|
||||||
}
|
}
|
||||||
|
|
||||||
metricsResp, err := s.getPrometheusMetrics(ctx, promQueries, gatewayLatencyQuantileQuery, labels.String(), timeWindow, groupBy.String())
|
quantileQueries := generateQuantileQueries(gatewayLatencyQuantileQuery, labels.String(), timeWindow, groupBy.String())
|
||||||
|
metricsResp, err := s.getPrometheusMetrics(ctx, promQueries, quantileQueries)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,6 @@ type promResult struct {
|
||||||
|
|
||||||
const (
|
const (
|
||||||
promGatewayAlive = promType("QUERY_GATEWAY_ALIVE")
|
promGatewayAlive = promType("QUERY_GATEWAY_ALIVE")
|
||||||
promNumMirroredServices = promType("QUERY_NUM_MIRRORED_SERVICES")
|
|
||||||
promRequests = promType("QUERY_REQUESTS")
|
promRequests = promType("QUERY_REQUESTS")
|
||||||
promActualRequests = promType("QUERY_ACTUAL_REQUESTS")
|
promActualRequests = promType("QUERY_ACTUAL_REQUESTS")
|
||||||
promTCPConnections = promType("QUERY_TCP_CONNECTIONS")
|
promTCPConnections = promType("QUERY_TCP_CONNECTIONS")
|
||||||
|
|
@ -168,6 +167,16 @@ func generateLabelStringWithRegex(l model.LabelSet, labelName string, stringToMa
|
||||||
return fmt.Sprintf("{%s}", strings.Join(lstrs, ", "))
|
return fmt.Sprintf("{%s}", strings.Join(lstrs, ", "))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// generate Prometheus queries for latency quantiles, based on a quantile query
|
||||||
|
// template, query labels, a time window and grouping.
|
||||||
|
func generateQuantileQueries(quantileQuery, labels, timeWindow, groupBy string) map[promType]string {
|
||||||
|
return map[promType]string{
|
||||||
|
promLatencyP50: fmt.Sprintf(quantileQuery, promLatencyP50, labels, timeWindow, groupBy),
|
||||||
|
promLatencyP95: fmt.Sprintf(quantileQuery, promLatencyP95, labels, timeWindow, groupBy),
|
||||||
|
promLatencyP99: fmt.Sprintf(quantileQuery, promLatencyP99, labels, timeWindow, groupBy),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// determine if we should add "namespace=<namespace>" to a named query
|
// determine if we should add "namespace=<namespace>" to a named query
|
||||||
func shouldAddNamespaceLabel(resource *pb.Resource) bool {
|
func shouldAddNamespaceLabel(resource *pb.Resource) bool {
|
||||||
return resource.Type != k8s.Namespace && resource.Namespace != ""
|
return resource.Type != k8s.Namespace && resource.Namespace != ""
|
||||||
|
|
@ -180,23 +189,21 @@ func promDirectionLabels(direction string) model.LabelSet {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func promPeerLabel(peer string) model.LabelSet {
|
||||||
|
return model.LabelSet{
|
||||||
|
model.LabelName("peer"): model.LabelValue(peer),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func promResourceType(resource *pb.Resource) model.LabelName {
|
func promResourceType(resource *pb.Resource) model.LabelName {
|
||||||
l5dLabel := k8s.KindToL5DLabel(resource.Type)
|
l5dLabel := k8s.KindToL5DLabel(resource.Type)
|
||||||
return model.LabelName(l5dLabel)
|
return model.LabelName(l5dLabel)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *grpcServer) getPrometheusMetrics(ctx context.Context, requestQueryTemplates map[promType]string, latencyQueryTemplate, labels, timeWindow, groupBy string) ([]promResult, error) {
|
func (s *grpcServer) getPrometheusMetrics(ctx context.Context, requestQueries map[promType]string, latencyQueries map[promType]string) ([]promResult, error) {
|
||||||
resultChan := make(chan promResult)
|
resultChan := make(chan promResult)
|
||||||
|
|
||||||
// kick off asynchronous queries: request count queries + 3 latency queries
|
for pt, query := range requestQueries {
|
||||||
for pt, requestQueryTemplate := range requestQueryTemplates {
|
|
||||||
var query string
|
|
||||||
if pt == promTCPConnections || pt == promGatewayAlive || pt == promNumMirroredServices {
|
|
||||||
query = fmt.Sprintf(requestQueryTemplate, labels, groupBy)
|
|
||||||
} else {
|
|
||||||
query = fmt.Sprintf(requestQueryTemplate, labels, timeWindow, groupBy)
|
|
||||||
}
|
|
||||||
|
|
||||||
go func(typ promType, promQuery string) {
|
go func(typ promType, promQuery string) {
|
||||||
resultVector, err := s.queryProm(ctx, promQuery)
|
resultVector, err := s.queryProm(ctx, promQuery)
|
||||||
resultChan <- promResult{
|
resultChan <- promResult{
|
||||||
|
|
@ -207,25 +214,20 @@ func (s *grpcServer) getPrometheusMetrics(ctx context.Context, requestQueryTempl
|
||||||
}(pt, query)
|
}(pt, query)
|
||||||
}
|
}
|
||||||
|
|
||||||
quantiles := []promType{promLatencyP50, promLatencyP95, promLatencyP99}
|
for quantile, query := range latencyQueries {
|
||||||
|
go func(qt promType, promQuery string) {
|
||||||
for _, quantile := range quantiles {
|
resultVector, err := s.queryProm(ctx, promQuery)
|
||||||
go func(quantile promType) {
|
|
||||||
latencyQuery := fmt.Sprintf(latencyQueryTemplate, quantile, labels, timeWindow, groupBy)
|
|
||||||
latencyResult, err := s.queryProm(ctx, latencyQuery)
|
|
||||||
|
|
||||||
resultChan <- promResult{
|
resultChan <- promResult{
|
||||||
prom: quantile,
|
prom: qt,
|
||||||
vec: latencyResult,
|
vec: resultVector,
|
||||||
err: err,
|
err: err,
|
||||||
}
|
}
|
||||||
}(quantile)
|
}(quantile, query)
|
||||||
}
|
}
|
||||||
|
|
||||||
// process results, receive one message per prometheus query type
|
// process results, receive one message per prometheus query type
|
||||||
var err error
|
var err error
|
||||||
results := []promResult{}
|
results := []promResult{}
|
||||||
for i := 0; i < len(quantiles)+len(requestQueryTemplates); i++ {
|
for i := 0; i < len(latencyQueries)+len(requestQueries); i++ {
|
||||||
result := <-resultChan
|
result := <-resultChan
|
||||||
if result.err != nil {
|
if result.err != nil {
|
||||||
log.Errorf("queryProm failed with: %s", result.err)
|
log.Errorf("queryProm failed with: %s", result.err)
|
||||||
|
|
|
||||||
|
|
@ -490,18 +490,35 @@ func buildTrafficSplitRequestLabels(req *pb.StatSummaryRequest) (labels model.La
|
||||||
return labels, groupBy
|
return labels, groupBy
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func buildTCPStatsRequestLabels(req *pb.StatSummaryRequest, reqLabels model.LabelSet) string {
|
||||||
|
switch req.Outbound.(type) {
|
||||||
|
case *pb.StatSummaryRequest_ToResource, *pb.StatSummaryRequest_FromResource:
|
||||||
|
// If TCP stats are queried from a resource to another one (i.e outbound -- from/to), then append peer='dst'
|
||||||
|
reqLabels = reqLabels.Merge(promPeerLabel("dst"))
|
||||||
|
|
||||||
|
default:
|
||||||
|
// If TCP stats are not queried from a specific resource (i.e inbound -- no to/from), then append peer='src'
|
||||||
|
reqLabels = reqLabels.Merge(promPeerLabel("src"))
|
||||||
|
}
|
||||||
|
return reqLabels.String()
|
||||||
|
}
|
||||||
|
|
||||||
func (s *grpcServer) getStatMetrics(ctx context.Context, req *pb.StatSummaryRequest, timeWindow string) (map[rKey]*pb.BasicStats, map[rKey]*pb.TcpStats, error) {
|
func (s *grpcServer) getStatMetrics(ctx context.Context, req *pb.StatSummaryRequest, timeWindow string) (map[rKey]*pb.BasicStats, map[rKey]*pb.TcpStats, error) {
|
||||||
reqLabels, groupBy := buildRequestLabels(req)
|
reqLabels, groupBy := buildRequestLabels(req)
|
||||||
promQueries := map[promType]string{
|
promQueries := map[promType]string{
|
||||||
promRequests: reqQuery,
|
promRequests: fmt.Sprintf(reqQuery, reqLabels.String(), timeWindow, groupBy.String()),
|
||||||
}
|
}
|
||||||
|
|
||||||
if req.TcpStats {
|
if req.TcpStats {
|
||||||
promQueries[promTCPConnections] = tcpConnectionsQuery
|
promQueries[promTCPConnections] = fmt.Sprintf(tcpConnectionsQuery, reqLabels.String(), groupBy.String())
|
||||||
promQueries[promTCPReadBytes] = tcpReadBytesQuery
|
// For TCP read/write bytes total we add an additional 'peer' label with a value of either 'src' or 'dst'
|
||||||
promQueries[promTCPWriteBytes] = tcpWriteBytesQuery
|
tcpLabels := buildTCPStatsRequestLabels(req, reqLabels)
|
||||||
|
promQueries[promTCPReadBytes] = fmt.Sprintf(tcpReadBytesQuery, tcpLabels, timeWindow, groupBy.String())
|
||||||
|
promQueries[promTCPWriteBytes] = fmt.Sprintf(tcpWriteBytesQuery, tcpLabels, timeWindow, groupBy.String())
|
||||||
}
|
}
|
||||||
results, err := s.getPrometheusMetrics(ctx, promQueries, latencyQuantileQuery, reqLabels.String(), timeWindow, groupBy.String())
|
|
||||||
|
quantileQueries := generateQuantileQueries(latencyQuantileQuery, reqLabels.String(), timeWindow, groupBy.String())
|
||||||
|
results, err := s.getPrometheusMetrics(ctx, promQueries, quantileQueries)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, err
|
return nil, nil, err
|
||||||
|
|
@ -523,10 +540,11 @@ func (s *grpcServer) getTrafficSplitMetrics(ctx context.Context, req *pb.StatSum
|
||||||
reqLabels := generateLabelStringWithRegex(labels, "authority", stringToMatch)
|
reqLabels := generateLabelStringWithRegex(labels, "authority", stringToMatch)
|
||||||
|
|
||||||
promQueries := map[promType]string{
|
promQueries := map[promType]string{
|
||||||
promRequests: reqQuery,
|
promRequests: fmt.Sprintf(reqQuery, reqLabels, timeWindow, groupBy.String()),
|
||||||
}
|
}
|
||||||
|
|
||||||
results, err := s.getPrometheusMetrics(ctx, promQueries, latencyQuantileQuery, reqLabels, timeWindow, groupBy.String())
|
quantileQueries := generateQuantileQueries(latencyQuantileQuery, reqLabels, timeWindow, groupBy.String())
|
||||||
|
results, err := s.getPrometheusMetrics(ctx, promQueries, quantileQueries)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
|
||||||
|
|
@ -742,8 +742,8 @@ status:
|
||||||
`histogram_quantile(0.99, sum(irate(response_latency_ms_bucket{direction="inbound", namespace="emojivoto", pod="emojivoto-1"}[1m])) by (le, namespace, pod))`,
|
`histogram_quantile(0.99, sum(irate(response_latency_ms_bucket{direction="inbound", namespace="emojivoto", pod="emojivoto-1"}[1m])) by (le, namespace, pod))`,
|
||||||
`sum(increase(response_total{direction="inbound", namespace="emojivoto", pod="emojivoto-1"}[1m])) by (namespace, pod, classification, tls)`,
|
`sum(increase(response_total{direction="inbound", namespace="emojivoto", pod="emojivoto-1"}[1m])) by (namespace, pod, classification, tls)`,
|
||||||
`sum(tcp_open_connections{direction="inbound", namespace="emojivoto", pod="emojivoto-1"}) by (namespace, pod)`,
|
`sum(tcp_open_connections{direction="inbound", namespace="emojivoto", pod="emojivoto-1"}) by (namespace, pod)`,
|
||||||
`sum(increase(tcp_read_bytes_total{direction="inbound", namespace="emojivoto", pod="emojivoto-1"}[1m])) by (namespace, pod)`,
|
`sum(increase(tcp_read_bytes_total{direction="inbound", namespace="emojivoto", peer="src", pod="emojivoto-1"}[1m])) by (namespace, pod)`,
|
||||||
`sum(increase(tcp_write_bytes_total{direction="inbound", namespace="emojivoto", pod="emojivoto-1"}[1m])) by (namespace, pod)`,
|
`sum(increase(tcp_write_bytes_total{direction="inbound", namespace="emojivoto", peer="src", pod="emojivoto-1"}[1m])) by (namespace, pod)`,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
req: &pb.StatSummaryRequest{
|
req: &pb.StatSummaryRequest{
|
||||||
|
|
@ -769,6 +769,67 @@ status:
|
||||||
testStatSummary(t, expectations)
|
testStatSummary(t, expectations)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("Queries prometheus for outbound TCP stats if --to resource is specified", func(t *testing.T) {
|
||||||
|
|
||||||
|
expectations := []statSumExpected{
|
||||||
|
{
|
||||||
|
|
||||||
|
expectedStatRPC: expectedStatRPC{
|
||||||
|
err: nil,
|
||||||
|
k8sConfigs: []string{`
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: emojivoto-1
|
||||||
|
namespace: emojivoto
|
||||||
|
labels:
|
||||||
|
app: emoji-svc
|
||||||
|
linkerd.io/control-plane-ns: linkerd
|
||||||
|
status:
|
||||||
|
phase: Running
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
mockPromResponse: prometheusMetric("emojivoto-1", "pod"),
|
||||||
|
expectedPrometheusQueries: []string{
|
||||||
|
`histogram_quantile(0.5, sum(irate(response_latency_ms_bucket{direction="outbound", dst_namespace="emojivoto", dst_pod="emojivoto-2", namespace="emojivoto", pod="emojivoto-1"}[1m])) by (le, namespace, pod))`,
|
||||||
|
`histogram_quantile(0.95, sum(irate(response_latency_ms_bucket{direction="outbound", dst_namespace="emojivoto", dst_pod="emojivoto-2", namespace="emojivoto", pod="emojivoto-1"}[1m])) by (le, namespace, pod))`,
|
||||||
|
`histogram_quantile(0.99, sum(irate(response_latency_ms_bucket{direction="outbound", dst_namespace="emojivoto", dst_pod="emojivoto-2", namespace="emojivoto", pod="emojivoto-1"}[1m])) by (le, namespace, pod))`,
|
||||||
|
`sum(increase(response_total{direction="outbound", dst_namespace="emojivoto", dst_pod="emojivoto-2", namespace="emojivoto", pod="emojivoto-1"}[1m])) by (namespace, pod, classification, tls)`,
|
||||||
|
`sum(tcp_open_connections{direction="outbound", dst_namespace="emojivoto", dst_pod="emojivoto-2", namespace="emojivoto", pod="emojivoto-1"}) by (namespace, pod)`,
|
||||||
|
`sum(increase(tcp_read_bytes_total{direction="outbound", dst_namespace="emojivoto", dst_pod="emojivoto-2", namespace="emojivoto", peer="dst", pod="emojivoto-1"}[1m])) by (namespace, pod)`,
|
||||||
|
`sum(increase(tcp_write_bytes_total{direction="outbound", dst_namespace="emojivoto", dst_pod="emojivoto-2", namespace="emojivoto", peer="dst", pod="emojivoto-1"}[1m])) by (namespace, pod)`,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
req: &pb.StatSummaryRequest{
|
||||||
|
Selector: &pb.ResourceSelection{
|
||||||
|
Resource: &pb.Resource{
|
||||||
|
Name: "emojivoto-1",
|
||||||
|
Namespace: "emojivoto",
|
||||||
|
Type: pkgK8s.Pod,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
TimeWindow: "1m",
|
||||||
|
TcpStats: true,
|
||||||
|
Outbound: &pb.StatSummaryRequest_ToResource{
|
||||||
|
ToResource: &pb.Resource{
|
||||||
|
Name: "emojivoto-2",
|
||||||
|
Namespace: "emojivoto",
|
||||||
|
Type: pkgK8s.Pod,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expectedResponse: GenStatSummaryResponse("emojivoto-1", pkgK8s.Pod, []string{"emojivoto"}, &PodCounts{
|
||||||
|
Status: "Running",
|
||||||
|
MeshedPods: 1,
|
||||||
|
RunningPods: 1,
|
||||||
|
FailedPods: 0,
|
||||||
|
}, true, true),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
testStatSummary(t, expectations)
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("Queries prometheus for a specific resource if name is specified", func(t *testing.T) {
|
t.Run("Queries prometheus for a specific resource if name is specified", func(t *testing.T) {
|
||||||
expectations := []statSumExpected{
|
expectations := []statSumExpected{
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -240,15 +240,16 @@ func (s *grpcServer) getRouteMetrics(ctx context.Context, req *pb.TopRoutesReque
|
||||||
groupBy := "rt_route"
|
groupBy := "rt_route"
|
||||||
|
|
||||||
queries := map[promType]string{
|
queries := map[promType]string{
|
||||||
promRequests: routeReqQuery,
|
promRequests: fmt.Sprintf(routeReqQuery, reqLabels, timeWindow, groupBy),
|
||||||
}
|
}
|
||||||
|
|
||||||
if req.GetOutbound() != nil && req.GetNone() == nil {
|
if req.GetOutbound() != nil && req.GetNone() == nil {
|
||||||
// If this req has an Outbound, then query the actual request counts as well.
|
// If this req has an Outbound, then query the actual request counts as well.
|
||||||
queries[promActualRequests] = actualRouteReqQuery
|
queries[promActualRequests] = fmt.Sprintf(actualRouteReqQuery, reqLabels, timeWindow, groupBy)
|
||||||
}
|
}
|
||||||
|
|
||||||
results, err := s.getPrometheusMetrics(ctx, queries, routeLatencyQuantileQuery, reqLabels, timeWindow, groupBy)
|
quantileQueries := generateQuantileQueries(routeLatencyQuantileQuery, reqLabels, timeWindow, groupBy)
|
||||||
|
results, err := s.getPrometheusMetrics(ctx, queries, quantileQueries)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue