dns: reduce cardinality of metrics (#6691)
Remove the port component of the resolver IP:port. Each of our unbounds serves on multiple ports, and we talk to all of those ports, to increase the entropy of our UDP query packets and reduce the chances of a spurious ID mismatch error. But since all of those ports are running on the same server, they are not worth distinguishing for metrics purposes. Remove the authenticated_data label from the query time histograms. We don't use this. Part of #6142
This commit is contained in:
parent
cdf1a6f9f9
commit
991995cb5c
54
bdns/dns.go
54
bdns/dns.go
|
|
@ -195,7 +195,7 @@ func New(
|
||||||
Help: "Time taken to perform a DNS query",
|
Help: "Time taken to perform a DNS query",
|
||||||
Buckets: metrics.InternetFacingBuckets,
|
Buckets: metrics.InternetFacingBuckets,
|
||||||
},
|
},
|
||||||
[]string{"qtype", "result", "authenticated_data", "resolver"},
|
[]string{"qtype", "result", "resolver"},
|
||||||
)
|
)
|
||||||
totalLookupTime := prometheus.NewHistogramVec(
|
totalLookupTime := prometheus.NewHistogramVec(
|
||||||
prometheus.HistogramOpts{
|
prometheus.HistogramOpts{
|
||||||
|
|
@ -203,7 +203,7 @@ func New(
|
||||||
Help: "Time taken to perform a DNS lookup, including all retried queries",
|
Help: "Time taken to perform a DNS lookup, including all retried queries",
|
||||||
Buckets: metrics.InternetFacingBuckets,
|
Buckets: metrics.InternetFacingBuckets,
|
||||||
},
|
},
|
||||||
[]string{"qtype", "result", "authenticated_data", "retries", "resolver"},
|
[]string{"qtype", "result", "retries", "resolver"},
|
||||||
)
|
)
|
||||||
timeoutCounter := prometheus.NewCounterVec(
|
timeoutCounter := prometheus.NewCounterVec(
|
||||||
prometheus.CounterOpts{
|
prometheus.CounterOpts{
|
||||||
|
|
@ -277,48 +277,62 @@ func (dnsClient *impl) exchangeOne(ctx context.Context, hostname string, qtype u
|
||||||
chosenServerIndex := 0
|
chosenServerIndex := 0
|
||||||
chosenServer := servers[chosenServerIndex]
|
chosenServer := servers[chosenServerIndex]
|
||||||
|
|
||||||
|
// Strip off the IP address part of the server address because
|
||||||
|
// we talk to the same server on multiple ports, and don't want
|
||||||
|
// to blow up the cardinality.
|
||||||
|
chosenServerIP, _, err := net.SplitHostPort(chosenServer)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
start := dnsClient.clk.Now()
|
start := dnsClient.clk.Now()
|
||||||
client := dnsClient.dnsClient
|
client := dnsClient.dnsClient
|
||||||
qtypeStr := dns.TypeToString[qtype]
|
qtypeStr := dns.TypeToString[qtype]
|
||||||
tries := 1
|
tries := 1
|
||||||
defer func() {
|
defer func() {
|
||||||
result, authenticated := "failed", ""
|
result := "failed"
|
||||||
if resp != nil {
|
if resp != nil {
|
||||||
result = dns.RcodeToString[resp.Rcode]
|
result = dns.RcodeToString[resp.Rcode]
|
||||||
authenticated = fmt.Sprintf("%t", resp.AuthenticatedData)
|
|
||||||
}
|
}
|
||||||
dnsClient.totalLookupTime.With(prometheus.Labels{
|
dnsClient.totalLookupTime.With(prometheus.Labels{
|
||||||
"qtype": qtypeStr,
|
"qtype": qtypeStr,
|
||||||
"result": result,
|
"result": result,
|
||||||
"authenticated_data": authenticated,
|
"retries": strconv.Itoa(tries),
|
||||||
"retries": strconv.Itoa(tries),
|
"resolver": chosenServerIP,
|
||||||
"resolver": chosenServer,
|
|
||||||
}).Observe(dnsClient.clk.Since(start).Seconds())
|
}).Observe(dnsClient.clk.Since(start).Seconds())
|
||||||
}()
|
}()
|
||||||
for {
|
for {
|
||||||
ch := make(chan dnsResp, 1)
|
ch := make(chan dnsResp, 1)
|
||||||
|
|
||||||
|
// Strip off the IP address part of the server address because
|
||||||
|
// we talk to the same server on multiple ports, and don't want
|
||||||
|
// to blow up the cardinality.
|
||||||
|
// Note: validateServerAddress() has already checked net.SplitHostPort()
|
||||||
|
// and ensures that chosenServer can't be a bare port, e.g. ":1337"
|
||||||
|
chosenServerIP, _, err = net.SplitHostPort(chosenServer)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
rsp, rtt, err := client.Exchange(m, chosenServer)
|
rsp, rtt, err := client.Exchange(m, chosenServer)
|
||||||
result, authenticated := "failed", ""
|
result := "failed"
|
||||||
if rsp != nil {
|
if rsp != nil {
|
||||||
result = dns.RcodeToString[rsp.Rcode]
|
result = dns.RcodeToString[rsp.Rcode]
|
||||||
authenticated = fmt.Sprintf("%t", rsp.AuthenticatedData)
|
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logDNSError(dnsClient.log, chosenServer, hostname, m, rsp, err)
|
logDNSError(dnsClient.log, chosenServer, hostname, m, rsp, err)
|
||||||
if err == dns.ErrId {
|
if err == dns.ErrId {
|
||||||
dnsClient.idMismatchCounter.With(prometheus.Labels{
|
dnsClient.idMismatchCounter.With(prometheus.Labels{
|
||||||
"qtype": qtypeStr,
|
"qtype": qtypeStr,
|
||||||
"resolver": chosenServer,
|
"resolver": chosenServerIP,
|
||||||
}).Inc()
|
}).Inc()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dnsClient.queryTime.With(prometheus.Labels{
|
dnsClient.queryTime.With(prometheus.Labels{
|
||||||
"qtype": qtypeStr,
|
"qtype": qtypeStr,
|
||||||
"result": result,
|
"result": result,
|
||||||
"authenticated_data": authenticated,
|
"resolver": chosenServerIP,
|
||||||
"resolver": chosenServer,
|
|
||||||
}).Observe(rtt.Seconds())
|
}).Observe(rtt.Seconds())
|
||||||
ch <- dnsResp{m: rsp, err: err}
|
ch <- dnsResp{m: rsp, err: err}
|
||||||
}()
|
}()
|
||||||
|
|
@ -328,21 +342,21 @@ func (dnsClient *impl) exchangeOne(ctx context.Context, hostname string, qtype u
|
||||||
dnsClient.timeoutCounter.With(prometheus.Labels{
|
dnsClient.timeoutCounter.With(prometheus.Labels{
|
||||||
"qtype": qtypeStr,
|
"qtype": qtypeStr,
|
||||||
"type": "deadline exceeded",
|
"type": "deadline exceeded",
|
||||||
"resolver": chosenServer,
|
"resolver": chosenServerIP,
|
||||||
"isTLD": isTLD(hostname),
|
"isTLD": isTLD(hostname),
|
||||||
}).Inc()
|
}).Inc()
|
||||||
} else if ctx.Err() == context.Canceled {
|
} else if ctx.Err() == context.Canceled {
|
||||||
dnsClient.timeoutCounter.With(prometheus.Labels{
|
dnsClient.timeoutCounter.With(prometheus.Labels{
|
||||||
"qtype": qtypeStr,
|
"qtype": qtypeStr,
|
||||||
"type": "canceled",
|
"type": "canceled",
|
||||||
"resolver": chosenServer,
|
"resolver": chosenServerIP,
|
||||||
"isTLD": isTLD(hostname),
|
"isTLD": isTLD(hostname),
|
||||||
}).Inc()
|
}).Inc()
|
||||||
} else {
|
} else {
|
||||||
dnsClient.timeoutCounter.With(prometheus.Labels{
|
dnsClient.timeoutCounter.With(prometheus.Labels{
|
||||||
"qtype": qtypeStr,
|
"qtype": qtypeStr,
|
||||||
"type": "unknown",
|
"type": "unknown",
|
||||||
"resolver": chosenServer,
|
"resolver": chosenServerIP,
|
||||||
}).Inc()
|
}).Inc()
|
||||||
}
|
}
|
||||||
err = ctx.Err()
|
err = ctx.Err()
|
||||||
|
|
@ -366,7 +380,7 @@ func (dnsClient *impl) exchangeOne(ctx context.Context, hostname string, qtype u
|
||||||
dnsClient.timeoutCounter.With(prometheus.Labels{
|
dnsClient.timeoutCounter.With(prometheus.Labels{
|
||||||
"qtype": qtypeStr,
|
"qtype": qtypeStr,
|
||||||
"type": "out of retries",
|
"type": "out of retries",
|
||||||
"resolver": chosenServer,
|
"resolver": chosenServerIP,
|
||||||
"isTLD": isTLD(hostname),
|
"isTLD": isTLD(hostname),
|
||||||
}).Inc()
|
}).Inc()
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -647,7 +647,7 @@ func TestRetry(t *testing.T) {
|
||||||
t, dr.timeoutCounter, prometheus.Labels{
|
t, dr.timeoutCounter, prometheus.Labels{
|
||||||
"qtype": "TXT",
|
"qtype": "TXT",
|
||||||
"type": "out of retries",
|
"type": "out of retries",
|
||||||
"resolver": dnsLoopbackAddr,
|
"resolver": "127.0.0.1",
|
||||||
"isTLD": "false",
|
"isTLD": "false",
|
||||||
}, tc.metricsAllRetries)
|
}, tc.metricsAllRetries)
|
||||||
}
|
}
|
||||||
|
|
@ -690,14 +690,14 @@ func TestRetry(t *testing.T) {
|
||||||
t, dr.timeoutCounter, prometheus.Labels{
|
t, dr.timeoutCounter, prometheus.Labels{
|
||||||
"qtype": "TXT",
|
"qtype": "TXT",
|
||||||
"type": "canceled",
|
"type": "canceled",
|
||||||
"resolver": dnsLoopbackAddr,
|
"resolver": "127.0.0.1",
|
||||||
}, 1)
|
}, 1)
|
||||||
|
|
||||||
test.AssertMetricWithLabelsEquals(
|
test.AssertMetricWithLabelsEquals(
|
||||||
t, dr.timeoutCounter, prometheus.Labels{
|
t, dr.timeoutCounter, prometheus.Labels{
|
||||||
"qtype": "TXT",
|
"qtype": "TXT",
|
||||||
"type": "deadline exceeded",
|
"type": "deadline exceeded",
|
||||||
"resolver": dnsLoopbackAddr,
|
"resolver": "127.0.0.1",
|
||||||
}, 2)
|
}, 2)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue