dns: reduce cardinality of metrics (#6691)

Remove the port component of the resolver IP:port. Each of our unbounds
serves on multiple ports, and we talk to all of those ports, to increase
the entropy of our UDP query packets and reduce the chances of a
spurious ID mismatch error. But since all of those ports are running on
the same server, they are not worth distinguishing for metrics purposes.

Remove the authenticated_data label from the query time histograms. We
don't use this.

Part of #6142
This commit is contained in:
Jacob Hoffman-Andrews 2023-02-24 10:42:40 -08:00 committed by GitHub
parent cdf1a6f9f9
commit 991995cb5c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 37 additions and 23 deletions

View File

@ -195,7 +195,7 @@ func New(
Help: "Time taken to perform a DNS query",
Buckets: metrics.InternetFacingBuckets,
},
[]string{"qtype", "result", "authenticated_data", "resolver"},
[]string{"qtype", "result", "resolver"},
)
totalLookupTime := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
@ -203,7 +203,7 @@ func New(
Help: "Time taken to perform a DNS lookup, including all retried queries",
Buckets: metrics.InternetFacingBuckets,
},
[]string{"qtype", "result", "authenticated_data", "retries", "resolver"},
[]string{"qtype", "result", "retries", "resolver"},
)
timeoutCounter := prometheus.NewCounterVec(
prometheus.CounterOpts{
@ -277,48 +277,62 @@ func (dnsClient *impl) exchangeOne(ctx context.Context, hostname string, qtype u
chosenServerIndex := 0
chosenServer := servers[chosenServerIndex]
// Strip off the IP address part of the server address because
// we talk to the same server on multiple ports, and don't want
// to blow up the cardinality.
chosenServerIP, _, err := net.SplitHostPort(chosenServer)
if err != nil {
return
}
start := dnsClient.clk.Now()
client := dnsClient.dnsClient
qtypeStr := dns.TypeToString[qtype]
tries := 1
defer func() {
result, authenticated := "failed", ""
result := "failed"
if resp != nil {
result = dns.RcodeToString[resp.Rcode]
authenticated = fmt.Sprintf("%t", resp.AuthenticatedData)
}
dnsClient.totalLookupTime.With(prometheus.Labels{
"qtype": qtypeStr,
"result": result,
"authenticated_data": authenticated,
"retries": strconv.Itoa(tries),
"resolver": chosenServer,
"qtype": qtypeStr,
"result": result,
"retries": strconv.Itoa(tries),
"resolver": chosenServerIP,
}).Observe(dnsClient.clk.Since(start).Seconds())
}()
for {
ch := make(chan dnsResp, 1)
// Strip off the IP address part of the server address because
// we talk to the same server on multiple ports, and don't want
// to blow up the cardinality.
// Note: validateServerAddress() has already checked net.SplitHostPort()
// and ensures that chosenServer can't be a bare port, e.g. ":1337"
chosenServerIP, _, err = net.SplitHostPort(chosenServer)
if err != nil {
return
}
go func() {
rsp, rtt, err := client.Exchange(m, chosenServer)
result, authenticated := "failed", ""
result := "failed"
if rsp != nil {
result = dns.RcodeToString[rsp.Rcode]
authenticated = fmt.Sprintf("%t", rsp.AuthenticatedData)
}
if err != nil {
logDNSError(dnsClient.log, chosenServer, hostname, m, rsp, err)
if err == dns.ErrId {
dnsClient.idMismatchCounter.With(prometheus.Labels{
"qtype": qtypeStr,
"resolver": chosenServer,
"resolver": chosenServerIP,
}).Inc()
}
}
dnsClient.queryTime.With(prometheus.Labels{
"qtype": qtypeStr,
"result": result,
"authenticated_data": authenticated,
"resolver": chosenServer,
"qtype": qtypeStr,
"result": result,
"resolver": chosenServerIP,
}).Observe(rtt.Seconds())
ch <- dnsResp{m: rsp, err: err}
}()
@ -328,21 +342,21 @@ func (dnsClient *impl) exchangeOne(ctx context.Context, hostname string, qtype u
dnsClient.timeoutCounter.With(prometheus.Labels{
"qtype": qtypeStr,
"type": "deadline exceeded",
"resolver": chosenServer,
"resolver": chosenServerIP,
"isTLD": isTLD(hostname),
}).Inc()
} else if ctx.Err() == context.Canceled {
dnsClient.timeoutCounter.With(prometheus.Labels{
"qtype": qtypeStr,
"type": "canceled",
"resolver": chosenServer,
"resolver": chosenServerIP,
"isTLD": isTLD(hostname),
}).Inc()
} else {
dnsClient.timeoutCounter.With(prometheus.Labels{
"qtype": qtypeStr,
"type": "unknown",
"resolver": chosenServer,
"resolver": chosenServerIP,
}).Inc()
}
err = ctx.Err()
@ -366,7 +380,7 @@ func (dnsClient *impl) exchangeOne(ctx context.Context, hostname string, qtype u
dnsClient.timeoutCounter.With(prometheus.Labels{
"qtype": qtypeStr,
"type": "out of retries",
"resolver": chosenServer,
"resolver": chosenServerIP,
"isTLD": isTLD(hostname),
}).Inc()
}

View File

@ -647,7 +647,7 @@ func TestRetry(t *testing.T) {
t, dr.timeoutCounter, prometheus.Labels{
"qtype": "TXT",
"type": "out of retries",
"resolver": dnsLoopbackAddr,
"resolver": "127.0.0.1",
"isTLD": "false",
}, tc.metricsAllRetries)
}
@ -690,14 +690,14 @@ func TestRetry(t *testing.T) {
t, dr.timeoutCounter, prometheus.Labels{
"qtype": "TXT",
"type": "canceled",
"resolver": dnsLoopbackAddr,
"resolver": "127.0.0.1",
}, 1)
test.AssertMetricWithLabelsEquals(
t, dr.timeoutCounter, prometheus.Labels{
"qtype": "TXT",
"type": "deadline exceeded",
"resolver": dnsLoopbackAddr,
"resolver": "127.0.0.1",
}, 2)
}