dns: reduce cardinality of metrics (#6691)

Remove the port component of the resolver IP:port. Each of our unbounds
serves on multiple ports, and we talk to all of those ports, to increase
the entropy of our UDP query packets and reduce the chances of a
spurious ID mismatch error. But since all of those ports are running on
the same server, they are not worth distinguishing for metrics purposes.

Remove the authenticated_data label from the query time histograms. We
don't use this.

Part of #6142
This commit is contained in:
Jacob Hoffman-Andrews 2023-02-24 10:42:40 -08:00 committed by GitHub
parent cdf1a6f9f9
commit 991995cb5c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 37 additions and 23 deletions

View File

@ -195,7 +195,7 @@ func New(
Help: "Time taken to perform a DNS query", Help: "Time taken to perform a DNS query",
Buckets: metrics.InternetFacingBuckets, Buckets: metrics.InternetFacingBuckets,
}, },
[]string{"qtype", "result", "authenticated_data", "resolver"}, []string{"qtype", "result", "resolver"},
) )
totalLookupTime := prometheus.NewHistogramVec( totalLookupTime := prometheus.NewHistogramVec(
prometheus.HistogramOpts{ prometheus.HistogramOpts{
@ -203,7 +203,7 @@ func New(
Help: "Time taken to perform a DNS lookup, including all retried queries", Help: "Time taken to perform a DNS lookup, including all retried queries",
Buckets: metrics.InternetFacingBuckets, Buckets: metrics.InternetFacingBuckets,
}, },
[]string{"qtype", "result", "authenticated_data", "retries", "resolver"}, []string{"qtype", "result", "retries", "resolver"},
) )
timeoutCounter := prometheus.NewCounterVec( timeoutCounter := prometheus.NewCounterVec(
prometheus.CounterOpts{ prometheus.CounterOpts{
@ -277,48 +277,62 @@ func (dnsClient *impl) exchangeOne(ctx context.Context, hostname string, qtype u
chosenServerIndex := 0 chosenServerIndex := 0
chosenServer := servers[chosenServerIndex] chosenServer := servers[chosenServerIndex]
// Strip off the IP address part of the server address because
// we talk to the same server on multiple ports, and don't want
// to blow up the cardinality.
chosenServerIP, _, err := net.SplitHostPort(chosenServer)
if err != nil {
return
}
start := dnsClient.clk.Now() start := dnsClient.clk.Now()
client := dnsClient.dnsClient client := dnsClient.dnsClient
qtypeStr := dns.TypeToString[qtype] qtypeStr := dns.TypeToString[qtype]
tries := 1 tries := 1
defer func() { defer func() {
result, authenticated := "failed", "" result := "failed"
if resp != nil { if resp != nil {
result = dns.RcodeToString[resp.Rcode] result = dns.RcodeToString[resp.Rcode]
authenticated = fmt.Sprintf("%t", resp.AuthenticatedData)
} }
dnsClient.totalLookupTime.With(prometheus.Labels{ dnsClient.totalLookupTime.With(prometheus.Labels{
"qtype": qtypeStr, "qtype": qtypeStr,
"result": result, "result": result,
"authenticated_data": authenticated, "retries": strconv.Itoa(tries),
"retries": strconv.Itoa(tries), "resolver": chosenServerIP,
"resolver": chosenServer,
}).Observe(dnsClient.clk.Since(start).Seconds()) }).Observe(dnsClient.clk.Since(start).Seconds())
}() }()
for { for {
ch := make(chan dnsResp, 1) ch := make(chan dnsResp, 1)
// Strip off the IP address part of the server address because
// we talk to the same server on multiple ports, and don't want
// to blow up the cardinality.
// Note: validateServerAddress() has already checked net.SplitHostPort()
// and ensures that chosenServer can't be a bare port, e.g. ":1337"
chosenServerIP, _, err = net.SplitHostPort(chosenServer)
if err != nil {
return
}
go func() { go func() {
rsp, rtt, err := client.Exchange(m, chosenServer) rsp, rtt, err := client.Exchange(m, chosenServer)
result, authenticated := "failed", "" result := "failed"
if rsp != nil { if rsp != nil {
result = dns.RcodeToString[rsp.Rcode] result = dns.RcodeToString[rsp.Rcode]
authenticated = fmt.Sprintf("%t", rsp.AuthenticatedData)
} }
if err != nil { if err != nil {
logDNSError(dnsClient.log, chosenServer, hostname, m, rsp, err) logDNSError(dnsClient.log, chosenServer, hostname, m, rsp, err)
if err == dns.ErrId { if err == dns.ErrId {
dnsClient.idMismatchCounter.With(prometheus.Labels{ dnsClient.idMismatchCounter.With(prometheus.Labels{
"qtype": qtypeStr, "qtype": qtypeStr,
"resolver": chosenServer, "resolver": chosenServerIP,
}).Inc() }).Inc()
} }
} }
dnsClient.queryTime.With(prometheus.Labels{ dnsClient.queryTime.With(prometheus.Labels{
"qtype": qtypeStr, "qtype": qtypeStr,
"result": result, "result": result,
"authenticated_data": authenticated, "resolver": chosenServerIP,
"resolver": chosenServer,
}).Observe(rtt.Seconds()) }).Observe(rtt.Seconds())
ch <- dnsResp{m: rsp, err: err} ch <- dnsResp{m: rsp, err: err}
}() }()
@ -328,21 +342,21 @@ func (dnsClient *impl) exchangeOne(ctx context.Context, hostname string, qtype u
dnsClient.timeoutCounter.With(prometheus.Labels{ dnsClient.timeoutCounter.With(prometheus.Labels{
"qtype": qtypeStr, "qtype": qtypeStr,
"type": "deadline exceeded", "type": "deadline exceeded",
"resolver": chosenServer, "resolver": chosenServerIP,
"isTLD": isTLD(hostname), "isTLD": isTLD(hostname),
}).Inc() }).Inc()
} else if ctx.Err() == context.Canceled { } else if ctx.Err() == context.Canceled {
dnsClient.timeoutCounter.With(prometheus.Labels{ dnsClient.timeoutCounter.With(prometheus.Labels{
"qtype": qtypeStr, "qtype": qtypeStr,
"type": "canceled", "type": "canceled",
"resolver": chosenServer, "resolver": chosenServerIP,
"isTLD": isTLD(hostname), "isTLD": isTLD(hostname),
}).Inc() }).Inc()
} else { } else {
dnsClient.timeoutCounter.With(prometheus.Labels{ dnsClient.timeoutCounter.With(prometheus.Labels{
"qtype": qtypeStr, "qtype": qtypeStr,
"type": "unknown", "type": "unknown",
"resolver": chosenServer, "resolver": chosenServerIP,
}).Inc() }).Inc()
} }
err = ctx.Err() err = ctx.Err()
@ -366,7 +380,7 @@ func (dnsClient *impl) exchangeOne(ctx context.Context, hostname string, qtype u
dnsClient.timeoutCounter.With(prometheus.Labels{ dnsClient.timeoutCounter.With(prometheus.Labels{
"qtype": qtypeStr, "qtype": qtypeStr,
"type": "out of retries", "type": "out of retries",
"resolver": chosenServer, "resolver": chosenServerIP,
"isTLD": isTLD(hostname), "isTLD": isTLD(hostname),
}).Inc() }).Inc()
} }

View File

@ -647,7 +647,7 @@ func TestRetry(t *testing.T) {
t, dr.timeoutCounter, prometheus.Labels{ t, dr.timeoutCounter, prometheus.Labels{
"qtype": "TXT", "qtype": "TXT",
"type": "out of retries", "type": "out of retries",
"resolver": dnsLoopbackAddr, "resolver": "127.0.0.1",
"isTLD": "false", "isTLD": "false",
}, tc.metricsAllRetries) }, tc.metricsAllRetries)
} }
@ -690,14 +690,14 @@ func TestRetry(t *testing.T) {
t, dr.timeoutCounter, prometheus.Labels{ t, dr.timeoutCounter, prometheus.Labels{
"qtype": "TXT", "qtype": "TXT",
"type": "canceled", "type": "canceled",
"resolver": dnsLoopbackAddr, "resolver": "127.0.0.1",
}, 1) }, 1)
test.AssertMetricWithLabelsEquals( test.AssertMetricWithLabelsEquals(
t, dr.timeoutCounter, prometheus.Labels{ t, dr.timeoutCounter, prometheus.Labels{
"qtype": "TXT", "qtype": "TXT",
"type": "deadline exceeded", "type": "deadline exceeded",
"resolver": dnsLoopbackAddr, "resolver": "127.0.0.1",
}, 2) }, 2)
} }