grpc: Allow for some SRV resolution failures (#7014)

Allow gRPC SRV resolver to succeed even when some names are not resolved
successfully. Cross-DC services (e.g. nonce) will fail to resolve when
the link between DCs is severed or one DC is taken offline, this should
not result in hard gRPC service failures.

Fixes #6974
This commit is contained in:
Samantha 2023-08-01 12:55:05 -04:00 committed by GitHub
parent 0a3ce04d38
commit e7cb74b5f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 289 additions and 6 deletions

View File

@ -24,6 +24,7 @@ package dns
import (
"context"
"errors"
"fmt"
"net"
"strconv"
@ -228,33 +229,40 @@ func (d *dnsResolver) watcher() {
func (d *dnsResolver) lookupSRV() ([]resolver.Address, error) {
var newAddrs []resolver.Address
var errs []error
for _, n := range d.names {
_, srvs, err := d.resolver.LookupSRV(d.ctx, n.service, "tcp", n.domain)
if err != nil {
err = handleDNSError(err, "SRV") // may become nil
return nil, err
if err != nil {
errs = append(errs, err)
continue
}
}
for _, s := range srvs {
backendAddrs, err := d.resolver.LookupHost(d.ctx, s.Target)
if err != nil {
err = handleDNSError(err, "A") // may become nil
if err == nil {
// If there are other SRV records, look them up and ignore this
// one that does not exist.
if err != nil {
errs = append(errs, err)
continue
}
return nil, err
}
for _, a := range backendAddrs {
ip, ok := formatIP(a)
if !ok {
return nil, fmt.Errorf("srv: error parsing A record IP address %v", a)
errs = append(errs, fmt.Errorf("srv: error parsing A record IP address %v", a))
continue
}
addr := ip + ":" + strconv.Itoa(int(s.Port))
newAddrs = append(newAddrs, resolver.Address{Addr: addr, ServerName: s.Target})
}
}
}
// Only return an error if all lookups failed.
if len(errs) > 0 && len(newAddrs) == 0 {
return nil, errors.Join(errs...)
}
return newAddrs, nil
}

View File

@ -348,3 +348,84 @@ services {
port = 9092
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
}
//
// The following services are used for testing the gRPC DNS resolver.
//
// CaseOne config will have 2 SRV records. The first will have 0 backends, the
// second will have 1.
services {
id = "case1a"
name = "case1a"
address = "10.77.77.77"
port = 9101
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
checks = [
{
id = "case1a-failing"
name = "case1a-failing"
http = "http://localhost:12345" // invalid url
method = "GET"
interval = "2s"
}
]
}
services {
id = "case1b"
name = "case1b"
address = "10.88.88.88"
port = 9101
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
}
// CaseTwo config will have 2 SRV records. The first will not be configured in
// Consul, the second will have 1 backend.
services {
id = "case2b"
name = "case2b"
address = "10.88.88.88"
port = 9101
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
}
// CaseThree config will have 2 SRV records. Neither will be configured in
// Consul.
// CaseFour config will have 2 SRV records. Neither will have backends.
services {
id = "case4a"
name = "case4a"
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
address = "10.77.77.77"
port = 9101
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
checks = [
{
id = "case4a-failing"
name = "case4a-failing"
http = "http://localhost:12345" // invalid url
method = "GET"
interval = "2s"
}
]
}
services {
id = "case4b"
name = "case4b"
address = "10.88.88.88"
port = 9101
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
checks = [
{
id = "case4b-failing"
name = "case4b-failing"
http = "http://localhost:12345" // invalid url
method = "GET"
interval = "2s"
}
]
}

View File

@ -0,0 +1,121 @@
//go:build integration
package integration
import (
"context"
"testing"
"github.com/jmhodges/clock"
"google.golang.org/protobuf/types/known/emptypb"
"github.com/letsencrypt/boulder/cmd"
bgrpc "github.com/letsencrypt/boulder/grpc"
"github.com/letsencrypt/boulder/metrics"
"github.com/letsencrypt/boulder/nonce"
"github.com/letsencrypt/boulder/test"
)
type conf struct {
WebFooEnd struct {
TLS cmd.TLSConfig
// CaseOne config will have 2 SRV records. The first will have 0
// backends, the second will have 1.
CaseOne *cmd.GRPCClientConfig
// CaseTwo config will have 2 SRV records. The first will not be
// configured in Consul, the second will have 1 backend.
CaseTwo *cmd.GRPCClientConfig
// CaseThree config will have 2 SRV records. Neither will be configured
// in Consul.
CaseThree *cmd.GRPCClientConfig
// CaseFour config will have 2 SRV records. Neither will have backends.
CaseFour *cmd.GRPCClientConfig
}
}
func TestSRVResolver_CaseOne(t *testing.T) {
t.Parallel()
var c conf
err := cmd.ReadConfigFile("test/integration/testdata/srv-resolver-config.json", &c)
test.AssertNotError(t, err, "Could not read config file")
tlsConfig, err := c.WebFooEnd.TLS.Load(metrics.NoopRegisterer)
test.AssertNotError(t, err, "Could not load TLS config")
clk := clock.New()
getNonceConn, err := bgrpc.ClientSetup(c.WebFooEnd.CaseOne, tlsConfig, metrics.NoopRegisterer, clk)
test.AssertNotError(t, err, "Could not set up gRPC client")
// This should succeed, even though the first SRV record has no backends.
gnc := nonce.NewGetter(getNonceConn)
_, err = gnc.Nonce(context.Background(), &emptypb.Empty{})
test.AssertNotError(t, err, "Unexpected error getting nonce")
}
func TestSRVResolver_CaseTwo(t *testing.T) {
t.Parallel()
var c conf
err := cmd.ReadConfigFile("test/integration/testdata/srv-resolver-config.json", &c)
test.AssertNotError(t, err, "Could not read config file")
tlsConfig, err := c.WebFooEnd.TLS.Load(metrics.NoopRegisterer)
test.AssertNotError(t, err, "Could not load TLS config")
clk := clock.New()
getNonceConn, err := bgrpc.ClientSetup(c.WebFooEnd.CaseTwo, tlsConfig, metrics.NoopRegisterer, clk)
test.AssertNotError(t, err, "Could not set up gRPC client")
// This should succeed, even though the first SRV record is not configured
// in Consul.
gnc := nonce.NewGetter(getNonceConn)
_, err = gnc.Nonce(context.Background(), &emptypb.Empty{})
test.AssertNotError(t, err, "Unexpected error getting nonce")
}
func TestSRVResolver_CaseThree(t *testing.T) {
t.Parallel()
var c conf
err := cmd.ReadConfigFile("test/integration/testdata/srv-resolver-config.json", &c)
test.AssertNotError(t, err, "Could not read config file")
tlsConfig, err := c.WebFooEnd.TLS.Load(metrics.NoopRegisterer)
test.AssertNotError(t, err, "Could not load TLS config")
clk := clock.New()
getNonceConn, err := bgrpc.ClientSetup(c.WebFooEnd.CaseThree, tlsConfig, metrics.NoopRegisterer, clk)
test.AssertNotError(t, err, "Could not set up gRPC client")
// This should fail, neither SRV record is configured in Consul and the
// resolver will not return any backends.
gnc := nonce.NewGetter(getNonceConn)
_, err = gnc.Nonce(context.Background(), &emptypb.Empty{})
test.AssertError(t, err, "Expected error getting nonce")
test.AssertContains(t, err.Error(), "last resolver error: produced zero addresses")
}
func TestSRVResolver_CaseFour(t *testing.T) {
t.Parallel()
var c conf
err := cmd.ReadConfigFile("test/integration/testdata/srv-resolver-config.json", &c)
test.AssertNotError(t, err, "Could not read config file")
tlsConfig, err := c.WebFooEnd.TLS.Load(metrics.NoopRegisterer)
test.AssertNotError(t, err, "Could not load TLS config")
clk := clock.New()
getNonceConn4, err := bgrpc.ClientSetup(c.WebFooEnd.CaseFour, tlsConfig, metrics.NoopRegisterer, clk)
test.AssertNotError(t, err, "Could not set up gRPC client")
// This should fail, neither SRV record resolves to backends.
gnc := nonce.NewGetter(getNonceConn4)
_, err = gnc.Nonce(context.Background(), &emptypb.Empty{})
test.AssertError(t, err, "Expected error getting nonce")
test.AssertContains(t, err.Error(), "last resolver error: produced zero addresses")
}

View File

@ -0,0 +1,73 @@
{
"webFooEnd": {
"tls": {
"caCertFile": "test/grpc-creds/minica.pem",
"certFile": "test/grpc-creds/wfe.boulder/cert.pem",
"keyFile": "test/grpc-creds/wfe.boulder/key.pem"
},
"caseOne": {
"dnsAuthority": "consul.service.consul",
"srvLookups": [
{
"service": "case1a",
"domain": "service.consul"
},
{
"service": "case1b",
"domain": "service.consul"
}
],
"timeout": "15s",
"noWaitForReady": true,
"hostOverride": "nonce.boulder"
},
"caseTwo": {
"dnsAuthority": "consul.service.consul",
"srvLookups": [
{
"service": "case2a",
"domain": "service.consul"
},
{
"service": "case2b",
"domain": "service.consul"
}
],
"timeout": "15s",
"noWaitForReady": true,
"hostOverride": "nonce.boulder"
},
"caseThree": {
"dnsAuthority": "consul.service.consul",
"srvLookups": [
{
"service": "case3a",
"domain": "service.consul"
},
{
"service": "case3b",
"domain": "service.consul"
}
],
"timeout": "15s",
"noWaitForReady": true,
"hostOverride": "nonce.boulder"
},
"caseFour": {
"dnsAuthority": "consul.service.consul",
"srvLookups": [
{
"service": "case4a",
"domain": "service.consul"
},
{
"service": "case4b",
"domain": "service.consul"
}
],
"timeout": "15s",
"noWaitForReady": true,
"hostOverride": "nonce.boulder"
}
}
}