grpc: Allow for some SRV resolution failures (#7014)
Allow gRPC SRV resolver to succeed even when some names are not resolved successfully. Cross-DC services (e.g. nonce) will fail to resolve when the link between DCs is severed or one DC is taken offline, this should not result in hard gRPC service failures. Fixes #6974
This commit is contained in:
parent
0a3ce04d38
commit
e7cb74b5f8
|
|
@ -24,6 +24,7 @@ package dns
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"strconv"
|
||||
|
|
@ -228,33 +229,40 @@ func (d *dnsResolver) watcher() {
|
|||
|
||||
func (d *dnsResolver) lookupSRV() ([]resolver.Address, error) {
|
||||
var newAddrs []resolver.Address
|
||||
var errs []error
|
||||
for _, n := range d.names {
|
||||
_, srvs, err := d.resolver.LookupSRV(d.ctx, n.service, "tcp", n.domain)
|
||||
if err != nil {
|
||||
err = handleDNSError(err, "SRV") // may become nil
|
||||
return nil, err
|
||||
if err != nil {
|
||||
errs = append(errs, err)
|
||||
continue
|
||||
}
|
||||
}
|
||||
for _, s := range srvs {
|
||||
backendAddrs, err := d.resolver.LookupHost(d.ctx, s.Target)
|
||||
if err != nil {
|
||||
err = handleDNSError(err, "A") // may become nil
|
||||
if err == nil {
|
||||
// If there are other SRV records, look them up and ignore this
|
||||
// one that does not exist.
|
||||
if err != nil {
|
||||
errs = append(errs, err)
|
||||
continue
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
for _, a := range backendAddrs {
|
||||
ip, ok := formatIP(a)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("srv: error parsing A record IP address %v", a)
|
||||
errs = append(errs, fmt.Errorf("srv: error parsing A record IP address %v", a))
|
||||
continue
|
||||
}
|
||||
addr := ip + ":" + strconv.Itoa(int(s.Port))
|
||||
newAddrs = append(newAddrs, resolver.Address{Addr: addr, ServerName: s.Target})
|
||||
}
|
||||
}
|
||||
}
|
||||
// Only return an error if all lookups failed.
|
||||
if len(errs) > 0 && len(newAddrs) == 0 {
|
||||
return nil, errors.Join(errs...)
|
||||
}
|
||||
return newAddrs, nil
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -348,3 +348,84 @@ services {
|
|||
port = 9092
|
||||
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
|
||||
}
|
||||
|
||||
//
|
||||
// The following services are used for testing the gRPC DNS resolver.
|
||||
//
|
||||
|
||||
// CaseOne config will have 2 SRV records. The first will have 0 backends, the
|
||||
// second will have 1.
|
||||
services {
|
||||
id = "case1a"
|
||||
name = "case1a"
|
||||
address = "10.77.77.77"
|
||||
port = 9101
|
||||
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
|
||||
checks = [
|
||||
{
|
||||
id = "case1a-failing"
|
||||
name = "case1a-failing"
|
||||
http = "http://localhost:12345" // invalid url
|
||||
method = "GET"
|
||||
interval = "2s"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
services {
|
||||
id = "case1b"
|
||||
name = "case1b"
|
||||
address = "10.88.88.88"
|
||||
port = 9101
|
||||
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
|
||||
}
|
||||
|
||||
// CaseTwo config will have 2 SRV records. The first will not be configured in
|
||||
// Consul, the second will have 1 backend.
|
||||
services {
|
||||
id = "case2b"
|
||||
name = "case2b"
|
||||
address = "10.88.88.88"
|
||||
port = 9101
|
||||
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
|
||||
}
|
||||
|
||||
// CaseThree config will have 2 SRV records. Neither will be configured in
|
||||
// Consul.
|
||||
|
||||
|
||||
// CaseFour config will have 2 SRV records. Neither will have backends.
|
||||
services {
|
||||
id = "case4a"
|
||||
name = "case4a"
|
||||
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
|
||||
address = "10.77.77.77"
|
||||
port = 9101
|
||||
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
|
||||
checks = [
|
||||
{
|
||||
id = "case4a-failing"
|
||||
name = "case4a-failing"
|
||||
http = "http://localhost:12345" // invalid url
|
||||
method = "GET"
|
||||
interval = "2s"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
services {
|
||||
id = "case4b"
|
||||
name = "case4b"
|
||||
address = "10.88.88.88"
|
||||
port = 9101
|
||||
tags = ["tcp"] // Required for SRV RR support in gRPC DNS resolution.
|
||||
checks = [
|
||||
{
|
||||
id = "case4b-failing"
|
||||
name = "case4b-failing"
|
||||
http = "http://localhost:12345" // invalid url
|
||||
method = "GET"
|
||||
interval = "2s"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,121 @@
|
|||
//go:build integration
|
||||
|
||||
package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/jmhodges/clock"
|
||||
"google.golang.org/protobuf/types/known/emptypb"
|
||||
|
||||
"github.com/letsencrypt/boulder/cmd"
|
||||
bgrpc "github.com/letsencrypt/boulder/grpc"
|
||||
"github.com/letsencrypt/boulder/metrics"
|
||||
"github.com/letsencrypt/boulder/nonce"
|
||||
"github.com/letsencrypt/boulder/test"
|
||||
)
|
||||
|
||||
type conf struct {
|
||||
WebFooEnd struct {
|
||||
TLS cmd.TLSConfig
|
||||
// CaseOne config will have 2 SRV records. The first will have 0
|
||||
// backends, the second will have 1.
|
||||
CaseOne *cmd.GRPCClientConfig
|
||||
|
||||
// CaseTwo config will have 2 SRV records. The first will not be
|
||||
// configured in Consul, the second will have 1 backend.
|
||||
CaseTwo *cmd.GRPCClientConfig
|
||||
|
||||
// CaseThree config will have 2 SRV records. Neither will be configured
|
||||
// in Consul.
|
||||
CaseThree *cmd.GRPCClientConfig
|
||||
|
||||
// CaseFour config will have 2 SRV records. Neither will have backends.
|
||||
CaseFour *cmd.GRPCClientConfig
|
||||
}
|
||||
}
|
||||
|
||||
func TestSRVResolver_CaseOne(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var c conf
|
||||
err := cmd.ReadConfigFile("test/integration/testdata/srv-resolver-config.json", &c)
|
||||
test.AssertNotError(t, err, "Could not read config file")
|
||||
|
||||
tlsConfig, err := c.WebFooEnd.TLS.Load(metrics.NoopRegisterer)
|
||||
test.AssertNotError(t, err, "Could not load TLS config")
|
||||
clk := clock.New()
|
||||
|
||||
getNonceConn, err := bgrpc.ClientSetup(c.WebFooEnd.CaseOne, tlsConfig, metrics.NoopRegisterer, clk)
|
||||
test.AssertNotError(t, err, "Could not set up gRPC client")
|
||||
|
||||
// This should succeed, even though the first SRV record has no backends.
|
||||
gnc := nonce.NewGetter(getNonceConn)
|
||||
_, err = gnc.Nonce(context.Background(), &emptypb.Empty{})
|
||||
test.AssertNotError(t, err, "Unexpected error getting nonce")
|
||||
}
|
||||
|
||||
func TestSRVResolver_CaseTwo(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var c conf
|
||||
err := cmd.ReadConfigFile("test/integration/testdata/srv-resolver-config.json", &c)
|
||||
test.AssertNotError(t, err, "Could not read config file")
|
||||
|
||||
tlsConfig, err := c.WebFooEnd.TLS.Load(metrics.NoopRegisterer)
|
||||
test.AssertNotError(t, err, "Could not load TLS config")
|
||||
clk := clock.New()
|
||||
|
||||
getNonceConn, err := bgrpc.ClientSetup(c.WebFooEnd.CaseTwo, tlsConfig, metrics.NoopRegisterer, clk)
|
||||
test.AssertNotError(t, err, "Could not set up gRPC client")
|
||||
|
||||
// This should succeed, even though the first SRV record is not configured
|
||||
// in Consul.
|
||||
gnc := nonce.NewGetter(getNonceConn)
|
||||
_, err = gnc.Nonce(context.Background(), &emptypb.Empty{})
|
||||
test.AssertNotError(t, err, "Unexpected error getting nonce")
|
||||
}
|
||||
|
||||
func TestSRVResolver_CaseThree(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var c conf
|
||||
err := cmd.ReadConfigFile("test/integration/testdata/srv-resolver-config.json", &c)
|
||||
test.AssertNotError(t, err, "Could not read config file")
|
||||
|
||||
tlsConfig, err := c.WebFooEnd.TLS.Load(metrics.NoopRegisterer)
|
||||
test.AssertNotError(t, err, "Could not load TLS config")
|
||||
clk := clock.New()
|
||||
|
||||
getNonceConn, err := bgrpc.ClientSetup(c.WebFooEnd.CaseThree, tlsConfig, metrics.NoopRegisterer, clk)
|
||||
test.AssertNotError(t, err, "Could not set up gRPC client")
|
||||
|
||||
// This should fail, neither SRV record is configured in Consul and the
|
||||
// resolver will not return any backends.
|
||||
gnc := nonce.NewGetter(getNonceConn)
|
||||
_, err = gnc.Nonce(context.Background(), &emptypb.Empty{})
|
||||
test.AssertError(t, err, "Expected error getting nonce")
|
||||
test.AssertContains(t, err.Error(), "last resolver error: produced zero addresses")
|
||||
}
|
||||
|
||||
func TestSRVResolver_CaseFour(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var c conf
|
||||
err := cmd.ReadConfigFile("test/integration/testdata/srv-resolver-config.json", &c)
|
||||
test.AssertNotError(t, err, "Could not read config file")
|
||||
|
||||
tlsConfig, err := c.WebFooEnd.TLS.Load(metrics.NoopRegisterer)
|
||||
test.AssertNotError(t, err, "Could not load TLS config")
|
||||
clk := clock.New()
|
||||
|
||||
getNonceConn4, err := bgrpc.ClientSetup(c.WebFooEnd.CaseFour, tlsConfig, metrics.NoopRegisterer, clk)
|
||||
test.AssertNotError(t, err, "Could not set up gRPC client")
|
||||
|
||||
// This should fail, neither SRV record resolves to backends.
|
||||
gnc := nonce.NewGetter(getNonceConn4)
|
||||
_, err = gnc.Nonce(context.Background(), &emptypb.Empty{})
|
||||
test.AssertError(t, err, "Expected error getting nonce")
|
||||
test.AssertContains(t, err.Error(), "last resolver error: produced zero addresses")
|
||||
}
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
{
|
||||
"webFooEnd": {
|
||||
"tls": {
|
||||
"caCertFile": "test/grpc-creds/minica.pem",
|
||||
"certFile": "test/grpc-creds/wfe.boulder/cert.pem",
|
||||
"keyFile": "test/grpc-creds/wfe.boulder/key.pem"
|
||||
},
|
||||
"caseOne": {
|
||||
"dnsAuthority": "consul.service.consul",
|
||||
"srvLookups": [
|
||||
{
|
||||
"service": "case1a",
|
||||
"domain": "service.consul"
|
||||
},
|
||||
{
|
||||
"service": "case1b",
|
||||
"domain": "service.consul"
|
||||
}
|
||||
],
|
||||
"timeout": "15s",
|
||||
"noWaitForReady": true,
|
||||
"hostOverride": "nonce.boulder"
|
||||
},
|
||||
"caseTwo": {
|
||||
"dnsAuthority": "consul.service.consul",
|
||||
"srvLookups": [
|
||||
{
|
||||
"service": "case2a",
|
||||
"domain": "service.consul"
|
||||
},
|
||||
{
|
||||
"service": "case2b",
|
||||
"domain": "service.consul"
|
||||
}
|
||||
],
|
||||
"timeout": "15s",
|
||||
"noWaitForReady": true,
|
||||
"hostOverride": "nonce.boulder"
|
||||
},
|
||||
"caseThree": {
|
||||
"dnsAuthority": "consul.service.consul",
|
||||
"srvLookups": [
|
||||
{
|
||||
"service": "case3a",
|
||||
"domain": "service.consul"
|
||||
},
|
||||
{
|
||||
"service": "case3b",
|
||||
"domain": "service.consul"
|
||||
}
|
||||
],
|
||||
"timeout": "15s",
|
||||
"noWaitForReady": true,
|
||||
"hostOverride": "nonce.boulder"
|
||||
},
|
||||
"caseFour": {
|
||||
"dnsAuthority": "consul.service.consul",
|
||||
"srvLookups": [
|
||||
{
|
||||
"service": "case4a",
|
||||
"domain": "service.consul"
|
||||
},
|
||||
{
|
||||
"service": "case4b",
|
||||
"domain": "service.consul"
|
||||
}
|
||||
],
|
||||
"timeout": "15s",
|
||||
"noWaitForReady": true,
|
||||
"hostOverride": "nonce.boulder"
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue