Start using multiple-IP hostnames for load balancing (#3687)

We'd like to start using the DNS load balancer in the latest version of gRPC. That means putting all IPs for a service under a single hostname (or using a SRV record, but we're not taking that path). This change adds an sd-test-srv to act as our service discovery DNS service. It returns both Boulder IP addresses for any A lookup ending in ".boulder". This change also sets up the Docker DNS for our boulder container to defer to sd-test-srv when it doesn't know an answer.

sd-test-srv doesn't know how to resolve public Internet names like `github.com`. Resolving public names is required for the `godep-restore` test phase, so this change breaks out a copy of the boulder container that is used only for `godep-restore`.

This change implements a shim of a DNS resolver for gRPC, so that we can switch to DNS-based load balancing with the currently vendored gRPC, then when we upgrade to the latest gRPC we won't need a simultaneous config update.

Also, this change introduces a check at the end of the integration test that each backend received at least one RPC, ensuring that we are not sending all load to a single backend.
This commit is contained in:
Jacob Hoffman-Andrews 2018-05-23 06:47:14 -07:00 committed by Daniel McCarney
parent ef0324727d
commit dbcb16543e
21 changed files with 272 additions and 54 deletions

View File

@ -34,18 +34,22 @@ env:
#
# Current Go version build tasks:
#
- RUN="vet fmt migrations integration godep-restore errcheck generate dashlint rpm"
- RUN="vet fmt migrations integration errcheck generate dashlint rpm"
# Config changes that have landed in master but not yet been applied to
# production can be made in boulder-config-next.json.
- RUN="integration" BOULDER_CONFIG_DIR="test/config-next"
- RUN="unit"
- RUN="unit-next" BOULDER_CONFIG_DIR="test/config-next"
- RUN="coverage"
# godep-restore runs with a separate container because it needs to fetch
# packages from GitHub et. al., which is incompatible with the DNS server
# override in the boulder container (used for service discovery).
- RUN="godep-restore" CONTAINER="netaccess"
- RUN="coverage" CONTAINER="netaccess"
matrix:
fast_finish: true
allow_failures:
- env: RUN="coverage"
- env: RUN="coverage" CONTAINER="netaccess"
# We require a newer version of docker-compose than is installed by way of the
# "services: docker" directive. Per the travis docs[0] this is best remedied
@ -63,4 +67,4 @@ install:
- $HOME/bin/docker-compose pull
script:
- $HOME/bin/docker-compose run --use-aliases -e BOULDER_CONFIG_DIR="${BOULDER_CONFIG_DIR}" -e RUN="${RUN}" -e TRAVIS="${TRAVIS}" -e TRAVIS_COMMIT="${TRAVIS_COMMIT}" -e TRAVIS_PULL_REQUEST="${TRAVIS_PULL_REQUEST}" -e TRAVIS_PULL_REQUEST="${TRAVIS_PULL_REQUEST}" -e TRAVIS_JOB_ID="${TRAVIS_JOB_ID}" -e COVERALLS_TOKEN="${COVERALLS_TOKEN}" boulder ./test.sh
- $HOME/bin/docker-compose run --use-aliases -e BOULDER_CONFIG_DIR="${BOULDER_CONFIG_DIR}" -e RUN="${RUN}" -e TRAVIS="${TRAVIS}" -e TRAVIS_COMMIT="${TRAVIS_COMMIT}" -e TRAVIS_PULL_REQUEST="${TRAVIS_PULL_REQUEST}" -e TRAVIS_PULL_REQUEST="${TRAVIS_PULL_REQUEST}" -e TRAVIS_JOB_ID="${TRAVIS_JOB_ID}" -e COVERALLS_TOKEN="${COVERALLS_TOKEN}" ${CONTAINER:-boulder} ./test.sh

View File

@ -29,6 +29,13 @@ services:
- ra2.boulder
- va2.boulder
- publisher2.boulder
# Use sd-test-srv as a backup to Docker's embedded DNS server
# (https://docs.docker.com/config/containers/container-networking/#dns-services).
# If there's a name Docker's DNS server doesn't know about, it will
# forward the query to this IP (running sd-test-srv). We have
# special logic there that will return multiple IP addresses for
# service names.
dns: 10.77.77.77
ports:
- 4000:4000 # ACME
- 4001:4001 # ACMEv2
@ -65,6 +72,16 @@ services:
command: mysqld --bind-address=0.0.0.0
logging:
driver: none
netaccess:
image: letsencrypt/boulder-tools-go${TRAVIS_GO_VERSION:-1.10.2}:2018-05-04
networks:
- bluenet
volumes:
- .:/go/src/github.com/letsencrypt/boulder
working_dir: /go/src/github.com/letsencrypt/boulder
entrypoint: test/entrypoint.sh
depends_on:
- bmysql
networks:
bluenet:

View File

@ -3,6 +3,7 @@ package grpc
import (
"crypto/tls"
"fmt"
"net"
"github.com/grpc-ecosystem/go-grpc-prometheus"
"github.com/jmhodges/clock"
@ -26,13 +27,31 @@ func ClientSetup(c *cmd.GRPCClientConfig, tls *tls.Config, metrics clientMetrics
}
ci := clientInterceptor{c.Timeout.Duration, metrics, clk}
creds := bcreds.NewClientCredentials(tls.RootCAs, tls.Certificates)
return grpc.Dial(
"", // Since our staticResolver provides addresses we don't need to pass an address here
grpc.WithTransportCredentials(creds),
grpc.WithBalancer(grpc.RoundRobin(newStaticResolver(c.ServerAddresses))),
grpc.WithUnaryInterceptor(ci.intercept),
)
// When there's only one server address, we use our custom newDNSResolver,
// intended as a temporary shim until we upgrade to a version of gRPC that has
// its own built-in DNS resolver. This works equally well when there's only
// one IP for a hostname or when there are multiple IPs for the hostname.
if len(c.ServerAddresses) == 1 {
host, port, err := net.SplitHostPort(c.ServerAddresses[0])
if err != nil {
return nil, err
}
creds := bcreds.NewClientCredentials(tls.RootCAs, tls.Certificates, host)
return grpc.Dial(
c.ServerAddresses[0],
grpc.WithTransportCredentials(creds),
grpc.WithBalancer(grpc.RoundRobin(newDNSResolver(host, port))),
grpc.WithUnaryInterceptor(ci.intercept),
)
} else {
creds := bcreds.NewClientCredentials(tls.RootCAs, tls.Certificates, "")
return grpc.Dial(
"", // Since our staticResolver provides addresses we don't need to pass an address here
grpc.WithTransportCredentials(creds),
grpc.WithBalancer(grpc.RoundRobin(newStaticResolver(c.ServerAddresses))),
grpc.WithUnaryInterceptor(ci.intercept),
)
}
}
type registry interface {

View File

@ -40,11 +40,14 @@ func (e SANNotAcceptedErr) Error() string {
type clientTransportCredentials struct {
roots *x509.CertPool
clients []tls.Certificate
// If set, this is used as the hostname to validate on certificates, instead
// of the value passed to ClientHandshake by grpc.
hostOverride string
}
// NewClientCredentials returns a new initialized grpc/credentials.TransportCredentials for client usage
func NewClientCredentials(rootCAs *x509.CertPool, clientCerts []tls.Certificate) credentials.TransportCredentials {
return &clientTransportCredentials{rootCAs, clientCerts}
func NewClientCredentials(rootCAs *x509.CertPool, clientCerts []tls.Certificate, hostOverride string) credentials.TransportCredentials {
return &clientTransportCredentials{rootCAs, clientCerts, hostOverride}
}
// ClientHandshake does the authentication handshake specified by the corresponding
@ -52,11 +55,15 @@ func NewClientCredentials(rootCAs *x509.CertPool, clientCerts []tls.Certificate)
// connection and the corresponding auth information about the connection.
// Implementations must use the provided context to implement timely cancellation.
func (tc *clientTransportCredentials) ClientHandshake(ctx context.Context, addr string, rawConn net.Conn) (net.Conn, credentials.AuthInfo, error) {
// IMPORTANT: Don't wrap the errors returned from this method. gRPC expects to be
// able to check err.Temporary to spot temporary errors and reconnect when they happen.
host, _, err := net.SplitHostPort(addr)
if err != nil {
return nil, nil, err
var err error
host := tc.hostOverride
if host == "" {
// IMPORTANT: Don't wrap the errors returned from this method. gRPC expects to be
// able to check err.Temporary to spot temporary errors and reconnect when they happen.
host, _, err = net.SplitHostPort(addr)
if err != nil {
return nil, nil, err
}
}
conn := tls.Client(rawConn, &tls.Config{
ServerName: host,
@ -107,7 +114,7 @@ func (tc *clientTransportCredentials) RequireTransportSecurity() bool {
// Clone returns a copy of the clientTransportCredentials
func (tc *clientTransportCredentials) Clone() credentials.TransportCredentials {
return NewClientCredentials(tc.roots, tc.clients)
return NewClientCredentials(tc.roots, tc.clients, tc.hostOverride)
}
// OverrideServerName is not implemented and here only to satisfy the interface

View File

@ -113,7 +113,7 @@ func TestClientTransportCredentials(t *testing.T) {
serverB := httptest.NewUnstartedServer(nil)
serverB.TLS = &tls.Config{Certificates: []tls.Certificate{{Certificate: [][]byte{derB}, PrivateKey: priv}}}
tc := NewClientCredentials(roots, []tls.Certificate{})
tc := NewClientCredentials(roots, []tls.Certificate{}, "")
serverA.StartTLS()
defer serverA.Close()
@ -195,7 +195,7 @@ func (bc *brokenConn) SetReadDeadline(time.Time) error { return nil }
func (bc *brokenConn) SetWriteDeadline(time.Time) error { return nil }
func TestClientReset(t *testing.T) {
tc := NewClientCredentials(nil, []tls.Certificate{})
tc := NewClientCredentials(nil, []tls.Certificate{}, "")
_, _, err := tc.ClientHandshake(context.Background(), "T:1010", &brokenConn{})
test.AssertError(t, err, "ClientHandshake succeeded with brokenConn")
_, ok := err.(interface {

53
grpc/dns_resolver.go Normal file
View File

@ -0,0 +1,53 @@
package grpc
import (
"context"
"net"
"google.golang.org/grpc/naming"
)
// dnsResolver implements both the naming.Resolver and naming.Watcher
// interfaces. It's a temporary shim until we upgrade to the latest gRPC, which
// has a built-in DNS resolver. It looks up the hostname only once; it doesn't
// monitor for changes.
type dnsResolver struct {
host, port string
// ch is used to enforce the "lookup only once" behavior.
ch chan bool
}
func newDNSResolver(host, port string) *dnsResolver {
return &dnsResolver{
host: host,
port: port,
ch: make(chan bool, 1),
}
}
func (dr *dnsResolver) Resolve(target string) (naming.Watcher, error) {
return dr, nil
}
// Next is called in a loop by grpc.RoundRobin expecting updates. We provide a
// single update then block forever.
func (dr *dnsResolver) Next() ([]*naming.Update, error) {
// Stick a value on the channel, which has capacity 1. This will succeed once,
// then each subsequent call will block forever.
dr.ch <- true
addrs, err := net.DefaultResolver.LookupHost(context.Background(), dr.host)
if err != nil {
return nil, err
}
var updates []*naming.Update
for _, ip := range addrs {
updates = append(updates, &naming.Update{
Op: naming.Add,
Addr: net.JoinHostPort(ip, dr.port),
})
}
return updates, nil
}
// Close does nothing
func (dr *dnsResolver) Close() {}

View File

@ -8,11 +8,11 @@
"keyFile": "test/grpc-creds/admin-revoker.boulder/key.pem"
},
"raService": {
"serverAddresses": ["ra1.boulder:9094", "ra2.boulder:9094"],
"serverAddresses": ["ra.boulder:9094"],
"timeout": "15s"
},
"saService": {
"serverAddresses": ["sa1.boulder:9095", "sa2.boulder:9095"],
"serverAddresses": ["sa.boulder:9095"],
"timeout": "15s"
}
},

View File

@ -11,7 +11,7 @@
"keyFile": "test/grpc-creds/ca.boulder/key.pem"
},
"saService": {
"serverAddresses": ["sa1.boulder:9095", "sa2.boulder:9095"],
"serverAddresses": ["sa.boulder:9095"],
"timeout": "15s"
},
"grpcCA": {

View File

@ -18,7 +18,7 @@
"keyFile": "test/grpc-creds/expiration-mailer.boulder/key.pem"
},
"saService": {
"serverAddresses": ["sa1.boulder:9095", "sa2.boulder:9095"],
"serverAddresses": ["sa.boulder:9095"],
"timeout": "15s"
},
"SMTPTrustedRootFile": "test/mail-test-srv/minica.pem",

View File

@ -23,15 +23,15 @@
"keyFile": "test/grpc-creds/ocsp-updater.boulder/key.pem"
},
"publisher": {
"serverAddresses": ["publisher1.boulder:9091", "publisher2.boulder:9091"],
"serverAddresses": ["publisher.boulder:9091"],
"timeout": "10s"
},
"saService": {
"serverAddresses": ["sa1.boulder:9095", "sa2.boulder:9095"],
"serverAddresses": ["sa.boulder:9095"],
"timeout": "15s"
},
"ocspGeneratorService": {
"serverAddresses": ["ca1.boulder:9096", "ca2.boulder:9096"],
"serverAddresses": ["ca.boulder:9096"],
"timeout": "15s"
},
"features": {

View File

@ -12,7 +12,7 @@
},
"saService": {
"serverAddresses": ["sa1.boulder:9095", "sa2.boulder:9095"],
"serverAddresses": ["sa.boulder:9095"],
"timeout": "15s"
}
}

View File

@ -17,7 +17,7 @@
"keyFile": "test/grpc-creds/publisher.boulder/key.pem"
},
"saService": {
"serverAddresses": ["sa1.boulder:9095", "sa2.boulder:9095"],
"serverAddresses": ["sa.boulder:9095"],
"timeout": "15s"
},
"features": {

View File

@ -22,19 +22,19 @@
"keyFile": "test/grpc-creds/ra.boulder/key.pem"
},
"vaService": {
"serverAddresses": ["va1.boulder:9092", "va1.boulder:9092"],
"serverAddresses": ["va.boulder:9092"],
"timeout": "20s"
},
"caService": {
"serverAddresses": ["ca1.boulder:9093", "ca1.boulder:9093"],
"serverAddresses": ["ca.boulder:9093"],
"timeout": "15s"
},
"publisherService": {
"serverAddresses": ["publisher1.boulder:9091", "publisher1.boulder:9091"],
"serverAddresses": ["publisher.boulder:9091"],
"timeout": "300s"
},
"saService": {
"serverAddresses": ["sa1.boulder:9095", "sa1.boulder:9095"],
"serverAddresses": ["sa.boulder:9095"],
"timeout": "15s"
},
"grpc": {

View File

@ -9,6 +9,10 @@
"tlsPort": 5001
},
"dnsTries": 3,
"dnsResolvers": [
"127.0.0.1:8053",
"127.0.0.1:8054"
],
"issuerDomain": "happy-hacker-ca.invalid",
"tls": {
"caCertfile": "test/grpc-creds/minica.pem",
@ -34,7 +38,6 @@
},
"common": {
"dnsResolver": "127.0.0.1:8053",
"dnsTimeout": "1s",
"dnsAllowLoopbackAddresses": true
}

View File

@ -9,6 +9,10 @@
"tlsPort": 5001
},
"dnsTries": 3,
"dnsResolvers": [
"127.0.0.1:8053",
"127.0.0.1:8054"
],
"issuerDomain": "happy-hacker-ca.invalid",
"tls": {
"caCertfile": "test/grpc-creds/minica.pem",
@ -34,7 +38,6 @@
},
"common": {
"dnsResolver": "127.0.0.1:8053",
"dnsTimeout": "1s",
"dnsAllowLoopbackAddresses": true
}

View File

@ -23,11 +23,11 @@
"keyFile": "test/grpc-creds/wfe.boulder/key.pem"
},
"raService": {
"serverAddresses": ["ra1.boulder:9094", "ra2.boulder:9094"],
"serverAddresses": ["ra.boulder:9094"],
"timeout": "20s"
},
"saService": {
"serverAddresses": ["sa1.boulder:9095", "sa2.boulder:9095"],
"serverAddresses": ["sa.boulder:9095"],
"timeout": "15s"
},
"features": {

View File

@ -24,11 +24,11 @@
"keyFile": "test/grpc-creds/wfe.boulder/key.pem"
},
"raService": {
"serverAddresses": ["ra1.boulder:9094", "ra2.boulder:9094"],
"serverAddresses": ["ra.boulder:9094"],
"timeout": "15s"
},
"saService": {
"serverAddresses": ["sa1.boulder:9095", "sa2.boulder:9095"],
"serverAddresses": ["sa.boulder:9095"],
"timeout": "15s"
},
"certificateChains": {

View File

@ -31,17 +31,19 @@ wait_tcp_port boulder-mysql 3306
# create the database
MYSQL_CONTAINER=1 $DIR/create_db.sh
# Delaying loading private key into SoftHSM container until now so that switching
# out the signing key doesn't require rebuilding the boulder-tools image. Only
# convert key to DER once per container.
wait_tcp_port boulder-hsm 5657
if [ -n "${PKCS11_PROXY_SOCKET:-}" ]; then
# Delaying loading private key into SoftHSM container until now so that switching
# out the signing key doesn't require rebuilding the boulder-tools image. Only
# convert key to DER once per container.
wait_tcp_port boulder-hsm 5657
addkey() {
pkcs11-tool --module=/usr/local/lib/libpkcs11-proxy.so \
--type privkey --pin 5678 --login --so-pin 1234 "$@";
}
addkey --token-label intermediate --write-object test/test-ca.key.der --label intermediate_key
addkey --token-label root --write-object test/test-root.key.der --label root_key
addkey() {
pkcs11-tool --module=/usr/local/lib/libpkcs11-proxy.so \
--type privkey --pin 5678 --login --so-pin 1234 "$@";
}
addkey --token-label intermediate --write-object test/test-ca.key.der --label intermediate_key
addkey --token-label root --write-object test/test-root.key.der --label root_key
fi
if [[ $# -eq 0 ]]; then
exec ./start.py

View File

@ -270,7 +270,7 @@ def random_domain():
return "rand.%x.xyz" % random.randrange(2**32)
def test_expiration_mailer():
email_addr = "integration.%x@boulder.local" % random.randrange(2**16)
email_addr = "integration.%x@boulder" % random.randrange(2**16)
cert, _ = auth_and_issue([random_domain()], email=email_addr)
# Check that the expiration mailer sends a reminder
expiry = datetime.datetime.strptime(cert.body.get_notAfter(), '%Y%m%d%H%M%SZ')
@ -616,6 +616,7 @@ def main():
run(args.custom)
run_cert_checker()
check_balance()
run_expired_authz_purger()
if not startservers.check():
@ -641,6 +642,30 @@ def run_loadtest():
-config test/load-generator/config/v2-integration-test-config.json\
-results %s" % latency_data_file)
def check_balance():
"""Verify that gRPC load balancing across backends is working correctly.
Fetch metrics from each backend and ensure the grpc_server_handled_total
metric is present, which means that backend handled at least one request.
"""
addresses = [
"sa1.boulder:8003",
"sa2.boulder:8103",
"publisher1.boulder:8009",
"publisher2.boulder:8109",
"va1.boulder:8004",
"va2.boulder:8104",
"ca1.boulder:8001",
"ca2.boulder:8104",
"ra1.boulder:8002",
"ra2.boulder:8102",
]
for address in addresses:
metrics = requests.get("http://%s/metrics" % address)
if not "grpc_server_handled_total" in metrics.text:
raise Exception("no gRPC traffic processed by %s; load balancing problem?"
% address)
def run_cert_checker():
run("./bin/cert-checker -config %s/cert-checker.json" % default_config_dir)

84
test/sd-test-srv/main.go Normal file
View File

@ -0,0 +1,84 @@
// sd-test-srv runs a simple service discovery system; it returns two hardcoded
// IP addresses for every A query.
package main
import (
"flag"
"log"
"net"
"strings"
"time"
"github.com/miekg/dns"
)
func dnsHandler(w dns.ResponseWriter, r *dns.Msg) {
m := new(dns.Msg)
m.SetReply(r)
m.Compress = false
if len(r.Question) != 1 {
m.Rcode = dns.RcodeServerFailure
w.WriteMsg(m)
return
}
if !strings.HasSuffix(r.Question[0].Name, ".boulder.") {
m.Rcode = dns.RcodeServerFailure
w.WriteMsg(m)
return
}
hdr := dns.RR_Header{
Name: r.Question[0].Name,
Rrtype: dns.TypeA,
Class: dns.ClassINET,
Ttl: 0,
}
// These two hardcoded IPs correspond to the configured addresses for boulder
// in docker-compose.yml. In our Docker setup, boulder is present on two
// networks, rednet and bluenet, with a different IP address on each. This
// allows us to test load balance across gRPC backends.
m.Answer = append(m.Answer, &dns.A{
A: net.ParseIP("10.77.77.77"),
Hdr: hdr,
}, &dns.A{
A: net.ParseIP("10.88.88.88"),
Hdr: hdr,
})
w.WriteMsg(m)
return
}
func main() {
listen := flag.String("listen", ":53", "Address and port to listen on.")
flag.Parse()
if *listen == "" {
flag.Usage()
return
}
dns.HandleFunc(".", dnsHandler)
go func() {
srv := dns.Server{
Addr: *listen,
Net: "tcp",
ReadTimeout: time.Second,
WriteTimeout: time.Second,
}
err := srv.ListenAndServe()
if err != nil {
log.Fatal(err)
}
}()
srv := dns.Server{
Addr: *listen,
Net: "udp",
ReadTimeout: time.Second,
WriteTimeout: time.Second,
}
err := srv.ListenAndServe()
if err != nil {
log.Fatal(err)
}
}

View File

@ -40,7 +40,7 @@ def run(cmd, race_detection, fakeclock):
def waitport(port, prog):
"""Wait until a port on localhost is open."""
while True:
for _ in range(1000):
try:
time.sleep(0.1)
# If one of the servers has died, quit immediately.
@ -49,13 +49,13 @@ def waitport(port, prog):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(('localhost', port))
s.close()
break
return True
except socket.error as e:
if e.errno == errno.ECONNREFUSED:
print "Waiting for debug port %d (%s)" % (port, prog)
else:
raise
return True
raise Exception("timed out waiting for debug port %d (%s)" % (port, prog))
def start(race_detection, fakeclock=None):
"""Return True if everything builds and starts.
@ -79,6 +79,7 @@ def start(race_detection, fakeclock=None):
[8012, 'boulder-va --config %s' % os.path.join(default_config_dir, "va-remote-b.json")],
])
progs.extend([
[53, 'sd-test-srv --listen :53'], # Service discovery DNS server
[8003, 'boulder-sa --config %s --addr sa1.boulder:9095 --debug-addr :8003' % os.path.join(default_config_dir, "sa.json")],
[8103, 'boulder-sa --config %s --addr sa2.boulder:9095 --debug-addr :8103' % os.path.join(default_config_dir, "sa.json")],
[4500, 'ct-test-srv --config test/ct-test-srv/ct-test-srv.json'],