Give more detailed validation errors in VA (#3629)

In particular, differentiate timeouts during connect (which are usually a firewall problem) from timeouts after connect (which are usually a software problem). In the process, refactor the tests and add testing for specific problem detail messages.

This also switches over the HTTP challenge's dialer to use DialContext, and to shave a little bit of headroom off of the context deadline, so that the dial can report its timeout before the overall context expires, which would lead to an overly generic "deadline exceeded" error, which would then get translated (incorrectly) into a "timeout after connect."

There is an additional error case, Timeout during %s (your server may be slow or overloaded), (where %s can be read or write) which doesn't have any unittests. I believe it may not be possible to trigger this, since read and write timeouts get subsumed by the HTTP or TLS library, but it's worth having as a fallback case. We'll see if it shows up in the logs.

Among the test refactorings, I shortened the timeout on the TLS timeout test to 50ms. Previously this was the long pole making the whole test take 10s. Now it takes ~500 ms overall.

I recommend starting review at https://github.com/letsencrypt/boulder/compare/detailed-va-errors?expand=1#diff-4c51d1d7ca3ec3022d14b42809af0d7eR671 (the changes to detailedError), then reviewing the Dial -> DialContext changes, then the tests.
This commit is contained in:
Jacob Hoffman-Andrews 2018-04-16 12:01:08 -07:00 committed by Roland Bracewell Shoemaker
parent 1271a15be7
commit 339ea954bd
2 changed files with 278 additions and 62 deletions

View File

@ -43,9 +43,9 @@ const (
maxResponseSize = 128
)
// singleDialTimeout specifies how long an individual `Dial` operation may take
// singleDialTimeout specifies how long an individual `DialContext` operation may take
// before timing out. This timeout ignores the base RPC timeout and is strictly
// used for the Dial operations that take place during an
// used for the DialContext operations that take place during an
// HTTP-01/TLS-SNI-[01|02] challenge validation.
var singleDialTimeout = time.Second * 10
@ -125,6 +125,15 @@ func NewValidationAuthorityImpl(
clk clock.Clock,
logger blog.Logger,
) *ValidationAuthorityImpl {
if pc.HTTPPort == 0 {
pc.HTTPPort = 80
}
if pc.HTTPSPort == 0 {
pc.HTTPSPort = 443
}
if pc.TLSPort == 0 {
pc.TLSPort = 443
}
return &ValidationAuthorityImpl{
log: logger,
@ -178,8 +187,8 @@ func (va ValidationAuthorityImpl) getAddr(ctx context.Context, hostname string)
}
// http01Dialer is a struct that exists to provide a dialer like object with
// a `Dial` method that can be given to an http.Transport for HTTP-01
// validation. The primary purpose of the http01Dialer's Dial method is to
// a `DialContext` method that can be given to an http.Transport for HTTP-01
// validation. The primary purpose of the http01Dialer's DialContext method is to
// circumvent traditional DNS lookup and to use the IP addresses provided in the
// inner `record` member populated by the `resolveAndConstructDialer` function.
type http01Dialer struct {
@ -190,19 +199,32 @@ type http01Dialer struct {
// realDialer is used to create a true `net.Dialer` that can be used once an IP
// address to connect to is determined. It increments the `dialerCount` integer
// to track how many "fresh" dialer instances have been created during a `Dial`
// for testing purposes.
// to track how many "fresh" dialer instances have been created during a
// `DialContext` for testing purposes.
func (d *http01Dialer) realDialer() *net.Dialer {
// Record that we created a new instance of a real net.Dialer
d.dialerCount++
return &net.Dialer{Timeout: singleDialTimeout}
}
// Dial processes the IP addresses from the inner validation record, using
// DialContext processes the IP addresses from the inner validation record, using
// `realDialer` to make connections as required. If `features.IPv6First` is
// enabled then for dual-homed hosts an initial IPv6 connection will be made
// followed by a IPv4 connection if there is a failure with the IPv6 connection.
func (d *http01Dialer) Dial(_, _ string) (net.Conn, error) {
func (d *http01Dialer) DialContext(ctx context.Context, _, _ string) (net.Conn, error) {
deadline, ok := ctx.Deadline()
if !ok {
// Shouldn't happen: All requests should have a deadline by this point.
deadline = time.Now().Add(100 * time.Second)
} else {
// Set the context deadline slightly shorter than the HTTP deadline, so we
// get the dial error rather than a generic "deadline exceeded" error. This
// lets us give a more specific error to the subscriber.
deadline = deadline.Add(-10 * time.Millisecond)
}
ctx, cancel := context.WithDeadline(ctx, deadline)
defer cancel()
var realDialer *net.Dialer
// Split the available addresses into v4 and v6 addresses
@ -219,7 +241,7 @@ func (d *http01Dialer) Dial(_, _ string) (net.Conn, error) {
address := net.JoinHostPort(addresses[0].String(), d.record.Port)
d.record.AddressUsed = addresses[0]
realDialer = d.realDialer()
return realDialer.Dial("tcp", address)
return realDialer.DialContext(ctx, "tcp", address)
}
// If the IPv6 first feature is enabled and there is at least one IPv6 address
@ -228,7 +250,7 @@ func (d *http01Dialer) Dial(_, _ string) (net.Conn, error) {
address := net.JoinHostPort(v6[0].String(), d.record.Port)
d.record.AddressUsed = v6[0]
realDialer = d.realDialer()
conn, err := realDialer.Dial("tcp", address)
conn, err := realDialer.DialContext(ctx, "tcp", address)
// If there is no error, return immediately
if err == nil {
@ -257,7 +279,7 @@ func (d *http01Dialer) Dial(_, _ string) (net.Conn, error) {
address := net.JoinHostPort(v4[0].String(), d.record.Port)
d.record.AddressUsed = v4[0]
realDialer = d.realDialer()
return realDialer.Dial("tcp", address)
return realDialer.DialContext(ctx, "tcp", address)
}
// availableAddresses takes a ValidationRecord and splits the AddressesResolved
@ -325,6 +347,7 @@ func (va *ValidationAuthorityImpl) fetchHTTP(ctx context.Context, identifier cor
return nil, nil, probs.Malformed("URL provided for HTTP was invalid")
}
httpRequest = httpRequest.WithContext(ctx)
if va.userAgent != "" {
httpRequest.Header["User-Agent"] = []string{va.userAgent}
}
@ -332,7 +355,7 @@ func (va *ValidationAuthorityImpl) fetchHTTP(ctx context.Context, identifier cor
dialer, prob := va.resolveAndConstructDialer(ctx, host, port)
dialer.record.URL = url.String()
// Start with an empty validation record list - we will add a record after
// each dialer.Dial()
// each dialer.DialContext()
var validationRecords []core.ValidationRecord
if prob != nil {
return nil, []core.ValidationRecord{dialer.record}, prob
@ -345,9 +368,9 @@ func (va *ValidationAuthorityImpl) fetchHTTP(ctx context.Context, identifier cor
// We don't expect to make multiple requests to a client, so close
// connection immediately.
DisableKeepAlives: true,
// Intercept Dial in order to connect to the IP address we
// Intercept DialContext in order to connect to the IP address we
// select.
Dial: dialer.Dial,
DialContext: dialer.DialContext,
}
// Some of our users use mod_security. Mod_security sees a lack of Accept
@ -401,7 +424,7 @@ func (va *ValidationAuthorityImpl) fetchHTTP(ctx context.Context, identifier cor
if err != nil {
return err
}
tr.Dial = dialer.Dial
tr.DialContext = dialer.DialContext
va.log.Debug(fmt.Sprintf("%s [%s] redirect from %q to %q [%s]", challenge.Type, identifier, via[len(via)-1].URL.String(), req.URL.String(), dialer.record.AddressUsed))
return nil
}
@ -649,13 +672,20 @@ func detailedError(err error) *probs.ProblemDetails {
} else if syscallErr, ok := netErr.Err.(*os.SyscallError); ok &&
syscallErr.Err == syscall.ECONNREFUSED {
return probs.ConnectionFailure("Connection refused")
} else if syscallErr, ok := netErr.Err.(*os.SyscallError); ok &&
syscallErr.Err == syscall.ENETUNREACH {
return probs.ConnectionFailure("Network unreachable")
} else if syscallErr, ok := netErr.Err.(*os.SyscallError); ok &&
syscallErr.Err == syscall.ECONNRESET {
return probs.ConnectionFailure("Connection reset by peer")
} else if netErr.Timeout() && netErr.Op == "dial" {
return probs.ConnectionFailure("Timeout during connect (likely firewall problem)")
} else if netErr.Timeout() {
return probs.ConnectionFailure(fmt.Sprintf("Timeout during %s (your server may be slow or overloaded)", netErr.Op))
}
}
if err, ok := err.(net.Error); ok && err.Timeout() {
return probs.ConnectionFailure("Timeout")
return probs.ConnectionFailure("Timeout after connect (your server may be slow or overloaded)")
}
if berrors.Is(err, berrors.ConnectionFailure) {
return probs.ConnectionFailure(err.Error())

View File

@ -186,7 +186,7 @@ func tlssni02Srv(t *testing.T, chall core.Challenge) *httptest.Server {
return tlssniSrvWithNames(t, chall, sanAName, sanBName)
}
func tlssniSrvWithNames(t *testing.T, chall core.Challenge, names ...string) *httptest.Server {
func makeACert(names []string) *tls.Certificate {
template := &x509.Certificate{
SerialNumber: big.NewInt(1337),
Subject: pkix.Name{
@ -203,19 +203,18 @@ func tlssniSrvWithNames(t *testing.T, chall core.Challenge, names ...string) *ht
}
certBytes, _ := x509.CreateCertificate(rand.Reader, template, template, &TheKey.PublicKey, &TheKey)
cert := &tls.Certificate{
return &tls.Certificate{
Certificate: [][]byte{certBytes},
PrivateKey: &TheKey,
}
}
func tlssniSrvWithNames(t *testing.T, chall core.Challenge, names ...string) *httptest.Server {
cert := makeACert(names)
tlsConfig := &tls.Config{
Certificates: []tls.Certificate{*cert},
ClientAuth: tls.NoClientCert,
GetCertificate: func(clientHello *tls.ClientHelloInfo) (*tls.Certificate, error) {
if clientHello.ServerName != names[0] {
time.Sleep(time.Second * 10)
return nil, nil
}
return cert, nil
},
NextProtos: []string{"http/1.1"},
@ -338,23 +337,84 @@ func TestHTTPTimeout(t *testing.T) {
setChallengeToken(&chall, pathWaitLong)
started := time.Now()
timeout := 50 * time.Millisecond
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
_, prob := va.validateHTTP01(ctx, dnsi("localhost"), chall)
took := time.Since(started)
// Check that the HTTP connection does't return before a timeout, and times
// out after the expected time
test.Assert(t,
(took > (time.Second * singleDialTimeout)),
fmt.Sprintf("HTTP timed out before %d seconds", singleDialTimeout))
test.Assert(t,
(took < (time.Second * (singleDialTimeout * 2))),
fmt.Sprintf("HTTP connection didn't timeout after %d seconds",
singleDialTimeout))
if prob == nil {
t.Fatalf("Connection should've timed out")
}
took := time.Since(started)
// Check that the HTTP connection doesn't return before a timeout, and times
// out after the expected time
if took < timeout {
t.Fatalf("HTTP timed out before %s: %s with %s", timeout, took, prob)
}
if took > 2*timeout {
t.Fatalf("HTTP connection didn't timeout after %s", timeout)
}
test.AssertEquals(t, prob.Type, probs.ConnectionProblem)
expectMatch := regexp.MustCompile(
"Fetching http://localhost:\\d+/.well-known/acme-challenge/wait-long: Timeout")
"Fetching http://localhost:\\d+/.well-known/acme-challenge/wait-long: Timeout after connect")
if !expectMatch.MatchString(prob.Detail) {
t.Errorf("Problem details incorrect. Got %q, expected to match %q",
prob.Detail, expectMatch)
}
}
// dnsMockReturnsUnroutable is a DNSClient mock that always returns an
// unroutable address for LookupHost. This is useful in testing connect
// timeouts.
type dnsMockReturnsUnroutable struct {
*bdns.MockDNSClient
}
func (mock dnsMockReturnsUnroutable) LookupHost(_ context.Context, hostname string) ([]net.IP, error) {
return []net.IP{net.ParseIP("198.51.100.1")}, nil
}
// TestHTTPDialTimeout tests that we give the proper "Timeout during connect"
// error when dial fails. We do this by using a mock DNS client that resolves
// everything to an unroutable IP address.
func TestHTTPDialTimeout(t *testing.T) {
va, _ := setup(nil, 0)
started := time.Now()
timeout := 50 * time.Millisecond
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
va.dnsClient = dnsMockReturnsUnroutable{&bdns.MockDNSClient{}}
// The only method I've found so far to trigger a connect timeout is to
// connect to an unrouteable IP address. This usuall generates a connection
// timeout, but will rarely return "Network unreachable" instead. If we get
// that, just retry until we get something other than "Network unreachable".
var prob *probs.ProblemDetails
for i := 0; i < 20; i++ {
_, prob = va.validateHTTP01(ctx, dnsi("unroutable.invalid"), core.HTTPChallenge01())
if prob != nil && strings.Contains(prob.Detail, "Network unreachable") {
continue
} else {
break
}
}
if prob == nil {
t.Fatalf("Connection should've timed out")
}
took := time.Since(started)
// Check that the HTTP connection doesn't return too fast, and times
// out after the expected time
if took < timeout/2 {
t.Fatalf("HTTP returned before %s (%s) with %#v", timeout, took, prob)
}
if took > 2*timeout {
t.Fatalf("HTTP connection didn't timeout after %s seconds", timeout)
}
test.AssertEquals(t, prob.Type, probs.ConnectionProblem)
expectMatch := regexp.MustCompile(
"Fetching http://unroutable.invalid/.well-known/acme-challenge/.*: Timeout during connect")
if !expectMatch.MatchString(prob.Detail) {
t.Errorf("Problem details incorrect. Got %q, expected to match %q",
prob.Detail, expectMatch)
@ -476,11 +536,9 @@ func getPort(hs *httptest.Server) int {
return int(port)
}
func TestTLSSNI01(t *testing.T) {
func TestTLSSNI01Success(t *testing.T) {
chall := createChallenge(core.ChallengeTypeTLSSNI01)
hs := tlssni01Srv(t, chall)
va, log := setup(hs, 0)
_, prob := va.validateTLSSNI01(ctx, dnsi("localhost"), chall)
@ -492,10 +550,15 @@ func TestTLSSNI01(t *testing.T) {
t.Errorf("Didn't get log message with validated certificate. Instead got:\n%s",
strings.Join(log.GetAllMatching(".*"), "\n"))
}
}
func TestTLSSNI01FailIP(t *testing.T) {
chall := createChallenge(core.ChallengeTypeTLSSNI01)
hs := tlssni01Srv(t, chall)
va, _ := setup(hs, 0)
log.Clear()
port := getPort(hs)
_, prob = va.validateTLSSNI01(ctx, core.AcmeIdentifier{
_, prob := va.validateTLSSNI01(ctx, core.AcmeIdentifier{
Type: core.IdentifierType("ip"),
Value: net.JoinHostPort("127.0.0.1", fmt.Sprintf("%d", port)),
}, chall)
@ -503,53 +566,176 @@ func TestTLSSNI01(t *testing.T) {
t.Fatalf("IdentifierType IP shouldn't have worked.")
}
test.AssertEquals(t, prob.Type, probs.MalformedProblem)
}
log.Clear()
_, prob = va.validateTLSSNI01(ctx, core.AcmeIdentifier{Type: core.IdentifierDNS, Value: "always.invalid"}, chall)
func TestTLSSNI01Invalid(t *testing.T) {
chall := createChallenge(core.ChallengeTypeTLSSNI01)
hs := tlssni01Srv(t, chall)
va, _ := setup(hs, 0)
_, prob := va.validateTLSSNI01(ctx, core.AcmeIdentifier{Type: core.IdentifierDNS, Value: "always.invalid"}, chall)
if prob == nil {
t.Fatalf("Domain name was supposed to be invalid.")
}
test.AssertEquals(t, prob.Type, probs.UnknownHostProblem)
expected := "No valid IP addresses found for always.invalid"
if prob.Detail != expected {
t.Errorf("Got wrong error detail. Expected %q, got %q",
expected, prob.Detail)
}
}
// Need to create a new authorized keys object to get an unknown SNI (from the signature value)
chall.Token = core.NewToken()
chall.ProvidedKeyAuthorization = "invalid"
func slowTLSSrv() *httptest.Server {
server := httptest.NewUnstartedServer(http.DefaultServeMux)
server.TLS = &tls.Config{
GetCertificate: func(*tls.ClientHelloInfo) (*tls.Certificate, error) {
time.Sleep(100 * time.Millisecond)
return makeACert([]string{"nomatter"}), nil
},
}
server.StartTLS()
return server
}
func TestTLSSNI01TimeoutAfterConnect(t *testing.T) {
// Set a short dial timeout so this test can happen quickly. Note: It would be
// better to override this with a context, but that doesn't work right now:
// https://github.com/letsencrypt/boulder/issues/3628
oldSingleDialTimeout := singleDialTimeout
singleDialTimeout = 50 * time.Millisecond
defer func() {
singleDialTimeout = oldSingleDialTimeout
}()
chall := createChallenge(core.ChallengeTypeTLSSNI01)
hs := slowTLSSrv()
va, _ := setup(hs, 0)
timeout := 50 * time.Millisecond
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
log.Clear()
started := time.Now()
_, prob = va.validateTLSSNI01(ctx, dnsi("localhost"), chall)
took := time.Since(started)
_, prob := va.validateTLSSNI01(ctx, dnsi("slow.server"), chall)
if prob == nil {
t.Fatalf("Validation should've failed")
}
test.AssertEquals(t, prob.Type, probs.ConnectionProblem)
// Check that the TLS connection doesn't return before a timeout, and times
// out after the expected time
test.Assert(t,
(took > (time.Second * singleDialTimeout)),
fmt.Sprintf("TLS connection returned before %d seconds", singleDialTimeout))
test.Assert(t,
(took < (time.Second * (2 * singleDialTimeout))),
fmt.Sprintf("TLS connection didn't timeout after %d seconds",
singleDialTimeout))
test.AssertEquals(t, len(log.GetAllMatching(`Resolved addresses for localhost \[using 127.0.0.1\]: \[127.0.0.1\]`)), 1)
took := time.Since(started)
// Check that the HTTP connection doesn't return too fast, and times
// out after the expected time
if took < timeout/2 {
t.Fatalf("TLSSNI returned before %s (%s) with %#v", timeout, took, prob)
}
if took > 2*timeout {
t.Fatalf("TLSSNI didn't timeout after %s", timeout)
}
if prob == nil {
t.Fatalf("Connection should've timed out")
}
test.AssertEquals(t, prob.Type, probs.ConnectionProblem)
expected := "Timeout after connect (your server may be slow or overloaded)"
if prob.Detail != expected {
t.Errorf("Wrong error detail. Expected %q, got %q", expected, prob.Detail)
}
}
func TestTLSSNI01DialTimeout(t *testing.T) {
// Set a short dial timeout so this test can happen quickly. Note: It would be
// better to override this with a context, but that doesn't work right now:
// https://github.com/letsencrypt/boulder/issues/3628
old := singleDialTimeout
singleDialTimeout = 50 * time.Millisecond
defer func() {
singleDialTimeout = old
}()
timeout := singleDialTimeout
chall := createChallenge(core.ChallengeTypeTLSSNI01)
hs := slowTLSSrv()
va, _ := setup(hs, 0)
va.dnsClient = dnsMockReturnsUnroutable{&bdns.MockDNSClient{}}
started := time.Now()
// The only method I've found so far to trigger a connect timeout is to
// connect to an unrouteable IP address. This usuall generates a connection
// timeout, but will rarely return "Network unreachable" instead. If we get
// that, just retry until we get something other than "Network unreachable".
var prob *probs.ProblemDetails
for i := 0; i < 20; i++ {
_, prob = va.validateTLSSNI01(ctx, dnsi("unroutable.invalid"), chall)
if prob != nil && strings.Contains(prob.Detail, "Network unreachable") {
continue
} else {
break
}
}
if prob == nil {
t.Fatalf("Validation should've failed")
}
// Check that the TLS connection doesn't return before a timeout, and times
// out after the expected time
took := time.Since(started)
// Check that the HTTP connection doesn't return too fast, and times
// out after the expected time
if took < timeout/2 {
t.Fatalf("TLSSNI returned before %s (%s) with %#v", timeout, took, prob)
}
if took > 2*timeout {
t.Fatalf("TLSSNI didn't timeout after %s", timeout)
}
if prob == nil {
t.Fatalf("Connection should've timed out")
}
test.AssertEquals(t, prob.Type, probs.ConnectionProblem)
expected := "Timeout during connect (likely firewall problem)"
if prob.Detail != expected {
t.Errorf("Wrong error detail. Expected %q, got %q", expected, prob.Detail)
}
}
func TestTLSSNI01InvalidResponse(t *testing.T) {
chall := createChallenge(core.ChallengeTypeTLSSNI01)
hs := tlssni01Srv(t, chall)
va, _ := setup(hs, 0)
differentChall := createChallenge(core.ChallengeTypeTLSSNI01)
differentChall.ProvidedKeyAuthorization = "invalid.keyAuthorization"
_, prob := va.validateTLSSNI01(ctx, dnsi("localhost"), differentChall)
if prob == nil {
t.Fatalf("Validation should've failed")
}
expected := "Incorrect validation certificate for tls-sni-01 challenge."
if !strings.HasPrefix(prob.Detail, expected) {
t.Errorf("Wrong error detail. Expected %q, got %q", expected, prob.Detail)
}
}
func TestTLSSNI01Refused(t *testing.T) {
chall := createChallenge(core.ChallengeTypeTLSSNI01)
hs := tlssni01Srv(t, chall)
va, _ := setup(hs, 0)
// Take down validation server and check that validation fails.
hs.Close()
_, err := va.validateTLSSNI01(ctx, dnsi("localhost"), chall)
if err == nil {
_, prob := va.validateTLSSNI01(ctx, dnsi("localhost"), chall)
if prob == nil {
t.Fatalf("Server's down; expected refusal. Where did we connect?")
}
test.AssertEquals(t, prob.Type, probs.ConnectionProblem)
}
func TestTLSSNI01TalkingToHTTP(t *testing.T) {
chall := createChallenge(core.ChallengeTypeTLSSNI01)
hs := tlssni01Srv(t, chall)
va, _ := setup(hs, 0)
httpOnly := httpSrv(t, "")
va.tlsPort = getPort(httpOnly)
log.Clear()
_, err = va.validateTLSSNI01(ctx, dnsi("localhost"), chall)
test.AssertError(t, err, "TLS-SNI-01 validation passed when talking to a HTTP-only server")
_, prob := va.validateTLSSNI01(ctx, dnsi("localhost"), chall)
test.AssertError(t, prob, "TLS-SNI-01 validation passed when talking to a HTTP-only server")
test.Assert(t, strings.HasSuffix(
err.Error(),
prob.Detail,
"Server only speaks HTTP, not TLS",
), "validate TLS-SNI-01 didn't return useful error")
}
@ -1171,7 +1357,7 @@ func TestHTTP01DialerFallback(t *testing.T) {
d, _ := va.resolveAndConstructDialer(context.Background(), "ipv4.and.ipv6.localhost", va.httpPort)
// Try to dial the dialer
_, dialProb := d.Dial("", "ipv4.and.ipv6.localhost")
_, dialProb := d.DialContext(context.Background(), "", "ipv4.and.ipv6.localhost")
// There shouldn't be a problem from this dial
test.AssertEquals(t, dialProb, nil)