VA: Log exceptional non-problem remote VA errors. (#3760)

Previously, if a remote VA returned an error that is not a ProblemDetail, the primary VA would log a ServerInternalProblem but not the underlying error. This commit updates performRemoteValidation to always return the full error it receives from a remote VA. This commit also adds a unittest that checks that the VA still returns a ServerInternalProblem to the RA, and that the VA audit logs the underlying error. Resolves https://github.com/letsencrypt/boulder/issues/3753
2018-06-15 13:53:16 -04:00 · 2018-06-15 13:53:16 -04:00 · 2dadd5e09a
parent 49a1959e21
commit 2dadd5e09a
2 changed files with 106 additions and 1 deletions
--- a/va/va.go
+++ b/va/va.go
@ -948,10 +948,21 @@ func (va *ValidationAuthorityImpl) performRemoteValidation(ctx context.Context,
 				// returned error can be a nil *probs.ProblemDetails which breaks the
 				// err != nil check so do a slightly more complicated unwrap check to
 				// make sure we don't choke on that.
-				if p, ok := err.(*probs.ProblemDetails); !ok || p != nil {
+				if p, ok := err.(*probs.ProblemDetails); ok || p != nil {
+					// If the non-nil err was a non-nil *probs.ProblemDetails then we can
+					// log it at an info level. It's a normal non-success validation
+					// result and the remote VA will have logged more detail.
 					va.log.Infof("Remote VA %q.PerformValidation failed: %s", rva.Addresses, err)
 				} else if ok && p == nil {
+					// If the non-nil err was a nil *probs.ProblemDetails then we don't need to do
+					// anything. There isn't really an error here.
 					err = nil
+				} else if !ok {
+					// Otherwise, the non-nil err was *not* a *probs.ProblemDetails and
+					// represents something that will later be returned as a server internal error
+					// without detail if the number of errors is >= va.maxRemoteFailures.
+					// Log it at the error level so we can debug from logs.
+					va.log.Errf("Remote VA %q.PerformValidation failed: %s", rva.Addresses, err)
 				}
 			}
 			errors <- err
@ -984,6 +995,9 @@ func (va *ValidationAuthorityImpl) performRemoteValidation(ctx context.Context,
 				// instances.
 				result <- prob
 			} else {
+				// Otherwise the error was not an expected non-sucess problem result and
+				// represents an internal error. The real error has already been logged
+				// so return a server internal problem result without detail.
 				result <- probs.ServerInternal("Remote PerformValidation RPCs failed")
 			}
 			break
--- a/va/va_test.go
+++ b/va/va_test.go
@ -10,6 +10,7 @@ import (
 	"encoding/asn1"
 	"encoding/base64"
 	"encoding/hex"
+	"errors"
 	"fmt"
 	"math/big"
 	mrand "math/rand"
@ -41,6 +42,7 @@ import (
 	"github.com/letsencrypt/boulder/metrics/mock_metrics"
 	"github.com/letsencrypt/boulder/probs"
 	"github.com/letsencrypt/boulder/test"
+	vaPB "github.com/letsencrypt/boulder/va/proto"
 )

 func bigIntFromB64(b64 string) *big.Int {
@ -1717,6 +1719,95 @@ func TestPerformRemoteValidation(t *testing.T) {
 	}
 }

+// brokenRemoteVA is a mock for the core.ValidationAuthority interface mocked to
+// always return errors.
+type brokenRemoteVA struct{}
+
+// brokenRemoteVAError is the error returned by a brokenRemoteVA's
+// PerformValidation and IsSafeDomain functions.
+var brokenRemoteVAError = errors.New("brokenRemoteVA is broken")
+
+// PerformValidation returns brokenRemoteVAError unconditionally
+func (b *brokenRemoteVA) PerformValidation(
+	_ context.Context,
+	_ string,
+	_ core.Challenge,
+	_ core.Authorization) ([]core.ValidationRecord, error) {
+	return nil, brokenRemoteVAError
+}
+
+// IsSafeDomain returns brokenRemoteVAError unconditionally
+func (b *brokenRemoteVA) IsSafeDomain(
+	_ context.Context,
+	_ *vaPB.IsSafeDomainRequest) (*vaPB.IsDomainSafe, error) {
+	return nil, brokenRemoteVAError
+}
+
+func TestPerformRemoteValidationFailure(t *testing.T) {
+	// Create a new challenge to use for the httpSrv
+	chall := core.HTTPChallenge01()
+	setChallengeToken(&chall, core.NewToken())
+
+	// Create an IPv4 test server
+	ms := httpMultiSrv(t, chall.Token, map[string]struct{}{"remote 1": {}, "remote 2": {}})
+	defer ms.Close()
+
+	// Create a local test VA configured with a mock logger
+	mockLog := blog.NewMock()
+	localVA, _ := setup(ms.Server, 0)
+	localVA.userAgent = "local"
+	localVA.log = mockLog
+
+	// Create a remote test VA
+	remoteVA, _ := setup(ms.Server, 0)
+
+	// Create a broken remote test VA
+	brokenVA := &brokenRemoteVA{}
+	brokenVAAddr := "broken"
+
+	// Set the local VA to use the two remotes
+	localVA.remoteVAs = []RemoteVA{
+		{remoteVA, "good"},
+		{brokenVA, brokenVAAddr},
+	}
+
+	// Performing a validation should return a problem on the channel because of
+	// the broken remote VA.
+	probCh := make(chan *probs.ProblemDetails, 1)
+	localVA.performRemoteValidation(
+		context.Background(),
+		"localhost",
+		chall,
+		core.Authorization{},
+		probCh)
+	prob := <-probCh
+	if prob == nil {
+		t.Fatalf("performRemoteValidation with a broken remote VA did not " +
+			"return a problem")
+	}
+
+	// The problem returned should be a server internal problem with the correct
+	// user facing detail message.
+	if prob.Type != "serverInternal" {
+		t.Errorf("performRemoteValidation with a broken remote VA did not " +
+			"return a serverInternal problem")
+	}
+	if prob.Detail != "Remote PerformValidation RPCs failed" {
+		t.Errorf("performRemoteValidation with a broken remote VA did not " +
+			"return a serverInternal problem with the correct detail")
+	}
+
+	// The mock logger should have an audit err log line that includes the true
+	// underlying error that caused the server internal problem.
+	expectedLine := fmt.Sprintf(
+		`ERR: \[AUDIT\] Remote VA %q.PerformValidation failed: %s`,
+		brokenVAAddr, brokenRemoteVAError.Error())
+	failLogs := mockLog.GetAllMatching(expectedLine)
+	if len(failLogs) == 0 {
+		t.Error("Expected log line with broken remote VA error message. Found none")
+	}
+}
+
 func TestDetailedError(t *testing.T) {
 	cases := []struct {
 		err      error