Remote VA logging and stats (#3063)

Add a logging statement that fires when a remote VA fail causes overall failure. Also change remoteValidationFailures into a counter that counts the same thing, instead of a histogram. Since the histogram had the default bucket sizes, it failed to collect what we needed, and produced more metrics than necessary.
2017-09-11 12:50:50 -07:00 · 2017-09-11 12:50:50 -07:00 · 568407e5b8
parent 3b6a5ff63c
commit 568407e5b8
8 changed files with 57 additions and 32 deletions
--- a/bdns/dns_test.go
+++ b/bdns/dns_test.go
@ -601,7 +601,7 @@ func TestRetry(t *testing.T) {
 			t.Errorf("#%d, error, expectedCount %v, got %v", i, tc.expectedCount, tc.te.count)
 		}
 		if tc.metricsAllRetries > 0 {
-			test.AssertEquals(t, test.CountCounter(
+			test.AssertEquals(t, test.CountCounterVec(
 				"qtype",
 				"TXT",
 				dr.usedAllRetriesCounter), tc.metricsAllRetries)
@ -635,7 +635,7 @@ func TestRetry(t *testing.T) {
 		t.Errorf("expected %s, got %s", context.DeadlineExceeded, err)
 	}

-	test.AssertEquals(t, test.CountCounter(
+	test.AssertEquals(t, test.CountCounterVec(
 		"qtype",
 		"TXT",
 		dr.cancelCounter), 3)
--- a/ca/ca_test.go
+++ b/ca/ca_test.go
@ -766,8 +766,8 @@ func countMustStaple(t *testing.T, cert *x509.Certificate) (count int) {
 func issueCertificateSubTestMustStapleWhenDisabled(t *testing.T, i *TestCertificateIssuance) {
 	// With ca.enableMustStaple = false, should issue successfully and not add
 	// Must Staple.
-	test.AssertEquals(t, test.CountCounter(csrExtensionCategory, csrExtensionTLSFeature, i.ca.csrExtensionCount), 1)
-	test.AssertEquals(t, test.CountCounter(csrExtensionCategory, csrExtensionTLSFeatureInvalid, i.ca.csrExtensionCount), 0)
+	test.AssertEquals(t, test.CountCounterVec(csrExtensionCategory, csrExtensionTLSFeature, i.ca.csrExtensionCount), 1)
+	test.AssertEquals(t, test.CountCounterVec(csrExtensionCategory, csrExtensionTLSFeatureInvalid, i.ca.csrExtensionCount), 0)
 	test.AssertEquals(t, signatureCountByPurpose("cert", i.ca.signatureCount), 1)
 	test.AssertEquals(t, countMustStaple(t, i.cert), 0)
 }
@ -776,20 +776,20 @@ func issueCertificateSubTestMustStapleWhenEnabled(t *testing.T, i *TestCertifica
 	// With ca.enableMustStaple = true, a TLS feature extension should put a must-staple
 	// extension into the cert. Even if there are multiple TLS Feature extensions, only
 	// one extension should be included.
-	test.AssertEquals(t, test.CountCounter(csrExtensionCategory, csrExtensionTLSFeature, i.ca.csrExtensionCount), 1)
-	test.AssertEquals(t, test.CountCounter(csrExtensionCategory, csrExtensionTLSFeatureInvalid, i.ca.csrExtensionCount), 0)
+	test.AssertEquals(t, test.CountCounterVec(csrExtensionCategory, csrExtensionTLSFeature, i.ca.csrExtensionCount), 1)
+	test.AssertEquals(t, test.CountCounterVec(csrExtensionCategory, csrExtensionTLSFeatureInvalid, i.ca.csrExtensionCount), 0)
 	test.AssertEquals(t, signatureCountByPurpose("cert", i.ca.signatureCount), 1)
 	test.AssertEquals(t, countMustStaple(t, i.cert), 1)
 }

 func issueCertificateSubTestTLSFeatureUnknown(t *testing.T, ca *CertificateAuthorityImpl, _ *mockSA) {
-	test.AssertEquals(t, test.CountCounter(csrExtensionCategory, csrExtensionTLSFeature, ca.csrExtensionCount), 1)
-	test.AssertEquals(t, test.CountCounter(csrExtensionCategory, csrExtensionTLSFeatureInvalid, ca.csrExtensionCount), 1)
+	test.AssertEquals(t, test.CountCounterVec(csrExtensionCategory, csrExtensionTLSFeature, ca.csrExtensionCount), 1)
+	test.AssertEquals(t, test.CountCounterVec(csrExtensionCategory, csrExtensionTLSFeatureInvalid, ca.csrExtensionCount), 1)
 }

 func issueCertificateSubTestUnknownExtension(t *testing.T, i *TestCertificateIssuance) {
 	// Unsupported extensions in the CSR should be silently ignored.
-	test.AssertEquals(t, test.CountCounter(csrExtensionCategory, csrExtensionOther, i.ca.csrExtensionCount), 1)
+	test.AssertEquals(t, test.CountCounterVec(csrExtensionCategory, csrExtensionOther, i.ca.csrExtensionCount), 1)
 	test.AssertEquals(t, signatureCountByPurpose("cert", i.ca.signatureCount), 1)

 	// NOTE: The hard-coded value here will have to change over time as Boulder
@ -806,7 +806,7 @@ func issueCertificateSubTestCTPoisonExtension(t *testing.T, i *TestCertificateIs
 	// unknown extension, whether it has a valid or invalid value. The check
 	// for whether or not the poison extension is present in the issued
 	// certificate/precertificate is done in the caller.
-	test.AssertEquals(t, test.CountCounter(csrExtensionCategory, csrExtensionOther, i.ca.csrExtensionCount), 1)
+	test.AssertEquals(t, test.CountCounterVec(csrExtensionCategory, csrExtensionOther, i.ca.csrExtensionCount), 1)
 	test.AssertEquals(t, signatureCountByPurpose("cert", i.ca.signatureCount), 1)
 }

@ -820,5 +820,5 @@ func findExtension(extensions []pkix.Extension, id asn1.ObjectIdentifier) *pkix.
 }

 func signatureCountByPurpose(signatureType string, signatureCount *prometheus.CounterVec) int {
-	return test.CountCounter("purpose", signatureType, signatureCount)
+	return test.CountCounterVec("purpose", signatureType, signatureCount)
 }
--- a/test/test-tools.go
+++ b/test/test-tools.go
@ -10,6 +10,7 @@ import (
 	"runtime"
 	"strings"
 	"testing"
+	"time"

 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_model/go"
@ -156,11 +157,22 @@ func AssertBetween(t *testing.T, a, b, c int64) {
 	}
 }

+// CountCounterVecVec returns the count by label and value of a prometheus metric
+func CountCounterVec(labelName string, value string, counterVec *prometheus.CounterVec) int {
+	return CountCounter(counterVec.With(prometheus.Labels{labelName: value}))
+}
+
 // CountCounter returns the count by label and value of a prometheus metric
-func CountCounter(labelName string, value string, counter *prometheus.CounterVec) int {
+func CountCounter(counter prometheus.Counter) int {
 	ch := make(chan prometheus.Metric, 10)
-	counter.With(prometheus.Labels{labelName: value}).Collect(ch)
-	m := <-ch
+	counter.Collect(ch)
+	var m prometheus.Metric
+	select {
+	case <-time.After(time.Second):
+		fmt.Println("timed out collecting metrics")
+		return 0
+	case m = <-ch:
+	}
 	var iom io_prometheus_client.Metric
 	_ = m.Write(&iom)
 	return int(iom.Counter.GetValue())
--- a/va/va.go
+++ b/va/va.go
@ -59,7 +59,7 @@ type RemoteVA struct {
 type vaMetrics struct {
 	validationTime           *prometheus.HistogramVec
 	remoteValidationTime     *prometheus.HistogramVec
-	remoteValidationFailures *prometheus.HistogramVec
+	remoteValidationFailures prometheus.Counter
 }

 func initMetrics(stats metrics.Scope) *vaMetrics {
@ -77,11 +77,11 @@ func initMetrics(stats metrics.Scope) *vaMetrics {
 		},
 		[]string{"type", "result"})
 	stats.MustRegister(remoteValidationTime)
-	remoteValidationFailures := prometheus.NewHistogramVec(
-		prometheus.HistogramOpts{
+	remoteValidationFailures := prometheus.NewCounter(
+		prometheus.CounterOpts{
 			Name: "remote_validation_failures",
-			Help: "Number of remote VAs that failed during challenge validation",
-		}, nil)
+			Help: "Number of validations failed due to remote VAs returning failure",
+		})
 	stats.MustRegister(remoteValidationFailures)

 	return &vaMetrics{
@ -851,7 +851,6 @@ func (va *ValidationAuthorityImpl) performRemoteValidation(ctx context.Context,
 		"type":   string(challenge.Type),
 		"result": state,
 	}).Observe(va.clk.Since(s).Seconds())
-	va.metrics.remoteValidationFailures.With(prometheus.Labels{}).Observe(float64(bad))
 }

 // PerformValidation validates the given challenge. It always returns a list of
@ -893,6 +892,10 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, domain
 			challenge.Status = core.StatusInvalid
 			challenge.Error = prob
 			logEvent.Error = prob.Error()
+			va.log.Info(fmt.Sprintf(
+				"Validation failed due to remote failures: identifier=%v err=%s",
+				authz.Identifier, prob))
+			va.metrics.remoteValidationFailures.Inc()
 		} else {
 			challenge.Status = core.StatusValid
 		}
--- a/va/va_test.go
+++ b/va/va_test.go
@ -1305,6 +1305,7 @@ func TestPerformRemoteValidation(t *testing.T) {
 	ms.mu.Lock()
 	delete(ms.allowedUAs, "remote 1")
 	ms.mu.Unlock()
+	mockLog := blog.NewMock()
 	localVA.performRemoteValidation(context.Background(), "localhost", chall, core.Authorization{}, probCh)
 	prob = <-probCh
 	if prob == nil {
@ -1333,6 +1334,7 @@ func TestPerformRemoteValidation(t *testing.T) {
 	}

 	// Local and remote 2 working, should fail
+	localVA.log = mockLog
 	ms.mu.Lock()
 	ms.allowedUAs["local"] = struct{}{}
 	delete(ms.allowedUAs, "remote 1")
@ -1341,6 +1343,14 @@ func TestPerformRemoteValidation(t *testing.T) {
 	if err == nil {
 		t.Error("PerformValidation didn't fail when one 'remote' validation failed")
 	}
+	failLogs := mockLog.GetAllMatching(`Validation failed due to remote failures`)
+	if len(failLogs) == 0 {
+		t.Error("Expected log line about failure due to remote failures, didn't get it")
+	}
+	remoteFailMetric := test.CountCounter(localVA.metrics.remoteValidationFailures)
+	if remoteFailMetric != 1 {
+		t.Errorf("Expected remote_validation_failures to be incremented, but it wasn't")
+	}

 	// Local and remote 2 working with maxRemoteFailures == 1, should succeed
 	localVA, _ = setup(ms.Server, 1)
--- a/wfe/wfe_test.go
+++ b/wfe/wfe_test.go
@ -1021,7 +1021,7 @@ func TestIssueCertificate(t *testing.T) {
 		`{"type":"`+probs.V1ErrorNS+`malformed","detail":"CSR generated using a pre-1.0.2 OpenSSL with a client that doesn't properly specify the CSR version. See https://community.letsencrypt.org/t/openssl-bug-information/19591","status":400}`)

 	// Test the CSR signature type counter works
-	test.AssertEquals(t, test.CountCounter("type", "SHA256-RSA", wfe.csrSignatureAlgs), 4)
+	test.AssertEquals(t, test.CountCounterVec("type", "SHA256-RSA", wfe.csrSignatureAlgs), 4)
 }

 func TestGetChallenge(t *testing.T) {
--- a/wfe2/verify_test.go
+++ b/wfe2/verify_test.go
@ -414,7 +414,7 @@ func TestValidPOSTRequest(t *testing.T) {
 			test.AssertEquals(t, prob.Type, probs.MalformedProblem)
 			test.AssertEquals(t, prob.HTTPStatus, tc.HTTPStatus)
 			test.AssertEquals(t, prob.Detail, tc.ProblemDetail)
-			test.AssertEquals(t, test.CountCounter(
+			test.AssertEquals(t, test.CountCounterVec(
 				"type", tc.ErrorStatType, wfe.stats.httpErrorCount), 1)
 		})
 	}
@ -512,7 +512,7 @@ func TestEnforceJWSAuthType(t *testing.T) {
 				test.AssertMarshaledEquals(t, prob, tc.ExpectedResult)
 			}
 			if tc.ErrorStatType != "" {
-				test.AssertEquals(t, test.CountCounter(
+				test.AssertEquals(t, test.CountCounterVec(
 					"type", tc.ErrorStatType, wfe.stats.joseErrorCount), 1)
 			}
 		})
@ -582,7 +582,7 @@ func TestValidNonce(t *testing.T) {
 				test.AssertMarshaledEquals(t, prob, tc.ExpectedResult)
 			}
 			if tc.ErrorStatType != "" {
-				test.AssertEquals(t, test.CountCounter(
+				test.AssertEquals(t, test.CountCounterVec(
 					"type", tc.ErrorStatType, wfe.stats.joseErrorCount), 1)
 			}
 		})
@ -701,7 +701,7 @@ func TestValidPOSTURL(t *testing.T) {
 				test.AssertMarshaledEquals(t, prob, tc.ExpectedResult)
 			}
 			if tc.ErrorStatType != "" {
-				test.AssertEquals(t, test.CountCounter(
+				test.AssertEquals(t, test.CountCounterVec(
 					"type", tc.ErrorStatType, wfe.stats.joseErrorCount), 1)
 			}
 		})
@ -857,7 +857,7 @@ func TestParseJWSRequest(t *testing.T) {
 				test.AssertMarshaledEquals(t, prob, tc.ExpectedProblem)
 			}
 			if tc.ErrorStatType != "" {
-				test.AssertEquals(t, test.CountCounter(
+				test.AssertEquals(t, test.CountCounterVec(
 					"type", tc.ErrorStatType, wfe.stats.joseErrorCount), 1)
 			}
 		})
@ -1047,7 +1047,7 @@ func TestLookupJWK(t *testing.T) {
 				test.AssertMarshaledEquals(t, prob, tc.ExpectedProblem)
 			}
 			if tc.ErrorStatType != "" {
-				test.AssertEquals(t, test.CountCounter(
+				test.AssertEquals(t, test.CountCounterVec(
 					"type", tc.ErrorStatType, wfe.stats.joseErrorCount), 1)
 			}
 		})
@ -1178,7 +1178,7 @@ func TestValidJWSForKey(t *testing.T) {
 				test.AssertMarshaledEquals(t, prob, tc.ExpectedProblem)
 			}
 			if tc.ErrorStatType != "" {
-				test.AssertEquals(t, test.CountCounter(
+				test.AssertEquals(t, test.CountCounterVec(
 					"type", tc.ErrorStatType, wfe.stats.joseErrorCount), 1)
 			}
 		})
@ -1274,7 +1274,7 @@ func TestValidPOSTForAccount(t *testing.T) {
 				test.AssertMarshaledEquals(t, prob, tc.ExpectedProblem)
 			}
 			if tc.ErrorStatType != "" {
-				test.AssertEquals(t, test.CountCounter(
+				test.AssertEquals(t, test.CountCounterVec(
 					"type", tc.ErrorStatType, wfe.stats.joseErrorCount), 1)
 			}
 		})
@ -1379,7 +1379,7 @@ func TestValidSelfAuthenticatedPOST(t *testing.T) {
 				test.AssertMarshaledEquals(t, prob, tc.ExpectedProblem)
 			}
 			if tc.ErrorStatType != "" {
-				test.AssertEquals(t, test.CountCounter(
+				test.AssertEquals(t, test.CountCounterVec(
 					"type", tc.ErrorStatType, wfe.stats.joseErrorCount), 1)
 			}
 		})
@ -1462,7 +1462,7 @@ func TestMatchJWSURLs(t *testing.T) {
 				test.AssertMarshaledEquals(t, prob, tc.ExpectedProblem)
 			}
 			if tc.ErrorStatType != "" {
-				test.AssertEquals(t, test.CountCounter(
+				test.AssertEquals(t, test.CountCounterVec(
 					"type", tc.ErrorStatType, wfe.stats.joseErrorCount), 1)
 			}
 		})
--- a/wfe2/wfe_test.go
+++ b/wfe2/wfe_test.go
@ -1911,7 +1911,7 @@ func TestKeyRollover(t *testing.T) {
 			wfe.KeyRollover(ctx, newRequestEvent(), responseWriter, makePostRequestWithPath("key-change", outer))
 			test.AssertUnmarshaledEquals(t, responseWriter.Body.String(), tc.ExpectedResponse)
 			if tc.ErrorStatType != "" {
-				test.AssertEquals(t, test.CountCounter(
+				test.AssertEquals(t, test.CountCounterVec(
 					"type", tc.ErrorStatType, wfe.stats.joseErrorCount), 1)
 			}
 		})