Add stagger to CT log submissions. (#3794)

This allows each log a chance to respond before we move onto the next, spreading our load more evenly across the logs in a log group.
2018-07-06 13:25:51 -07:00 · 2018-07-06 13:25:51 -07:00 · 36a83150ad
parent 92d273408f
commit 36a83150ad
4 changed files with 51 additions and 3 deletions
--- a/cmd/config.go
+++ b/cmd/config.go
@ -309,4 +309,7 @@ type CAADistributedResolverConfig struct {
 type CTGroup struct {
 	Name string
 	Logs []LogDescription
+	// How long to wait for one log to accept a certificate before moving on to
+	// the next.
+	Stagger ConfigDuration
 }
--- a/ctpolicy/ctpolicy.go
+++ b/ctpolicy/ctpolicy.go
@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"math/rand"
+	"time"

 	"github.com/letsencrypt/boulder/canceled"
 	"github.com/letsencrypt/boulder/cmd"
@ -28,7 +29,12 @@ type CTPolicy struct {
 }

 // New creates a new CTPolicy struct
-func New(pub core.Publisher, groups []cmd.CTGroup, informational []cmd.LogDescription, log blog.Logger, stats metrics.Scope) *CTPolicy {
+func New(pub core.Publisher,
+	groups []cmd.CTGroup,
+	informational []cmd.LogDescription,
+	log blog.Logger,
+	stats metrics.Scope,
+) *CTPolicy {
 	var finalLogs []cmd.LogDescription
 	for _, group := range groups {
 		for _, log := range group.Logs {
@ -79,7 +85,15 @@ func (ctp *CTPolicy) race(ctx context.Context, cert core.CertDER, group cmd.CTGr
 	// so we maximize the distribution of logs we get SCTs from.
 	for _, i := range rand.Perm(len(group.Logs)) {
 		l := group.Logs[i]
-		go func(l cmd.LogDescription) {
+		go func(i int, l cmd.LogDescription) {
+			// Each submission waits a bit longer than the previous one, to give the
+			// previous log a chance to reply. If the context is already done by the
+			// time we get here, don't bother submitting. That generally means the
+			// context was canceled because another log returned a success already.
+			time.Sleep(time.Duration(i) * group.Stagger.Duration)
+			if ctx.Err() != nil {
+				return
+			}
 			sct, err := ctp.pub.SubmitToSingleCTWithResult(ctx, &pubpb.Request{
 				LogURL:       &l.URI,
 				LogPublicKey: &l.Key,
@ -95,7 +109,7 @@ func (ctp *CTPolicy) race(ctx context.Context, cert core.CertDER, group cmd.CTGr
 				return
 			}
 			results <- result{sct: sct.Sct, log: l.URI}
-		}(l)
+		}(i, l)
 	}

 	for i := 0; i < len(group.Logs); i++ {
--- a/ctpolicy/ctpolicy_test.go
+++ b/ctpolicy/ctpolicy_test.go
@ -167,3 +167,32 @@ func TestGetSCTsMetrics(t *testing.T) {
 	test.AssertEquals(t, test.CountCounter(ctp.winnerCounter.With(prometheus.Labels{"log": "ghi", "group": "a"})), 1)
 	test.AssertEquals(t, test.CountCounter(ctp.winnerCounter.With(prometheus.Labels{"log": "ghi", "group": "b"})), 1)
 }
+
+// A mock publisher that counts submissions
+type countEm struct {
+	count int
+}
+
+func (ce *countEm) SubmitToSingleCTWithResult(_ context.Context, _ *pubpb.Request) (*pubpb.Result, error) {
+	ce.count++
+	return &pubpb.Result{Sct: []byte{0}}, nil
+}
+
+func TestStagger(t *testing.T) {
+	countingPub := &countEm{}
+	ctp := New(countingPub, []cmd.CTGroup{
+		{
+			Name:    "a",
+			Stagger: cmd.ConfigDuration{Duration: 500 * time.Millisecond},
+			Logs: []cmd.LogDescription{
+				{URI: "abc", Key: "def"},
+				{URI: "ghi", Key: "jkl"},
+			},
+		},
+	}, nil, blog.NewMock(), metrics.NewNoopScope())
+	_, err := ctp.GetSCTs(context.Background(), []byte{0})
+	test.AssertNotError(t, err, "GetSCTs failed")
+	if countingPub.count != 1 {
+		t.Errorf("wrong number of requests to publisher. got %d, expected 1", countingPub.count)
+	}
+}
--- a/test/config-next/ra.json
+++ b/test/config-next/ra.json
@ -56,6 +56,7 @@
    "CTLogGroups2": [
      {
        "name": "a",
+        "stagger": "500ms",
        "logs": [
          {
            "uri": "http://boulder:4500",
@ -71,6 +72,7 @@
      },
      {
        "name": "b",
+        "stagger": "500ms",
        "logs": [
          {
            "uri": "http://boulder:4510",