Add stagger to CT log submissions. (#3794)

This allows each log a chance to respond before we move onto the next,
spreading our load more evenly across the logs in a log group.
This commit is contained in:
Jacob Hoffman-Andrews 2018-07-06 13:25:51 -07:00 committed by Daniel McCarney
parent 92d273408f
commit 36a83150ad
4 changed files with 51 additions and 3 deletions

View File

@ -309,4 +309,7 @@ type CAADistributedResolverConfig struct {
type CTGroup struct {
Name string
Logs []LogDescription
// How long to wait for one log to accept a certificate before moving on to
// the next.
Stagger ConfigDuration
}

View File

@ -4,6 +4,7 @@ import (
"context"
"errors"
"math/rand"
"time"
"github.com/letsencrypt/boulder/canceled"
"github.com/letsencrypt/boulder/cmd"
@ -28,7 +29,12 @@ type CTPolicy struct {
}
// New creates a new CTPolicy struct
func New(pub core.Publisher, groups []cmd.CTGroup, informational []cmd.LogDescription, log blog.Logger, stats metrics.Scope) *CTPolicy {
func New(pub core.Publisher,
groups []cmd.CTGroup,
informational []cmd.LogDescription,
log blog.Logger,
stats metrics.Scope,
) *CTPolicy {
var finalLogs []cmd.LogDescription
for _, group := range groups {
for _, log := range group.Logs {
@ -79,7 +85,15 @@ func (ctp *CTPolicy) race(ctx context.Context, cert core.CertDER, group cmd.CTGr
// so we maximize the distribution of logs we get SCTs from.
for _, i := range rand.Perm(len(group.Logs)) {
l := group.Logs[i]
go func(l cmd.LogDescription) {
go func(i int, l cmd.LogDescription) {
// Each submission waits a bit longer than the previous one, to give the
// previous log a chance to reply. If the context is already done by the
// time we get here, don't bother submitting. That generally means the
// context was canceled because another log returned a success already.
time.Sleep(time.Duration(i) * group.Stagger.Duration)
if ctx.Err() != nil {
return
}
sct, err := ctp.pub.SubmitToSingleCTWithResult(ctx, &pubpb.Request{
LogURL: &l.URI,
LogPublicKey: &l.Key,
@ -95,7 +109,7 @@ func (ctp *CTPolicy) race(ctx context.Context, cert core.CertDER, group cmd.CTGr
return
}
results <- result{sct: sct.Sct, log: l.URI}
}(l)
}(i, l)
}
for i := 0; i < len(group.Logs); i++ {

View File

@ -167,3 +167,32 @@ func TestGetSCTsMetrics(t *testing.T) {
test.AssertEquals(t, test.CountCounter(ctp.winnerCounter.With(prometheus.Labels{"log": "ghi", "group": "a"})), 1)
test.AssertEquals(t, test.CountCounter(ctp.winnerCounter.With(prometheus.Labels{"log": "ghi", "group": "b"})), 1)
}
// A mock publisher that counts submissions
type countEm struct {
count int
}
func (ce *countEm) SubmitToSingleCTWithResult(_ context.Context, _ *pubpb.Request) (*pubpb.Result, error) {
ce.count++
return &pubpb.Result{Sct: []byte{0}}, nil
}
func TestStagger(t *testing.T) {
countingPub := &countEm{}
ctp := New(countingPub, []cmd.CTGroup{
{
Name: "a",
Stagger: cmd.ConfigDuration{Duration: 500 * time.Millisecond},
Logs: []cmd.LogDescription{
{URI: "abc", Key: "def"},
{URI: "ghi", Key: "jkl"},
},
},
}, nil, blog.NewMock(), metrics.NewNoopScope())
_, err := ctp.GetSCTs(context.Background(), []byte{0})
test.AssertNotError(t, err, "GetSCTs failed")
if countingPub.count != 1 {
t.Errorf("wrong number of requests to publisher. got %d, expected 1", countingPub.count)
}
}

View File

@ -56,6 +56,7 @@
"CTLogGroups2": [
{
"name": "a",
"stagger": "500ms",
"logs": [
{
"uri": "http://boulder:4500",
@ -71,6 +72,7 @@
},
{
"name": "b",
"stagger": "500ms",
"logs": [
{
"uri": "http://boulder:4510",