Streamline and test crl-updater errors and audit logs (#6382)

Modify the way errors are handled in crl-updater:
- Rather than having each method in the tick, tickIssuer, tickShard
  chain concatenate all errors from its children, simply have them
  summarize the errors. This results in much shorter error messages.
- Rather than having each method log its own errors as it returns, have
  each caller responsible for logging the errors it receives from its
  children.

In addition, add tests for tick, tickIssuer, and tickShard which cover
their simple errors paths, where one of the gRPC requests to the SA, CA,
or CRLStorer encounters an error. These tests let us ensure that errors
are being properly propagated upwards through the layers of indirection
and goroutines in the three methods under test, and that the appropriate
metrics are being incremented and log messages are being printed.

Fixes #6375
This commit is contained in:
Aaron Gable 2022-09-14 16:41:28 -07:00 committed by GitHub
parent db044a8822
commit d53c90a3bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 300 additions and 39 deletions

View File

@ -2,7 +2,6 @@ package crl
import (
"encoding/json"
"fmt"
"math/big"
"time"
@ -31,7 +30,7 @@ func Number(thisUpdate time.Time) number {
type id string
// Id is a utility function which constructs a new `id`.
func Id(issuerID issuance.IssuerNameID, crlNumber number, shardIdx int) (id, error) {
func Id(issuerID issuance.IssuerNameID, crlNumber number, shardIdx int) id {
type info struct {
IssuerID issuance.IssuerNameID `json:"issuerID"`
CRLNumber number `json:"crlNumber"`
@ -39,7 +38,7 @@ func Id(issuerID issuance.IssuerNameID, crlNumber number, shardIdx int) (id, err
}
jsonBytes, err := json.Marshal(info{issuerID, crlNumber, shardIdx})
if err != nil {
return "", fmt.Errorf("computing CRL Id: %w", err)
panic(err)
}
return id(jsonBytes), nil
return id(jsonBytes)
}

View File

@ -11,8 +11,7 @@ import (
func TestId(t *testing.T) {
thisUpdate := time.Now()
out, err := Id(1337, Number(thisUpdate), 1)
test.AssertNotError(t, err, "Failed to create CRLId")
out := Id(1337, Number(thisUpdate), 1)
expectCRLId := fmt.Sprintf("{\"issuerID\":1337,\"crlNumber\":%d,\"shardIdx\":1}", big.NewInt(thisUpdate.UnixNano()))
test.AssertEquals(t, string(out), expectCRLId)
}

View File

@ -133,10 +133,7 @@ func (cs *crlStorer) UploadCRL(stream cspb.CRLStorer_UploadCRLServer) error {
return errors.New("got no metadata message")
}
crlId, err := crl.Id(issuer.NameID(), crlNumber, int(shardIdx))
if err != nil {
return err
}
crlId := crl.Id(issuer.NameID(), crlNumber, int(shardIdx))
cs.sizeHistogram.WithLabelValues(issuer.Subject.CommonName).Observe(float64(len(crlBytes)))

View File

@ -3,9 +3,9 @@ package updater
import (
"context"
"crypto/sha256"
"errors"
"fmt"
"io"
"sort"
"strings"
"time"
@ -166,7 +166,9 @@ func (cu *crlUpdater) Run(ctx context.Context) error {
if err != nil {
// We only log, rather than return, so that the long-lived process can
// continue and try again at the next tick.
cu.log.AuditErrf("tick at time %s failed: %s", atTime, err)
cu.log.AuditErrf(
"Generating CRLs failed: number=[%d] err=[%s]",
crl.Number(atTime), err)
}
case <-ctx.Done():
ticker.Stop()
@ -181,31 +183,31 @@ func (cu *crlUpdater) Run(ctx context.Context) error {
// error at the end.
func (cu *crlUpdater) Tick(ctx context.Context, atTime time.Time) (err error) {
defer func() {
// Each return statement in this function assigns its returned value to the
// named return variable `err`. This deferred function closes over that
// variable, and so can reference and modify it.
// This func closes over the named return value `err`, so can reference it.
result := "success"
if err != nil {
result = "failed"
err = fmt.Errorf("tick %s: %w", atTime.Format(time.RFC3339Nano), err)
}
cu.tickHistogram.WithLabelValues("all", result).Observe(cu.clk.Since(atTime).Seconds())
}()
cu.log.Debugf("Ticking at time %s", atTime)
var errStrs []string
var errIssuers []string
for id := range cu.issuers {
// For now, process each issuer serially. This keeps the worker pool system
// simple, and processing all of the issuers in parallel likely wouldn't
// meaningfully speed up the overall process.
err := cu.tickIssuer(ctx, atTime, id)
if err != nil {
errStrs = append(errStrs, err.Error())
cu.log.AuditErrf(
"Generating CRLs for issuer failed: number=[%d] issuer=[%s] err=[%s]",
crl.Number(atTime), cu.issuers[id].Subject.CommonName, err)
errIssuers = append(errIssuers, cu.issuers[id].Subject.CommonName)
}
}
if len(errStrs) != 0 {
return errors.New(strings.Join(errStrs, "; "))
if len(errIssuers) != 0 {
return fmt.Errorf("%d issuers failed: %v", len(errIssuers), strings.Join(errIssuers, ", "))
}
return nil
}
@ -217,13 +219,10 @@ func (cu *crlUpdater) Tick(ctx context.Context, atTime time.Time) (err error) {
func (cu *crlUpdater) tickIssuer(ctx context.Context, atTime time.Time, issuerNameID issuance.IssuerNameID) (err error) {
start := cu.clk.Now()
defer func() {
// Each return statement in this function assigns its returned value to the
// named return variable `err`. This deferred function closes over that
// variable, and so can reference and modify it.
// This func closes over the named return value `err`, so can reference it.
result := "success"
if err != nil {
result = "failed"
err = fmt.Errorf("%s: %w", cu.issuers[issuerNameID].Subject.CommonName, err)
}
cu.tickHistogram.WithLabelValues(cu.issuers[issuerNameID].Subject.CommonName+" (Overall)", result).Observe(cu.clk.Since(start).Seconds())
}()
@ -259,16 +258,20 @@ func (cu *crlUpdater) tickIssuer(ctx context.Context, atTime time.Time, issuerNa
}
close(shardIdxs)
var errStrs []string
var errShards []int
for i := 0; i < cu.numShards; i++ {
res := <-shardResults
if res.err != nil {
errStrs = append(errStrs, res.err.Error())
cu.log.AuditErrf(
"Generating CRL failed: id=[%s] err=[%s]",
crl.Id(issuerNameID, crl.Number(atTime), res.shardIdx), err)
errShards = append(errShards, res.shardIdx)
}
}
if len(errStrs) != 0 {
return errors.New(strings.Join(errStrs, "; "))
if len(errShards) != 0 {
sort.Ints(errShards)
return fmt.Errorf("%d shards failed: %v", len(errShards), errShards)
}
return nil
}
@ -281,21 +284,12 @@ func (cu *crlUpdater) tickShard(ctx context.Context, atTime time.Time, issuerNam
ctx, cancel := context.WithCancel(ctx)
defer cancel()
crlId, err := crl.Id(issuerNameID, crl.Number(atTime), shardIdx)
if err != nil {
return err
}
start := cu.clk.Now()
defer func() {
// Each return statement in this function assigns its returned value to the
// named return variable `err`. This deferred function closes over that
// variable, and so can reference and modify it.
// This func closes over the named return value `err`, so can reference it.
result := "success"
if err != nil {
cu.log.AuditErrf("Generating CRL failed: id=[%s] err=[%s]", crlId, err)
result = "failed"
err = fmt.Errorf("%d: %w", shardIdx, err)
}
cu.tickHistogram.WithLabelValues(cu.issuers[issuerNameID].Subject.CommonName, result).Observe(cu.clk.Since(start).Seconds())
cu.updatedCounter.WithLabelValues(cu.issuers[issuerNameID].Subject.CommonName, result).Inc()
@ -406,7 +400,9 @@ func (cu *crlUpdater) tickShard(ctx context.Context, atTime time.Time, issuerNam
return fmt.Errorf("closing CRLStorer upload stream: %w", err)
}
cu.log.Infof("Generated CRL: id=[%s] size=[%d] hash=[%x]", crlId, crlLen, crlHash.Sum(nil))
cu.log.Infof(
"Generated CRL shard: id=[%s] size=[%d] hash=[%x]",
crl.Id(issuerNameID, crl.Number(atTime), shardIdx), crlLen, crlHash.Sum(nil))
return nil
}

View File

@ -1,12 +1,282 @@
package updater
import (
"context"
"errors"
"io"
"testing"
"time"
"google.golang.org/grpc"
"google.golang.org/protobuf/types/known/emptypb"
"github.com/jmhodges/clock"
capb "github.com/letsencrypt/boulder/ca/proto"
corepb "github.com/letsencrypt/boulder/core/proto"
cspb "github.com/letsencrypt/boulder/crl/storer/proto"
"github.com/letsencrypt/boulder/issuance"
blog "github.com/letsencrypt/boulder/log"
"github.com/letsencrypt/boulder/metrics"
"github.com/letsencrypt/boulder/mocks"
sapb "github.com/letsencrypt/boulder/sa/proto"
"github.com/letsencrypt/boulder/test"
"github.com/prometheus/client_golang/prometheus"
)
// fakeGRCC is a fake sapb.StorageAuthority_GetRevokedCertsClient which can be
// populated with some CRL entries or an error for use as the return value of
// a faked GetRevokedCerts call.
type fakeGRCC struct {
grpc.ClientStream
entries []*corepb.CRLEntry
nextIdx int
err error
}
func (f *fakeGRCC) Recv() (*corepb.CRLEntry, error) {
if f.err != nil {
return nil, f.err
}
if f.nextIdx < len(f.entries) {
res := f.entries[f.nextIdx]
f.nextIdx++
return res, nil
}
return nil, io.EOF
}
// fakeSAC is a fake sapb.StorageAuthorityClient which can be populated with a
// fakeGRCC to be used as the return value for calls to GetRevokedCerts.
type fakeSAC struct {
mocks.StorageAuthority
grcc fakeGRCC
}
func (f *fakeSAC) GetRevokedCerts(ctx context.Context, _ *sapb.GetRevokedCertsRequest, _ ...grpc.CallOption) (sapb.StorageAuthority_GetRevokedCertsClient, error) {
return &f.grcc, nil
}
// fakeGCC is a fake capb.CRLGenerator_GenerateCRLClient which can be
// populated with some CRL entries or an error for use as the return value of
// a faked GenerateCRL call.
type fakeGCC struct {
grpc.ClientStream
chunks [][]byte
nextIdx int
sendErr error
recvErr error
}
func (f *fakeGCC) Send(*capb.GenerateCRLRequest) error {
return f.sendErr
}
func (f *fakeGCC) CloseSend() error {
return nil
}
func (f *fakeGCC) Recv() (*capb.GenerateCRLResponse, error) {
if f.recvErr != nil {
return nil, f.recvErr
}
if f.nextIdx < len(f.chunks) {
res := f.chunks[f.nextIdx]
f.nextIdx++
return &capb.GenerateCRLResponse{Chunk: res}, nil
}
return nil, io.EOF
}
// fakeCGC is a fake capb.CRLGeneratorClient which can be populated with a
// fakeGCC to be used as the return value for calls to GenerateCRL.
type fakeCGC struct {
gcc fakeGCC
}
func (f *fakeCGC) GenerateCRL(ctx context.Context, opts ...grpc.CallOption) (capb.CRLGenerator_GenerateCRLClient, error) {
return &f.gcc, nil
}
// fakeUCC is a fake cspb.CRLStorer_UploadCRLClient which can be populated with
// an error for use as the return value of a faked UploadCRL call.
type fakeUCC struct {
grpc.ClientStream
sendErr error
recvErr error
}
func (f *fakeUCC) Send(*cspb.UploadCRLRequest) error {
return f.sendErr
}
func (f *fakeUCC) CloseAndRecv() (*emptypb.Empty, error) {
if f.recvErr != nil {
return nil, f.recvErr
}
return &emptypb.Empty{}, nil
}
// fakeCSC is a fake cspb.CRLStorerClient which can be populated with a
// fakeUCC for use as the return value for calls to UploadCRL.
type fakeCSC struct {
ucc fakeUCC
}
func (f *fakeCSC) UploadCRL(ctx context.Context, opts ...grpc.CallOption) (cspb.CRLStorer_UploadCRLClient, error) {
return &f.ucc, nil
}
func TestTickShard(t *testing.T) {
e1, err := issuance.LoadCertificate("../../test/hierarchy/int-e1.cert.pem")
test.AssertNotError(t, err, "loading test issuer")
r3, err := issuance.LoadCertificate("../../test/hierarchy/int-r3.cert.pem")
test.AssertNotError(t, err, "loading test issuer")
sentinelErr := errors.New("oops")
cu, err := NewUpdater(
[]*issuance.Certificate{e1, r3}, 2, 10*24*time.Hour, 24*time.Hour, 0, 1,
&fakeSAC{grcc: fakeGRCC{}},
&fakeCGC{gcc: fakeGCC{}},
&fakeCSC{ucc: fakeUCC{}},
metrics.NoopRegisterer, blog.NewMock(), clock.NewFake(),
)
test.AssertNotError(t, err, "building test crlUpdater")
// Ensure that getting no results from the SA still works.
err = cu.tickShard(context.Background(), cu.clk.Now(), e1.NameID(), 0)
test.AssertNotError(t, err, "empty CRL")
test.AssertMetricWithLabelsEquals(t, cu.updatedCounter, prometheus.Labels{
"issuer": "(TEST) Elegant Elephant E1", "result": "success",
}, 1)
cu.updatedCounter.Reset()
// Errors closing the Storer upload stream should bubble up.
cu.cs = &fakeCSC{ucc: fakeUCC{recvErr: sentinelErr}}
err = cu.tickShard(context.Background(), cu.clk.Now(), e1.NameID(), 0)
test.AssertError(t, err, "storer error")
test.AssertContains(t, err.Error(), "closing CRLStorer upload stream")
test.AssertErrorIs(t, err, sentinelErr)
test.AssertMetricWithLabelsEquals(t, cu.updatedCounter, prometheus.Labels{
"issuer": "(TEST) Elegant Elephant E1", "result": "failed",
}, 1)
cu.updatedCounter.Reset()
// Errors reading from the CA should bubble up sooner.
cu.ca = &fakeCGC{gcc: fakeGCC{recvErr: sentinelErr}}
err = cu.tickShard(context.Background(), cu.clk.Now(), e1.NameID(), 0)
test.AssertError(t, err, "CA error")
test.AssertContains(t, err.Error(), "receiving CRL bytes")
test.AssertErrorIs(t, err, sentinelErr)
test.AssertMetricWithLabelsEquals(t, cu.updatedCounter, prometheus.Labels{
"issuer": "(TEST) Elegant Elephant E1", "result": "failed",
}, 1)
cu.updatedCounter.Reset()
// Errors sending to the Storer should bubble up sooner.
cu.cs = &fakeCSC{ucc: fakeUCC{sendErr: sentinelErr}}
err = cu.tickShard(context.Background(), cu.clk.Now(), e1.NameID(), 0)
test.AssertError(t, err, "storer error")
test.AssertContains(t, err.Error(), "sending CRLStorer metadata")
test.AssertErrorIs(t, err, sentinelErr)
test.AssertMetricWithLabelsEquals(t, cu.updatedCounter, prometheus.Labels{
"issuer": "(TEST) Elegant Elephant E1", "result": "failed",
}, 1)
cu.updatedCounter.Reset()
// Errors reading from the SA should bubble up sooner.
cu.sa = &fakeSAC{grcc: fakeGRCC{err: sentinelErr}}
err = cu.tickShard(context.Background(), cu.clk.Now(), e1.NameID(), 0)
test.AssertError(t, err, "database error")
test.AssertContains(t, err.Error(), "retrieving entry from SA")
test.AssertErrorIs(t, err, sentinelErr)
test.AssertMetricWithLabelsEquals(t, cu.updatedCounter, prometheus.Labels{
"issuer": "(TEST) Elegant Elephant E1", "result": "failed",
}, 1)
cu.updatedCounter.Reset()
// Errors sending to the CA should bubble up soonest.
cu.ca = &fakeCGC{gcc: fakeGCC{sendErr: sentinelErr}}
err = cu.tickShard(context.Background(), cu.clk.Now(), e1.NameID(), 0)
test.AssertError(t, err, "CA error")
test.AssertContains(t, err.Error(), "sending CA metadata")
test.AssertErrorIs(t, err, sentinelErr)
test.AssertMetricWithLabelsEquals(t, cu.updatedCounter, prometheus.Labels{
"issuer": "(TEST) Elegant Elephant E1", "result": "failed",
}, 1)
cu.updatedCounter.Reset()
}
func TestTickIssuer(t *testing.T) {
e1, err := issuance.LoadCertificate("../../test/hierarchy/int-e1.cert.pem")
test.AssertNotError(t, err, "loading test issuer")
r3, err := issuance.LoadCertificate("../../test/hierarchy/int-r3.cert.pem")
test.AssertNotError(t, err, "loading test issuer")
mockLog := blog.NewMock()
cu, err := NewUpdater(
[]*issuance.Certificate{e1, r3}, 2, 10*24*time.Hour, 24*time.Hour, 0, 1,
&fakeSAC{grcc: fakeGRCC{err: errors.New("db no worky")}},
&fakeCGC{gcc: fakeGCC{}},
&fakeCSC{ucc: fakeUCC{}},
metrics.NoopRegisterer, mockLog, clock.NewFake(),
)
test.AssertNotError(t, err, "building test crlUpdater")
// An error that affects all shards should have every shard reflected in the
// combined error message.
err = cu.tickIssuer(context.Background(), cu.clk.Now(), e1.NameID())
test.AssertError(t, err, "database error")
test.AssertContains(t, err.Error(), "2 shards failed")
test.AssertContains(t, err.Error(), "[0 1]")
test.AssertEquals(t, len(mockLog.GetAllMatching("Generating CRL failed:")), 2)
test.AssertMetricWithLabelsEquals(t, cu.tickHistogram, prometheus.Labels{
"issuer": "(TEST) Elegant Elephant E1", "result": "failed",
}, 2)
test.AssertMetricWithLabelsEquals(t, cu.tickHistogram, prometheus.Labels{
"issuer": "(TEST) Elegant Elephant E1 (Overall)", "result": "failed",
}, 1)
cu.tickHistogram.Reset()
}
func TestTick(t *testing.T) {
e1, err := issuance.LoadCertificate("../../test/hierarchy/int-e1.cert.pem")
test.AssertNotError(t, err, "loading test issuer")
r3, err := issuance.LoadCertificate("../../test/hierarchy/int-r3.cert.pem")
test.AssertNotError(t, err, "loading test issuer")
mockLog := blog.NewMock()
cu, err := NewUpdater(
[]*issuance.Certificate{e1, r3}, 2, 10*24*time.Hour, 24*time.Hour, 0, 1,
&fakeSAC{grcc: fakeGRCC{err: errors.New("db no worky")}},
&fakeCGC{gcc: fakeGCC{}},
&fakeCSC{ucc: fakeUCC{}},
metrics.NoopRegisterer, mockLog, clock.NewFake(),
)
test.AssertNotError(t, err, "building test crlUpdater")
// An error that affects all issuers should have every issuer reflected in the
// combined error message.
now := cu.clk.Now()
err = cu.Tick(context.Background(), now)
test.AssertError(t, err, "database error")
test.AssertContains(t, err.Error(), "2 issuers failed")
test.AssertContains(t, err.Error(), "(TEST) Elegant Elephant E1")
test.AssertContains(t, err.Error(), "(TEST) Radical Rhino R3")
test.AssertEquals(t, len(mockLog.GetAllMatching("Generating CRL failed:")), 4)
test.AssertEquals(t, len(mockLog.GetAllMatching("Generating CRLs for issuer failed:")), 2)
test.AssertMetricWithLabelsEquals(t, cu.tickHistogram, prometheus.Labels{
"issuer": "(TEST) Elegant Elephant E1 (Overall)", "result": "failed",
}, 1)
test.AssertMetricWithLabelsEquals(t, cu.tickHistogram, prometheus.Labels{
"issuer": "(TEST) Radical Rhino R3 (Overall)", "result": "failed",
}, 1)
test.AssertMetricWithLabelsEquals(t, cu.tickHistogram, prometheus.Labels{
"issuer": "all", "result": "failed",
}, 1)
cu.tickHistogram.Reset()
}
func TestGetWindowForShard(t *testing.T) {
// Our test updater divides a 107-day window into 107 shards, resulting in a
// shard width of 24 hours.