SARO: Add sa_lag_factor metric to assess usage of the lagFactor codepath (#6774)

Add `sa_lag_retry` prometheus countervec metric with pass/fail
dimensions for `GetOrder`, `GetAuthorization2`, and `GetRegistration`
methods.

The new metrics will appear as follows:
```
sa_lag_retry{method="GetOrder",result="found"} 0
sa_lag_retry{method="GetOrder",result="notfound"} 0
sa_lag_retry{method="GetOrder",result="other"} 0
sa_lag_retry{method="GetAuthorization2",result="found"} 0
sa_lag_retry{method="GetAuthorization2",result="notfound"} 0
sa_lag_retry{method="GetAuthorization2",result="other"} 0
sa_lag_retry{method="GetRegistration",result="found"} 0
sa_lag_retry{method="GetRegistration",result="notfound"} 0
sa_lag_retry{method="GetRegistration",result="other"} 0
```

Fixes https://github.com/letsencrypt/boulder/issues/6773

---------

Co-authored-by: Samantha <hello@entropy.cat>
This commit is contained in:
Phil Porada 2023-03-30 13:48:16 -04:00 committed by GitHub
parent 3850ab3bd8
commit ce2ee69c5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 59 additions and 5 deletions

View File

@ -87,7 +87,7 @@ func main() {
cmd.FailOnError(err, "TLS config")
saroi, err := sa.NewSQLStorageAuthorityRO(
dbReadOnlyMap, dbIncidentsMap, parallel, c.SA.LagFactor.Duration, clk, logger)
dbReadOnlyMap, dbIncidentsMap, scope, parallel, c.SA.LagFactor.Duration, clk, logger)
cmd.FailOnError(err, "Failed to create read-only SA impl")
sai, err := sa.NewSQLStorageAuthorityWrapping(saroi, dbMap, scope)

View File

@ -87,7 +87,7 @@ func NewSQLStorageAuthority(
stats prometheus.Registerer,
) (*SQLStorageAuthority, error) {
ssaro, err := NewSQLStorageAuthorityRO(
dbReadOnlyMap, dbIncidentsMap, parallelismPerRPC, lagFactor, clk, logger)
dbReadOnlyMap, dbIncidentsMap, stats, parallelismPerRPC, lagFactor, clk, logger)
if err != nil {
return nil, err
}

View File

@ -36,6 +36,7 @@ import (
sapb "github.com/letsencrypt/boulder/sa/proto"
"github.com/letsencrypt/boulder/test"
"github.com/letsencrypt/boulder/test/vars"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/crypto/ocsp"
"google.golang.org/grpc"
"google.golang.org/protobuf/types/known/emptypb"
@ -53,8 +54,8 @@ var (
}`
)
// initSA constructs a SQLStorageAuthority and a clean up function
// that should be defer'ed to the end of the test.
// initSA constructs a SQLStorageAuthority and a clean up function that should
// be defer'ed to the end of the test.
func initSA(t *testing.T) (*SQLStorageAuthority, clock.FakeClock, func()) {
t.Helper()
features.Reset()
@ -72,7 +73,7 @@ func initSA(t *testing.T) (*SQLStorageAuthority, clock.FakeClock, func()) {
fc := clock.NewFake()
fc.Set(time.Date(2015, 3, 4, 5, 0, 0, 0, time.UTC))
saro, err := NewSQLStorageAuthorityRO(dbMap, dbIncidentsMap, 1, 0, fc, log)
saro, err := NewSQLStorageAuthorityRO(dbMap, dbIncidentsMap, metrics.NoopRegisterer, 1, 0, fc, log)
if err != nil {
t.Fatalf("Failed to create SA: %s", err)
}
@ -240,10 +241,14 @@ func TestReplicationLagRetries(t *testing.T) {
_, err := sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id})
test.AssertNotError(t, err, "selecting extant registration")
test.AssertEquals(t, clk.Now(), start)
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0)
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1})
test.AssertError(t, err, "selecting nonexistent registration")
test.AssertEquals(t, clk.Now(), start)
// With lagFactor disabled, we should never enter the retry codepath, as a
// result the metric should not increment.
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0)
// Now, set the lagFactor to 1. Trying to select a nonexistent registration
// should cause the clock to advance when GetRegistration sleeps and retries.
@ -253,10 +258,15 @@ func TestReplicationLagRetries(t *testing.T) {
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id})
test.AssertNotError(t, err, "selecting extant registration")
test.AssertEquals(t, clk.Now(), start)
// lagFactor is enabled, but the registration exists.
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0)
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1})
test.AssertError(t, err, "selecting nonexistent registration")
test.AssertEquals(t, clk.Now(), start.Add(1))
// With lagFactor enabled, we should enter the retry codepath and as a result
// the metric should increment.
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 1)
}
func TestAddCertificate(t *testing.T) {

View File

@ -12,6 +12,7 @@ import (
"time"
"github.com/jmhodges/clock"
"github.com/prometheus/client_golang/prometheus"
"google.golang.org/protobuf/types/known/emptypb"
"google.golang.org/protobuf/types/known/timestamppb"
jose "gopkg.in/go-jose/go-jose.v2"
@ -59,6 +60,13 @@ type SQLStorageAuthorityRO struct {
clk clock.Clock
log blog.Logger
// lagFactorCounter is a Prometheus counter that tracks the number of times
// we've retried a query inside of GetRegistration, GetOrder, and
// GetAuthorization2 due to replication lag. It is labeled by method name
// and whether data from the retry attempt was found, notfound, or some
// other error was encountered.
lagFactorCounter *prometheus.CounterVec
}
// NewSQLStorageAuthorityRO provides persistence using a SQL backend for
@ -66,11 +74,18 @@ type SQLStorageAuthorityRO struct {
func NewSQLStorageAuthorityRO(
dbReadOnlyMap *db.WrappedMap,
dbIncidentsMap *db.WrappedMap,
stats prometheus.Registerer,
parallelismPerRPC int,
lagFactor time.Duration,
clk clock.Clock,
logger blog.Logger,
) (*SQLStorageAuthorityRO, error) {
lagFactorCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "sa_lag_factor",
Help: "A counter of SA lagFactor checks labelled by method and pass/fail",
}, []string{"method", "result"})
stats.MustRegister(lagFactorCounter)
ssaro := &SQLStorageAuthorityRO{
dbReadOnlyMap: dbReadOnlyMap,
dbIncidentsMap: dbIncidentsMap,
@ -78,6 +93,7 @@ func NewSQLStorageAuthorityRO(
lagFactor: lagFactor,
clk: clk,
log: logger,
lagFactorCounter: lagFactorCounter,
}
ssaro.countCertificatesByName = ssaro.countCertificates
@ -99,6 +115,15 @@ func (ssa *SQLStorageAuthorityRO) GetRegistration(ctx context.Context, req *sapb
// replica yet. If we get a NoRows, wait a little bit and retry, once.
ssa.clk.Sleep(ssa.lagFactor)
model, err = selectRegistration(ssa.dbReadOnlyMap.WithContext(ctx), query, req.Id)
if err != nil {
if db.IsNoRows(err) {
ssa.lagFactorCounter.WithLabelValues("GetRegistration", "notfound").Inc()
} else {
ssa.lagFactorCounter.WithLabelValues("GetRegistration", "other").Inc()
}
} else {
ssa.lagFactorCounter.WithLabelValues("GetRegistration", "found").Inc()
}
}
if err != nil {
if db.IsNoRows(err) {
@ -648,6 +673,7 @@ func (ssa *SQLStorageAuthorityRO) GetOrder(ctx context.Context, req *sapb.OrderR
return order, nil
}
output, err := db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn)
if (db.IsNoRows(err) || errors.Is(err, berrors.NotFound)) && ssa.lagFactor != 0 {
// GetOrder is often called shortly after a new order is created, sometimes
@ -655,6 +681,15 @@ func (ssa *SQLStorageAuthorityRO) GetOrder(ctx context.Context, req *sapb.OrderR
// replica yet. If we get a NoRows, wait a little bit and retry, once.
ssa.clk.Sleep(ssa.lagFactor)
output, err = db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn)
if err != nil {
if db.IsNoRows(err) || errors.Is(err, berrors.NotFound) {
ssa.lagFactorCounter.WithLabelValues("GetOrder", "notfound").Inc()
} else {
ssa.lagFactorCounter.WithLabelValues("GetOrder", "other").Inc()
}
} else {
ssa.lagFactorCounter.WithLabelValues("GetOrder", "found").Inc()
}
}
if err != nil {
return nil, err
@ -751,6 +786,15 @@ func (ssa *SQLStorageAuthorityRO) GetAuthorization2(ctx context.Context, req *sa
// read replica yet. If we get a NoRows, wait a little bit and retry, once.
ssa.clk.Sleep(ssa.lagFactor)
obj, err = ssa.dbReadOnlyMap.Get(authzModel{}, req.Id)
if err != nil {
if db.IsNoRows(err) {
ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "notfound").Inc()
} else {
ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "other").Inc()
}
} else {
ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "found").Inc()
}
}
if err != nil {
return nil, err