SARO: Add sa_lag_factor metric to assess usage of the lagFactor codepath (#6774)

Add `sa_lag_retry` prometheus countervec metric with pass/fail
dimensions for `GetOrder`, `GetAuthorization2`, and `GetRegistration`
methods.

The new metrics will appear as follows:
```
sa_lag_retry{method="GetOrder",result="found"} 0
sa_lag_retry{method="GetOrder",result="notfound"} 0
sa_lag_retry{method="GetOrder",result="other"} 0
sa_lag_retry{method="GetAuthorization2",result="found"} 0
sa_lag_retry{method="GetAuthorization2",result="notfound"} 0
sa_lag_retry{method="GetAuthorization2",result="other"} 0
sa_lag_retry{method="GetRegistration",result="found"} 0
sa_lag_retry{method="GetRegistration",result="notfound"} 0
sa_lag_retry{method="GetRegistration",result="other"} 0
```

Fixes https://github.com/letsencrypt/boulder/issues/6773

---------

Co-authored-by: Samantha <hello@entropy.cat>
This commit is contained in:
Phil Porada 2023-03-30 13:48:16 -04:00 committed by GitHub
parent 3850ab3bd8
commit ce2ee69c5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 59 additions and 5 deletions

View File

@ -87,7 +87,7 @@ func main() {
cmd.FailOnError(err, "TLS config") cmd.FailOnError(err, "TLS config")
saroi, err := sa.NewSQLStorageAuthorityRO( saroi, err := sa.NewSQLStorageAuthorityRO(
dbReadOnlyMap, dbIncidentsMap, parallel, c.SA.LagFactor.Duration, clk, logger) dbReadOnlyMap, dbIncidentsMap, scope, parallel, c.SA.LagFactor.Duration, clk, logger)
cmd.FailOnError(err, "Failed to create read-only SA impl") cmd.FailOnError(err, "Failed to create read-only SA impl")
sai, err := sa.NewSQLStorageAuthorityWrapping(saroi, dbMap, scope) sai, err := sa.NewSQLStorageAuthorityWrapping(saroi, dbMap, scope)

View File

@ -87,7 +87,7 @@ func NewSQLStorageAuthority(
stats prometheus.Registerer, stats prometheus.Registerer,
) (*SQLStorageAuthority, error) { ) (*SQLStorageAuthority, error) {
ssaro, err := NewSQLStorageAuthorityRO( ssaro, err := NewSQLStorageAuthorityRO(
dbReadOnlyMap, dbIncidentsMap, parallelismPerRPC, lagFactor, clk, logger) dbReadOnlyMap, dbIncidentsMap, stats, parallelismPerRPC, lagFactor, clk, logger)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -36,6 +36,7 @@ import (
sapb "github.com/letsencrypt/boulder/sa/proto" sapb "github.com/letsencrypt/boulder/sa/proto"
"github.com/letsencrypt/boulder/test" "github.com/letsencrypt/boulder/test"
"github.com/letsencrypt/boulder/test/vars" "github.com/letsencrypt/boulder/test/vars"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/crypto/ocsp" "golang.org/x/crypto/ocsp"
"google.golang.org/grpc" "google.golang.org/grpc"
"google.golang.org/protobuf/types/known/emptypb" "google.golang.org/protobuf/types/known/emptypb"
@ -53,8 +54,8 @@ var (
}` }`
) )
// initSA constructs a SQLStorageAuthority and a clean up function // initSA constructs a SQLStorageAuthority and a clean up function that should
// that should be defer'ed to the end of the test. // be defer'ed to the end of the test.
func initSA(t *testing.T) (*SQLStorageAuthority, clock.FakeClock, func()) { func initSA(t *testing.T) (*SQLStorageAuthority, clock.FakeClock, func()) {
t.Helper() t.Helper()
features.Reset() features.Reset()
@ -72,7 +73,7 @@ func initSA(t *testing.T) (*SQLStorageAuthority, clock.FakeClock, func()) {
fc := clock.NewFake() fc := clock.NewFake()
fc.Set(time.Date(2015, 3, 4, 5, 0, 0, 0, time.UTC)) fc.Set(time.Date(2015, 3, 4, 5, 0, 0, 0, time.UTC))
saro, err := NewSQLStorageAuthorityRO(dbMap, dbIncidentsMap, 1, 0, fc, log) saro, err := NewSQLStorageAuthorityRO(dbMap, dbIncidentsMap, metrics.NoopRegisterer, 1, 0, fc, log)
if err != nil { if err != nil {
t.Fatalf("Failed to create SA: %s", err) t.Fatalf("Failed to create SA: %s", err)
} }
@ -240,10 +241,14 @@ func TestReplicationLagRetries(t *testing.T) {
_, err := sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id}) _, err := sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id})
test.AssertNotError(t, err, "selecting extant registration") test.AssertNotError(t, err, "selecting extant registration")
test.AssertEquals(t, clk.Now(), start) test.AssertEquals(t, clk.Now(), start)
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0)
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1}) _, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1})
test.AssertError(t, err, "selecting nonexistent registration") test.AssertError(t, err, "selecting nonexistent registration")
test.AssertEquals(t, clk.Now(), start) test.AssertEquals(t, clk.Now(), start)
// With lagFactor disabled, we should never enter the retry codepath, as a
// result the metric should not increment.
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0)
// Now, set the lagFactor to 1. Trying to select a nonexistent registration // Now, set the lagFactor to 1. Trying to select a nonexistent registration
// should cause the clock to advance when GetRegistration sleeps and retries. // should cause the clock to advance when GetRegistration sleeps and retries.
@ -253,10 +258,15 @@ func TestReplicationLagRetries(t *testing.T) {
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id}) _, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id})
test.AssertNotError(t, err, "selecting extant registration") test.AssertNotError(t, err, "selecting extant registration")
test.AssertEquals(t, clk.Now(), start) test.AssertEquals(t, clk.Now(), start)
// lagFactor is enabled, but the registration exists.
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0)
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1}) _, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1})
test.AssertError(t, err, "selecting nonexistent registration") test.AssertError(t, err, "selecting nonexistent registration")
test.AssertEquals(t, clk.Now(), start.Add(1)) test.AssertEquals(t, clk.Now(), start.Add(1))
// With lagFactor enabled, we should enter the retry codepath and as a result
// the metric should increment.
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 1)
} }
func TestAddCertificate(t *testing.T) { func TestAddCertificate(t *testing.T) {

View File

@ -12,6 +12,7 @@ import (
"time" "time"
"github.com/jmhodges/clock" "github.com/jmhodges/clock"
"github.com/prometheus/client_golang/prometheus"
"google.golang.org/protobuf/types/known/emptypb" "google.golang.org/protobuf/types/known/emptypb"
"google.golang.org/protobuf/types/known/timestamppb" "google.golang.org/protobuf/types/known/timestamppb"
jose "gopkg.in/go-jose/go-jose.v2" jose "gopkg.in/go-jose/go-jose.v2"
@ -59,6 +60,13 @@ type SQLStorageAuthorityRO struct {
clk clock.Clock clk clock.Clock
log blog.Logger log blog.Logger
// lagFactorCounter is a Prometheus counter that tracks the number of times
// we've retried a query inside of GetRegistration, GetOrder, and
// GetAuthorization2 due to replication lag. It is labeled by method name
// and whether data from the retry attempt was found, notfound, or some
// other error was encountered.
lagFactorCounter *prometheus.CounterVec
} }
// NewSQLStorageAuthorityRO provides persistence using a SQL backend for // NewSQLStorageAuthorityRO provides persistence using a SQL backend for
@ -66,11 +74,18 @@ type SQLStorageAuthorityRO struct {
func NewSQLStorageAuthorityRO( func NewSQLStorageAuthorityRO(
dbReadOnlyMap *db.WrappedMap, dbReadOnlyMap *db.WrappedMap,
dbIncidentsMap *db.WrappedMap, dbIncidentsMap *db.WrappedMap,
stats prometheus.Registerer,
parallelismPerRPC int, parallelismPerRPC int,
lagFactor time.Duration, lagFactor time.Duration,
clk clock.Clock, clk clock.Clock,
logger blog.Logger, logger blog.Logger,
) (*SQLStorageAuthorityRO, error) { ) (*SQLStorageAuthorityRO, error) {
lagFactorCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "sa_lag_factor",
Help: "A counter of SA lagFactor checks labelled by method and pass/fail",
}, []string{"method", "result"})
stats.MustRegister(lagFactorCounter)
ssaro := &SQLStorageAuthorityRO{ ssaro := &SQLStorageAuthorityRO{
dbReadOnlyMap: dbReadOnlyMap, dbReadOnlyMap: dbReadOnlyMap,
dbIncidentsMap: dbIncidentsMap, dbIncidentsMap: dbIncidentsMap,
@ -78,6 +93,7 @@ func NewSQLStorageAuthorityRO(
lagFactor: lagFactor, lagFactor: lagFactor,
clk: clk, clk: clk,
log: logger, log: logger,
lagFactorCounter: lagFactorCounter,
} }
ssaro.countCertificatesByName = ssaro.countCertificates ssaro.countCertificatesByName = ssaro.countCertificates
@ -99,6 +115,15 @@ func (ssa *SQLStorageAuthorityRO) GetRegistration(ctx context.Context, req *sapb
// replica yet. If we get a NoRows, wait a little bit and retry, once. // replica yet. If we get a NoRows, wait a little bit and retry, once.
ssa.clk.Sleep(ssa.lagFactor) ssa.clk.Sleep(ssa.lagFactor)
model, err = selectRegistration(ssa.dbReadOnlyMap.WithContext(ctx), query, req.Id) model, err = selectRegistration(ssa.dbReadOnlyMap.WithContext(ctx), query, req.Id)
if err != nil {
if db.IsNoRows(err) {
ssa.lagFactorCounter.WithLabelValues("GetRegistration", "notfound").Inc()
} else {
ssa.lagFactorCounter.WithLabelValues("GetRegistration", "other").Inc()
}
} else {
ssa.lagFactorCounter.WithLabelValues("GetRegistration", "found").Inc()
}
} }
if err != nil { if err != nil {
if db.IsNoRows(err) { if db.IsNoRows(err) {
@ -648,6 +673,7 @@ func (ssa *SQLStorageAuthorityRO) GetOrder(ctx context.Context, req *sapb.OrderR
return order, nil return order, nil
} }
output, err := db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn) output, err := db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn)
if (db.IsNoRows(err) || errors.Is(err, berrors.NotFound)) && ssa.lagFactor != 0 { if (db.IsNoRows(err) || errors.Is(err, berrors.NotFound)) && ssa.lagFactor != 0 {
// GetOrder is often called shortly after a new order is created, sometimes // GetOrder is often called shortly after a new order is created, sometimes
@ -655,6 +681,15 @@ func (ssa *SQLStorageAuthorityRO) GetOrder(ctx context.Context, req *sapb.OrderR
// replica yet. If we get a NoRows, wait a little bit and retry, once. // replica yet. If we get a NoRows, wait a little bit and retry, once.
ssa.clk.Sleep(ssa.lagFactor) ssa.clk.Sleep(ssa.lagFactor)
output, err = db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn) output, err = db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn)
if err != nil {
if db.IsNoRows(err) || errors.Is(err, berrors.NotFound) {
ssa.lagFactorCounter.WithLabelValues("GetOrder", "notfound").Inc()
} else {
ssa.lagFactorCounter.WithLabelValues("GetOrder", "other").Inc()
}
} else {
ssa.lagFactorCounter.WithLabelValues("GetOrder", "found").Inc()
}
} }
if err != nil { if err != nil {
return nil, err return nil, err
@ -751,6 +786,15 @@ func (ssa *SQLStorageAuthorityRO) GetAuthorization2(ctx context.Context, req *sa
// read replica yet. If we get a NoRows, wait a little bit and retry, once. // read replica yet. If we get a NoRows, wait a little bit and retry, once.
ssa.clk.Sleep(ssa.lagFactor) ssa.clk.Sleep(ssa.lagFactor)
obj, err = ssa.dbReadOnlyMap.Get(authzModel{}, req.Id) obj, err = ssa.dbReadOnlyMap.Get(authzModel{}, req.Id)
if err != nil {
if db.IsNoRows(err) {
ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "notfound").Inc()
} else {
ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "other").Inc()
}
} else {
ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "found").Inc()
}
} }
if err != nil { if err != nil {
return nil, err return nil, err