From ce2ee69c5f5f5bf96004174b658e18ef41d7ecd7 Mon Sep 17 00:00:00 2001 From: Phil Porada Date: Thu, 30 Mar 2023 13:48:16 -0400 Subject: [PATCH] SARO: Add sa_lag_factor metric to assess usage of the lagFactor codepath (#6774) Add `sa_lag_retry` prometheus countervec metric with pass/fail dimensions for `GetOrder`, `GetAuthorization2`, and `GetRegistration` methods. The new metrics will appear as follows: ``` sa_lag_retry{method="GetOrder",result="found"} 0 sa_lag_retry{method="GetOrder",result="notfound"} 0 sa_lag_retry{method="GetOrder",result="other"} 0 sa_lag_retry{method="GetAuthorization2",result="found"} 0 sa_lag_retry{method="GetAuthorization2",result="notfound"} 0 sa_lag_retry{method="GetAuthorization2",result="other"} 0 sa_lag_retry{method="GetRegistration",result="found"} 0 sa_lag_retry{method="GetRegistration",result="notfound"} 0 sa_lag_retry{method="GetRegistration",result="other"} 0 ``` Fixes https://github.com/letsencrypt/boulder/issues/6773 --------- Co-authored-by: Samantha --- cmd/boulder-sa/main.go | 2 +- sa/sa.go | 2 +- sa/sa_test.go | 16 ++++++++++++--- sa/saro.go | 44 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 5 deletions(-) diff --git a/cmd/boulder-sa/main.go b/cmd/boulder-sa/main.go index b70afa2ec..27dcbeaa8 100644 --- a/cmd/boulder-sa/main.go +++ b/cmd/boulder-sa/main.go @@ -87,7 +87,7 @@ func main() { cmd.FailOnError(err, "TLS config") saroi, err := sa.NewSQLStorageAuthorityRO( - dbReadOnlyMap, dbIncidentsMap, parallel, c.SA.LagFactor.Duration, clk, logger) + dbReadOnlyMap, dbIncidentsMap, scope, parallel, c.SA.LagFactor.Duration, clk, logger) cmd.FailOnError(err, "Failed to create read-only SA impl") sai, err := sa.NewSQLStorageAuthorityWrapping(saroi, dbMap, scope) diff --git a/sa/sa.go b/sa/sa.go index 85482ebdf..46fadec84 100644 --- a/sa/sa.go +++ b/sa/sa.go @@ -87,7 +87,7 @@ func NewSQLStorageAuthority( stats prometheus.Registerer, ) (*SQLStorageAuthority, error) { ssaro, err := NewSQLStorageAuthorityRO( - dbReadOnlyMap, dbIncidentsMap, parallelismPerRPC, lagFactor, clk, logger) + dbReadOnlyMap, dbIncidentsMap, stats, parallelismPerRPC, lagFactor, clk, logger) if err != nil { return nil, err } diff --git a/sa/sa_test.go b/sa/sa_test.go index a19ecfabd..8f10ef4e6 100644 --- a/sa/sa_test.go +++ b/sa/sa_test.go @@ -36,6 +36,7 @@ import ( sapb "github.com/letsencrypt/boulder/sa/proto" "github.com/letsencrypt/boulder/test" "github.com/letsencrypt/boulder/test/vars" + "github.com/prometheus/client_golang/prometheus" "golang.org/x/crypto/ocsp" "google.golang.org/grpc" "google.golang.org/protobuf/types/known/emptypb" @@ -53,8 +54,8 @@ var ( }` ) -// initSA constructs a SQLStorageAuthority and a clean up function -// that should be defer'ed to the end of the test. +// initSA constructs a SQLStorageAuthority and a clean up function that should +// be defer'ed to the end of the test. func initSA(t *testing.T) (*SQLStorageAuthority, clock.FakeClock, func()) { t.Helper() features.Reset() @@ -72,7 +73,7 @@ func initSA(t *testing.T) (*SQLStorageAuthority, clock.FakeClock, func()) { fc := clock.NewFake() fc.Set(time.Date(2015, 3, 4, 5, 0, 0, 0, time.UTC)) - saro, err := NewSQLStorageAuthorityRO(dbMap, dbIncidentsMap, 1, 0, fc, log) + saro, err := NewSQLStorageAuthorityRO(dbMap, dbIncidentsMap, metrics.NoopRegisterer, 1, 0, fc, log) if err != nil { t.Fatalf("Failed to create SA: %s", err) } @@ -240,10 +241,14 @@ func TestReplicationLagRetries(t *testing.T) { _, err := sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id}) test.AssertNotError(t, err, "selecting extant registration") test.AssertEquals(t, clk.Now(), start) + test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0) _, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1}) test.AssertError(t, err, "selecting nonexistent registration") test.AssertEquals(t, clk.Now(), start) + // With lagFactor disabled, we should never enter the retry codepath, as a + // result the metric should not increment. + test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0) // Now, set the lagFactor to 1. Trying to select a nonexistent registration // should cause the clock to advance when GetRegistration sleeps and retries. @@ -253,10 +258,15 @@ func TestReplicationLagRetries(t *testing.T) { _, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id}) test.AssertNotError(t, err, "selecting extant registration") test.AssertEquals(t, clk.Now(), start) + // lagFactor is enabled, but the registration exists. + test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0) _, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1}) test.AssertError(t, err, "selecting nonexistent registration") test.AssertEquals(t, clk.Now(), start.Add(1)) + // With lagFactor enabled, we should enter the retry codepath and as a result + // the metric should increment. + test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 1) } func TestAddCertificate(t *testing.T) { diff --git a/sa/saro.go b/sa/saro.go index 34f70b821..6783f68cc 100644 --- a/sa/saro.go +++ b/sa/saro.go @@ -12,6 +12,7 @@ import ( "time" "github.com/jmhodges/clock" + "github.com/prometheus/client_golang/prometheus" "google.golang.org/protobuf/types/known/emptypb" "google.golang.org/protobuf/types/known/timestamppb" jose "gopkg.in/go-jose/go-jose.v2" @@ -59,6 +60,13 @@ type SQLStorageAuthorityRO struct { clk clock.Clock log blog.Logger + + // lagFactorCounter is a Prometheus counter that tracks the number of times + // we've retried a query inside of GetRegistration, GetOrder, and + // GetAuthorization2 due to replication lag. It is labeled by method name + // and whether data from the retry attempt was found, notfound, or some + // other error was encountered. + lagFactorCounter *prometheus.CounterVec } // NewSQLStorageAuthorityRO provides persistence using a SQL backend for @@ -66,11 +74,18 @@ type SQLStorageAuthorityRO struct { func NewSQLStorageAuthorityRO( dbReadOnlyMap *db.WrappedMap, dbIncidentsMap *db.WrappedMap, + stats prometheus.Registerer, parallelismPerRPC int, lagFactor time.Duration, clk clock.Clock, logger blog.Logger, ) (*SQLStorageAuthorityRO, error) { + lagFactorCounter := prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "sa_lag_factor", + Help: "A counter of SA lagFactor checks labelled by method and pass/fail", + }, []string{"method", "result"}) + stats.MustRegister(lagFactorCounter) + ssaro := &SQLStorageAuthorityRO{ dbReadOnlyMap: dbReadOnlyMap, dbIncidentsMap: dbIncidentsMap, @@ -78,6 +93,7 @@ func NewSQLStorageAuthorityRO( lagFactor: lagFactor, clk: clk, log: logger, + lagFactorCounter: lagFactorCounter, } ssaro.countCertificatesByName = ssaro.countCertificates @@ -99,6 +115,15 @@ func (ssa *SQLStorageAuthorityRO) GetRegistration(ctx context.Context, req *sapb // replica yet. If we get a NoRows, wait a little bit and retry, once. ssa.clk.Sleep(ssa.lagFactor) model, err = selectRegistration(ssa.dbReadOnlyMap.WithContext(ctx), query, req.Id) + if err != nil { + if db.IsNoRows(err) { + ssa.lagFactorCounter.WithLabelValues("GetRegistration", "notfound").Inc() + } else { + ssa.lagFactorCounter.WithLabelValues("GetRegistration", "other").Inc() + } + } else { + ssa.lagFactorCounter.WithLabelValues("GetRegistration", "found").Inc() + } } if err != nil { if db.IsNoRows(err) { @@ -648,6 +673,7 @@ func (ssa *SQLStorageAuthorityRO) GetOrder(ctx context.Context, req *sapb.OrderR return order, nil } + output, err := db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn) if (db.IsNoRows(err) || errors.Is(err, berrors.NotFound)) && ssa.lagFactor != 0 { // GetOrder is often called shortly after a new order is created, sometimes @@ -655,6 +681,15 @@ func (ssa *SQLStorageAuthorityRO) GetOrder(ctx context.Context, req *sapb.OrderR // replica yet. If we get a NoRows, wait a little bit and retry, once. ssa.clk.Sleep(ssa.lagFactor) output, err = db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn) + if err != nil { + if db.IsNoRows(err) || errors.Is(err, berrors.NotFound) { + ssa.lagFactorCounter.WithLabelValues("GetOrder", "notfound").Inc() + } else { + ssa.lagFactorCounter.WithLabelValues("GetOrder", "other").Inc() + } + } else { + ssa.lagFactorCounter.WithLabelValues("GetOrder", "found").Inc() + } } if err != nil { return nil, err @@ -751,6 +786,15 @@ func (ssa *SQLStorageAuthorityRO) GetAuthorization2(ctx context.Context, req *sa // read replica yet. If we get a NoRows, wait a little bit and retry, once. ssa.clk.Sleep(ssa.lagFactor) obj, err = ssa.dbReadOnlyMap.Get(authzModel{}, req.Id) + if err != nil { + if db.IsNoRows(err) { + ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "notfound").Inc() + } else { + ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "other").Inc() + } + } else { + ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "found").Inc() + } } if err != nil { return nil, err