SARO: Add sa_lag_factor metric to assess usage of the lagFactor codepath (#6774)
Add `sa_lag_retry` prometheus countervec metric with pass/fail
dimensions for `GetOrder`, `GetAuthorization2`, and `GetRegistration`
methods.
The new metrics will appear as follows:
```
sa_lag_retry{method="GetOrder",result="found"} 0
sa_lag_retry{method="GetOrder",result="notfound"} 0
sa_lag_retry{method="GetOrder",result="other"} 0
sa_lag_retry{method="GetAuthorization2",result="found"} 0
sa_lag_retry{method="GetAuthorization2",result="notfound"} 0
sa_lag_retry{method="GetAuthorization2",result="other"} 0
sa_lag_retry{method="GetRegistration",result="found"} 0
sa_lag_retry{method="GetRegistration",result="notfound"} 0
sa_lag_retry{method="GetRegistration",result="other"} 0
```
Fixes https://github.com/letsencrypt/boulder/issues/6773
---------
Co-authored-by: Samantha <hello@entropy.cat>
This commit is contained in:
parent
3850ab3bd8
commit
ce2ee69c5f
|
|
@ -87,7 +87,7 @@ func main() {
|
|||
cmd.FailOnError(err, "TLS config")
|
||||
|
||||
saroi, err := sa.NewSQLStorageAuthorityRO(
|
||||
dbReadOnlyMap, dbIncidentsMap, parallel, c.SA.LagFactor.Duration, clk, logger)
|
||||
dbReadOnlyMap, dbIncidentsMap, scope, parallel, c.SA.LagFactor.Duration, clk, logger)
|
||||
cmd.FailOnError(err, "Failed to create read-only SA impl")
|
||||
|
||||
sai, err := sa.NewSQLStorageAuthorityWrapping(saroi, dbMap, scope)
|
||||
|
|
|
|||
2
sa/sa.go
2
sa/sa.go
|
|
@ -87,7 +87,7 @@ func NewSQLStorageAuthority(
|
|||
stats prometheus.Registerer,
|
||||
) (*SQLStorageAuthority, error) {
|
||||
ssaro, err := NewSQLStorageAuthorityRO(
|
||||
dbReadOnlyMap, dbIncidentsMap, parallelismPerRPC, lagFactor, clk, logger)
|
||||
dbReadOnlyMap, dbIncidentsMap, stats, parallelismPerRPC, lagFactor, clk, logger)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ import (
|
|||
sapb "github.com/letsencrypt/boulder/sa/proto"
|
||||
"github.com/letsencrypt/boulder/test"
|
||||
"github.com/letsencrypt/boulder/test/vars"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"golang.org/x/crypto/ocsp"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/protobuf/types/known/emptypb"
|
||||
|
|
@ -53,8 +54,8 @@ var (
|
|||
}`
|
||||
)
|
||||
|
||||
// initSA constructs a SQLStorageAuthority and a clean up function
|
||||
// that should be defer'ed to the end of the test.
|
||||
// initSA constructs a SQLStorageAuthority and a clean up function that should
|
||||
// be defer'ed to the end of the test.
|
||||
func initSA(t *testing.T) (*SQLStorageAuthority, clock.FakeClock, func()) {
|
||||
t.Helper()
|
||||
features.Reset()
|
||||
|
|
@ -72,7 +73,7 @@ func initSA(t *testing.T) (*SQLStorageAuthority, clock.FakeClock, func()) {
|
|||
fc := clock.NewFake()
|
||||
fc.Set(time.Date(2015, 3, 4, 5, 0, 0, 0, time.UTC))
|
||||
|
||||
saro, err := NewSQLStorageAuthorityRO(dbMap, dbIncidentsMap, 1, 0, fc, log)
|
||||
saro, err := NewSQLStorageAuthorityRO(dbMap, dbIncidentsMap, metrics.NoopRegisterer, 1, 0, fc, log)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create SA: %s", err)
|
||||
}
|
||||
|
|
@ -240,10 +241,14 @@ func TestReplicationLagRetries(t *testing.T) {
|
|||
_, err := sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id})
|
||||
test.AssertNotError(t, err, "selecting extant registration")
|
||||
test.AssertEquals(t, clk.Now(), start)
|
||||
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0)
|
||||
|
||||
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1})
|
||||
test.AssertError(t, err, "selecting nonexistent registration")
|
||||
test.AssertEquals(t, clk.Now(), start)
|
||||
// With lagFactor disabled, we should never enter the retry codepath, as a
|
||||
// result the metric should not increment.
|
||||
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0)
|
||||
|
||||
// Now, set the lagFactor to 1. Trying to select a nonexistent registration
|
||||
// should cause the clock to advance when GetRegistration sleeps and retries.
|
||||
|
|
@ -253,10 +258,15 @@ func TestReplicationLagRetries(t *testing.T) {
|
|||
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id})
|
||||
test.AssertNotError(t, err, "selecting extant registration")
|
||||
test.AssertEquals(t, clk.Now(), start)
|
||||
// lagFactor is enabled, but the registration exists.
|
||||
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0)
|
||||
|
||||
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1})
|
||||
test.AssertError(t, err, "selecting nonexistent registration")
|
||||
test.AssertEquals(t, clk.Now(), start.Add(1))
|
||||
// With lagFactor enabled, we should enter the retry codepath and as a result
|
||||
// the metric should increment.
|
||||
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 1)
|
||||
}
|
||||
|
||||
func TestAddCertificate(t *testing.T) {
|
||||
|
|
|
|||
44
sa/saro.go
44
sa/saro.go
|
|
@ -12,6 +12,7 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/jmhodges/clock"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"google.golang.org/protobuf/types/known/emptypb"
|
||||
"google.golang.org/protobuf/types/known/timestamppb"
|
||||
jose "gopkg.in/go-jose/go-jose.v2"
|
||||
|
|
@ -59,6 +60,13 @@ type SQLStorageAuthorityRO struct {
|
|||
|
||||
clk clock.Clock
|
||||
log blog.Logger
|
||||
|
||||
// lagFactorCounter is a Prometheus counter that tracks the number of times
|
||||
// we've retried a query inside of GetRegistration, GetOrder, and
|
||||
// GetAuthorization2 due to replication lag. It is labeled by method name
|
||||
// and whether data from the retry attempt was found, notfound, or some
|
||||
// other error was encountered.
|
||||
lagFactorCounter *prometheus.CounterVec
|
||||
}
|
||||
|
||||
// NewSQLStorageAuthorityRO provides persistence using a SQL backend for
|
||||
|
|
@ -66,11 +74,18 @@ type SQLStorageAuthorityRO struct {
|
|||
func NewSQLStorageAuthorityRO(
|
||||
dbReadOnlyMap *db.WrappedMap,
|
||||
dbIncidentsMap *db.WrappedMap,
|
||||
stats prometheus.Registerer,
|
||||
parallelismPerRPC int,
|
||||
lagFactor time.Duration,
|
||||
clk clock.Clock,
|
||||
logger blog.Logger,
|
||||
) (*SQLStorageAuthorityRO, error) {
|
||||
lagFactorCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Name: "sa_lag_factor",
|
||||
Help: "A counter of SA lagFactor checks labelled by method and pass/fail",
|
||||
}, []string{"method", "result"})
|
||||
stats.MustRegister(lagFactorCounter)
|
||||
|
||||
ssaro := &SQLStorageAuthorityRO{
|
||||
dbReadOnlyMap: dbReadOnlyMap,
|
||||
dbIncidentsMap: dbIncidentsMap,
|
||||
|
|
@ -78,6 +93,7 @@ func NewSQLStorageAuthorityRO(
|
|||
lagFactor: lagFactor,
|
||||
clk: clk,
|
||||
log: logger,
|
||||
lagFactorCounter: lagFactorCounter,
|
||||
}
|
||||
|
||||
ssaro.countCertificatesByName = ssaro.countCertificates
|
||||
|
|
@ -99,6 +115,15 @@ func (ssa *SQLStorageAuthorityRO) GetRegistration(ctx context.Context, req *sapb
|
|||
// replica yet. If we get a NoRows, wait a little bit and retry, once.
|
||||
ssa.clk.Sleep(ssa.lagFactor)
|
||||
model, err = selectRegistration(ssa.dbReadOnlyMap.WithContext(ctx), query, req.Id)
|
||||
if err != nil {
|
||||
if db.IsNoRows(err) {
|
||||
ssa.lagFactorCounter.WithLabelValues("GetRegistration", "notfound").Inc()
|
||||
} else {
|
||||
ssa.lagFactorCounter.WithLabelValues("GetRegistration", "other").Inc()
|
||||
}
|
||||
} else {
|
||||
ssa.lagFactorCounter.WithLabelValues("GetRegistration", "found").Inc()
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
if db.IsNoRows(err) {
|
||||
|
|
@ -648,6 +673,7 @@ func (ssa *SQLStorageAuthorityRO) GetOrder(ctx context.Context, req *sapb.OrderR
|
|||
|
||||
return order, nil
|
||||
}
|
||||
|
||||
output, err := db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn)
|
||||
if (db.IsNoRows(err) || errors.Is(err, berrors.NotFound)) && ssa.lagFactor != 0 {
|
||||
// GetOrder is often called shortly after a new order is created, sometimes
|
||||
|
|
@ -655,6 +681,15 @@ func (ssa *SQLStorageAuthorityRO) GetOrder(ctx context.Context, req *sapb.OrderR
|
|||
// replica yet. If we get a NoRows, wait a little bit and retry, once.
|
||||
ssa.clk.Sleep(ssa.lagFactor)
|
||||
output, err = db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn)
|
||||
if err != nil {
|
||||
if db.IsNoRows(err) || errors.Is(err, berrors.NotFound) {
|
||||
ssa.lagFactorCounter.WithLabelValues("GetOrder", "notfound").Inc()
|
||||
} else {
|
||||
ssa.lagFactorCounter.WithLabelValues("GetOrder", "other").Inc()
|
||||
}
|
||||
} else {
|
||||
ssa.lagFactorCounter.WithLabelValues("GetOrder", "found").Inc()
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
|
@ -751,6 +786,15 @@ func (ssa *SQLStorageAuthorityRO) GetAuthorization2(ctx context.Context, req *sa
|
|||
// read replica yet. If we get a NoRows, wait a little bit and retry, once.
|
||||
ssa.clk.Sleep(ssa.lagFactor)
|
||||
obj, err = ssa.dbReadOnlyMap.Get(authzModel{}, req.Id)
|
||||
if err != nil {
|
||||
if db.IsNoRows(err) {
|
||||
ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "notfound").Inc()
|
||||
} else {
|
||||
ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "other").Inc()
|
||||
}
|
||||
} else {
|
||||
ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "found").Inc()
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
|
|
|||
Loading…
Reference in New Issue