SARO: Add sa_lag_factor metric to assess usage of the lagFactor codepath (#6774)
Add `sa_lag_retry` prometheus countervec metric with pass/fail
dimensions for `GetOrder`, `GetAuthorization2`, and `GetRegistration`
methods.
The new metrics will appear as follows:
```
sa_lag_retry{method="GetOrder",result="found"} 0
sa_lag_retry{method="GetOrder",result="notfound"} 0
sa_lag_retry{method="GetOrder",result="other"} 0
sa_lag_retry{method="GetAuthorization2",result="found"} 0
sa_lag_retry{method="GetAuthorization2",result="notfound"} 0
sa_lag_retry{method="GetAuthorization2",result="other"} 0
sa_lag_retry{method="GetRegistration",result="found"} 0
sa_lag_retry{method="GetRegistration",result="notfound"} 0
sa_lag_retry{method="GetRegistration",result="other"} 0
```
Fixes https://github.com/letsencrypt/boulder/issues/6773
---------
Co-authored-by: Samantha <hello@entropy.cat>
This commit is contained in:
parent
3850ab3bd8
commit
ce2ee69c5f
|
|
@ -87,7 +87,7 @@ func main() {
|
||||||
cmd.FailOnError(err, "TLS config")
|
cmd.FailOnError(err, "TLS config")
|
||||||
|
|
||||||
saroi, err := sa.NewSQLStorageAuthorityRO(
|
saroi, err := sa.NewSQLStorageAuthorityRO(
|
||||||
dbReadOnlyMap, dbIncidentsMap, parallel, c.SA.LagFactor.Duration, clk, logger)
|
dbReadOnlyMap, dbIncidentsMap, scope, parallel, c.SA.LagFactor.Duration, clk, logger)
|
||||||
cmd.FailOnError(err, "Failed to create read-only SA impl")
|
cmd.FailOnError(err, "Failed to create read-only SA impl")
|
||||||
|
|
||||||
sai, err := sa.NewSQLStorageAuthorityWrapping(saroi, dbMap, scope)
|
sai, err := sa.NewSQLStorageAuthorityWrapping(saroi, dbMap, scope)
|
||||||
|
|
|
||||||
2
sa/sa.go
2
sa/sa.go
|
|
@ -87,7 +87,7 @@ func NewSQLStorageAuthority(
|
||||||
stats prometheus.Registerer,
|
stats prometheus.Registerer,
|
||||||
) (*SQLStorageAuthority, error) {
|
) (*SQLStorageAuthority, error) {
|
||||||
ssaro, err := NewSQLStorageAuthorityRO(
|
ssaro, err := NewSQLStorageAuthorityRO(
|
||||||
dbReadOnlyMap, dbIncidentsMap, parallelismPerRPC, lagFactor, clk, logger)
|
dbReadOnlyMap, dbIncidentsMap, stats, parallelismPerRPC, lagFactor, clk, logger)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,7 @@ import (
|
||||||
sapb "github.com/letsencrypt/boulder/sa/proto"
|
sapb "github.com/letsencrypt/boulder/sa/proto"
|
||||||
"github.com/letsencrypt/boulder/test"
|
"github.com/letsencrypt/boulder/test"
|
||||||
"github.com/letsencrypt/boulder/test/vars"
|
"github.com/letsencrypt/boulder/test/vars"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"golang.org/x/crypto/ocsp"
|
"golang.org/x/crypto/ocsp"
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
"google.golang.org/protobuf/types/known/emptypb"
|
"google.golang.org/protobuf/types/known/emptypb"
|
||||||
|
|
@ -53,8 +54,8 @@ var (
|
||||||
}`
|
}`
|
||||||
)
|
)
|
||||||
|
|
||||||
// initSA constructs a SQLStorageAuthority and a clean up function
|
// initSA constructs a SQLStorageAuthority and a clean up function that should
|
||||||
// that should be defer'ed to the end of the test.
|
// be defer'ed to the end of the test.
|
||||||
func initSA(t *testing.T) (*SQLStorageAuthority, clock.FakeClock, func()) {
|
func initSA(t *testing.T) (*SQLStorageAuthority, clock.FakeClock, func()) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
features.Reset()
|
features.Reset()
|
||||||
|
|
@ -72,7 +73,7 @@ func initSA(t *testing.T) (*SQLStorageAuthority, clock.FakeClock, func()) {
|
||||||
fc := clock.NewFake()
|
fc := clock.NewFake()
|
||||||
fc.Set(time.Date(2015, 3, 4, 5, 0, 0, 0, time.UTC))
|
fc.Set(time.Date(2015, 3, 4, 5, 0, 0, 0, time.UTC))
|
||||||
|
|
||||||
saro, err := NewSQLStorageAuthorityRO(dbMap, dbIncidentsMap, 1, 0, fc, log)
|
saro, err := NewSQLStorageAuthorityRO(dbMap, dbIncidentsMap, metrics.NoopRegisterer, 1, 0, fc, log)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("Failed to create SA: %s", err)
|
t.Fatalf("Failed to create SA: %s", err)
|
||||||
}
|
}
|
||||||
|
|
@ -240,10 +241,14 @@ func TestReplicationLagRetries(t *testing.T) {
|
||||||
_, err := sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id})
|
_, err := sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id})
|
||||||
test.AssertNotError(t, err, "selecting extant registration")
|
test.AssertNotError(t, err, "selecting extant registration")
|
||||||
test.AssertEquals(t, clk.Now(), start)
|
test.AssertEquals(t, clk.Now(), start)
|
||||||
|
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0)
|
||||||
|
|
||||||
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1})
|
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1})
|
||||||
test.AssertError(t, err, "selecting nonexistent registration")
|
test.AssertError(t, err, "selecting nonexistent registration")
|
||||||
test.AssertEquals(t, clk.Now(), start)
|
test.AssertEquals(t, clk.Now(), start)
|
||||||
|
// With lagFactor disabled, we should never enter the retry codepath, as a
|
||||||
|
// result the metric should not increment.
|
||||||
|
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0)
|
||||||
|
|
||||||
// Now, set the lagFactor to 1. Trying to select a nonexistent registration
|
// Now, set the lagFactor to 1. Trying to select a nonexistent registration
|
||||||
// should cause the clock to advance when GetRegistration sleeps and retries.
|
// should cause the clock to advance when GetRegistration sleeps and retries.
|
||||||
|
|
@ -253,10 +258,15 @@ func TestReplicationLagRetries(t *testing.T) {
|
||||||
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id})
|
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id})
|
||||||
test.AssertNotError(t, err, "selecting extant registration")
|
test.AssertNotError(t, err, "selecting extant registration")
|
||||||
test.AssertEquals(t, clk.Now(), start)
|
test.AssertEquals(t, clk.Now(), start)
|
||||||
|
// lagFactor is enabled, but the registration exists.
|
||||||
|
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 0)
|
||||||
|
|
||||||
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1})
|
_, err = sa.GetRegistration(ctx, &sapb.RegistrationID{Id: reg.Id + 1})
|
||||||
test.AssertError(t, err, "selecting nonexistent registration")
|
test.AssertError(t, err, "selecting nonexistent registration")
|
||||||
test.AssertEquals(t, clk.Now(), start.Add(1))
|
test.AssertEquals(t, clk.Now(), start.Add(1))
|
||||||
|
// With lagFactor enabled, we should enter the retry codepath and as a result
|
||||||
|
// the metric should increment.
|
||||||
|
test.AssertMetricWithLabelsEquals(t, sa.lagFactorCounter, prometheus.Labels{"method": "GetRegistration", "result": "notfound"}, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestAddCertificate(t *testing.T) {
|
func TestAddCertificate(t *testing.T) {
|
||||||
|
|
|
||||||
44
sa/saro.go
44
sa/saro.go
|
|
@ -12,6 +12,7 @@ import (
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/jmhodges/clock"
|
"github.com/jmhodges/clock"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"google.golang.org/protobuf/types/known/emptypb"
|
"google.golang.org/protobuf/types/known/emptypb"
|
||||||
"google.golang.org/protobuf/types/known/timestamppb"
|
"google.golang.org/protobuf/types/known/timestamppb"
|
||||||
jose "gopkg.in/go-jose/go-jose.v2"
|
jose "gopkg.in/go-jose/go-jose.v2"
|
||||||
|
|
@ -59,6 +60,13 @@ type SQLStorageAuthorityRO struct {
|
||||||
|
|
||||||
clk clock.Clock
|
clk clock.Clock
|
||||||
log blog.Logger
|
log blog.Logger
|
||||||
|
|
||||||
|
// lagFactorCounter is a Prometheus counter that tracks the number of times
|
||||||
|
// we've retried a query inside of GetRegistration, GetOrder, and
|
||||||
|
// GetAuthorization2 due to replication lag. It is labeled by method name
|
||||||
|
// and whether data from the retry attempt was found, notfound, or some
|
||||||
|
// other error was encountered.
|
||||||
|
lagFactorCounter *prometheus.CounterVec
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewSQLStorageAuthorityRO provides persistence using a SQL backend for
|
// NewSQLStorageAuthorityRO provides persistence using a SQL backend for
|
||||||
|
|
@ -66,11 +74,18 @@ type SQLStorageAuthorityRO struct {
|
||||||
func NewSQLStorageAuthorityRO(
|
func NewSQLStorageAuthorityRO(
|
||||||
dbReadOnlyMap *db.WrappedMap,
|
dbReadOnlyMap *db.WrappedMap,
|
||||||
dbIncidentsMap *db.WrappedMap,
|
dbIncidentsMap *db.WrappedMap,
|
||||||
|
stats prometheus.Registerer,
|
||||||
parallelismPerRPC int,
|
parallelismPerRPC int,
|
||||||
lagFactor time.Duration,
|
lagFactor time.Duration,
|
||||||
clk clock.Clock,
|
clk clock.Clock,
|
||||||
logger blog.Logger,
|
logger blog.Logger,
|
||||||
) (*SQLStorageAuthorityRO, error) {
|
) (*SQLStorageAuthorityRO, error) {
|
||||||
|
lagFactorCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Name: "sa_lag_factor",
|
||||||
|
Help: "A counter of SA lagFactor checks labelled by method and pass/fail",
|
||||||
|
}, []string{"method", "result"})
|
||||||
|
stats.MustRegister(lagFactorCounter)
|
||||||
|
|
||||||
ssaro := &SQLStorageAuthorityRO{
|
ssaro := &SQLStorageAuthorityRO{
|
||||||
dbReadOnlyMap: dbReadOnlyMap,
|
dbReadOnlyMap: dbReadOnlyMap,
|
||||||
dbIncidentsMap: dbIncidentsMap,
|
dbIncidentsMap: dbIncidentsMap,
|
||||||
|
|
@ -78,6 +93,7 @@ func NewSQLStorageAuthorityRO(
|
||||||
lagFactor: lagFactor,
|
lagFactor: lagFactor,
|
||||||
clk: clk,
|
clk: clk,
|
||||||
log: logger,
|
log: logger,
|
||||||
|
lagFactorCounter: lagFactorCounter,
|
||||||
}
|
}
|
||||||
|
|
||||||
ssaro.countCertificatesByName = ssaro.countCertificates
|
ssaro.countCertificatesByName = ssaro.countCertificates
|
||||||
|
|
@ -99,6 +115,15 @@ func (ssa *SQLStorageAuthorityRO) GetRegistration(ctx context.Context, req *sapb
|
||||||
// replica yet. If we get a NoRows, wait a little bit and retry, once.
|
// replica yet. If we get a NoRows, wait a little bit and retry, once.
|
||||||
ssa.clk.Sleep(ssa.lagFactor)
|
ssa.clk.Sleep(ssa.lagFactor)
|
||||||
model, err = selectRegistration(ssa.dbReadOnlyMap.WithContext(ctx), query, req.Id)
|
model, err = selectRegistration(ssa.dbReadOnlyMap.WithContext(ctx), query, req.Id)
|
||||||
|
if err != nil {
|
||||||
|
if db.IsNoRows(err) {
|
||||||
|
ssa.lagFactorCounter.WithLabelValues("GetRegistration", "notfound").Inc()
|
||||||
|
} else {
|
||||||
|
ssa.lagFactorCounter.WithLabelValues("GetRegistration", "other").Inc()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ssa.lagFactorCounter.WithLabelValues("GetRegistration", "found").Inc()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if db.IsNoRows(err) {
|
if db.IsNoRows(err) {
|
||||||
|
|
@ -648,6 +673,7 @@ func (ssa *SQLStorageAuthorityRO) GetOrder(ctx context.Context, req *sapb.OrderR
|
||||||
|
|
||||||
return order, nil
|
return order, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
output, err := db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn)
|
output, err := db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn)
|
||||||
if (db.IsNoRows(err) || errors.Is(err, berrors.NotFound)) && ssa.lagFactor != 0 {
|
if (db.IsNoRows(err) || errors.Is(err, berrors.NotFound)) && ssa.lagFactor != 0 {
|
||||||
// GetOrder is often called shortly after a new order is created, sometimes
|
// GetOrder is often called shortly after a new order is created, sometimes
|
||||||
|
|
@ -655,6 +681,15 @@ func (ssa *SQLStorageAuthorityRO) GetOrder(ctx context.Context, req *sapb.OrderR
|
||||||
// replica yet. If we get a NoRows, wait a little bit and retry, once.
|
// replica yet. If we get a NoRows, wait a little bit and retry, once.
|
||||||
ssa.clk.Sleep(ssa.lagFactor)
|
ssa.clk.Sleep(ssa.lagFactor)
|
||||||
output, err = db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn)
|
output, err = db.WithTransaction(ctx, ssa.dbReadOnlyMap, txn)
|
||||||
|
if err != nil {
|
||||||
|
if db.IsNoRows(err) || errors.Is(err, berrors.NotFound) {
|
||||||
|
ssa.lagFactorCounter.WithLabelValues("GetOrder", "notfound").Inc()
|
||||||
|
} else {
|
||||||
|
ssa.lagFactorCounter.WithLabelValues("GetOrder", "other").Inc()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ssa.lagFactorCounter.WithLabelValues("GetOrder", "found").Inc()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
@ -751,6 +786,15 @@ func (ssa *SQLStorageAuthorityRO) GetAuthorization2(ctx context.Context, req *sa
|
||||||
// read replica yet. If we get a NoRows, wait a little bit and retry, once.
|
// read replica yet. If we get a NoRows, wait a little bit and retry, once.
|
||||||
ssa.clk.Sleep(ssa.lagFactor)
|
ssa.clk.Sleep(ssa.lagFactor)
|
||||||
obj, err = ssa.dbReadOnlyMap.Get(authzModel{}, req.Id)
|
obj, err = ssa.dbReadOnlyMap.Get(authzModel{}, req.Id)
|
||||||
|
if err != nil {
|
||||||
|
if db.IsNoRows(err) {
|
||||||
|
ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "notfound").Inc()
|
||||||
|
} else {
|
||||||
|
ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "other").Inc()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ssa.lagFactorCounter.WithLabelValues("GetAuthorization2", "found").Inc()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue