From a38ac96984f715fd06661f12a26e3574f5693a0d Mon Sep 17 00:00:00 2001 From: Connor Date: Mon, 9 May 2022 11:50:19 +0800 Subject: [PATCH] Add backoff for error recovery in progress (#485) * add backoff for error recovery in progress Signed-off-by: Connor1996 --- error/error.go | 2 ++ internal/locate/region_request.go | 12 ++++++++++++ internal/retry/backoff_test.go | 24 +++++++++++++++--------- internal/retry/config.go | 21 +++++++++++---------- metrics/shortcuts.go | 24 +++++++++++++----------- 5 files changed, 53 insertions(+), 30 deletions(-) diff --git a/error/error.go b/error/error.go index 96946700..eabe129f 100644 --- a/error/error.go +++ b/error/error.go @@ -86,6 +86,8 @@ var ( ErrRegionNotInitialized = errors.New("region not Initialized") // ErrTiKVDiskFull is the error when tikv server disk usage is full. ErrTiKVDiskFull = errors.New("tikv disk full") + // ErrRegionRecoveryInProgress is the error when region is recovering. + ErrRegionRecoveryInProgress = errors.New("region is being online unsafe recovered") // ErrUnknown is the unknow error. ErrUnknown = errors.New("unknow") // ErrResultUndetermined is the error when execution result is unknown. diff --git a/internal/locate/region_request.go b/internal/locate/region_request.go index 20ca2e60..1ea60df4 100644 --- a/internal/locate/region_request.go +++ b/internal/locate/region_request.go @@ -1348,6 +1348,8 @@ func regionErrorToLabel(e *errorpb.Error) string { return "region_not_initialized" } else if e.GetDiskFull() != nil { return "disk_full" + } else if e.GetRecoveryInProgress() != nil { + return "recovery_in_progress" } return "unknown" } @@ -1395,6 +1397,16 @@ func (s *RegionRequestSender) onRegionError(bo *retry.Backoffer, ctx *RPCContext return true, nil } + if regionErr.GetRecoveryInProgress() != nil { + s.regionCache.InvalidateCachedRegion(ctx.Region) + logutil.BgLogger().Debug("tikv reports `RecoveryInProgress`", zap.Stringer("ctx", ctx)) + err = bo.Backoff(retry.BoRegionRecoveryInProgress, errors.Errorf("region recovery in progress, ctx: %v", ctx)) + if err != nil { + return false, err + } + return false, nil + } + // This peer is removed from the region. Invalidate the region since it's too stale. if regionErr.GetRegionNotFound() != nil { s.regionCache.InvalidateCachedRegion(ctx.Region) diff --git a/internal/retry/backoff_test.go b/internal/retry/backoff_test.go index 1d96ee16..6a1eba6f 100644 --- a/internal/retry/backoff_test.go +++ b/internal/retry/backoff_test.go @@ -51,26 +51,32 @@ func TestBackoffWithMax(t *testing.T) { } func TestBackoffErrorType(t *testing.T) { - // the actual maxSleep is multiplied by weight, which is 400ms - b := NewBackofferWithVars(context.TODO(), 200, nil) + // the actual maxSleep is multiplied by weight, which is 480ms + b := NewBackofferWithVars(context.TODO(), 250, nil) err := b.Backoff(BoRegionMiss, errors.New("region miss")) // 2ms sleep assert.Nil(t, err) - // 300 ms sleep in total + // 300ms sleep at most in total for i := 0; i < 2; i++ { err = b.Backoff(BoMaxDataNotReady, errors.New("data not ready")) assert.Nil(t, err) } + // 100ms sleep at most in total + err = b.Backoff(BoRegionRecoveryInProgress, errors.New("recovery in progress")) + assert.Nil(t, err) + // sleep from ServerIsBusy is not counted err = b.Backoff(BoTiKVServerBusy, errors.New("server is busy")) assert.Nil(t, err) - // 126ms sleep in total - for i := 0; i < 6; i++ { + // wait it exceed max sleep + for i := 0; i < 10; i++ { err = b.Backoff(BoTxnNotFound, errors.New("txn not found")) - assert.Nil(t, err) + if err != nil { + // Next backoff should return error of backoff that sleeps for longest time. + assert.ErrorIs(t, err, BoMaxDataNotReady.err) + return + } } - // Next backoff should return error of backoff that sleeps for longest time. - err = b.Backoff(BoTxnNotFound, errors.New("tikv rpc")) - assert.ErrorIs(t, err, BoMaxDataNotReady.err) + assert.Fail(t, "should not be here") } func TestBackoffDeepCopy(t *testing.T) { diff --git a/internal/retry/config.go b/internal/retry/config.go index 69e5cc14..53e07b0c 100644 --- a/internal/retry/config.go +++ b/internal/retry/config.go @@ -114,16 +114,17 @@ var ( BoTxnLock = NewConfig("txnLock", &metrics.BackoffHistogramLock, NewBackoffFnCfg(100, 3000, EqualJitter), tikverr.ErrResolveLockTimeout) BoPDRPC = NewConfig("pdRPC", &metrics.BackoffHistogramPD, NewBackoffFnCfg(500, 3000, EqualJitter), tikverr.NewErrPDServerTimeout("")) // change base time to 2ms, because it may recover soon. - BoRegionMiss = NewConfig("regionMiss", &metrics.BackoffHistogramRegionMiss, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrRegionUnavailable) - BoRegionScheduling = NewConfig("regionScheduling", &metrics.BackoffHistogramRegionScheduling, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrRegionUnavailable) - BoTiKVServerBusy = NewConfig("tikvServerBusy", &metrics.BackoffHistogramServerBusy, NewBackoffFnCfg(2000, 10000, EqualJitter), tikverr.ErrTiKVServerBusy) - BoTiKVDiskFull = NewConfig("tikvDiskFull", &metrics.BackoffHistogramTiKVDiskFull, NewBackoffFnCfg(500, 5000, NoJitter), tikverr.ErrTiKVDiskFull) - BoTiFlashServerBusy = NewConfig("tiflashServerBusy", &metrics.BackoffHistogramServerBusy, NewBackoffFnCfg(2000, 10000, EqualJitter), tikverr.ErrTiFlashServerBusy) - BoTxnNotFound = NewConfig("txnNotFound", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrResolveLockTimeout) - BoStaleCmd = NewConfig("staleCommand", &metrics.BackoffHistogramStaleCmd, NewBackoffFnCfg(2, 1000, NoJitter), tikverr.ErrTiKVStaleCommand) - BoMaxTsNotSynced = NewConfig("maxTsNotSynced", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrTiKVMaxTimestampNotSynced) - BoMaxDataNotReady = NewConfig("dataNotReady", &metrics.BackoffHistogramDataNotReady, NewBackoffFnCfg(100, 2000, NoJitter), tikverr.ErrRegionDataNotReady) - BoMaxRegionNotInitialized = NewConfig("regionNotInitialized", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 1000, NoJitter), tikverr.ErrRegionNotInitialized) + BoRegionMiss = NewConfig("regionMiss", &metrics.BackoffHistogramRegionMiss, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrRegionUnavailable) + BoRegionScheduling = NewConfig("regionScheduling", &metrics.BackoffHistogramRegionScheduling, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrRegionUnavailable) + BoTiKVServerBusy = NewConfig("tikvServerBusy", &metrics.BackoffHistogramServerBusy, NewBackoffFnCfg(2000, 10000, EqualJitter), tikverr.ErrTiKVServerBusy) + BoTiKVDiskFull = NewConfig("tikvDiskFull", &metrics.BackoffHistogramTiKVDiskFull, NewBackoffFnCfg(500, 5000, NoJitter), tikverr.ErrTiKVDiskFull) + BoRegionRecoveryInProgress = NewConfig("regionRecoveryInProgress", &metrics.BackoffHistogramRegionRecoveryInProgress, NewBackoffFnCfg(100, 10000, EqualJitter), tikverr.ErrRegionRecoveryInProgress) + BoTiFlashServerBusy = NewConfig("tiflashServerBusy", &metrics.BackoffHistogramServerBusy, NewBackoffFnCfg(2000, 10000, EqualJitter), tikverr.ErrTiFlashServerBusy) + BoTxnNotFound = NewConfig("txnNotFound", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrResolveLockTimeout) + BoStaleCmd = NewConfig("staleCommand", &metrics.BackoffHistogramStaleCmd, NewBackoffFnCfg(2, 1000, NoJitter), tikverr.ErrTiKVStaleCommand) + BoMaxTsNotSynced = NewConfig("maxTsNotSynced", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrTiKVMaxTimestampNotSynced) + BoMaxDataNotReady = NewConfig("dataNotReady", &metrics.BackoffHistogramDataNotReady, NewBackoffFnCfg(100, 2000, NoJitter), tikverr.ErrRegionDataNotReady) + BoMaxRegionNotInitialized = NewConfig("regionNotInitialized", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 1000, NoJitter), tikverr.ErrRegionNotInitialized) // TxnLockFast's `base` load from vars.BackoffLockFast when create BackoffFn. BoTxnLockFast = NewConfig(txnLockFastName, &metrics.BackoffHistogramLockFast, NewBackoffFnCfg(2, 3000, EqualJitter), tikverr.ErrResolveLockTimeout) ) diff --git a/metrics/shortcuts.go b/metrics/shortcuts.go index ebdf7403..e54d1d52 100644 --- a/metrics/shortcuts.go +++ b/metrics/shortcuts.go @@ -54,17 +54,18 @@ var ( RawkvSizeHistogramWithKey prometheus.Observer RawkvSizeHistogramWithValue prometheus.Observer - BackoffHistogramRPC prometheus.Observer - BackoffHistogramLock prometheus.Observer - BackoffHistogramLockFast prometheus.Observer - BackoffHistogramPD prometheus.Observer - BackoffHistogramRegionMiss prometheus.Observer - BackoffHistogramRegionScheduling prometheus.Observer - BackoffHistogramServerBusy prometheus.Observer - BackoffHistogramTiKVDiskFull prometheus.Observer - BackoffHistogramStaleCmd prometheus.Observer - BackoffHistogramDataNotReady prometheus.Observer - BackoffHistogramEmpty prometheus.Observer + BackoffHistogramRPC prometheus.Observer + BackoffHistogramLock prometheus.Observer + BackoffHistogramLockFast prometheus.Observer + BackoffHistogramPD prometheus.Observer + BackoffHistogramRegionMiss prometheus.Observer + BackoffHistogramRegionScheduling prometheus.Observer + BackoffHistogramServerBusy prometheus.Observer + BackoffHistogramTiKVDiskFull prometheus.Observer + BackoffHistogramRegionRecoveryInProgress prometheus.Observer + BackoffHistogramStaleCmd prometheus.Observer + BackoffHistogramDataNotReady prometheus.Observer + BackoffHistogramEmpty prometheus.Observer TxnRegionsNumHistogramWithSnapshot prometheus.Observer TxnRegionsNumHistogramPrewrite prometheus.Observer @@ -155,6 +156,7 @@ func initShortcuts() { BackoffHistogramRegionScheduling = TiKVBackoffHistogram.WithLabelValues("regionScheduling") BackoffHistogramServerBusy = TiKVBackoffHistogram.WithLabelValues("serverBusy") BackoffHistogramTiKVDiskFull = TiKVBackoffHistogram.WithLabelValues("tikvDiskFull") + BackoffHistogramRegionRecoveryInProgress = TiKVBackoffHistogram.WithLabelValues("regionRecoveryInProgress") BackoffHistogramStaleCmd = TiKVBackoffHistogram.WithLabelValues("staleCommand") BackoffHistogramDataNotReady = TiKVBackoffHistogram.WithLabelValues("dataNotReady") BackoffHistogramEmpty = TiKVBackoffHistogram.WithLabelValues("")