Add backoff for error recovery in progress (#485)

* add backoff for error recovery in progress

Signed-off-by: Connor1996 <zbk602423539@gmail.com>
This commit is contained in:
Connor 2022-05-09 11:50:19 +08:00 committed by GitHub
parent 3705989fa1
commit a38ac96984
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 53 additions and 30 deletions

View File

@ -86,6 +86,8 @@ var (
ErrRegionNotInitialized = errors.New("region not Initialized")
// ErrTiKVDiskFull is the error when tikv server disk usage is full.
ErrTiKVDiskFull = errors.New("tikv disk full")
// ErrRegionRecoveryInProgress is the error when region is recovering.
ErrRegionRecoveryInProgress = errors.New("region is being online unsafe recovered")
// ErrUnknown is the unknow error.
ErrUnknown = errors.New("unknow")
// ErrResultUndetermined is the error when execution result is unknown.

View File

@ -1348,6 +1348,8 @@ func regionErrorToLabel(e *errorpb.Error) string {
return "region_not_initialized"
} else if e.GetDiskFull() != nil {
return "disk_full"
} else if e.GetRecoveryInProgress() != nil {
return "recovery_in_progress"
}
return "unknown"
}
@ -1395,6 +1397,16 @@ func (s *RegionRequestSender) onRegionError(bo *retry.Backoffer, ctx *RPCContext
return true, nil
}
if regionErr.GetRecoveryInProgress() != nil {
s.regionCache.InvalidateCachedRegion(ctx.Region)
logutil.BgLogger().Debug("tikv reports `RecoveryInProgress`", zap.Stringer("ctx", ctx))
err = bo.Backoff(retry.BoRegionRecoveryInProgress, errors.Errorf("region recovery in progress, ctx: %v", ctx))
if err != nil {
return false, err
}
return false, nil
}
// This peer is removed from the region. Invalidate the region since it's too stale.
if regionErr.GetRegionNotFound() != nil {
s.regionCache.InvalidateCachedRegion(ctx.Region)

View File

@ -51,26 +51,32 @@ func TestBackoffWithMax(t *testing.T) {
}
func TestBackoffErrorType(t *testing.T) {
// the actual maxSleep is multiplied by weight, which is 400ms
b := NewBackofferWithVars(context.TODO(), 200, nil)
// the actual maxSleep is multiplied by weight, which is 480ms
b := NewBackofferWithVars(context.TODO(), 250, nil)
err := b.Backoff(BoRegionMiss, errors.New("region miss")) // 2ms sleep
assert.Nil(t, err)
// 300 ms sleep in total
// 300ms sleep at most in total
for i := 0; i < 2; i++ {
err = b.Backoff(BoMaxDataNotReady, errors.New("data not ready"))
assert.Nil(t, err)
}
// 100ms sleep at most in total
err = b.Backoff(BoRegionRecoveryInProgress, errors.New("recovery in progress"))
assert.Nil(t, err)
// sleep from ServerIsBusy is not counted
err = b.Backoff(BoTiKVServerBusy, errors.New("server is busy"))
assert.Nil(t, err)
// 126ms sleep in total
for i := 0; i < 6; i++ {
// wait it exceed max sleep
for i := 0; i < 10; i++ {
err = b.Backoff(BoTxnNotFound, errors.New("txn not found"))
assert.Nil(t, err)
if err != nil {
// Next backoff should return error of backoff that sleeps for longest time.
assert.ErrorIs(t, err, BoMaxDataNotReady.err)
return
}
}
// Next backoff should return error of backoff that sleeps for longest time.
err = b.Backoff(BoTxnNotFound, errors.New("tikv rpc"))
assert.ErrorIs(t, err, BoMaxDataNotReady.err)
assert.Fail(t, "should not be here")
}
func TestBackoffDeepCopy(t *testing.T) {

View File

@ -114,16 +114,17 @@ var (
BoTxnLock = NewConfig("txnLock", &metrics.BackoffHistogramLock, NewBackoffFnCfg(100, 3000, EqualJitter), tikverr.ErrResolveLockTimeout)
BoPDRPC = NewConfig("pdRPC", &metrics.BackoffHistogramPD, NewBackoffFnCfg(500, 3000, EqualJitter), tikverr.NewErrPDServerTimeout(""))
// change base time to 2ms, because it may recover soon.
BoRegionMiss = NewConfig("regionMiss", &metrics.BackoffHistogramRegionMiss, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrRegionUnavailable)
BoRegionScheduling = NewConfig("regionScheduling", &metrics.BackoffHistogramRegionScheduling, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrRegionUnavailable)
BoTiKVServerBusy = NewConfig("tikvServerBusy", &metrics.BackoffHistogramServerBusy, NewBackoffFnCfg(2000, 10000, EqualJitter), tikverr.ErrTiKVServerBusy)
BoTiKVDiskFull = NewConfig("tikvDiskFull", &metrics.BackoffHistogramTiKVDiskFull, NewBackoffFnCfg(500, 5000, NoJitter), tikverr.ErrTiKVDiskFull)
BoTiFlashServerBusy = NewConfig("tiflashServerBusy", &metrics.BackoffHistogramServerBusy, NewBackoffFnCfg(2000, 10000, EqualJitter), tikverr.ErrTiFlashServerBusy)
BoTxnNotFound = NewConfig("txnNotFound", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrResolveLockTimeout)
BoStaleCmd = NewConfig("staleCommand", &metrics.BackoffHistogramStaleCmd, NewBackoffFnCfg(2, 1000, NoJitter), tikverr.ErrTiKVStaleCommand)
BoMaxTsNotSynced = NewConfig("maxTsNotSynced", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrTiKVMaxTimestampNotSynced)
BoMaxDataNotReady = NewConfig("dataNotReady", &metrics.BackoffHistogramDataNotReady, NewBackoffFnCfg(100, 2000, NoJitter), tikverr.ErrRegionDataNotReady)
BoMaxRegionNotInitialized = NewConfig("regionNotInitialized", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 1000, NoJitter), tikverr.ErrRegionNotInitialized)
BoRegionMiss = NewConfig("regionMiss", &metrics.BackoffHistogramRegionMiss, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrRegionUnavailable)
BoRegionScheduling = NewConfig("regionScheduling", &metrics.BackoffHistogramRegionScheduling, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrRegionUnavailable)
BoTiKVServerBusy = NewConfig("tikvServerBusy", &metrics.BackoffHistogramServerBusy, NewBackoffFnCfg(2000, 10000, EqualJitter), tikverr.ErrTiKVServerBusy)
BoTiKVDiskFull = NewConfig("tikvDiskFull", &metrics.BackoffHistogramTiKVDiskFull, NewBackoffFnCfg(500, 5000, NoJitter), tikverr.ErrTiKVDiskFull)
BoRegionRecoveryInProgress = NewConfig("regionRecoveryInProgress", &metrics.BackoffHistogramRegionRecoveryInProgress, NewBackoffFnCfg(100, 10000, EqualJitter), tikverr.ErrRegionRecoveryInProgress)
BoTiFlashServerBusy = NewConfig("tiflashServerBusy", &metrics.BackoffHistogramServerBusy, NewBackoffFnCfg(2000, 10000, EqualJitter), tikverr.ErrTiFlashServerBusy)
BoTxnNotFound = NewConfig("txnNotFound", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrResolveLockTimeout)
BoStaleCmd = NewConfig("staleCommand", &metrics.BackoffHistogramStaleCmd, NewBackoffFnCfg(2, 1000, NoJitter), tikverr.ErrTiKVStaleCommand)
BoMaxTsNotSynced = NewConfig("maxTsNotSynced", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrTiKVMaxTimestampNotSynced)
BoMaxDataNotReady = NewConfig("dataNotReady", &metrics.BackoffHistogramDataNotReady, NewBackoffFnCfg(100, 2000, NoJitter), tikverr.ErrRegionDataNotReady)
BoMaxRegionNotInitialized = NewConfig("regionNotInitialized", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 1000, NoJitter), tikverr.ErrRegionNotInitialized)
// TxnLockFast's `base` load from vars.BackoffLockFast when create BackoffFn.
BoTxnLockFast = NewConfig(txnLockFastName, &metrics.BackoffHistogramLockFast, NewBackoffFnCfg(2, 3000, EqualJitter), tikverr.ErrResolveLockTimeout)
)

View File

@ -54,17 +54,18 @@ var (
RawkvSizeHistogramWithKey prometheus.Observer
RawkvSizeHistogramWithValue prometheus.Observer
BackoffHistogramRPC prometheus.Observer
BackoffHistogramLock prometheus.Observer
BackoffHistogramLockFast prometheus.Observer
BackoffHistogramPD prometheus.Observer
BackoffHistogramRegionMiss prometheus.Observer
BackoffHistogramRegionScheduling prometheus.Observer
BackoffHistogramServerBusy prometheus.Observer
BackoffHistogramTiKVDiskFull prometheus.Observer
BackoffHistogramStaleCmd prometheus.Observer
BackoffHistogramDataNotReady prometheus.Observer
BackoffHistogramEmpty prometheus.Observer
BackoffHistogramRPC prometheus.Observer
BackoffHistogramLock prometheus.Observer
BackoffHistogramLockFast prometheus.Observer
BackoffHistogramPD prometheus.Observer
BackoffHistogramRegionMiss prometheus.Observer
BackoffHistogramRegionScheduling prometheus.Observer
BackoffHistogramServerBusy prometheus.Observer
BackoffHistogramTiKVDiskFull prometheus.Observer
BackoffHistogramRegionRecoveryInProgress prometheus.Observer
BackoffHistogramStaleCmd prometheus.Observer
BackoffHistogramDataNotReady prometheus.Observer
BackoffHistogramEmpty prometheus.Observer
TxnRegionsNumHistogramWithSnapshot prometheus.Observer
TxnRegionsNumHistogramPrewrite prometheus.Observer
@ -155,6 +156,7 @@ func initShortcuts() {
BackoffHistogramRegionScheduling = TiKVBackoffHistogram.WithLabelValues("regionScheduling")
BackoffHistogramServerBusy = TiKVBackoffHistogram.WithLabelValues("serverBusy")
BackoffHistogramTiKVDiskFull = TiKVBackoffHistogram.WithLabelValues("tikvDiskFull")
BackoffHistogramRegionRecoveryInProgress = TiKVBackoffHistogram.WithLabelValues("regionRecoveryInProgress")
BackoffHistogramStaleCmd = TiKVBackoffHistogram.WithLabelValues("staleCommand")
BackoffHistogramDataNotReady = TiKVBackoffHistogram.WithLabelValues("dataNotReady")
BackoffHistogramEmpty = TiKVBackoffHistogram.WithLabelValues("")