Add backoff for error recovery in progress (#485)

* add backoff for error recovery in progress

Signed-off-by: Connor1996 <zbk602423539@gmail.com>
This commit is contained in:
Connor 2022-05-09 11:50:19 +08:00 committed by GitHub
parent 3705989fa1
commit a38ac96984
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 53 additions and 30 deletions

View File

@ -86,6 +86,8 @@ var (
ErrRegionNotInitialized = errors.New("region not Initialized")
// ErrTiKVDiskFull is the error when tikv server disk usage is full.
ErrTiKVDiskFull = errors.New("tikv disk full")
// ErrRegionRecoveryInProgress is the error when region is recovering.
ErrRegionRecoveryInProgress = errors.New("region is being online unsafe recovered")
// ErrUnknown is the unknow error.
ErrUnknown = errors.New("unknow")
// ErrResultUndetermined is the error when execution result is unknown.

View File

@ -1348,6 +1348,8 @@ func regionErrorToLabel(e *errorpb.Error) string {
return "region_not_initialized"
} else if e.GetDiskFull() != nil {
return "disk_full"
} else if e.GetRecoveryInProgress() != nil {
return "recovery_in_progress"
}
return "unknown"
}
@ -1395,6 +1397,16 @@ func (s *RegionRequestSender) onRegionError(bo *retry.Backoffer, ctx *RPCContext
return true, nil
}
if regionErr.GetRecoveryInProgress() != nil {
s.regionCache.InvalidateCachedRegion(ctx.Region)
logutil.BgLogger().Debug("tikv reports `RecoveryInProgress`", zap.Stringer("ctx", ctx))
err = bo.Backoff(retry.BoRegionRecoveryInProgress, errors.Errorf("region recovery in progress, ctx: %v", ctx))
if err != nil {
return false, err
}
return false, nil
}
// This peer is removed from the region. Invalidate the region since it's too stale.
if regionErr.GetRegionNotFound() != nil {
s.regionCache.InvalidateCachedRegion(ctx.Region)

View File

@ -51,26 +51,32 @@ func TestBackoffWithMax(t *testing.T) {
}
func TestBackoffErrorType(t *testing.T) {
// the actual maxSleep is multiplied by weight, which is 400ms
b := NewBackofferWithVars(context.TODO(), 200, nil)
// the actual maxSleep is multiplied by weight, which is 480ms
b := NewBackofferWithVars(context.TODO(), 250, nil)
err := b.Backoff(BoRegionMiss, errors.New("region miss")) // 2ms sleep
assert.Nil(t, err)
// 300 ms sleep in total
// 300ms sleep at most in total
for i := 0; i < 2; i++ {
err = b.Backoff(BoMaxDataNotReady, errors.New("data not ready"))
assert.Nil(t, err)
}
// 100ms sleep at most in total
err = b.Backoff(BoRegionRecoveryInProgress, errors.New("recovery in progress"))
assert.Nil(t, err)
// sleep from ServerIsBusy is not counted
err = b.Backoff(BoTiKVServerBusy, errors.New("server is busy"))
assert.Nil(t, err)
// 126ms sleep in total
for i := 0; i < 6; i++ {
// wait it exceed max sleep
for i := 0; i < 10; i++ {
err = b.Backoff(BoTxnNotFound, errors.New("txn not found"))
assert.Nil(t, err)
}
if err != nil {
// Next backoff should return error of backoff that sleeps for longest time.
err = b.Backoff(BoTxnNotFound, errors.New("tikv rpc"))
assert.ErrorIs(t, err, BoMaxDataNotReady.err)
return
}
}
assert.Fail(t, "should not be here")
}
func TestBackoffDeepCopy(t *testing.T) {

View File

@ -118,6 +118,7 @@ var (
BoRegionScheduling = NewConfig("regionScheduling", &metrics.BackoffHistogramRegionScheduling, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrRegionUnavailable)
BoTiKVServerBusy = NewConfig("tikvServerBusy", &metrics.BackoffHistogramServerBusy, NewBackoffFnCfg(2000, 10000, EqualJitter), tikverr.ErrTiKVServerBusy)
BoTiKVDiskFull = NewConfig("tikvDiskFull", &metrics.BackoffHistogramTiKVDiskFull, NewBackoffFnCfg(500, 5000, NoJitter), tikverr.ErrTiKVDiskFull)
BoRegionRecoveryInProgress = NewConfig("regionRecoveryInProgress", &metrics.BackoffHistogramRegionRecoveryInProgress, NewBackoffFnCfg(100, 10000, EqualJitter), tikverr.ErrRegionRecoveryInProgress)
BoTiFlashServerBusy = NewConfig("tiflashServerBusy", &metrics.BackoffHistogramServerBusy, NewBackoffFnCfg(2000, 10000, EqualJitter), tikverr.ErrTiFlashServerBusy)
BoTxnNotFound = NewConfig("txnNotFound", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrResolveLockTimeout)
BoStaleCmd = NewConfig("staleCommand", &metrics.BackoffHistogramStaleCmd, NewBackoffFnCfg(2, 1000, NoJitter), tikverr.ErrTiKVStaleCommand)

View File

@ -62,6 +62,7 @@ var (
BackoffHistogramRegionScheduling prometheus.Observer
BackoffHistogramServerBusy prometheus.Observer
BackoffHistogramTiKVDiskFull prometheus.Observer
BackoffHistogramRegionRecoveryInProgress prometheus.Observer
BackoffHistogramStaleCmd prometheus.Observer
BackoffHistogramDataNotReady prometheus.Observer
BackoffHistogramEmpty prometheus.Observer
@ -155,6 +156,7 @@ func initShortcuts() {
BackoffHistogramRegionScheduling = TiKVBackoffHistogram.WithLabelValues("regionScheduling")
BackoffHistogramServerBusy = TiKVBackoffHistogram.WithLabelValues("serverBusy")
BackoffHistogramTiKVDiskFull = TiKVBackoffHistogram.WithLabelValues("tikvDiskFull")
BackoffHistogramRegionRecoveryInProgress = TiKVBackoffHistogram.WithLabelValues("regionRecoveryInProgress")
BackoffHistogramStaleCmd = TiKVBackoffHistogram.WithLabelValues("staleCommand")
BackoffHistogramDataNotReady = TiKVBackoffHistogram.WithLabelValues("dataNotReady")
BackoffHistogramEmpty = TiKVBackoffHistogram.WithLabelValues("")