diff --git a/error/error.go b/error/error.go index 82a8c3d8..165130bd 100644 --- a/error/error.go +++ b/error/error.go @@ -93,6 +93,8 @@ var ( ErrRegionFlashbackInProgress = errors.New("region is in the flashback progress") // ErrRegionFlashbackNotPrepared is the error when a region is not prepared for the flashback first. ErrRegionFlashbackNotPrepared = errors.New("region is not prepared for the flashback") + // ErrIsWitness is the error when a request is send to a witness. + ErrIsWitness = errors.New("peer is witness") // ErrUnknown is the unknow error. ErrUnknown = errors.New("unknow") // ErrResultUndetermined is the error when execution result is unknown. diff --git a/go.mod b/go.mod index 4f1dc28d..1a92f094 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/opentracing/opentracing-go v1.2.0 github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989 - github.com/pingcap/kvproto v0.0.0-20221227030452-22819f5b377a + github.com/pingcap/kvproto v0.0.0-20230104090009-7c5d757b6e12 github.com/pingcap/log v1.1.1-0.20221015072633-39906604fb81 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.11.0 diff --git a/go.sum b/go.sum index def7dc23..eccd0409 100644 --- a/go.sum +++ b/go.sum @@ -155,8 +155,8 @@ github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00/go.mod h1:4qGtCB github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989 h1:surzm05a8C9dN8dIUmo4Be2+pMRb6f55i+UIYrluu2E= github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989/go.mod h1:O17XtbryoCJhkKGbT62+L2OlrniwqiGLSqrmdHCMzZw= github.com/pingcap/kvproto v0.0.0-20221026112947-f8d61344b172/go.mod h1:OYtxs0786qojVTmkVeufx93xe+jUgm56GUYRIKnmaGI= -github.com/pingcap/kvproto v0.0.0-20221227030452-22819f5b377a h1:4TiR0nGKmLmu2kTC22jOhUIplmv4GGUrUD9D2DTMms0= -github.com/pingcap/kvproto v0.0.0-20221227030452-22819f5b377a/go.mod h1:OYtxs0786qojVTmkVeufx93xe+jUgm56GUYRIKnmaGI= +github.com/pingcap/kvproto v0.0.0-20230104090009-7c5d757b6e12 h1:ZgPcRwsrzcuYcgrsGB+2I9w0n6JIT+GEptLUZ1kWKZA= +github.com/pingcap/kvproto v0.0.0-20230104090009-7c5d757b6e12/go.mod h1:OYtxs0786qojVTmkVeufx93xe+jUgm56GUYRIKnmaGI= github.com/pingcap/log v1.1.1-0.20221015072633-39906604fb81 h1:URLoJ61DmmY++Sa/yyPEQHG2s/ZBeV1FbIswHEMrdoY= github.com/pingcap/log v1.1.1-0.20221015072633-39906604fb81/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= diff --git a/integration_tests/go.sum b/integration_tests/go.sum index b2f4737e..238470ba 100644 --- a/integration_tests/go.sum +++ b/integration_tests/go.sum @@ -344,13 +344,19 @@ github.com/pingcap/fn v0.0.0-20200306044125-d5540d389059 h1:Pe2LbxRmbTfAoKJ65bZL github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989 h1:surzm05a8C9dN8dIUmo4Be2+pMRb6f55i+UIYrluu2E= github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989/go.mod h1:O17XtbryoCJhkKGbT62+L2OlrniwqiGLSqrmdHCMzZw= github.com/pingcap/kvproto v0.0.0-20221026112947-f8d61344b172/go.mod h1:OYtxs0786qojVTmkVeufx93xe+jUgm56GUYRIKnmaGI= +github.com/pingcap/kvproto v0.0.0-20230104090009-7c5d757b6e12 h1:ZgPcRwsrzcuYcgrsGB+2I9w0n6JIT+GEptLUZ1kWKZA= +github.com/pingcap/kvproto v0.0.0-20230104090009-7c5d757b6e12/go.mod h1:OYtxs0786qojVTmkVeufx93xe+jUgm56GUYRIKnmaGI= github.com/pingcap/kvproto v0.0.0-20230105060948-64890fa4f6c1 h1:jw4NjEiCleRJPPpHM7K6l8OKzOjnZAj62eKteCAY6ro= github.com/pingcap/kvproto v0.0.0-20230105060948-64890fa4f6c1/go.mod h1:OYtxs0786qojVTmkVeufx93xe+jUgm56GUYRIKnmaGI= +github.com/pingcap/log v0.0.0-20200511115504-543df19646ad/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8= github.com/pingcap/log v1.1.0/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= +github.com/pingcap/log v1.1.1-0.20221015072633-39906604fb81 h1:URLoJ61DmmY++Sa/yyPEQHG2s/ZBeV1FbIswHEMrdoY= github.com/pingcap/log v1.1.1-0.20221015072633-39906604fb81/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/log v1.1.1-0.20221116035753-734d527bc87c h1:crhkw6DD+07Bg1wYhW5Piw+kYNKZqFQqfC2puUf6gMI= github.com/pingcap/log v1.1.1-0.20221116035753-734d527bc87c/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/sysutil v0.0.0-20220114020952-ea68d2dbf5b4 h1:HYbcxtnkN3s5tqrZ/z3eJS4j3Db8wMphEm1q10lY/TM= +github.com/pingcap/tidb v1.1.0-beta.0.20221101102559-97add26c8f84 h1:qAougMzGGYDQ00UfIVs+M0LBaQQrL8yXdmLQEqqR8HY= +github.com/pingcap/tidb v1.1.0-beta.0.20221101102559-97add26c8f84/go.mod h1:lhtdZBXqJGbk2/Fr8JnT9KhXxEKy8h0vrDwdna/ou3g= github.com/pingcap/tidb v1.1.0-beta.0.20230109054422-b477b1c94620 h1:153psGlYCgPaDYc3AtlDG5ePNPVE4T1c8iJGHFZInZw= github.com/pingcap/tidb v1.1.0-beta.0.20230109054422-b477b1c94620/go.mod h1:Rounl3rQhnhOBh8e7pxAr6BhnNJh0nGvomRCRE6W98U= github.com/pingcap/tidb/parser v0.0.0-20230109054422-b477b1c94620 h1:cEP+A7ylw8sGGOCHe8F8gto+vlkGQaYt09DHgHd2xmg= diff --git a/internal/locate/region_cache.go b/internal/locate/region_cache.go index a71d2748..0dfc686f 100644 --- a/internal/locate/region_cache.go +++ b/internal/locate/region_cache.go @@ -267,6 +267,14 @@ func newRegion(bo *retry.Backoffer, c *RegionCache, pdRegion *pd.Region) (*Regio if addr == "" { continue } + + // Since the witness is read-write prohibited, it does not make sense to send requests to it + // unless it is the leader. When it is the leader and transfer leader encounters a problem, + // the backoff timeout will be triggered, and the client can give a more accurate error message. + if p.IsWitness && !isSamePeer(p, leader) { + continue + } + if isSamePeer(p, leader) { leaderAccessIdx = AccessIndex(len(rs.accessIndex[tiKVOnly])) } diff --git a/internal/locate/region_cache_test.go b/internal/locate/region_cache_test.go index 3e92ac94..50cb6b66 100644 --- a/internal/locate/region_cache_test.go +++ b/internal/locate/region_cache_test.go @@ -1260,6 +1260,42 @@ func (s *testRegionCacheSuite) TestPeersLenChange() { s.cache.OnSendFail(retry.NewNoopBackoff(context.Background()), ctx, false, errors.New("send fail")) } +func (s *testRegionCacheSuite) TestPeersLenChangedByWitness() { + // 2 peers [peer1, peer2] and let peer2 become leader + loc, err := s.cache.LocateKey(s.bo, []byte("a")) + s.Nil(err) + s.cache.UpdateLeader(loc.Region, &metapb.Peer{Id: s.peer2, StoreId: s.store2}, 0) + + // current leader is peer2 in [peer1, peer2] + loc, err = s.cache.LocateKey(s.bo, []byte("a")) + s.Nil(err) + ctx, err := s.cache.GetTiKVRPCContext(s.bo, loc.Region, kv.ReplicaReadLeader, 0) + s.Nil(err) + s.Equal(ctx.Peer.StoreId, s.store2) + + // simulate peer1 become witness in kv heartbeat and loaded before response back. + cpMeta := &metapb.Region{ + Id: ctx.Meta.Id, + StartKey: ctx.Meta.StartKey, + EndKey: ctx.Meta.EndKey, + RegionEpoch: ctx.Meta.RegionEpoch, + Peers: make([]*metapb.Peer, len(ctx.Meta.Peers)), + } + copy(cpMeta.Peers, ctx.Meta.Peers) + for _, peer := range cpMeta.Peers { + if peer.Id == s.peer1 { + peer.IsWitness = true + } + } + cpRegion := &pd.Region{Meta: cpMeta} + region, err := newRegion(s.bo, s.cache, cpRegion) + s.Nil(err) + s.cache.insertRegionToCache(region) + + // OnSendFail should not panic + s.cache.OnSendFail(retry.NewNoopBackoff(context.Background()), ctx, false, errors.New("send fail")) +} + func createSampleRegion(startKey, endKey []byte) *Region { return &Region{ meta: &metapb.Region{ diff --git a/internal/locate/region_request.go b/internal/locate/region_request.go index 6b6f9b8f..58fe1389 100644 --- a/internal/locate/region_request.go +++ b/internal/locate/region_request.go @@ -1403,6 +1403,8 @@ func regionErrorToLabel(e *errorpb.Error) string { return "flashback_in_progress" } else if e.GetFlashbackNotPrepared() != nil { return "flashback_not_prepared" + } else if e.GetIsWitness() != nil { + return "peer_is_witness" } return "unknown" } @@ -1460,6 +1462,16 @@ func (s *RegionRequestSender) onRegionError(bo *retry.Backoffer, ctx *RPCContext return false, nil } + if regionErr.GetIsWitness() != nil { + s.regionCache.InvalidateCachedRegion(ctx.Region) + logutil.BgLogger().Debug("tikv reports `IsWitness`", zap.Stringer("ctx", ctx)) + err = bo.Backoff(retry.BoIsWitness, errors.Errorf("is witness, ctx: %v", ctx)) + if err != nil { + return false, err + } + return false, nil + } + // Since we expect that the workload should be stopped during the flashback progress, // if a request meets the FlashbackInProgress error, it should stop retrying immediately // to avoid unnecessary backoff and potential unexpected data status to the user. diff --git a/internal/retry/backoff_test.go b/internal/retry/backoff_test.go index a4c8dfe4..468306fb 100644 --- a/internal/retry/backoff_test.go +++ b/internal/retry/backoff_test.go @@ -51,8 +51,8 @@ func TestBackoffWithMax(t *testing.T) { } func TestBackoffErrorType(t *testing.T) { - // the actual maxSleep is multiplied by weight, which is 480ms - b := NewBackofferWithVars(context.TODO(), 210, nil) + // the actual maxSleep is multiplied by weight, which is 1600ms + b := NewBackofferWithVars(context.TODO(), 800, nil) err := b.Backoff(BoRegionMiss, errors.New("region miss")) // 2ms sleep assert.Nil(t, err) // 6ms sleep at most in total @@ -67,6 +67,9 @@ func TestBackoffErrorType(t *testing.T) { // sleep from ServerIsBusy is not counted err = b.Backoff(BoTiKVServerBusy, errors.New("server is busy")) assert.Nil(t, err) + // 1000ms sleep at most in total + err = b.Backoff(BoIsWitness, errors.New("peer is witness")) + assert.Nil(t, err) // wait it exceed max sleep for i := 0; i < 10; i++ { err = b.Backoff(BoTxnNotFound, errors.New("txn not found")) diff --git a/internal/retry/config.go b/internal/retry/config.go index 19632d9f..9c062cc7 100644 --- a/internal/retry/config.go +++ b/internal/retry/config.go @@ -125,6 +125,7 @@ var ( BoMaxTsNotSynced = NewConfig("maxTsNotSynced", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 500, NoJitter), tikverr.ErrTiKVMaxTimestampNotSynced) BoMaxDataNotReady = NewConfig("dataNotReady", &metrics.BackoffHistogramDataNotReady, NewBackoffFnCfg(2, 2000, NoJitter), tikverr.ErrRegionDataNotReady) BoMaxRegionNotInitialized = NewConfig("regionNotInitialized", &metrics.BackoffHistogramEmpty, NewBackoffFnCfg(2, 1000, NoJitter), tikverr.ErrRegionNotInitialized) + BoIsWitness = NewConfig("isWitness", &metrics.BackoffHistogramIsWitness, NewBackoffFnCfg(1000, 10000, EqualJitter), tikverr.ErrIsWitness) // TxnLockFast's `base` load from vars.BackoffLockFast when create BackoffFn. BoTxnLockFast = NewConfig(txnLockFastName, &metrics.BackoffHistogramLockFast, NewBackoffFnCfg(2, 3000, EqualJitter), tikverr.ErrResolveLockTimeout) ) diff --git a/metrics/shortcuts.go b/metrics/shortcuts.go index 8ff83856..dacbd971 100644 --- a/metrics/shortcuts.go +++ b/metrics/shortcuts.go @@ -66,6 +66,7 @@ var ( BackoffHistogramRegionRecoveryInProgress prometheus.Observer BackoffHistogramStaleCmd prometheus.Observer BackoffHistogramDataNotReady prometheus.Observer + BackoffHistogramIsWitness prometheus.Observer BackoffHistogramEmpty prometheus.Observer TxnRegionsNumHistogramWithSnapshot prometheus.Observer @@ -166,6 +167,7 @@ func initShortcuts() { BackoffHistogramRegionRecoveryInProgress = TiKVBackoffHistogram.WithLabelValues("regionRecoveryInProgress") BackoffHistogramStaleCmd = TiKVBackoffHistogram.WithLabelValues("staleCommand") BackoffHistogramDataNotReady = TiKVBackoffHistogram.WithLabelValues("dataNotReady") + BackoffHistogramIsWitness = TiKVBackoffHistogram.WithLabelValues("isWitness") BackoffHistogramEmpty = TiKVBackoffHistogram.WithLabelValues("") TxnRegionsNumHistogramWithSnapshot = TiKVTxnRegionsNumHistogram.WithLabelValues("snapshot")