From 8d6a95f73d93c5b621f803a169f1313f5f2eef59 Mon Sep 17 00:00:00 2001 From: crazycs Date: Mon, 11 Mar 2024 15:40:44 +0800 Subject: [PATCH] replica selector refactor (#1142) * init Signed-off-by: crazycs520 * add ReplicaSelector interface Signed-off-by: crazycs520 * add replica_selector_v2, todo: fix test Signed-off-by: crazycs520 * fix all test Signed-off-by: crazycs520 * fix test in another way to compatible with old version about stale-read request Signed-off-by: crazycs520 * tiny refactor Signed-off-by: crazycs520 * refactor to remove duplicate code Signed-off-by: crazycs520 * add more test Signed-off-by: crazycs520 * refine onServerIsBusy region error handing logic Signed-off-by: crazycs520 * support forwarding by proxy Signed-off-by: crazycs520 * support busyThreshold and tiny reractor code Signed-off-by: crazycs520 * add config Signed-off-by: crazycs520 * fix test Signed-off-by: crazycs520 * fix proxy bug Signed-off-by: crazycs520 * fix test and tiny refactor Signed-off-by: crazycs520 * add some test Signed-off-by: crazycs520 * add more test Signed-off-by: crazycs520 * tidy refine Signed-off-by: crazycs520 * address comment Signed-off-by: crazycs520 * fix test in v1 Signed-off-by: crazycs520 * add backoff-cnt check and timeout test Signed-off-by: crazycs520 * refine test Signed-off-by: crazycs520 * refine code Signed-off-by: crazycs520 * refine code Signed-off-by: crazycs520 * add benchmark test Signed-off-by: crazycs520 * make test stale Signed-off-by: crazycs520 * fix golangci Signed-off-by: crazycs520 * add more test and refine code Signed-off-by: crazycs520 * fix race test Signed-off-by: crazycs520 * add more comprehensive enumeration testing Signed-off-by: crazycs520 * refine test Signed-off-by: crazycs520 * fix test Signed-off-by: crazycs520 * fix accessFollower with label retry bug Signed-off-by: crazycs520 * refine test and fix some bug Signed-off-by: crazycs520 * fix test and add more test Signed-off-by: crazycs520 * add more test Signed-off-by: crazycs520 * refine test Signed-off-by: crazycs520 * refine test Signed-off-by: crazycs520 * refine onNotLeader logic when new leader is not available Signed-off-by: crazycs520 * refine calculateScore logic, if the replica already tried, decrease the score Signed-off-by: crazycs520 * refine logic: replica-read request with mixed strategy and with label, should be able to retry all remain replicas Signed-off-by: crazycs520 * prefer-leader for mixed read with label for non stale-read req Signed-off-by: crazycs520 * add more test case Signed-off-by: crazycs520 * refine code Signed-off-by: crazycs520 * resolve conflict Signed-off-by: crazycs520 * remove some duplicate test Signed-off-by: crazycs520 * refactor test Signed-off-by: crazycs520 * refine code Signed-off-by: crazycs520 * refine test Signed-off-by: crazycs520 * add busy_threshold test and fix bug Signed-off-by: crazycs520 * fix proxy bug and add more test Signed-off-by: crazycs520 * refine code and add test Signed-off-by: crazycs520 * add learner test Signed-off-by: crazycs520 * add more test Signed-off-by: crazycs520 * remove old test and refine test Signed-off-by: crazycs520 * move test Signed-off-by: crazycs520 * address comment Signed-off-by: crazycs520 * use new score calculation by flag bit Signed-off-by: crazycs520 * make test stable Signed-off-by: crazycs520 * refine comment,license and fix lint Signed-off-by: crazycs520 * refine comment Signed-off-by: crazycs520 * refine comment Signed-off-by: crazycs520 * fix race test timeout Signed-off-by: crazycs520 * add flagNotAttemp in score Signed-off-by: crazycs520 * address comment Signed-off-by: crazycs520 --------- Signed-off-by: crazycs520 --- config/config.go | 27 +- internal/locate/region_request.go | 263 ++- internal/locate/region_request3_test.go | 196 +- internal/locate/replica_selector.go | 536 +++++ internal/locate/replica_selector_test.go | 2403 ++++++++++++++++++++-- internal/mockstore/mocktikv/cluster.go | 16 +- util/israce/israce.go | 20 + util/israce/norace.go | 20 + 8 files changed, 3165 insertions(+), 316 deletions(-) create mode 100644 internal/locate/replica_selector.go create mode 100644 util/israce/israce.go create mode 100644 util/israce/norace.go diff --git a/config/config.go b/config/config.go index 8b062645..aded539e 100644 --- a/config/config.go +++ b/config/config.go @@ -82,23 +82,26 @@ type Config struct { RegionsRefreshInterval uint64 // EnablePreload indicates whether to preload region info when initializing the client. EnablePreload bool + // EnableReplicaSelectorV2 indicates whether to enable ReplicaSelectorV2. + EnableReplicaSelectorV2 bool } // DefaultConfig returns the default configuration. func DefaultConfig() Config { return Config{ - CommitterConcurrency: 128, - MaxTxnTTL: 60 * 60 * 1000, // 1hour - TiKVClient: DefaultTiKVClient(), - PDClient: DefaultPDClient(), - TxnLocalLatches: DefaultTxnLocalLatches(), - StoresRefreshInterval: DefStoresRefreshInterval, - OpenTracingEnable: false, - Path: "", - EnableForwarding: false, - TxnScope: "", - EnableAsyncCommit: false, - Enable1PC: false, + CommitterConcurrency: 128, + MaxTxnTTL: 60 * 60 * 1000, // 1hour + TiKVClient: DefaultTiKVClient(), + PDClient: DefaultPDClient(), + TxnLocalLatches: DefaultTxnLocalLatches(), + StoresRefreshInterval: DefStoresRefreshInterval, + OpenTracingEnable: false, + Path: "", + EnableForwarding: false, + TxnScope: "", + EnableAsyncCommit: false, + Enable1PC: false, + EnableReplicaSelectorV2: true, } } diff --git a/internal/locate/region_request.go b/internal/locate/region_request.go index 3ba55b2f..09503fcb 100644 --- a/internal/locate/region_request.go +++ b/internal/locate/region_request.go @@ -110,7 +110,7 @@ type RegionRequestSender struct { client client.Client storeAddr string rpcError error - replicaSelector *replicaSelector + replicaSelector ReplicaSelector failStoreIDs map[uint64]struct{} failProxyStoreIDs map[uint64]struct{} RegionRequestRuntimeStats @@ -258,6 +258,7 @@ type replica struct { deadlineErrUsingConfTimeout bool dataIsNotReady bool notLeader bool + serverIsBusy bool } func (r *replica) getEpoch() uint32 { @@ -282,17 +283,10 @@ func (r *replica) onUpdateLeader() { r.notLeader = false } -type replicaSelector struct { +type baseReplicaSelector struct { regionCache *RegionCache region *Region - regionStore *regionStore replicas []*replica - labels []*metapb.StoreLabel - state selectorState - // replicas[targetIdx] is the replica handling the request this time - targetIdx AccessIndex - // replicas[proxyIdx] is the store used to redirect requests this time - proxyIdx AccessIndex // TiKV can reject the request when its estimated wait duration exceeds busyThreshold. // Then, the client will receive a ServerIsBusy error and choose another replica to retry. busyThreshold time.Duration @@ -315,6 +309,18 @@ type replicaSelector struct { pendingBackoffs map[uint64]*backoffArgs } +// TODO(crazycs520): remove this after replicaSelectorV2 stable. +type replicaSelector struct { + baseReplicaSelector + regionStore *regionStore + labels []*metapb.StoreLabel + state selectorState + // replicas[targetIdx] is the replica handling the request this time + targetIdx AccessIndex + // replicas[proxyIdx] is the store used to redirect requests this time + proxyIdx AccessIndex +} + func selectorStateToString(state selectorState) string { replicaSelectorState := "nil" if state != nil { @@ -345,11 +351,18 @@ func selectorStateToString(state selectorState) string { } func (s *replicaSelector) String() string { - var replicaStatus []string - cacheRegionIsValid := "unknown" selectorStateStr := "nil" if s != nil { selectorStateStr = selectorStateToString(s.state) + } + + return fmt.Sprintf("replicaSelector{state: %v, %v}", selectorStateStr, s.baseReplicaSelector.String()) +} + +func (s *baseReplicaSelector) String() string { + var replicaStatus []string + cacheRegionIsValid := "unknown" + if s != nil { if s.region != nil { if s.region.isValid() { cacheRegionIsValid = "true" @@ -371,8 +384,7 @@ func (s *replicaSelector) String() string { )) } } - - return fmt.Sprintf("replicaSelector{selectorStateStr: %v, cacheRegionIsValid: %v, replicaStatus: %v}", selectorStateStr, cacheRegionIsValid, replicaStatus) + return fmt.Sprintf("cacheRegionIsValid: %v, replicaStatus: %v", cacheRegionIsValid, replicaStatus) } // selectorState is the interface of states of the replicaSelector. @@ -463,17 +475,17 @@ func (state *accessKnownLeader) next(bo *retry.Backoffer, selector *replicaSelec // check leader is candidate or not. func isLeaderCandidate(leader *replica) bool { - liveness := leader.store.getLivenessState() // If hibernate region is enabled and the leader is not reachable, the raft group // will not be wakened up and re-elect the leader until the follower receives // a request. So, before the new leader is elected, we should not send requests // to the unreachable old leader to avoid unnecessary timeout. // If leader.deadlineErrUsingConfTimeout is true, it means the leader is already tried and received deadline exceeded error, then don't retry it. // If leader.notLeader is true, it means the leader is already tried and received not leader error, then don't retry it. - if liveness != reachable || + if leader.store.getLivenessState() != reachable || leader.isExhausted(maxReplicaAttempt, maxReplicaAttemptTime) || leader.deadlineErrUsingConfTimeout || - leader.notLeader { + leader.notLeader || + leader.isEpochStale() { // check leader epoch here, if leader.epoch staled, we can try other replicas. instead of buildRPCContext failed and invalidate region then retry. return false } return true @@ -557,9 +569,9 @@ func (state *tryFollower) next(bo *retry.Backoffer, selector *replicaSelector) ( if selector.targetIdx < 0 { // when meet deadline exceeded error, do fast retry without invalidate region cache. if !hasDeadlineExceededError(selector.replicas) { - metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("exhausted").Inc() selector.invalidateRegion() } + metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("exhausted").Inc() return nil, nil } rpcCtx, err := selector.buildRPCContext(bo) @@ -633,7 +645,7 @@ func (state *accessByKnownProxy) onSendFailure(bo *retry.Backoffer, selector *re } func (state *accessByKnownProxy) onNoLeader(selector *replicaSelector) { - selector.state = &invalidLeader{} + selector.state = &tryFollower{leaderIdx: state.leaderIdx, lastIdx: state.leaderIdx, fromAccessKnownLeader: true} } // tryNewProxy is the state where we try to find a node from followers as proxy. @@ -696,7 +708,7 @@ func (state *tryNewProxy) onSendFailure(bo *retry.Backoffer, selector *replicaSe } func (state *tryNewProxy) onNoLeader(selector *replicaSelector) { - selector.state = &invalidLeader{} + selector.state = &tryFollower{leaderIdx: state.leaderIdx, lastIdx: state.leaderIdx, fromAccessKnownLeader: true} } // accessFollower is the state where we are sending requests to TiKV followers. @@ -759,13 +771,14 @@ func (state *accessFollower) next(bo *retry.Backoffer, selector *replicaSelector idx = state.lastIdx } else { // randomly select next replica, but skip state.lastIdx - if (i+offset)%replicaSize == int(state.leaderIdx) { + // since i must be greater than or equal to 1, so use i-1 to try from the first replica to make test stable. + if (i-1+offset)%replicaSize == int(state.leaderIdx) { offset++ } - idx = AccessIndex((i + offset) % replicaSize) + idx = AccessIndex((i - 1 + offset) % replicaSize) } } else { - idx = AccessIndex((int(state.lastIdx) + i) % replicaSize) + idx = AccessIndex((offset + i) % replicaSize) } selectReplica := selector.replicas[idx] if state.isCandidate(idx, selectReplica) { @@ -820,9 +833,9 @@ func (state *accessFollower) next(bo *retry.Backoffer, selector *replicaSelector } // when meet deadline exceeded error, do fast retry without invalidate region cache. if !hasDeadlineExceededError(selector.replicas) { - metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("exhausted").Inc() selector.invalidateRegion() } + metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("exhausted").Inc() return nil, nil } state.lastIdx = state.leaderIdx @@ -929,9 +942,9 @@ func (state *tryIdleReplica) next(bo *retry.Backoffer, selector *replicaSelector if targetIdx == state.leaderIdx && !isLeaderCandidate(selector.replicas[targetIdx]) { // when meet deadline exceeded error, do fast retry without invalidate region cache. if !hasDeadlineExceededError(selector.replicas) { - metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("exhausted").Inc() selector.invalidateRegion() } + metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("exhausted").Inc() return nil, nil } selector.targetIdx = targetIdx @@ -999,19 +1012,8 @@ func newReplicaSelector( return nil, errors.New("cached region ttl expired") } + replicas := buildTiKVReplicas(cachedRegion) regionStore := cachedRegion.getStore() - replicas := make([]*replica, 0, regionStore.accessStoreNum(tiKVOnly)) - for _, storeIdx := range regionStore.accessIndex[tiKVOnly] { - replicas = append( - replicas, &replica{ - store: regionStore.stores[storeIdx], - peer: cachedRegion.meta.Peers[storeIdx], - epoch: regionStore.storeEpochs[storeIdx], - attempts: 0, - }, - ) - } - option := storeSelectorOp{} for _, op := range opts { op(&option) @@ -1038,6 +1040,35 @@ func newReplicaSelector( } } + return &replicaSelector{ + baseReplicaSelector: baseReplicaSelector{ + regionCache: regionCache, + region: cachedRegion, + replicas: replicas, + busyThreshold: time.Duration(req.BusyThresholdMs) * time.Millisecond, + }, + regionStore: regionStore, + labels: option.labels, + state: state, + targetIdx: -1, + proxyIdx: -1, + }, nil +} + +func buildTiKVReplicas(region *Region) []*replica { + regionStore := region.getStore() + replicas := make([]*replica, 0, regionStore.accessStoreNum(tiKVOnly)) + for _, storeIdx := range regionStore.accessIndex[tiKVOnly] { + replicas = append( + replicas, &replica{ + store: regionStore.stores[storeIdx], + peer: region.meta.Peers[storeIdx], + epoch: regionStore.storeEpochs[storeIdx], + attempts: 0, + }, + ) + } + if val, err := util.EvalFailpoint("newReplicaSelectorInitialAttemptedTime"); err == nil { attemptedTime, err := time.ParseDuration(val.(string)) if err != nil { @@ -1047,19 +1078,7 @@ func newReplicaSelector( r.attemptedTime = attemptedTime } } - - return &replicaSelector{ - regionCache, - cachedRegion, - regionStore, - replicas, - option.labels, - state, - -1, - -1, - time.Duration(req.BusyThresholdMs) * time.Millisecond, - nil, - }, nil + return replicas } const ( @@ -1072,7 +1091,7 @@ const ( // next creates the RPCContext of the current candidate replica. // It returns a SendError if runs out of all replicas or the cached region is invalidated. -func (s *replicaSelector) next(bo *retry.Backoffer) (rpcCtx *RPCContext, err error) { +func (s *replicaSelector) next(bo *retry.Backoffer, _ *tikvrpc.Request) (rpcCtx *RPCContext, err error) { if !s.region.isValid() { metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("invalid").Inc() return nil, nil @@ -1103,6 +1122,10 @@ func (s *replicaSelector) proxyReplica() *replica { return nil } +func (s *replicaSelector) getLabels() []*metapb.StoreLabel { + return s.labels +} + // sliceIdentical checks whether two slices are referencing the same block of memory. Two `nil`s are also considered // the same. func sliceIdentical[T any](a, b []T) bool { @@ -1146,8 +1169,10 @@ func (s *replicaSelector) refreshRegionStore() { } func (s *replicaSelector) buildRPCContext(bo *retry.Backoffer) (*RPCContext, error) { - targetReplica, proxyReplica := s.targetReplica(), s.proxyReplica() + return s.baseReplicaSelector.buildRPCContext(bo, s.targetReplica(), s.proxyReplica()) +} +func (s *baseReplicaSelector) buildRPCContext(bo *retry.Backoffer, targetReplica, proxyReplica *replica) (*RPCContext, error) { // Backoff and retry if no replica is selected or the selected replica is stale if targetReplica == nil || targetReplica.isEpochStale() || (proxyReplica != nil && proxyReplica.isEpochStale()) { @@ -1201,11 +1226,7 @@ func (s *replicaSelector) onSendFailure(bo *retry.Backoffer, err error) { } func (s *replicaSelector) onReadReqConfigurableTimeout(req *tikvrpc.Request) bool { - if req.MaxExecutionDurationMs >= uint64(client.ReadTimeoutShort.Milliseconds()) { - // Configurable timeout should less than `ReadTimeoutShort`. - return false - } - if isReadReq(req.Type) { + if isReadReqConfigurableTimeout(req) { if target := s.targetReplica(); target != nil { target.deadlineErrUsingConfTimeout = true } @@ -1215,10 +1236,18 @@ func (s *replicaSelector) onReadReqConfigurableTimeout(req *tikvrpc.Request) boo } return true } - // Only work for read requests, return false for non-read requests. return false } +func isReadReqConfigurableTimeout(req *tikvrpc.Request) bool { + if req.MaxExecutionDurationMs >= uint64(client.ReadTimeoutShort.Milliseconds()) { + // Configurable timeout should less than `ReadTimeoutShort`. + return false + } + // Only work for read requests, return false for non-read requests. + return isReadReq(req.Type) +} + func isReadReq(tp tikvrpc.CmdType) bool { switch tp { case tikvrpc.CmdGet, tikvrpc.CmdBatchGet, tikvrpc.CmdScan, @@ -1229,11 +1258,15 @@ func isReadReq(tp tikvrpc.CmdType) bool { } } -func (s *replicaSelector) checkLiveness(bo *retry.Backoffer, accessReplica *replica) livenessState { +func (s *baseReplicaSelector) getBaseReplicaSelector() *baseReplicaSelector { + return s +} + +func (s *baseReplicaSelector) checkLiveness(bo *retry.Backoffer, accessReplica *replica) livenessState { return accessReplica.store.requestLivenessAndStartHealthCheckLoopIfNeeded(bo, s.regionCache) } -func (s *replicaSelector) invalidateReplicaStore(replica *replica, cause error) { +func (s *baseReplicaSelector) invalidateReplicaStore(replica *replica, cause error) { store := replica.store if atomic.CompareAndSwapUint32(&store.epoch, replica.epoch, replica.epoch+1) { logutil.BgLogger().Info( @@ -1249,7 +1282,7 @@ func (s *replicaSelector) invalidateReplicaStore(replica *replica, cause error) } } -func (s *replicaSelector) onSendSuccess() { +func (s *replicaSelector) onSendSuccess(_ *tikvrpc.Request) { s.state.onSendSuccess(s) } @@ -1259,24 +1292,38 @@ func (s *replicaSelector) onNotLeader( if target := s.targetReplica(); target != nil { target.notLeader = true } - leader := notLeader.GetLeader() - if leader == nil { - // The region may be during transferring leader. - s.state.onNoLeader(s) - if err = bo.Backoff(retry.BoRegionScheduling, errors.Errorf("no leader, ctx: %v", ctx)); err != nil { - return false, err + leaderIdx, err := s.baseReplicaSelector.onNotLeader(bo, ctx, notLeader) + if err != nil { + return false, err + } + if leaderIdx >= 0 { + if isLeaderCandidate(s.replicas[leaderIdx]) { + s.state = &accessKnownLeader{leaderIdx: AccessIndex(leaderIdx)} } } else { - s.updateLeader(notLeader.GetLeader()) + s.state.onNoLeader(s) } return true, nil } +func (s *baseReplicaSelector) onNotLeader( + bo *retry.Backoffer, ctx *RPCContext, notLeader *errorpb.NotLeader, +) (leaderIdx int, err error) { + leader := notLeader.GetLeader() + if leader == nil { + // The region may be during transferring leader. + err = bo.Backoff(retry.BoRegionScheduling, errors.Errorf("no leader, ctx: %v", ctx)) + return -1, err + } + return s.updateLeader(leader), nil +} + // updateLeader updates the leader of the cached region. // If the leader peer isn't found in the region, the region will be invalidated. -func (s *replicaSelector) updateLeader(leader *metapb.Peer) { +// If switch to new leader successfully, returns the AccessIndex of the new leader in the replicas. +func (s *baseReplicaSelector) updateLeader(leader *metapb.Peer) int { if leader == nil { - return + return -1 } for i, replica := range s.replicas { if isSamePeer(replica.peer, leader) { @@ -1285,14 +1332,9 @@ func (s *replicaSelector) updateLeader(leader *metapb.Peer) { // a request. So, before the new leader is elected, we should not send requests // to the unreachable old leader to avoid unnecessary timeout. if replica.store.getLivenessState() != reachable { - return + return -1 } replica.onUpdateLeader() - if isLeaderCandidate(s.replicas[i]) { - // If the new leader is candidate, switch to the new leader. - // the leader may have deadlineErrUsingConfTimeout and isn't candidate, if so, keep the state unchanged and retry the request. - s.state = &accessKnownLeader{leaderIdx: AccessIndex(i)} - } // Update the workTiKVIdx so that following requests can be sent to the leader immediately. if !s.region.switchWorkLeaderToPeer(leader) { panic("the store must exist") @@ -1302,11 +1344,12 @@ func (s *replicaSelector) updateLeader(leader *metapb.Peer) { zap.Uint64("regionID", s.region.GetID()), zap.Uint64("leaderStoreID", leader.GetStoreId()), ) - return + return i } } // Invalidate the region since the new leader is not in the cached version. s.region.invalidate(StoreNotFound) + return -1 } func (s *replicaSelector) onServerIsBusy( @@ -1331,7 +1374,7 @@ func (s *replicaSelector) onServerIsBusy( } } } else { - // Mark the server is busy (the next incoming READs could be redirect to expected followers.) + // Mark the server is busy (the next incoming READs could be redirected to expected followers.) ctx.Store.healthStatus.markAlreadySlow() } } @@ -1353,7 +1396,7 @@ func (s *replicaSelector) onDataIsNotReady() { } } -func (s *replicaSelector) invalidateRegion() { +func (s *baseReplicaSelector) invalidateRegion() { if s.region != nil { s.region.invalidate(Other) } @@ -1369,14 +1412,14 @@ func (s *RegionRequestSender) getRPCContext( switch et { case tikvrpc.TiKV: if s.replicaSelector == nil { - selector, err := newReplicaSelector(s.regionCache, regionID, req, opts...) - if selector == nil || err != nil { + selector, err := NewReplicaSelector(s.regionCache, regionID, req, opts...) + if err != nil { s.rpcError = err return nil, nil } s.replicaSelector = selector } - return s.replicaSelector.next(bo) + return s.replicaSelector.next(bo, req) case tikvrpc.TiFlash: // Should ignore WN, because in disaggregated tiflash mode, TiDB will build rpcCtx itself. return s.regionCache.GetTiFlashRPCContext(bo, regionID, true, LabelFilterNoTiFlashWriteNode) @@ -1520,7 +1563,7 @@ func (s *RegionRequestSender) SendReqCtx( // and handle this error like EpochNotMatch, which means to re-split the request and retry. s.logSendReqError(bo, "throwing pseudo region error due to no replica available", regionID, retryTimes, req, totalErrors) if s.replicaSelector != nil { - if err := s.replicaSelector.backoffOnNoCandidate(bo); err != nil { + if err := s.replicaSelector.getBaseReplicaSelector().backoffOnNoCandidate(bo); err != nil { return nil, nil, retryTimes, err } } @@ -1531,7 +1574,7 @@ func (s *RegionRequestSender) SendReqCtx( var isLocalTraffic bool if staleReadCollector != nil && s.replicaSelector != nil { if target := s.replicaSelector.targetReplica(); target != nil { - isLocalTraffic = target.store.IsLabelsMatch(s.replicaSelector.labels) + isLocalTraffic = target.store.IsLabelsMatch(s.replicaSelector.getLabels()) staleReadCollector.onReq(req, isLocalTraffic) } } @@ -1549,13 +1592,13 @@ func (s *RegionRequestSender) SendReqCtx( req.Context.ClusterId = rpcCtx.ClusterID rpcCtx.contextPatcher.applyTo(&req.Context) if req.InputRequestSource != "" && s.replicaSelector != nil { - s.replicaSelector.patchRequestSource(req, rpcCtx) + patchRequestSource(req, s.replicaSelector.replicaType(rpcCtx)) } if e := tikvrpc.SetContext(req, rpcCtx.Meta, rpcCtx.Peer); e != nil { return nil, nil, retryTimes, err } if s.replicaSelector != nil { - if err := s.replicaSelector.backoffOnRetry(rpcCtx.Store, bo); err != nil { + if err := s.replicaSelector.getBaseReplicaSelector().backoffOnRetry(rpcCtx.Store, bo); err != nil { return nil, nil, retryTimes, err } } @@ -1611,7 +1654,7 @@ func (s *RegionRequestSender) SendReqCtx( s.logSendReqError(bo, "send request meet region error without retry", regionID, retryTimes, req, totalErrors) } else { if s.replicaSelector != nil { - s.replicaSelector.onSendSuccess() + s.replicaSelector.onSendSuccess(req) } } if staleReadCollector != nil { @@ -1783,7 +1826,7 @@ func (s *RegionRequestSender) sendReqToRegion( resp, err = s.client.SendRequest(ctx, sendToAddr, req, timeout) rpcDuration := time.Since(start) if s.replicaSelector != nil { - s.replicaSelector.recordAttemptedTime(rpcDuration) + recordAttemptedTime(s.replicaSelector, rpcDuration) } // Record timecost of external requests on related Store when `ReplicaReadMode == "PreferLeader"`. if rpcCtx.Store != nil && req.ReplicaReadType == kv.ReplicaReadPreferLeader && !util.IsInternalRequest(req.RequestSource) { @@ -2150,17 +2193,7 @@ func (s *RegionRequestSender) onRegionError( zap.Stringer("ctx", ctx), ) if req != nil { - // if the failure is caused by replica read, we can retry it with leader safely. - if ctx.contextPatcher.replicaRead != nil && *ctx.contextPatcher.replicaRead { - req.BusyThresholdMs = 0 - s.replicaSelector.busyThreshold = 0 - ctx.contextPatcher.replicaRead = nil - ctx.contextPatcher.busyThreshold = nil - return true, nil - } - if req.ReplicaReadType.IsFollowerRead() { - s.replicaSelector = nil - req.ReplicaReadType = kv.ReplicaReadLeader + if s.onFlashbackInProgressRegionError(ctx, req) { return true, nil } } @@ -2396,6 +2429,28 @@ func (s *RegionRequestSender) onRegionError( return false, nil } +func (s *RegionRequestSender) onFlashbackInProgressRegionError(ctx *RPCContext, req *tikvrpc.Request) bool { + switch selector := s.replicaSelector.(type) { + case *replicaSelector: + // if the failure is caused by replica read, we can retry it with leader safely. + if ctx.contextPatcher.replicaRead != nil && *ctx.contextPatcher.replicaRead { + req.BusyThresholdMs = 0 + selector.busyThreshold = 0 + ctx.contextPatcher.replicaRead = nil + ctx.contextPatcher.busyThreshold = nil + return true + } + if req.ReplicaReadType.IsFollowerRead() { + s.replicaSelector = nil + req.ReplicaReadType = kv.ReplicaReadLeader + return true + } + case *replicaSelectorV2: + return selector.onFlashbackInProgress(ctx, req) + } + return false +} + type staleReadMetricsCollector struct { } @@ -2471,7 +2526,7 @@ func (s *replicaSelector) replicaType(rpcCtx *RPCContext) string { return "unknown" } -func (s *replicaSelector) patchRequestSource(req *tikvrpc.Request, rpcCtx *RPCContext) { +func patchRequestSource(req *tikvrpc.Request, replicaType string) { var sb strings.Builder defer func() { // TiKV does the limit control by the last part of the request source. @@ -2480,8 +2535,6 @@ func (s *replicaSelector) patchRequestSource(req *tikvrpc.Request, rpcCtx *RPCCo req.RequestSource = sb.String() }() - replicaType := s.replicaType(rpcCtx) - if req.IsRetryRequest { sb.WriteString("retry_") sb.WriteString(req.ReadType) @@ -2497,7 +2550,7 @@ func (s *replicaSelector) patchRequestSource(req *tikvrpc.Request, rpcCtx *RPCCo sb.WriteString(req.ReadType) } -func (s *replicaSelector) recordAttemptedTime(duration time.Duration) { +func recordAttemptedTime(s ReplicaSelector, duration time.Duration) { if targetReplica := s.targetReplica(); targetReplica != nil { targetReplica.attemptedTime += duration } @@ -2523,7 +2576,7 @@ type backoffArgs struct { } // addPendingBackoff adds pending backoff for the store. -func (s *replicaSelector) addPendingBackoff(store *Store, cfg *retry.Config, err error) { +func (s *baseReplicaSelector) addPendingBackoff(store *Store, cfg *retry.Config, err error) { storeId := uint64(0) if store != nil { storeId = store.storeID @@ -2535,7 +2588,7 @@ func (s *replicaSelector) addPendingBackoff(store *Store, cfg *retry.Config, err } // backoffOnRetry apply pending backoff on the store when retry in this store. -func (s *replicaSelector) backoffOnRetry(store *Store, bo *retry.Backoffer) error { +func (s *baseReplicaSelector) backoffOnRetry(store *Store, bo *retry.Backoffer) error { storeId := uint64(0) if store != nil { storeId = store.storeID @@ -2549,7 +2602,7 @@ func (s *replicaSelector) backoffOnRetry(store *Store, bo *retry.Backoffer) erro } // backoffOnNoCandidate apply the largest base pending backoff when no candidate. -func (s *replicaSelector) backoffOnNoCandidate(bo *retry.Backoffer) error { +func (s *baseReplicaSelector) backoffOnNoCandidate(bo *retry.Backoffer) error { var args *backoffArgs for _, pbo := range s.pendingBackoffs { if args == nil || args.cfg.Base() < pbo.cfg.Base() { diff --git a/internal/locate/region_request3_test.go b/internal/locate/region_request3_test.go index 979ef9ba..b28fc34d 100644 --- a/internal/locate/region_request3_test.go +++ b/internal/locate/region_request3_test.go @@ -235,7 +235,8 @@ func (s *testRegionRequestToThreeStoresSuite) loadAndGetLeaderStore() (*Store, s } func (s *testRegionRequestToThreeStoresSuite) TestForwarding() { - s.regionRequestSender.regionCache.enableForwarding = true + sender := NewRegionRequestSender(s.cache, s.regionRequestSender.client) + sender.regionCache.enableForwarding = true // First get the leader's addr from region cache leaderStore, leaderAddr := s.loadAndGetLeaderStore() @@ -243,8 +244,8 @@ func (s *testRegionRequestToThreeStoresSuite) TestForwarding() { bo := retry.NewBackoffer(context.Background(), 10000) // Simulate that the leader is network-partitioned but can be accessed by forwarding via a follower - innerClient := s.regionRequestSender.client - s.regionRequestSender.client = &fnClient{fn: func(ctx context.Context, addr string, req *tikvrpc.Request, timeout time.Duration) (*tikvrpc.Response, error) { + innerClient := sender.client + sender.client = &fnClient{fn: func(ctx context.Context, addr string, req *tikvrpc.Request, timeout time.Duration) (*tikvrpc.Response, error) { if addr == leaderAddr { return nil, errors.New("simulated rpc error") } @@ -255,21 +256,21 @@ func (s *testRegionRequestToThreeStoresSuite) TestForwarding() { return innerClient.SendRequest(ctx, addr, req, timeout) }} var storeState = uint32(unreachable) - s.regionRequestSender.regionCache.setMockRequestLiveness(func(ctx context.Context, s *Store) livenessState { + sender.regionCache.setMockRequestLiveness(func(ctx context.Context, s *Store) livenessState { if s.addr == leaderAddr { return livenessState(atomic.LoadUint32(&storeState)) } return reachable }) - loc, err := s.regionRequestSender.regionCache.LocateKey(bo, []byte("k")) + loc, err := sender.regionCache.LocateKey(bo, []byte("k")) s.Nil(err) s.Equal(loc.Region.GetID(), s.regionID) req := tikvrpc.NewRequest(tikvrpc.CmdRawPut, &kvrpcpb.RawPutRequest{ Key: []byte("k"), Value: []byte("v1"), }) - resp, ctx, _, err := s.regionRequestSender.SendReqCtx(bo, req, loc.Region, time.Second, tikvrpc.TiKV) + resp, ctx, _, err := sender.SendReqCtx(bo, req, loc.Region, time.Second, tikvrpc.TiKV) s.Nil(err) regionErr, err := resp.GetRegionError() s.Nil(err) @@ -281,7 +282,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestForwarding() { s.Nil(err) // Simulate recovering to normal - s.regionRequestSender.client = innerClient + sender.client = innerClient atomic.StoreUint32(&storeState, uint32(reachable)) start := time.Now() for { @@ -296,7 +297,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestForwarding() { atomic.StoreUint32(&storeState, uint32(unreachable)) req = tikvrpc.NewRequest(tikvrpc.CmdRawGet, &kvrpcpb.RawGetRequest{Key: []byte("k")}) - resp, ctx, _, err = s.regionRequestSender.SendReqCtx(bo, req, loc.Region, time.Second, tikvrpc.TiKV) + resp, ctx, _, err = sender.SendReqCtx(bo, req, loc.Region, time.Second, tikvrpc.TiKV) s.Nil(err) regionErr, err = resp.GetRegionError() s.Nil(err) @@ -305,7 +306,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestForwarding() { s.Nil(ctx.ProxyStore) // Simulate server down - s.regionRequestSender.client = &fnClient{fn: func(ctx context.Context, addr string, req *tikvrpc.Request, timeout time.Duration) (*tikvrpc.Response, error) { + sender.client = &fnClient{fn: func(ctx context.Context, addr string, req *tikvrpc.Request, timeout time.Duration) (*tikvrpc.Response, error) { if addr == leaderAddr || req.ForwardedHost == leaderAddr { return nil, errors.New("simulated rpc error") } @@ -329,7 +330,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestForwarding() { Key: []byte("k"), Value: []byte("v2"), }) - resp, ctx, _, err = s.regionRequestSender.SendReqCtx(bo, req, loc.Region, time.Second, tikvrpc.TiKV) + resp, ctx, _, err = sender.SendReqCtx(bo, req, loc.Region, time.Second, tikvrpc.TiKV) s.Nil(err) regionErr, err = resp.GetRegionError() s.Nil(err) @@ -337,19 +338,19 @@ func (s *testRegionRequestToThreeStoresSuite) TestForwarding() { // Then SendReqCtx will throw a pseudo EpochNotMatch to tell the caller to reload the region. s.NotNil(regionErr.EpochNotMatch) s.Nil(ctx) - s.Equal(len(s.regionRequestSender.failStoreIDs), 0) - s.Equal(len(s.regionRequestSender.failProxyStoreIDs), 0) - region := s.regionRequestSender.regionCache.GetCachedRegionWithRLock(loc.Region) + s.Equal(len(sender.failStoreIDs), 0) + s.Equal(len(sender.failProxyStoreIDs), 0) + region := sender.regionCache.GetCachedRegionWithRLock(loc.Region) s.NotNil(region) s.False(region.isValid()) - loc, err = s.regionRequestSender.regionCache.LocateKey(bo, []byte("k")) + loc, err = sender.regionCache.LocateKey(bo, []byte("k")) s.Nil(err) req = tikvrpc.NewRequest(tikvrpc.CmdRawPut, &kvrpcpb.RawPutRequest{ Key: []byte("k"), Value: []byte("v2"), }) - resp, ctx, _, err = s.regionRequestSender.SendReqCtx(bo, req, loc.Region, time.Second, tikvrpc.TiKV) + resp, ctx, _, err = sender.SendReqCtx(bo, req, loc.Region, time.Second, tikvrpc.TiKV) s.Nil(err) regionErr, err = resp.GetRegionError() s.Nil(err) @@ -428,18 +429,18 @@ func (s *testRegionRequestToThreeStoresSuite) TestLearnerReplicaSelector() { refreshRegionTTL(region) refreshEpochs(regionStore) req.ReplicaReadType = kv.ReplicaReadLearner - replicaSelector, err := newReplicaSelector(cache, regionLoc.Region, req) + replicaSelector, err := NewReplicaSelector(cache, regionLoc.Region, req) s.NotNil(replicaSelector) s.Nil(err) - accessLearner, _ := replicaSelector.state.(*accessFollower) // Invalidate the region if the leader is not in the region. refreshRegionTTL(region) - rpcCtx, err := replicaSelector.next(s.bo) + rpcCtx, err := replicaSelector.next(s.bo, req) s.Nil(err) - // Should switch to the next follower. - s.Equal(AccessIndex(tikvLearnerAccessIdx), accessLearner.lastIdx) - AssertRPCCtxEqual(s, rpcCtx, replicaSelector.replicas[replicaSelector.targetIdx], nil) + target := replicaSelector.targetReplica() + AssertRPCCtxEqual(s, rpcCtx, target, nil) + s.Equal(target.peer.Role, metapb.PeerRole_Learner) + s.Equal(target.peer.Id, tikvLearner.Id) } func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { @@ -504,7 +505,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { s.IsType(&accessKnownLeader{}, replicaSelector.state) // Try the leader for maxReplicaAttempt times for i := 1; i <= maxReplicaAttempt; i++ { - rpcCtx, err := replicaSelector.next(s.bo) + rpcCtx, err := replicaSelector.next(s.bo, req) s.Nil(err) AssertRPCCtxEqual(s, rpcCtx, replicaSelector.replicas[regionStore.workTiKVIdx], nil) s.IsType(&accessKnownLeader{}, replicaSelector.state) @@ -513,7 +514,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { // After that it should switch to tryFollower for i := 0; i < len(replicaSelector.replicas)-1; i++ { - rpcCtx, err := replicaSelector.next(s.bo) + rpcCtx, err := replicaSelector.next(s.bo, req) s.Nil(err) state, ok := replicaSelector.state.(*tryFollower) s.True(ok) @@ -524,7 +525,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { s.Equal(replicaSelector.targetReplica().attempts, 1) } // In tryFollower state, if all replicas are tried, nil RPCContext should be returned - rpcCtx, err := replicaSelector.next(s.bo) + rpcCtx, err := replicaSelector.next(s.bo, req) s.Nil(err) s.Nil(rpcCtx) // The region should be invalidated @@ -537,10 +538,10 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { s.NotNil(replicaSelector) unreachable.injectConstantLiveness(cache) s.IsType(&accessKnownLeader{}, replicaSelector.state) - _, err = replicaSelector.next(s.bo) + _, err = replicaSelector.next(s.bo, req) s.Nil(err) replicaSelector.onSendFailure(s.bo, nil) - rpcCtx, err = replicaSelector.next(s.bo) + rpcCtx, err = replicaSelector.next(s.bo, req) s.NotNil(rpcCtx) s.Nil(err) s.IsType(&tryFollower{}, replicaSelector.state) @@ -561,7 +562,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { s.NotNil(replicaSelector) s.IsType(&accessKnownLeader{}, replicaSelector.state) // Now, livenessState is unreachable, so it will try a reachable follower instead of the unreachable leader. - rpcCtx, err = replicaSelector.next(s.bo) + rpcCtx, err = replicaSelector.next(s.bo, req) s.Nil(err) s.NotNil(rpcCtx) _, ok := replicaSelector.state.(*tryFollower) @@ -581,7 +582,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { s.IsType(&accessKnownLeader{}, replicaSelector.state) // Now, livenessState is unknown. Even if forwarding is enabled, it should try followers // instead of using the proxy. - rpcCtx, err = replicaSelector.next(s.bo) + rpcCtx, err = replicaSelector.next(s.bo, req) s.Nil(err) s.NotNil(rpcCtx) _, ok = replicaSelector.state.(*tryFollower) @@ -599,7 +600,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { }, 3*time.Second, 200*time.Millisecond) s.IsType(&accessKnownLeader{}, replicaSelector.state) // Now, livenessState is unreachable, so it will try a new proxy instead of the leader. - rpcCtx, err = replicaSelector.next(s.bo) + rpcCtx, err = replicaSelector.next(s.bo, req) s.Nil(err) s.NotNil(rpcCtx) state, ok := replicaSelector.state.(*tryNewProxy) @@ -614,7 +615,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { // When the current proxy node fails, it should try another one. lastProxy := replicaSelector.proxyIdx replicaSelector.onSendFailure(s.bo, nil) - rpcCtx, err = replicaSelector.next(s.bo) + rpcCtx, err = replicaSelector.next(s.bo, req) s.NotNil(rpcCtx) s.Nil(err) state, ok = replicaSelector.state.(*tryNewProxy) @@ -626,7 +627,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { s.Equal(replicaSelector.proxyReplica().attempts, 1) // Test proxy store is saves when proxy is enabled - replicaSelector.onSendSuccess() + replicaSelector.onSendSuccess(req) regionStore = region.getStore() s.Equal(replicaSelector.proxyIdx, regionStore.proxyTiKVIdx) @@ -639,14 +640,14 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { state2, ok := replicaSelector.state.(*accessByKnownProxy) s.True(ok) s.Equal(regionStore.workTiKVIdx, state2.leaderIdx) - _, err = replicaSelector.next(s.bo) + _, err = replicaSelector.next(s.bo, req) s.Nil(err) AssertRPCCtxEqual(s, rpcCtx, replicaSelector.targetReplica(), replicaSelector.proxyReplica()) // Switch to tryNewProxy if the current proxy is not available replicaSelector.onSendFailure(s.bo, nil) s.IsType(&tryNewProxy{}, replicaSelector.state) - rpcCtx, err = replicaSelector.next(s.bo) + rpcCtx, err = replicaSelector.next(s.bo, req) s.Nil(err) AssertRPCCtxEqual(s, rpcCtx, replicaSelector.targetReplica(), replicaSelector.proxyReplica()) s.Equal(regionStore.workTiKVIdx, state2.leaderIdx) @@ -654,6 +655,8 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { s.NotEqual(regionStore.proxyTiKVIdx, replicaSelector.proxyIdx) s.Equal(replicaSelector.targetReplica().attempts, 2) s.Equal(replicaSelector.proxyReplica().attempts, 1) + // FIXME: the chosen proxy-replica's store should be reachable. + //s.Equal(replicaSelector.proxyReplica().store.getLivenessState(), reachable) // Test accessFollower state with kv.ReplicaReadFollower request type. req = tikvrpc.NewReplicaReadRequest(tikvrpc.CmdGet, &kvrpcpb.GetRequest{}, kv.ReplicaReadFollower, nil) @@ -669,7 +672,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { lastIdx := AccessIndex(-1) for i := 0; i < regionStore.accessStoreNum(tiKVOnly)-1; i++ { - rpcCtx, err := replicaSelector.next(s.bo) + rpcCtx, err := replicaSelector.next(s.bo, req) s.Nil(err) // Should switch to the next follower. s.NotEqual(lastIdx, state3.lastIdx) @@ -680,13 +683,13 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { lastIdx = state3.lastIdx } // Fallback to the leader for 1 time - rpcCtx, err = replicaSelector.next(s.bo) + rpcCtx, err = replicaSelector.next(s.bo, req) s.Nil(err) s.Equal(regionStore.workTiKVIdx, state3.lastIdx) s.Equal(replicaSelector.targetIdx, state3.lastIdx) AssertRPCCtxEqual(s, rpcCtx, replicaSelector.replicas[regionStore.workTiKVIdx], nil) // All replicas are exhausted. - rpcCtx, err = replicaSelector.next(s.bo) + rpcCtx, err = replicaSelector.next(s.bo, req) s.Nil(rpcCtx) s.Nil(err) @@ -703,7 +706,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { s.Nil(err) state3 = replicaSelector.state.(*accessFollower) // Should fallback to the leader immediately. - rpcCtx, err = replicaSelector.next(s.bo) + rpcCtx, err = replicaSelector.next(s.bo, req) s.Nil(err) s.Equal(regionStore.workTiKVIdx, state3.lastIdx) s.Equal(replicaSelector.targetIdx, state3.lastIdx) @@ -726,7 +729,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { replicaSelector, err = newReplicaSelector(cache, regionLoc.Region, req, WithMatchLabels(labels)) s.NotNil(replicaSelector) s.Nil(err) - rpcCtx, err = replicaSelector.next(s.bo) + rpcCtx, err = replicaSelector.next(s.bo, req) s.Nil(err) AssertRPCCtxEqual(s, rpcCtx, replicaSelector.replicas[accessIdx], nil) } @@ -738,7 +741,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { replicaSelector, err = newReplicaSelector(cache, regionLoc.Region, req, WithLeaderOnly()) s.NotNil(replicaSelector) s.Nil(err) - rpcCtx, err = replicaSelector.next(s.bo) + rpcCtx, err = replicaSelector.next(s.bo, req) s.Nil(err) // Should always access the leader. AssertRPCCtxEqual(s, rpcCtx, replicaSelector.replicas[regionStore.workTiKVIdx], nil) @@ -757,7 +760,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestReplicaSelector() { replicaSelector.updateLeader(&metapb.Peer{Id: s.cluster.AllocID(), StoreId: s.cluster.AllocID()}) s.False(region.isValid()) // Don't try next replica if the region is invalidated. - rpcCtx, err = replicaSelector.next(s.bo) + rpcCtx, err = replicaSelector.next(s.bo, req) s.Nil(rpcCtx) s.Nil(err) } @@ -780,7 +783,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestSendReqWithReplicaSelector() { s.NotNil(regionStore) reloadRegion := func() { - s.regionRequestSender.replicaSelector.region.invalidate(Other) + s.regionRequestSender.replicaSelector.invalidateRegion() region, _ = s.cache.LocateRegionByID(s.bo, s.regionID) regionStore = s.cache.GetCachedRegionWithRLock(region.Region).getStore() } @@ -811,7 +814,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestSendReqWithReplicaSelector() { resp, _, err = sender.SendReq(bo, req, region.Region, client.ReadTimeoutShort) s.Nil(err) s.NotNil(resp) - s.Equal(sender.replicaSelector.targetIdx, AccessIndex(1)) + s.Equal(sender.replicaSelector.targetReplica().peer.Id, s.peerIDs[1]) s.True(bo.GetTotalBackoffTimes() == 1) s.cluster.StartStore(s.storeIDs[0]) atomic.StoreUint32(®ionStore.stores[0].livenessState, uint32(reachable)) @@ -822,7 +825,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestSendReqWithReplicaSelector() { resp, _, err = sender.SendReq(bo, req, region.Region, client.ReadTimeoutShort) s.Nil(err) s.NotNil(resp) - s.Equal(sender.replicaSelector.targetIdx, AccessIndex(1)) + s.Equal(sender.replicaSelector.targetReplica().peer.Id, s.peerIDs[1]) s.True(bo.GetTotalBackoffTimes() == 0) // Switch to the next peer due to leader failure but the new leader is not elected. @@ -853,13 +856,31 @@ func (s *testRegionRequestToThreeStoresSuite) TestSendReqWithReplicaSelector() { s.Nil(err) s.True(hasFakeRegionError(resp)) s.Equal(bo.GetTotalBackoffTimes(), 3) - s.False(sender.replicaSelector.region.isValid()) + getReplicaSelectorRegion := func() *Region { + if selector, ok := sender.replicaSelector.(*replicaSelector); ok { + return selector.region + } + if selector, ok := sender.replicaSelector.(*replicaSelectorV2); ok { + return selector.region + } + return nil + } + s.False(getReplicaSelectorRegion().isValid()) s.cluster.ChangeLeader(s.regionID, s.peerIDs[0]) // The leader store is alive but can't provide service. + getReplicaSelectorRegionStores := func() []*Store { + if selector, ok := sender.replicaSelector.(*replicaSelector); ok { + return selector.regionStore.stores + } + if selector, ok := sender.replicaSelector.(*replicaSelectorV2); ok { + return selector.region.getStore().stores + } + return nil + } reachable.injectConstantLiveness(s.cache) s.Eventually(func() bool { - stores := s.regionRequestSender.replicaSelector.regionStore.stores + stores := getReplicaSelectorRegionStores() return stores[0].getLivenessState() == reachable && stores[1].getLivenessState() == reachable && stores[2].getLivenessState() == reachable @@ -871,7 +892,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestSendReqWithReplicaSelector() { resp, _, err = sender.SendReq(bo, req, region.Region, time.Second) s.Nil(err) s.True(hasFakeRegionError(resp)) - s.False(sender.replicaSelector.region.isValid()) + s.False(getReplicaSelectorRegion().isValid()) s.Equal(bo.GetTotalBackoffTimes(), maxReplicaAttempt+2) s.cluster.StartStore(s.storeIDs[0]) @@ -905,7 +926,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestSendReqWithReplicaSelector() { resp, _, err := sender.SendReq(bo, req, region.Region, time.Second) s.Nil(err) s.True(hasFakeRegionError(resp)) - s.False(sender.replicaSelector.region.isValid()) + s.False(getReplicaSelectorRegion().isValid()) s.Equal(bo.GetTotalBackoffTimes(), maxReplicaAttempt+2) }() } @@ -925,7 +946,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestSendReqWithReplicaSelector() { resp, _, err := sender.SendReq(bo, req, region.Region, time.Second) s.Nil(err) s.True(hasFakeRegionError(resp)) - s.False(sender.replicaSelector.region.isValid()) + s.False(getReplicaSelectorRegion().isValid()) s.Equal(bo.GetTotalBackoffTimes(), 0) }() @@ -944,7 +965,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestSendReqWithReplicaSelector() { resp, _, err := sender.SendReq(bo, req, region.Region, time.Second) s.Nil(err) s.True(hasFakeRegionError(resp)) - s.False(sender.replicaSelector.region.isValid()) + s.False(getReplicaSelectorRegion().isValid()) s.Equal(bo.GetTotalBackoffTimes(), 0) }() @@ -977,7 +998,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestSendReqWithReplicaSelector() { regionErr, _ := resp.GetRegionError() s.NotNil(regionErr) } - s.False(sender.replicaSelector.region.isValid()) + s.False(getReplicaSelectorRegion().isValid()) s.Equal(bo.GetTotalBackoffTimes(), 0) }() } @@ -993,7 +1014,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestSendReqWithReplicaSelector() { s.Nil(err) s.True(hasFakeRegionError(resp)) s.True(bo.GetTotalBackoffTimes() == 3) - s.False(sender.replicaSelector.region.isValid()) + s.False(getReplicaSelectorRegion().isValid()) for _, store := range s.storeIDs { s.cluster.StartStore(store) } @@ -1008,42 +1029,45 @@ func (s *testRegionRequestToThreeStoresSuite) TestLoadBasedReplicaRead() { BusyThresholdMs: 50, }) - replicaSelector, err := newReplicaSelector(s.cache, regionLoc.Region, req) + replicaSelector, err := NewReplicaSelector(s.cache, regionLoc.Region, req) s.NotNil(replicaSelector) s.Nil(err) - s.Equal(replicaSelector.region, region) - s.IsType(&accessKnownLeader{}, replicaSelector.state) + s.Equal(replicaSelector.getBaseReplicaSelector().region, region) // The busyThreshold in replicaSelector should be initialized with the request context. - s.Equal(replicaSelector.busyThreshold, 50*time.Millisecond) + s.Equal(replicaSelector.getBaseReplicaSelector().busyThreshold, 50*time.Millisecond) bo := retry.NewBackoffer(context.Background(), -1) - rpcCtx, err := replicaSelector.next(bo) + rpcCtx, err := replicaSelector.next(bo, req) s.Nil(err) s.Equal(rpcCtx.Peer.Id, s.leaderPeer) + s.False(req.ReplicaRead) + s.Equal(req.BusyThresholdMs, uint32(50)) // Receive a ServerIsBusy error replicaSelector.onServerIsBusy(bo, rpcCtx, req, &errorpb.ServerIsBusy{ EstimatedWaitMs: 500, }) - rpcCtx, err = replicaSelector.next(bo) + rpcCtx, err = replicaSelector.next(bo, req) s.Nil(err) s.NotEqual(rpcCtx.Peer.Id, s.leaderPeer) - s.IsType(&tryIdleReplica{}, replicaSelector.state) - s.True(*rpcCtx.contextPatcher.replicaRead) + rpcCtx.contextPatcher.applyTo(&req.Context) + s.True(req.ReplicaRead) + s.Equal(req.BusyThresholdMs, uint32(50)) lastPeerID := rpcCtx.Peer.Id replicaSelector.onServerIsBusy(bo, rpcCtx, req, &errorpb.ServerIsBusy{ EstimatedWaitMs: 800, }) - rpcCtx, err = replicaSelector.next(bo) + rpcCtx, err = replicaSelector.next(bo, req) s.Nil(err) // Should choose a peer different from before s.NotEqual(rpcCtx.Peer.Id, s.leaderPeer) s.NotEqual(rpcCtx.Peer.Id, lastPeerID) - s.IsType(&tryIdleReplica{}, replicaSelector.state) - s.True(*rpcCtx.contextPatcher.replicaRead) + rpcCtx.contextPatcher.applyTo(&req.Context) + s.True(req.ReplicaRead) + s.Equal(req.BusyThresholdMs, uint32(50)) // All peers are too busy replicaSelector.onServerIsBusy(bo, rpcCtx, req, &errorpb.ServerIsBusy{ @@ -1052,24 +1076,26 @@ func (s *testRegionRequestToThreeStoresSuite) TestLoadBasedReplicaRead() { lessBusyPeer := rpcCtx.Peer.Id // Then, send to the leader again with no threshold. - rpcCtx, err = replicaSelector.next(bo) + rpcCtx, err = replicaSelector.next(bo, req) s.Nil(err) s.Equal(rpcCtx.Peer.Id, s.leaderPeer) - s.IsType(&tryIdleReplica{}, replicaSelector.state) - s.False(*rpcCtx.contextPatcher.replicaRead) - s.Equal(*rpcCtx.contextPatcher.busyThreshold, time.Duration(0)) + rpcCtx.contextPatcher.applyTo(&req.Context) + s.False(req.ReplicaRead) + s.Equal(req.BusyThresholdMs, uint32(0)) + s.True(replicaSelector.getBaseReplicaSelector().region.isValid()) // don't invalidate region when can't find an idle replica. time.Sleep(120 * time.Millisecond) // When there comes a new request, it should skip busy leader and choose a less busy store - replicaSelector, err = newReplicaSelector(s.cache, regionLoc.Region, req) + req.BusyThresholdMs = 50 + replicaSelector, err = NewReplicaSelector(s.cache, regionLoc.Region, req) s.NotNil(replicaSelector) s.Nil(err) - rpcCtx, err = replicaSelector.next(bo) + rpcCtx, err = replicaSelector.next(bo, req) s.Nil(err) s.Equal(rpcCtx.Peer.Id, lessBusyPeer) - s.IsType(&tryIdleReplica{}, replicaSelector.state) - s.True(*rpcCtx.contextPatcher.replicaRead) + rpcCtx.contextPatcher.applyTo(&req.Context) + s.True(req.ReplicaRead) } func (s *testRegionRequestToThreeStoresSuite) TestReplicaReadWithFlashbackInProgress() { @@ -1381,7 +1407,7 @@ func (s *testRegionRequestToThreeStoresSuite) TestRetryRequestSource() { } } - setTargetReplica := func(selector *replicaSelector, readType string) { + setTargetReplica := func(selector ReplicaSelector, readType string) { var leader bool switch readType { case "leader", "stale_leader": @@ -1391,13 +1417,23 @@ func (s *testRegionRequestToThreeStoresSuite) TestRetryRequestSource() { default: panic("unreachable") } - for idx, replica := range selector.replicas { + for idx, replica := range selector.getBaseReplicaSelector().replicas { if replica.store.storeID == leaderStore.storeID && leader { - selector.targetIdx = AccessIndex(idx) + if v1, ok := selector.(*replicaSelector); ok { + v1.targetIdx = AccessIndex(idx) + } + if v2, ok := selector.(*replicaSelectorV2); ok { + v2.target = replica + } return } if replica.store.storeID != leaderStore.storeID && !leader { - selector.targetIdx = AccessIndex(idx) + if v1, ok := selector.(*replicaSelector); ok { + v1.targetIdx = AccessIndex(idx) + } + if v2, ok := selector.(*replicaSelectorV2); ok { + v2.target = replica + } return } } @@ -1411,23 +1447,23 @@ func (s *testRegionRequestToThreeStoresSuite) TestRetryRequestSource() { bo := retry.NewBackoffer(context.Background(), -1) req.IsRetryRequest = false setReadType(req, firstReplica) - replicaSelector, err := newReplicaSelector(s.cache, regionLoc.Region, req) + replicaSelector, err := NewReplicaSelector(s.cache, regionLoc.Region, req) s.Nil(err) setTargetReplica(replicaSelector, firstReplica) - rpcCtx, err := replicaSelector.buildRPCContext(bo) + rpcCtx, err := replicaSelector.getBaseReplicaSelector().buildRPCContext(bo, replicaSelector.targetReplica(), replicaSelector.proxyReplica()) s.Nil(err) - replicaSelector.patchRequestSource(req, rpcCtx) + patchRequestSource(req, replicaSelector.replicaType(rpcCtx)) s.Equal(firstReplica+"_test", req.RequestSource) // retry setReadType(req, retryReplica) - replicaSelector, err = newReplicaSelector(s.cache, regionLoc.Region, req) + replicaSelector, err = NewReplicaSelector(s.cache, regionLoc.Region, req) s.Nil(err) setTargetReplica(replicaSelector, retryReplica) - rpcCtx, err = replicaSelector.buildRPCContext(bo) + rpcCtx, err = replicaSelector.getBaseReplicaSelector().buildRPCContext(bo, replicaSelector.targetReplica(), replicaSelector.proxyReplica()) s.Nil(err) req.IsRetryRequest = true - replicaSelector.patchRequestSource(req, rpcCtx) + patchRequestSource(req, replicaSelector.replicaType(rpcCtx)) s.Equal("retry_"+firstReplica+"_"+retryReplica+"_test", req.RequestSource) } } diff --git a/internal/locate/replica_selector.go b/internal/locate/replica_selector.go new file mode 100644 index 00000000..ed40b119 --- /dev/null +++ b/internal/locate/replica_selector.go @@ -0,0 +1,536 @@ +// Copyright 2024 TiKV Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package locate + +import ( + "fmt" + "time" + + "github.com/pingcap/kvproto/pkg/errorpb" + "github.com/pingcap/kvproto/pkg/metapb" + "github.com/pkg/errors" + "github.com/tikv/client-go/v2/config" + "github.com/tikv/client-go/v2/config/retry" + "github.com/tikv/client-go/v2/kv" + "github.com/tikv/client-go/v2/metrics" + "github.com/tikv/client-go/v2/tikvrpc" +) + +type ReplicaSelector interface { + next(bo *retry.Backoffer, req *tikvrpc.Request) (*RPCContext, error) + targetReplica() *replica + proxyReplica() *replica + replicaType(rpcCtx *RPCContext) string + String() string + getBaseReplicaSelector() *baseReplicaSelector + getLabels() []*metapb.StoreLabel + onSendSuccess(req *tikvrpc.Request) + onSendFailure(bo *retry.Backoffer, err error) + invalidateRegion() + // Following methods are used to handle region errors. + onNotLeader(bo *retry.Backoffer, ctx *RPCContext, notLeader *errorpb.NotLeader) (shouldRetry bool, err error) + onDataIsNotReady() + onServerIsBusy(bo *retry.Backoffer, ctx *RPCContext, req *tikvrpc.Request, serverIsBusy *errorpb.ServerIsBusy) (shouldRetry bool, err error) + onReadReqConfigurableTimeout(req *tikvrpc.Request) bool +} + +// NewReplicaSelector returns a new ReplicaSelector. +func NewReplicaSelector( + regionCache *RegionCache, regionID RegionVerID, req *tikvrpc.Request, opts ...StoreSelectorOption, +) (ReplicaSelector, error) { + if config.GetGlobalConfig().EnableReplicaSelectorV2 { + return newReplicaSelectorV2(regionCache, regionID, req, opts...) + } + return newReplicaSelector(regionCache, regionID, req, opts...) +} + +type replicaSelectorV2 struct { + baseReplicaSelector + replicaReadType kv.ReplicaReadType + isStaleRead bool + isReadOnlyReq bool + option storeSelectorOp + target *replica + proxy *replica + attempts int +} + +func newReplicaSelectorV2( + regionCache *RegionCache, regionID RegionVerID, req *tikvrpc.Request, opts ...StoreSelectorOption, +) (*replicaSelectorV2, error) { + cachedRegion := regionCache.GetCachedRegionWithRLock(regionID) + if cachedRegion == nil || !cachedRegion.isValid() { + return nil, errors.New("cached region invalid") + } + replicas := buildTiKVReplicas(cachedRegion) + option := storeSelectorOp{} + for _, op := range opts { + op(&option) + } + return &replicaSelectorV2{ + baseReplicaSelector: baseReplicaSelector{ + regionCache: regionCache, + region: cachedRegion, + replicas: replicas, + busyThreshold: time.Duration(req.BusyThresholdMs) * time.Millisecond, + }, + replicaReadType: req.ReplicaReadType, + isStaleRead: req.StaleRead, + isReadOnlyReq: isReadReq(req.Type), + option: option, + target: nil, + attempts: 0, + }, nil +} + +func (s *replicaSelectorV2) next(bo *retry.Backoffer, req *tikvrpc.Request) (rpcCtx *RPCContext, err error) { + if !s.region.isValid() { + metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("invalid").Inc() + return nil, nil + } + + s.attempts++ + s.target = nil + s.proxy = nil + switch s.replicaReadType { + case kv.ReplicaReadLeader: + s.nextForReplicaReadLeader(req) + default: + s.nextForReplicaReadMixed(req) + } + if s.target == nil { + return nil, nil + } + return s.buildRPCContext(bo, s.target, s.proxy) +} + +func (s *replicaSelectorV2) nextForReplicaReadLeader(req *tikvrpc.Request) { + if s.regionCache.enableForwarding { + strategy := ReplicaSelectLeaderWithProxyStrategy{} + s.target, s.proxy = strategy.next(s.replicas, s.region) + if s.target != nil && s.proxy != nil { + return + } + } + leaderIdx := s.region.getStore().workTiKVIdx + strategy := ReplicaSelectLeaderStrategy{leaderIdx: leaderIdx} + s.target = strategy.next(s.replicas) + if s.target != nil && s.busyThreshold > 0 && s.isReadOnlyReq && (s.target.store.EstimatedWaitTime() > s.busyThreshold || s.target.serverIsBusy) { + // If the leader is busy in our estimation, try other idle replicas. + // If other replicas are all busy, tryIdleReplica will try the leader again without busy threshold. + mixedStrategy := ReplicaSelectMixedStrategy{leaderIdx: leaderIdx, busyThreshold: s.busyThreshold} + idleTarget := mixedStrategy.next(s, s.region) + if idleTarget != nil { + s.target = idleTarget + req.ReplicaRead = true + } else { + // No threshold if all peers are too busy, remove busy threshold and still use leader. + s.busyThreshold = 0 + req.BusyThresholdMs = 0 + req.ReplicaRead = false + } + } + if s.target != nil { + return + } + mixedStrategy := ReplicaSelectMixedStrategy{leaderIdx: leaderIdx, leaderOnly: s.option.leaderOnly} + s.target = mixedStrategy.next(s, s.region) + if s.target != nil && s.isReadOnlyReq && s.replicas[leaderIdx].deadlineErrUsingConfTimeout { + req.ReplicaRead = true + req.StaleRead = false + } +} + +func (s *replicaSelectorV2) nextForReplicaReadMixed(req *tikvrpc.Request) { + leaderIdx := s.region.getStore().workTiKVIdx + if s.isStaleRead && s.attempts == 2 { + // For stale read second retry, try leader by leader read. + strategy := ReplicaSelectLeaderStrategy{leaderIdx: leaderIdx} + s.target = strategy.next(s.replicas) + if s.target != nil && !s.target.isExhausted(1, 0) { + // For stale read, don't retry leader again if it is accessed at the first attempt. + req.StaleRead = false + req.ReplicaRead = false + return + } + } + preferLeader := req.ReplicaReadType == kv.ReplicaReadPreferLeader + if s.attempts > 1 { + if req.ReplicaReadType == kv.ReplicaReadMixed { + // For mixed read retry, prefer retry leader first. + preferLeader = true + } + } + strategy := ReplicaSelectMixedStrategy{ + leaderIdx: leaderIdx, + tryLeader: req.ReplicaReadType == kv.ReplicaReadMixed || req.ReplicaReadType == kv.ReplicaReadPreferLeader, + preferLeader: preferLeader, + leaderOnly: s.option.leaderOnly, + learnerOnly: req.ReplicaReadType == kv.ReplicaReadLearner, + labels: s.option.labels, + stores: s.option.stores, + } + s.target = strategy.next(s, s.region) + if s.target != nil { + if s.isStaleRead && s.attempts == 1 { + // stale-read request first access. + if !s.target.store.IsLabelsMatch(s.option.labels) && s.target.peer.Id != s.region.GetLeaderPeerID() { + // If the target replica's labels is not match and not leader, use replica read. + // This is for compatible with old version. + req.StaleRead = false + req.ReplicaRead = true + } else { + // use stale read. + req.StaleRead = true + req.ReplicaRead = false + } + } else { + // always use replica. + req.StaleRead = false + req.ReplicaRead = s.isReadOnlyReq + } + } +} + +type ReplicaSelectLeaderStrategy struct { + leaderIdx AccessIndex +} + +func (s ReplicaSelectLeaderStrategy) next(replicas []*replica) *replica { + leader := replicas[s.leaderIdx] + if isLeaderCandidate(leader) { + return leader + } + return nil +} + +// ReplicaSelectMixedStrategy is used to select a replica by calculating a score for each replica, and then choose the one with the highest score. +// Attention, if you want the leader replica must be chosen in some case, you should use ReplicaSelectLeaderStrategy, instead of use ReplicaSelectMixedStrategy with preferLeader flag. +type ReplicaSelectMixedStrategy struct { + leaderIdx AccessIndex + tryLeader bool + preferLeader bool + leaderOnly bool + learnerOnly bool + labels []*metapb.StoreLabel + stores []uint64 + busyThreshold time.Duration +} + +func (s *ReplicaSelectMixedStrategy) next(selector *replicaSelectorV2, region *Region) *replica { + replicas := selector.replicas + maxScoreIdxes := make([]int, 0, len(replicas)) + maxScore := -1 + reloadRegion := false + for i, r := range replicas { + epochStale := r.isEpochStale() + liveness := r.store.getLivenessState() + isLeader := AccessIndex(i) == s.leaderIdx + if epochStale && ((liveness == reachable && r.store.getResolveState() == resolved) || isLeader) { + reloadRegion = true + } + if !s.isCandidate(r, isLeader, epochStale, liveness) { + continue + } + score := s.calculateScore(r, isLeader) + if score > maxScore { + maxScore = score + maxScoreIdxes = append(maxScoreIdxes[:0], i) + } else if score == maxScore && score > -1 { + maxScoreIdxes = append(maxScoreIdxes, i) + } + } + if reloadRegion { + selector.region.setSyncFlags(needDelayedReloadPending) + } + if len(maxScoreIdxes) == 1 { + idx := maxScoreIdxes[0] + return replicas[idx] + } else if len(maxScoreIdxes) > 1 { + // if there are more than one replica with the same max score, we will randomly select one + // todo: consider use store statistics information to select a faster one. + idx := maxScoreIdxes[randIntn(len(maxScoreIdxes))] + return replicas[idx] + } + if s.busyThreshold > 0 { + // when can't find an idle replica, no need to invalidate region. + return nil + } + // when meet deadline exceeded error, do fast retry without invalidate region cache. + if !hasDeadlineExceededError(selector.replicas) { + selector.invalidateRegion() + } + metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("exhausted").Inc() + return nil +} + +func (s *ReplicaSelectMixedStrategy) isCandidate(r *replica, isLeader bool, epochStale bool, liveness livenessState) bool { + if epochStale || liveness == unreachable { + // the replica is not available, skip it. + return false + } + maxAttempt := 1 + if r.dataIsNotReady && !isLeader { + // If the replica is failed by data not ready with stale read, we can retry it with replica-read. + // after https://github.com/tikv/tikv/pull/15726, the leader will not return DataIsNotReady error, + // then no need to retry leader again, if you try it again, you may got a NotLeader error. + maxAttempt = 2 + } + if r.isExhausted(maxAttempt, 0) { + // attempts is exhausted, skip it. + return false + } + if s.leaderOnly && !isLeader { + return false + } + if s.busyThreshold > 0 && (r.store.EstimatedWaitTime() > s.busyThreshold || r.serverIsBusy || isLeader) { + return false + } + return true +} + +const ( + // The definition of the score is: + // MSB LSB + // [unused bits][1 bit: LabelMatches][1 bit: PreferLeader][2 bits: NormalPeer + NotSlow] + flagLabelMatches = 1 << 4 + flagPreferLeader = 1 << 3 + flagNormalPeer = 1 << 2 + flagNotSlow = 1 << 1 + flagNotAttempt = 1 +) + +// calculateScore calculates the score of the replica. +func (s *ReplicaSelectMixedStrategy) calculateScore(r *replica, isLeader bool) int { + score := 0 + if r.store.IsStoreMatch(s.stores) && r.store.IsLabelsMatch(s.labels) { + score |= flagLabelMatches + } + if isLeader { + if s.preferLeader { + score |= flagPreferLeader + } else if s.tryLeader { + if len(s.labels) > 0 { + // When the leader has matching labels, prefer leader than other mismatching peers. + score |= flagPreferLeader + } else { + score |= flagNormalPeer + } + } + } else { + if s.learnerOnly { + if r.peer.Role == metapb.PeerRole_Learner { + score |= flagNormalPeer + } + } else { + score |= flagNormalPeer + } + } + if !r.store.healthStatus.IsSlow() { + score |= flagNotSlow + } + if r.attempts == 0 { + score |= flagNotAttempt + } + return score +} + +type ReplicaSelectLeaderWithProxyStrategy struct{} + +func (s ReplicaSelectLeaderWithProxyStrategy) next(replicas []*replica, region *Region) (leader *replica, proxy *replica) { + rs := region.getStore() + leaderIdx := rs.workTiKVIdx + leader = replicas[leaderIdx] + if leader.store.getLivenessState() == reachable || leader.notLeader { + // if leader's store is reachable, no need use proxy. + rs.unsetProxyStoreIfNeeded(region) + return nil, nil + } + proxyIdx := rs.proxyTiKVIdx + if proxyIdx >= 0 && int(proxyIdx) < len(replicas) && s.isCandidate(replicas[proxyIdx], proxyIdx == leaderIdx) { + return leader, replicas[proxyIdx] + } + + for i, r := range replicas { + if s.isCandidate(r, AccessIndex(i) == leaderIdx) { + return leader, r + } + } + return nil, nil +} + +func (s ReplicaSelectLeaderWithProxyStrategy) isCandidate(r *replica, isLeader bool) bool { + if isLeader || + r.isExhausted(1, 0) || + r.store.getLivenessState() != reachable || + r.isEpochStale() { + // check epoch here, if epoch staled, we can try other replicas. instead of buildRPCContext failed and invalidate region then retry. + return false + } + return true +} + +func (s *replicaSelectorV2) onNotLeader( + bo *retry.Backoffer, ctx *RPCContext, notLeader *errorpb.NotLeader, +) (shouldRetry bool, err error) { + if s.target != nil { + s.target.notLeader = true + } + leaderIdx, err := s.baseReplicaSelector.onNotLeader(bo, ctx, notLeader) + if err != nil { + return false, err + } + if leaderIdx >= 0 { + if isLeaderCandidate(s.replicas[leaderIdx]) { + s.replicaReadType = kv.ReplicaReadLeader + } + } + return true, nil +} + +func (s *replicaSelectorV2) onFlashbackInProgress(ctx *RPCContext, req *tikvrpc.Request) bool { + // if the failure is caused by replica read, we can retry it with leader safely. + if req.ReplicaRead && s.target != nil && s.target.peer.Id != s.region.GetLeaderPeerID() { + req.BusyThresholdMs = 0 + s.busyThreshold = 0 + s.replicaReadType = kv.ReplicaReadLeader + req.ReplicaReadType = kv.ReplicaReadLeader + return true + } + return false +} + +func (s *replicaSelectorV2) onDataIsNotReady() { + if s.target != nil { + s.target.dataIsNotReady = true + } +} + +func (s *replicaSelectorV2) onServerIsBusy( + bo *retry.Backoffer, ctx *RPCContext, req *tikvrpc.Request, serverIsBusy *errorpb.ServerIsBusy, +) (shouldRetry bool, err error) { + var store *Store + if ctx != nil && ctx.Store != nil { + store = ctx.Store + if serverIsBusy.EstimatedWaitMs != 0 { + ctx.Store.updateServerLoadStats(serverIsBusy.EstimatedWaitMs) + if s.busyThreshold != 0 && isReadReq(req.Type) { + // do not retry with batched coprocessor requests. + // it'll be region misses if we send the tasks to replica. + if req.Type == tikvrpc.CmdCop && len(req.Cop().Tasks) > 0 { + return false, nil + } + if s.target != nil { + s.target.serverIsBusy = true + } + } + } else { + // Mark the server is busy (the next incoming READs could be redirected to expected followers.) + ctx.Store.healthStatus.markAlreadySlow() + } + } + backoffErr := errors.Errorf("server is busy, ctx: %v", ctx) + if s.canFastRetry() { + s.addPendingBackoff(store, retry.BoTiKVServerBusy, backoffErr) + return true, nil + } + err = bo.Backoff(retry.BoTiKVServerBusy, backoffErr) + if err != nil { + return false, err + } + return true, nil +} + +func (s *replicaSelectorV2) canFastRetry() bool { + if s.replicaReadType == kv.ReplicaReadLeader { + leaderIdx := s.region.getStore().workTiKVIdx + leader := s.replicas[leaderIdx] + if isLeaderCandidate(leader) && !leader.serverIsBusy { + return false + } + } + return true +} + +func (s *replicaSelectorV2) onReadReqConfigurableTimeout(req *tikvrpc.Request) bool { + if isReadReqConfigurableTimeout(req) { + if s.target != nil { + s.target.deadlineErrUsingConfTimeout = true + } + return true + } + return false +} + +func (s *replicaSelectorV2) onSendFailure(bo *retry.Backoffer, err error) { + metrics.RegionCacheCounterWithSendFail.Inc() + // todo: mark store need check and return to fast retry. + target := s.target + if s.proxy != nil { + target = s.proxy + } + liveness := s.checkLiveness(bo, target) + if s.replicaReadType == kv.ReplicaReadLeader && s.proxy == nil && s.target != nil && s.target.peer.Id == s.region.GetLeaderPeerID() && + liveness == unreachable && len(s.replicas) > 1 && s.regionCache.enableForwarding { + // just return to use proxy. + return + } + if liveness != reachable { + s.invalidateReplicaStore(target, err) + } +} + +func (s *replicaSelectorV2) onSendSuccess(req *tikvrpc.Request) { + if s.proxy != nil && s.target != nil { + for idx, r := range s.replicas { + if r.peer.Id == s.proxy.peer.Id { + s.region.getStore().setProxyStoreIdx(s.region, AccessIndex(idx)) + break + } + } + } + if s.target != nil && s.target.peer.Id != s.region.GetLeaderPeerID() && req != nil && !req.StaleRead && !req.ReplicaRead { + s.region.switchWorkLeaderToPeer(s.target.peer) + } +} + +func (s *replicaSelectorV2) targetReplica() *replica { + return s.target +} + +func (s *replicaSelectorV2) proxyReplica() *replica { + return s.proxy +} + +func (s *replicaSelectorV2) getLabels() []*metapb.StoreLabel { + return s.option.labels +} + +func (s *replicaSelectorV2) replicaType(_ *RPCContext) string { + if s.target != nil { + if s.target.peer.Id == s.region.GetLeaderPeerID() { + return "leader" + } + return "follower" + } + return "unknown" +} + +func (s *replicaSelectorV2) String() string { + if s == nil { + return "" + } + return fmt.Sprintf("replicaSelectorV2{replicaReadType: %v, attempts: %v, %v}", s.replicaReadType.String(), s.attempts, s.baseReplicaSelector.String()) +} diff --git a/internal/locate/replica_selector_test.go b/internal/locate/replica_selector_test.go index f667bbd9..0359460f 100644 --- a/internal/locate/replica_selector_test.go +++ b/internal/locate/replica_selector_test.go @@ -4,6 +4,8 @@ import ( "context" "fmt" "math/rand" + "os" + "runtime/pprof" "sort" "strconv" "strings" @@ -17,13 +19,17 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/pkg/errors" "github.com/stretchr/testify/suite" + "github.com/tikv/client-go/v2/config" "github.com/tikv/client-go/v2/config/retry" "github.com/tikv/client-go/v2/internal/apicodec" "github.com/tikv/client-go/v2/internal/client" + "github.com/tikv/client-go/v2/internal/logutil" "github.com/tikv/client-go/v2/internal/mockstore/mocktikv" "github.com/tikv/client-go/v2/kv" "github.com/tikv/client-go/v2/oracle" "github.com/tikv/client-go/v2/tikvrpc" + "github.com/tikv/client-go/v2/util/israce" + "go.uber.org/zap" ) type testReplicaSelectorSuite struct { @@ -83,8 +89,10 @@ type replicaSelectorAccessPathCase struct { label *metapb.StoreLabel accessErr []RegionErrorType accessErrInValid bool - accessPathResult // use to record the execution result. - expect *accessPathResult // + expect *accessPathResult + result accessPathResult + beforeRun func() // beforeRun will be called before the test case execute, if it is nil, resetStoreState will be called. + afterRun func() // afterRun will be called after the test case execute, if it is nil, invalidateRegion will be called. } type accessPathResult struct { @@ -96,14 +104,221 @@ type accessPathResult struct { regionIsValid bool } +func TestReplicaSelectorBasic(t *testing.T) { + s := new(testReplicaSelectorSuite) + s.SetupTest(t) + defer s.TearDownTest() + + req := tikvrpc.NewReplicaReadRequest(tikvrpc.CmdGet, &kvrpcpb.GetRequest{Key: []byte("a")}, kv.ReplicaReadMixed, nil, kvrpcpb.Context{}) + req.EnableStaleWithMixedReplicaRead() + rc := s.getRegion() + s.NotNil(rc) + rc.invalidate(Other) + selector, err := newReplicaSelectorV2(s.cache, rc.VerID(), req) + s.NotNil(err) + s.Equal("cached region invalid", err.Error()) + s.Nil(selector) + s.Equal("", selector.String()) + + rc = s.getRegion() + selector, err = newReplicaSelectorV2(s.cache, rc.VerID(), req) + s.Nil(err) + s.NotNil(selector) + for _, reqSource := range []string{"leader", "follower", "follower", "unknown"} { + ctx, err := selector.next(s.bo, req) + s.Nil(err) + s.Equal(reqSource, selector.replicaType(ctx)) + } + + rc = s.getRegion() + selector, err = newReplicaSelectorV2(s.cache, rc.VerID(), req) + s.Nil(err) + s.NotNil(selector) + ctx, err := selector.next(s.bo, req) + s.Nil(err) + s.NotNil(ctx) + rc.invalidate(Other) + ctx, err = selector.next(s.bo, req) + s.Nil(err) + s.Nil(ctx) +} + +func TestReplicaSelectorCalculateScore(t *testing.T) { + s := new(testReplicaSelectorSuite) + s.SetupTest(t) + defer s.TearDownTest() + + req := tikvrpc.NewReplicaReadRequest(tikvrpc.CmdGet, &kvrpcpb.GetRequest{Key: []byte("a")}, kv.ReplicaReadMixed, nil, kvrpcpb.Context{}) + region, err := s.cache.LocateKey(s.bo, []byte("a")) + s.Nil(err) + s.NotNil(region) + selector, err := NewReplicaSelector(s.cache, region.Region, req) + s.Nil(err) + for i, r := range selector.getBaseReplicaSelector().replicas { + rc := s.cache.GetCachedRegionWithRLock(region.Region) + s.NotNil(rc) + isLeader := r.peer.Id == rc.GetLeaderPeerID() + s.Equal(isLeader, AccessIndex(i) == rc.getStore().workTiKVIdx) + strategy := ReplicaSelectMixedStrategy{leaderIdx: rc.getStore().workTiKVIdx} + score := strategy.calculateScore(r, isLeader) + s.Equal(r.store.healthStatus.IsSlow(), false) + if isLeader { + s.Equal(score, flagLabelMatches+flagNotSlow+flagNotAttempt) + } else { + s.Equal(score, flagLabelMatches+flagNormalPeer+flagNotSlow+flagNotAttempt) + } + r.store.healthStatus.markAlreadySlow() + s.Equal(r.store.healthStatus.IsSlow(), true) + score = strategy.calculateScore(r, isLeader) + if isLeader { + s.Equal(score, flagLabelMatches+flagNotAttempt) + } else { + s.Equal(score, flagLabelMatches+flagNormalPeer+flagNotAttempt) + } + strategy.tryLeader = true + score = strategy.calculateScore(r, isLeader) + s.Equal(score, flagLabelMatches+flagNormalPeer+flagNotAttempt) + strategy.preferLeader = true + score = strategy.calculateScore(r, isLeader) + if isLeader { + s.Equal(score, flagLabelMatches+flagPreferLeader+flagNotAttempt) + } else { + s.Equal(score, flagLabelMatches+flagNormalPeer+flagNotAttempt) + } + strategy.learnerOnly = true + strategy.tryLeader = false + strategy.preferLeader = false + score = strategy.calculateScore(r, isLeader) + s.Equal(score, flagLabelMatches+flagNotAttempt) + labels := []*metapb.StoreLabel{ + { + Key: "zone", + Value: "us-west-1", + }, + } + strategy.labels = labels + score = strategy.calculateScore(r, isLeader) + s.Equal(score, flagNotAttempt) + + strategy = ReplicaSelectMixedStrategy{ + leaderIdx: rc.getStore().workTiKVIdx, + tryLeader: true, + labels: labels, + } + score = strategy.calculateScore(r, isLeader) + if isLeader { + s.Equal(score, flagPreferLeader+flagNotAttempt) + } else { + s.Equal(score, flagNormalPeer+flagNotAttempt) + } + + strategy = ReplicaSelectMixedStrategy{ + leaderIdx: rc.getStore().workTiKVIdx, + preferLeader: true, + labels: labels, + } + score = strategy.calculateScore(r, isLeader) + if isLeader { + s.Equal(score, flagPreferLeader+flagNotAttempt) + } else { + s.Equal(score, flagNormalPeer+flagNotAttempt) + } + r.store.labels = labels + score = strategy.calculateScore(r, isLeader) + if isLeader { + s.Equal(score, flagLabelMatches+flagPreferLeader+flagNotAttempt) + } else { + s.Equal(score, flagLabelMatches+flagNormalPeer+flagNotAttempt) + } + r.store.labels = nil + } +} + +func TestCanFastRetry(t *testing.T) { + s := new(testReplicaSelectorSuite) + s.SetupTest(t) + defer s.TearDownTest() + + // Test for non-leader read. + loc, err := s.cache.LocateKey(s.bo, []byte("key")) + s.Nil(err) + req := tikvrpc.NewRequest(tikvrpc.CmdGet, &kvrpcpb.GetRequest{Key: []byte("key")}) + req.EnableStaleWithMixedReplicaRead() + selector, err := newReplicaSelector(s.cache, loc.Region, req) + s.Nil(err) + for i := 0; i < 3; i++ { + _, err = selector.next(s.bo, req) + s.Nil(err) + selector.canFastRetry() + s.True(selector.canFastRetry()) + } + + // Test for leader read. + req = tikvrpc.NewRequest(tikvrpc.CmdGet, &kvrpcpb.GetRequest{Key: []byte("key")}) + req.ReplicaReadType = kv.ReplicaReadLeader + selector, err = newReplicaSelector(s.cache, loc.Region, req) + s.Nil(err) + for i := 0; i < 12; i++ { + _, err = selector.next(s.bo, req) + s.Nil(err) + ok := selector.canFastRetry() + if i <= 8 { + s.False(ok) // can't skip since leader is available. + } else { + s.True(ok) + } + } +} + +func TestPendingBackoff(t *testing.T) { + s := new(testReplicaSelectorSuite) + s.SetupTest(t) + defer s.TearDownTest() + + loc, err := s.cache.LocateKey(s.bo, []byte("key")) + s.Nil(err) + req := tikvrpc.NewRequest(tikvrpc.CmdGet, &kvrpcpb.GetRequest{Key: []byte("key")}) + req.EnableStaleWithMixedReplicaRead() + selector, err := newReplicaSelector(s.cache, loc.Region, req) + s.Nil(err) + bo := retry.NewNoopBackoff(context.Background()) + err = selector.backoffOnRetry(nil, bo) + s.Nil(err) + err = selector.backoffOnRetry(&Store{storeID: 1}, bo) + s.Nil(err) + err = selector.backoffOnNoCandidate(bo) + s.Nil(err) + selector.addPendingBackoff(nil, retry.BoRegionScheduling, errors.New("err-0")) + s.Equal(1, len(selector.pendingBackoffs)) + selector.addPendingBackoff(&Store{storeID: 1}, retry.BoTiKVRPC, errors.New("err-1")) + s.Equal(2, len(selector.pendingBackoffs)) + selector.addPendingBackoff(&Store{storeID: 2}, retry.BoTiKVDiskFull, errors.New("err-2")) + s.Equal(3, len(selector.pendingBackoffs)) + selector.addPendingBackoff(&Store{storeID: 1}, retry.BoTiKVServerBusy, errors.New("err-3")) + s.Equal(3, len(selector.pendingBackoffs)) + _, ok := selector.pendingBackoffs[0] + s.True(ok) + err = selector.backoffOnRetry(nil, bo) + s.NotNil(err) + s.Equal("err-0", err.Error()) + _, ok = selector.pendingBackoffs[0] + s.False(ok) + s.Equal(2, len(selector.pendingBackoffs)) + err = selector.backoffOnRetry(&Store{storeID: 10}, bo) + s.Nil(err) + s.Equal(2, len(selector.pendingBackoffs)) + err = selector.backoffOnNoCandidate(bo) + s.NotNil(err) + s.Equal("err-3", err.Error()) +} + func TestReplicaReadAccessPathByCase(t *testing.T) { s := new(testReplicaSelectorSuite) s.SetupTest(t) defer s.TearDownTest() fakeEpochNotMatch := &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}} // fake region error, cause by no replica is available. - var ca replicaSelectorAccessPathCase - ca = replicaSelectorAccessPathCase{ + ca := replicaSelectorAccessPathCase{ reqType: tikvrpc.CmdGet, readType: kv.ReplicaReadMixed, staleRead: true, @@ -459,82 +674,1338 @@ func TestReplicaReadAccessPathByCase(t *testing.T) { s.True(s.runCaseAndCompare(ca)) } -func TestCanFastRetry(t *testing.T) { +func TestReplicaReadAccessPathByCase2(t *testing.T) { s := new(testReplicaSelectorSuite) s.SetupTest(t) defer s.TearDownTest() - // Test for non-leader read. - loc, err := s.cache.LocateKey(s.bo, []byte("key")) - s.Nil(err) - req := tikvrpc.NewRequest(tikvrpc.CmdGet, &kvrpcpb.GetRequest{Key: []byte("key")}) - req.EnableStaleWithMixedReplicaRead() - selector, err := newReplicaSelector(s.cache, loc.Region, req) - s.Nil(err) - for i := 0; i < 3; i++ { - _, err = selector.next(s.bo) - s.Nil(err) - selector.canFastRetry() - s.True(selector.canFastRetry()) + fakeEpochNotMatch := &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}} + // Following cases are found by other test, careful. + ca := replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: false, + timeout: time.Second, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DeadLineExceededErr, DiskFullErr, FlashbackNotPreparedErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store1, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}"}, + respErr: "region 0 is not prepared for the flashback", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"tikvDiskFull+1"}, + regionIsValid: true, + }, } + s.True(s.runCaseAndCompare(ca)) - // Test for leader read. - req = tikvrpc.NewRequest(tikvrpc.CmdGet, &kvrpcpb.GetRequest{Key: []byte("key")}) - req.ReplicaReadType = kv.ReplicaReadLeader - selector, err = newReplicaSelector(s.cache, loc.Region, req) - s.Nil(err) - for i := 0; i < 12; i++ { - _, err = selector.next(s.bo) - s.Nil(err) - ok := selector.canFastRetry() - if i <= 8 { - s.False(ok) // can't skip since leader is available. - } else { - s.True(ok) + // Don't invalid region in tryFollowers, since leader meets deadlineExceededErr. + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + staleRead: false, + timeout: time.Second, + label: nil, + accessErr: []RegionErrorType{NotLeaderErr, NotLeaderWithNewLeader3Err, DeadLineExceededErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}"}, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 1, + backoffDetail: []string{"regionScheduling+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + // Don't invalid region in accessFollower, since leader meets deadlineExceededErr. + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: false, + timeout: time.Second, + label: nil, + accessErr: []RegionErrorType{ServerIsBusyErr, ServerIsBusyErr, DeadLineExceededErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: true, stale-read: false}", + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}"}, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 1, + backoffDetail: []string{"tikvServerBusy+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + timeout: 0, + label: &metapb.StoreLabel{Key: "id", Value: "3"}, + accessErr: []RegionErrorType{DataIsNotReadyErr, NotLeaderErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store3, replica-read: false, stale-read: true}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}"}, + respErr: "", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"regionScheduling+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: false, + timeout: time.Second, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{FlashbackInProgressErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store1, replica-read: true, stale-read: false}"}, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + timeout: time.Microsecond * 100, + label: &metapb.StoreLabel{Key: "id", Value: "1"}, + accessErr: []RegionErrorType{DataIsNotReadyErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: true}", + // don't retry leader(store1), since leader won't return DataIsNotReadyErr, so retry it with leader-read may got NotLeaderErr. + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}"}, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdPrewrite, + readType: kv.ReplicaReadLeader, + staleRead: false, + timeout: 0, + label: nil, + accessErr: []RegionErrorType{DeadLineExceededErr, NotLeaderErr, NotLeaderWithNewLeader2Err, ServerIsBusyErr, DeadLineExceededErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}"}, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 4, + backoffDetail: []string{"regionScheduling+1", "tikvRPC+2", "tikvServerBusy+1"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + timeout: 0, + label: nil, + accessErr: []RegionErrorType{DeadLineExceededErr, RegionNotInitializedErr, DeadLineExceededErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: true}", + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}"}, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 3, + backoffDetail: []string{"regionNotInitialized+1", "tikvRPC+2"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + timeout: time.Second, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DeadLineExceededErr, NotLeaderWithNewLeader2Err}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: false, stale-read: true}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}"}, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: false, + timeout: time.Second, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DeadLineExceededErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store1, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}"}, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + s.changeRegionLeader(2) + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: false, + timeout: time.Second, + label: &metapb.StoreLabel{Key: "id", Value: "3"}, + accessErr: []RegionErrorType{DeadLineExceededErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store3, replica-read: true, stale-read: false}", + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store1, replica-read: true, stale-read: false}"}, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + s.changeRegionLeader(1) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + staleRead: false, + timeout: time.Second, + accessErr: []RegionErrorType{NotLeaderErr, DeadLineExceededErr, NotLeaderWithNewLeader2Err}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}"}, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 1, + backoffDetail: []string{"regionScheduling+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) +} + +func TestReplicaReadAccessPathByBasicCase(t *testing.T) { + s := new(testReplicaSelectorSuite) + s.SetupTest(t) + defer s.TearDownTest() + + retryableErrors := []RegionErrorType{ServerIsBusyErr, ServerIsBusyWithEstimatedWaitMsErr, StaleCommandErr, MaxTimestampNotSyncedErr, ProposalInMergingModeErr, ReadIndexNotReadyErr, RegionNotInitializedErr, DiskFullErr} + noRetryErrors := []RegionErrorType{RegionNotFoundErr, KeyNotInRegionErr, EpochNotMatchErr, StoreNotMatchErr, RaftEntryTooLargeErr, RecoveryInProgressErr, FlashbackNotPreparedErr, IsWitnessErr, MismatchPeerIdErr, BucketVersionNotMatchErr} + for _, reqType := range []tikvrpc.CmdType{tikvrpc.CmdGet, tikvrpc.CmdPrewrite} { + for _, readType := range []kv.ReplicaReadType{kv.ReplicaReadLeader, kv.ReplicaReadFollower, kv.ReplicaReadMixed, kv.ReplicaReadPreferLeader} { + if reqType == tikvrpc.CmdPrewrite && readType != kv.ReplicaReadLeader { + // write req only support leader read. + continue + } + for _, staleRead := range []bool{false, true} { + if staleRead && readType != kv.ReplicaReadMixed { + // stale read only support mixed read. + continue + } + for _, tp := range retryableErrors { + backoff := []string{} + switch tp { + case ServerIsBusyErr, ServerIsBusyWithEstimatedWaitMsErr: + if readType == kv.ReplicaReadLeader { + backoff = []string{"tikvServerBusy+1"} + } + case DiskFullErr: + backoff = []string{"tikvDiskFull+1"} + case RegionNotInitializedErr: + backoff = []string{"regionNotInitialized+1"} + case ReadIndexNotReadyErr, ProposalInMergingModeErr: + backoff = []string{"regionScheduling+1"} + case MaxTimestampNotSyncedErr: + backoff = []string{"maxTsNotSynced+1"} + } + accessPath := []string{ + "{addr: store1, replica-read: true, stale-read: false}", + "{addr: store2, replica-read: true, stale-read: false}", + } + switch readType { + case kv.ReplicaReadLeader: + accessPath = []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store1, replica-read: false, stale-read: false}", + } + case kv.ReplicaReadFollower: + if tp == ServerIsBusyErr { + backoff = []string{} + } + accessPath = []string{ + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + } + case kv.ReplicaReadMixed: + if tp == ServerIsBusyErr { + backoff = []string{} + } + if staleRead { + accessPath = []string{ + "{addr: store1, replica-read: false, stale-read: true}", + "{addr: store2, replica-read: true, stale-read: false}", + } + } + default: + if tp == ServerIsBusyErr { + backoff = []string{} + } + } + ca := replicaSelectorAccessPathCase{ + reqType: reqType, + readType: readType, + staleRead: staleRead, + accessErr: []RegionErrorType{tp}, + expect: &accessPathResult{ + accessPath: accessPath, + respErr: "", + respRegionError: nil, + backoffCnt: len(backoff), + backoffDetail: backoff, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + } + + for _, tp := range noRetryErrors { + backoff := []string{} + regionIsValid := false + respErr := "" + respRegionError := tp.GenRegionError() + accessPath := []string{"{addr: store1, replica-read: true, stale-read: false}"} + switch tp { + case RecoveryInProgressErr: + backoff = []string{"regionRecoveryInProgress+1"} + case IsWitnessErr: + backoff = []string{"isWitness+1"} + case BucketVersionNotMatchErr: + regionIsValid = true + case RaftEntryTooLargeErr: + respErr = RaftEntryTooLargeErr.GenRegionError().String() + respRegionError = nil + regionIsValid = true + case FlashbackNotPreparedErr: + respErr = "region 0 is not prepared for the flashback" + respRegionError = nil + regionIsValid = true + } + switch readType { + case kv.ReplicaReadLeader: + accessPath = []string{"{addr: store1, replica-read: false, stale-read: false}"} + case kv.ReplicaReadFollower: + accessPath = []string{"{addr: store2, replica-read: true, stale-read: false}"} + case kv.ReplicaReadMixed: + if staleRead { + accessPath = []string{"{addr: store1, replica-read: false, stale-read: true}"} + } + } + ca := replicaSelectorAccessPathCase{ + reqType: reqType, + readType: readType, + staleRead: staleRead, + accessErr: []RegionErrorType{tp}, + expect: &accessPathResult{ + accessPath: accessPath, + respErr: respErr, + respRegionError: respRegionError, + backoffCnt: len(backoff), + backoffDetail: backoff, + regionIsValid: regionIsValid, + }, + } + s.True(s.runCaseAndCompare(ca)) + } + } } } } -func TestPendingBackoff(t *testing.T) { +func TestReplicaReadAccessPathByLeaderCase(t *testing.T) { s := new(testReplicaSelectorSuite) s.SetupTest(t) defer s.TearDownTest() - loc, err := s.cache.LocateKey(s.bo, []byte("key")) - s.Nil(err) - req := tikvrpc.NewRequest(tikvrpc.CmdGet, &kvrpcpb.GetRequest{Key: []byte("key")}) - req.EnableStaleWithMixedReplicaRead() - selector, err := newReplicaSelector(s.cache, loc.Region, req) - s.Nil(err) - bo := retry.NewNoopBackoff(context.Background()) - err = selector.backoffOnRetry(nil, bo) - s.Nil(err) - err = selector.backoffOnRetry(&Store{storeID: 1}, bo) - s.Nil(err) - err = selector.backoffOnNoCandidate(bo) - s.Nil(err) - selector.addPendingBackoff(nil, retry.BoRegionScheduling, errors.New("err-0")) - s.Equal(1, len(selector.pendingBackoffs)) - selector.addPendingBackoff(&Store{storeID: 1}, retry.BoTiKVRPC, errors.New("err-1")) - s.Equal(2, len(selector.pendingBackoffs)) - selector.addPendingBackoff(&Store{storeID: 2}, retry.BoTiKVDiskFull, errors.New("err-2")) - s.Equal(3, len(selector.pendingBackoffs)) - selector.addPendingBackoff(&Store{storeID: 1}, retry.BoTiKVServerBusy, errors.New("err-3")) - s.Equal(3, len(selector.pendingBackoffs)) - _, ok := selector.pendingBackoffs[0] - s.True(ok) - err = selector.backoffOnRetry(nil, bo) - s.NotNil(err) - s.Equal("err-0", err.Error()) - _, ok = selector.pendingBackoffs[0] - s.False(ok) - s.Equal(2, len(selector.pendingBackoffs)) - err = selector.backoffOnRetry(&Store{storeID: 10}, bo) - s.Nil(err) - s.Equal(2, len(selector.pendingBackoffs)) - err = selector.backoffOnNoCandidate(bo) - s.NotNil(err) - s.Equal("err-3", err.Error()) + fakeEpochNotMatch := &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}} // fake region error, cause by no replica is available. + ca := replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: nil, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{ServerIsBusyWithEstimatedWaitMsErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store1, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 2, + backoffDetail: []string{"tikvServerBusy+2"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{ServerIsBusyErr, ServerIsBusyErr, ServerIsBusyErr, ServerIsBusyErr, ServerIsBusyErr, + ServerIsBusyErr, ServerIsBusyErr, ServerIsBusyErr, ServerIsBusyErr, ServerIsBusyErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 9, + backoffDetail: []string{"tikvServerBusy+9"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{NotLeaderErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"regionScheduling+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{NotLeaderErr, NotLeaderErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 2, + backoffDetail: []string{"regionScheduling+2"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{NotLeaderErr, NotLeaderErr, NotLeaderErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 3, + backoffDetail: []string{"regionScheduling+3"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{NotLeaderWithNewLeader3Err}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, // no backoff, fast retry. + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{NotLeaderWithNewLeader3Err, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"tikvServerBusy+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{DeadLineExceededErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"tikvRPC+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{DeadLineExceededErr, NotLeaderWithNewLeader1Err, NotLeaderWithNewLeader1Err}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", // Suppose there is network partition between TiDB and store1 + "{addr: store2, replica-read: false, stale-read: false}", // TODO(crazycs520): is this expected, should retry with replica-read? + "{addr: store3, replica-read: false, stale-read: false}", // ditto. + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 1, + backoffDetail: []string{"tikvRPC+1"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{DeadLineExceededErr, ServerIsBusyErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}"}, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 2, + backoffDetail: []string{"tikvRPC+1", "tikvServerBusy+1"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + busyThresholdMs: 10, + accessErr: []RegionErrorType{NotLeaderErr, NotLeaderWithNewLeader3Err, ServerIsBusyWithEstimatedWaitMsErr, NotLeaderErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}"}, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 3, + backoffDetail: []string{"regionScheduling+2", "tikvServerBusy+1"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + busyThresholdMs: 10, + accessErr: []RegionErrorType{NotLeaderErr, NotLeaderWithNewLeader3Err, ServerIsBusyWithEstimatedWaitMsErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}"}, + respErr: "", + respRegionError: nil, + backoffCnt: 3, + backoffDetail: []string{"regionScheduling+1", "tikvServerBusy+2"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + // Test for switch leader. + cas := []replicaSelectorAccessPathCase{ + { + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{NotLeaderErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"regionScheduling+1"}, + regionIsValid: true, + }, + afterRun: func() { /* don't invalid region */ }, + }, + { + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{NotLeaderWithNewLeader3Err}, + beforeRun: func() { /* don't resetStoreState */ }, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: false, stale-read: false}", // try new leader directly. + "{addr: store3, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + afterRun: func() { /* don't invalid region */ }, + }, + { + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{ServerIsBusyErr}, + beforeRun: func() { /* don't resetStoreState */ }, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store3, replica-read: false, stale-read: false}", // try new leader directly. + "{addr: store3, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"tikvServerBusy+1"}, + regionIsValid: true, + }, + }, + } + s.True(s.runMultiCaseAndCompare(cas)) +} + +func TestReplicaReadAccessPathByFollowerCase(t *testing.T) { + s := new(testReplicaSelectorSuite) + s.SetupTest(t) + defer s.TearDownTest() + + fakeEpochNotMatch := &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}} + ca := replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadFollower, + accessErr: nil, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadFollower, + accessErr: []RegionErrorType{ServerIsBusyWithEstimatedWaitMsErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + "{addr: store1, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadFollower, + accessErr: []RegionErrorType{ServerIsBusyErr, ServerIsBusyWithEstimatedWaitMsErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + "{addr: store1, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 1, + backoffDetail: []string{"tikvServerBusy+1"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadFollower, + accessErr: []RegionErrorType{DeadLineExceededErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + "{addr: store1, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"tikvRPC+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadFollower, + timeout: time.Second, + accessErr: []RegionErrorType{DeadLineExceededErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + "{addr: store1, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) +} + +func TestReplicaReadAccessPathByMixedAndPreferLeaderCase(t *testing.T) { + s := new(testReplicaSelectorSuite) + s.SetupTest(t) + defer s.TearDownTest() + + fakeEpochNotMatch := &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}} + var ca replicaSelectorAccessPathCase + // since leader in store1, so ReplicaReadMixed and ReplicaReadPreferLeader will have the same access path. + for _, readType := range []kv.ReplicaReadType{kv.ReplicaReadMixed, kv.ReplicaReadPreferLeader} { + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: readType, + accessErr: nil, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: readType, + accessErr: []RegionErrorType{ServerIsBusyWithEstimatedWaitMsErr, StaleCommandErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: true, stale-read: false}", + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 1, + backoffDetail: []string{"tikvServerBusy+1"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: readType, + accessErr: []RegionErrorType{ServerIsBusyErr, RegionNotFoundErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: true, stale-read: false}", + "{addr: store2, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: RegionNotFoundErr.GenRegionError(), + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: readType, + accessErr: []RegionErrorType{DeadLineExceededErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: true, stale-read: false}", + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"tikvRPC+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: readType, + timeout: time.Second, + accessErr: []RegionErrorType{DeadLineExceededErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: true, stale-read: false}", + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + } + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DeadLineExceededErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: true, stale-read: false}", // try match label first. + "{addr: store1, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"tikvRPC+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + s.changeRegionLeader(3) + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadPreferLeader, + accessErr: []RegionErrorType{DeadLineExceededErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store3, replica-read: true, stale-read: false}", // try leader first. + "{addr: store1, replica-read: true, stale-read: false}", + "{addr: store2, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"tikvRPC+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadPreferLeader, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DeadLineExceededErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: true, stale-read: false}", // try match label first, since match label has higher priority. + "{addr: store3, replica-read: true, stale-read: false}", // try leader. + "{addr: store1, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"tikvRPC+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + s.changeRegionLeader(1) +} + +func TestReplicaReadAccessPathByStaleReadCase(t *testing.T) { + s := new(testReplicaSelectorSuite) + s.SetupTest(t) + defer s.TearDownTest() + + fakeEpochNotMatch := &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}} + ca := replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + accessErr: nil, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: true}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + accessErr: []RegionErrorType{DataIsNotReadyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: true}", + "{addr: store2, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + // test stale read with label. + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DataIsNotReadyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: false, stale-read: true}", + "{addr: store1, replica-read: false, stale-read: false}", // try leader with leader read. + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + accessErr: []RegionErrorType{DataIsNotReadyErr, ServerIsBusyErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: true}", + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 1, + backoffDetail: []string{"tikvServerBusy+1"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DataIsNotReadyErr, NotLeaderWithNewLeader3Err}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: false, stale-read: true}", + "{addr: store1, replica-read: false, stale-read: false}", // try leader with leader read. + "{addr: store3, replica-read: false, stale-read: false}", // try new leader. + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DataIsNotReadyErr, ServerIsBusyErr, ServerIsBusyErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: false, stale-read: true}", + "{addr: store1, replica-read: false, stale-read: false}", // try leader with leader read. + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 1, + backoffDetail: []string{"tikvServerBusy+1"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + s.changeRegionLeader(2) + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + accessErr: []RegionErrorType{DataIsNotReadyErr, ServerIsBusyErr, ServerIsBusyErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: true}", + "{addr: store2, replica-read: false, stale-read: false}", // try leader with leader read. + "{addr: store3, replica-read: true, stale-read: false}", + "{addr: store1, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 1, + backoffDetail: []string{"tikvServerBusy+1"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + s.changeRegionLeader(1) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DataIsNotReadyErr, NotLeaderErr, ServerIsBusyErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: false, stale-read: true}", + "{addr: store1, replica-read: false, stale-read: false}", // try leader with leader read. + "{addr: store2, replica-read: true, stale-read: false}", // retry store2 with replica-read. + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 2, + backoffDetail: []string{"regionScheduling+1", "tikvServerBusy+1"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DataIsNotReadyErr, NotLeaderWithNewLeader3Err, ServerIsBusyErr, NotLeaderWithNewLeader2Err, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: false, stale-read: true}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 2, + backoffDetail: []string{"tikvServerBusy+2"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DataIsNotReadyErr, NotLeaderErr, ServerIsBusyErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: false, stale-read: true}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 2, + backoffDetail: []string{"regionScheduling+1", "tikvServerBusy+1"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DataIsNotReadyErr, RegionNotFoundErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: false, stale-read: true}", + "{addr: store1, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: RegionNotFoundErr.GenRegionError(), + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + timeout: time.Second, + accessErr: []RegionErrorType{DeadLineExceededErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: true}", + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + timeout: time.Second, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DeadLineExceededErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: false, stale-read: true}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + timeout: time.Second, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DeadLineExceededErr, DeadLineExceededErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: false, stale-read: true}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + cas := []replicaSelectorAccessPathCase{ + { + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + timeout: 0, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{DeadLineExceededErr, DeadLineExceededErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: false, stale-read: true}", + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 2, + backoffDetail: []string{"tikvRPC+2"}, + regionIsValid: true, + }, + afterRun: func() { /* don't invalid region */ }, + }, + { + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: true, + label: &metapb.StoreLabel{Key: "id", Value: "2"}, + accessErr: []RegionErrorType{ServerIsBusyErr}, + beforeRun: func() { /* don't resetStoreState */ }, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 1, + backoffDetail: []string{"tikvServerBusy+1"}, + regionIsValid: false, + }, + afterRun: func() { /* don't invalid region */ }, + }, + } + s.True(s.runMultiCaseAndCompare(cas)) } func TestReplicaReadAccessPathByTryIdleReplicaCase(t *testing.T) { @@ -549,7 +2020,7 @@ func TestReplicaReadAccessPathByTryIdleReplicaCase(t *testing.T) { reqType: tikvrpc.CmdPrewrite, readType: kv.ReplicaReadLeader, busyThresholdMs: 10, - accessErr: []RegionErrorType{ServerIsBusyWithEstimatedWaitMsErr, ServerIsBusyWithEstimatedWaitMsErr}, + accessErr: []RegionErrorType{ServerIsBusyErr, ServerIsBusyErr}, expect: &accessPathResult{ accessPath: []string{ "{addr: store1, replica-read: false, stale-read: false}", @@ -565,6 +2036,26 @@ func TestReplicaReadAccessPathByTryIdleReplicaCase(t *testing.T) { } s.True(s.runCaseAndCompare(ca)) + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + busyThresholdMs: 10, + accessErr: []RegionErrorType{ServerIsBusyWithEstimatedWaitMsErr, ServerIsBusyWithEstimatedWaitMsErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + ca = replicaSelectorAccessPathCase{ reqType: tikvrpc.CmdGet, readType: kv.ReplicaReadLeader, @@ -628,6 +2119,400 @@ func TestReplicaReadAccessPathByTryIdleReplicaCase(t *testing.T) { s.True(s.runCaseAndCompare(ca)) } +func TestReplicaReadAccessPathByFlashbackInProgressCase(t *testing.T) { + s := new(testReplicaSelectorSuite) + s.SetupTest(t) + defer s.TearDownTest() + + var ca replicaSelectorAccessPathCase + fakeEpochNotMatch := &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}} + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + staleRead: false, + timeout: time.Second, + label: nil, + accessErr: []RegionErrorType{FlashbackInProgressErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + }, + respErr: "region 0 is in flashback progress, FlashbackStartTS is 0", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + staleRead: false, + timeout: time.Second, + label: nil, + accessErr: []RegionErrorType{DeadLineExceededErr, FlashbackInProgressErr, FlashbackInProgressErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + // not compatible case. + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: false, + timeout: 0, + label: nil, + accessErr: []RegionErrorType{FlashbackInProgressErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: true, stale-read: false}", + "{addr: store1, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCase(ca, false)) + ca.expect = &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: true, stale-read: false}", + }, + respErr: "region 0 is in flashback progress, FlashbackStartTS is 0", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + } + s.True(s.runCase(ca, true)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadMixed, + staleRead: false, + timeout: time.Second, + label: nil, + accessErr: []RegionErrorType{DeadLineExceededErr, FlashbackInProgressErr, FlashbackInProgressErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: true, stale-read: false}", + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store1, replica-read: true, stale-read: false}", + }, + respErr: "region 0 is in flashback progress, FlashbackStartTS is 0", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCase(ca, false)) + ca.expect = &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: true, stale-read: false}", + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store3, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + } + s.True(s.runCase(ca, true)) +} + +func TestReplicaReadAccessPathByProxyCase(t *testing.T) { + s := new(testReplicaSelectorSuite) + s.SetupTest(t) + defer s.TearDownTest() + + // Enable forwarding. + s.cache.enableForwarding = true + fakeEpochNotMatch := &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}} + var ca replicaSelectorAccessPathCase + cas := []replicaSelectorAccessPathCase{ + { + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{DeadLineExceededErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false, forward_addr: store1}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"tikvRPC+1"}, + regionIsValid: true, + }, + afterRun: func() { /* don't invalid region */ }, + }, + { + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{}, + beforeRun: func() { /* don't resetStoreState */ }, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store2, replica-read: false, stale-read: false, forward_addr: store1}", // access to known proxy direct. + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + }, + } + s.True(s.runMultiCaseAndCompare(cas)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{DeadLineExceededErr, DeadLineExceededErr, DeadLineExceededErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false, forward_addr: store1}", + "{addr: store3, replica-read: false, stale-read: false, forward_addr: store1}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 3, + backoffDetail: []string{"tikvRPC+3"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{DeadLineExceededErr, NotLeaderWithNewLeader2Err, DeadLineExceededErr, ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false, forward_addr: store1}", + "{addr: store2, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false, forward_addr: store2}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 3, + backoffDetail: []string{"tikvRPC+2", "tikvServerBusy+1"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{DeadLineExceededErr, ServerIsBusyWithEstimatedWaitMsErr, ServerIsBusyWithEstimatedWaitMsErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false, forward_addr: store1}", + "{addr: store3, replica-read: false, stale-read: false, forward_addr: store1}", + }, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 3, + backoffDetail: []string{"tikvRPC+1", "tikvServerBusy+2"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdPrewrite, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{NotLeaderErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 1, + backoffDetail: []string{"regionScheduling+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + // TODO: maybe we can optimize the proxy strategy in future. + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdPrewrite, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{DeadLineExceededErr, NotLeaderErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false, forward_addr: store1}", + "{addr: store3, replica-read: false, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 2, + backoffDetail: []string{"regionScheduling+1", "tikvRPC+1"}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) + + ca = replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdPrewrite, + readType: kv.ReplicaReadLeader, + accessErr: []RegionErrorType{NotLeaderWithNewLeader2Err, DeadLineExceededErr, NotLeaderErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store1, replica-read: false, stale-read: false}", + "{addr: store2, replica-read: false, stale-read: false}", + "{addr: store3, replica-read: false, stale-read: false, forward_addr: store2}"}, + respErr: "", + respRegionError: fakeEpochNotMatch, + backoffCnt: 2, + backoffDetail: []string{"regionScheduling+1", "tikvRPC+1"}, + regionIsValid: false, + }, + } + s.True(s.runCaseAndCompare(ca)) +} + +func TestReplicaReadAccessPathByLearnerCase(t *testing.T) { + s := new(testReplicaSelectorSuite) + s.SetupTest(t) + defer s.TearDownTest() + + // Add a TiKV learner peer to the region. + rc := s.getRegion() + storeID := uint64(4) + s.cluster.AddStore(storeID, fmt.Sprintf("store%d", storeID)) + s.cluster.AddLearner(rc.meta.Id, storeID, s.cluster.AllocID()) + rc.invalidate(Other) // invalid region cache to reload region. + + ca := replicaSelectorAccessPathCase{ + reqType: tikvrpc.CmdGet, + readType: kv.ReplicaReadLearner, + accessErr: []RegionErrorType{ServerIsBusyErr}, + expect: &accessPathResult{ + accessPath: []string{ + "{addr: store4, replica-read: true, stale-read: false}", + "{addr: store1, replica-read: true, stale-read: false}", + }, + respErr: "", + respRegionError: nil, + backoffCnt: 0, + backoffDetail: []string{}, + regionIsValid: true, + }, + } + s.True(s.runCaseAndCompare(ca)) +} + +func TestReplicaReadAccessPathByGenError(t *testing.T) { + s := new(testReplicaSelectorSuite) + s.SetupTest(t) + defer s.TearDownTest() + + maxAccessErrCnt := 6 + if israce.RaceEnabled { + // When run this test with race, it will take a long time, so we reduce the maxAccessErrCnt to 3 to speed up test to avoid timeout. + maxAccessErrCnt = 3 + } + totalValidCaseCount := 0 + totalCaseCount := 0 + lastLogCnt := 0 + testCase := func(req tikvrpc.CmdType, readType kv.ReplicaReadType, staleRead bool, timeout time.Duration, busyThresholdMs uint32, label *metapb.StoreLabel) { + isRead := isReadReq(req) + accessErrGen := newAccessErrGenerator(isRead, staleRead, maxAccessErrCnt) + for { + accessErr, done := accessErrGen.genAccessErr(staleRead) + if done { + break + } + ca := replicaSelectorAccessPathCase{ + reqType: req, + readType: readType, + staleRead: staleRead, + timeout: timeout, + busyThresholdMs: busyThresholdMs, + label: label, + accessErr: accessErr, + } + valid := s.runCaseAndCompare(ca) + if valid { + totalValidCaseCount++ + } + totalCaseCount++ + if totalCaseCount-lastLogCnt > 100000 { + lastLogCnt = totalCaseCount + logutil.BgLogger().Info("TestReplicaReadAccessPathByGenError is running", + zap.Int("total-case", totalCaseCount), + zap.Int("valid-case", totalValidCaseCount), + zap.Int("invalid-case", totalCaseCount-totalValidCaseCount), + zap.String("req", req.String()), + zap.String("read-type", readType.String()), + zap.Bool("stale-read", staleRead), + zap.Duration("timeout", timeout), + zap.Any("label", label), + ) + } + } + } + + testCase(tikvrpc.CmdPrewrite, kv.ReplicaReadLeader, false, 0, 0, nil) + testCase(tikvrpc.CmdPrewrite, kv.ReplicaReadLeader, false, 0, 10, nil) + testCase(tikvrpc.CmdGet, kv.ReplicaReadLeader, false, 0, 0, nil) + testCase(tikvrpc.CmdGet, kv.ReplicaReadLeader, false, 0, 10, nil) + testCase(tikvrpc.CmdGet, kv.ReplicaReadFollower, false, 0, 0, nil) + testCase(tikvrpc.CmdGet, kv.ReplicaReadPreferLeader, false, 0, 0, nil) + testCase(tikvrpc.CmdGet, kv.ReplicaReadMixed, false, 0, 0, nil) + testCase(tikvrpc.CmdGet, kv.ReplicaReadLeader, false, time.Second, 0, nil) + testCase(tikvrpc.CmdGet, kv.ReplicaReadMixed, false, time.Second, 0, nil) + testCase(tikvrpc.CmdGet, kv.ReplicaReadMixed, false, time.Second, 0, &metapb.StoreLabel{Key: "id", Value: "1"}) + testCase(tikvrpc.CmdGet, kv.ReplicaReadMixed, false, time.Second, 0, &metapb.StoreLabel{Key: "id", Value: "2"}) + testCase(tikvrpc.CmdGet, kv.ReplicaReadMixed, false, time.Second, 0, &metapb.StoreLabel{Key: "id", Value: "3"}) + testCase(tikvrpc.CmdGet, kv.ReplicaReadMixed, true, 0, 0, nil) + testCase(tikvrpc.CmdGet, kv.ReplicaReadMixed, true, 0, 0, &metapb.StoreLabel{Key: "id", Value: "1"}) + testCase(tikvrpc.CmdGet, kv.ReplicaReadMixed, true, 0, 0, &metapb.StoreLabel{Key: "id", Value: "2"}) + testCase(tikvrpc.CmdGet, kv.ReplicaReadMixed, true, 0, 0, &metapb.StoreLabel{Key: "id", Value: "3"}) + testCase(tikvrpc.CmdGet, kv.ReplicaReadMixed, true, time.Second, 0, &metapb.StoreLabel{Key: "id", Value: "1"}) + testCase(tikvrpc.CmdGet, kv.ReplicaReadMixed, true, time.Second, 0, &metapb.StoreLabel{Key: "id", Value: "2"}) + testCase(tikvrpc.CmdGet, kv.ReplicaReadMixed, true, time.Second, 0, &metapb.StoreLabel{Key: "id", Value: "3"}) + + // Test for forwarding proxy. + s.cache.enableForwarding = true + testCase(tikvrpc.CmdPrewrite, kv.ReplicaReadLeader, false, 0, 0, nil) + testCase(tikvrpc.CmdPrewrite, kv.ReplicaReadLeader, false, 0, 10, nil) + testCase(tikvrpc.CmdGet, kv.ReplicaReadLeader, false, 0, 0, nil) + testCase(tikvrpc.CmdGet, kv.ReplicaReadLeader, false, 0, 10, nil) + + logutil.BgLogger().Info("TestReplicaReadAccessPathByGenError Finished", + zap.Int("total-case", totalCaseCount), + zap.Int("valid-case", totalValidCaseCount), + zap.Int("invalid-case", totalCaseCount-totalValidCaseCount)) +} + func (s *testReplicaSelectorSuite) changeRegionLeader(storeId uint64) { loc, err := s.cache.LocateKey(s.bo, []byte("key")) s.Nil(err) @@ -641,41 +2526,98 @@ func (s *testReplicaSelectorSuite) changeRegionLeader(storeId uint64) { s.cache.InvalidateCachedRegion(loc.Region) } -func (s *testReplicaSelectorSuite) runCaseAndCompare(ca2 replicaSelectorAccessPathCase) bool { - ca2.run(s) - if ca2.accessErrInValid { - // the case has been marked as invalid, just ignore it. - return false +func (s *testReplicaSelectorSuite) runCaseAndCompare(ca1 replicaSelectorAccessPathCase) bool { + ca2 := ca1 + config.UpdateGlobal(func(conf *config.Config) { + conf.EnableReplicaSelectorV2 = false + }) + sender := ca1.run(s) + ca1.checkResult(s, "v1", sender) + + config.UpdateGlobal(func(conf *config.Config) { + conf.EnableReplicaSelectorV2 = true + }) + sender = ca2.run(s) + if ca2.expect == nil { + // compare with ca1 result. + ca2.expect = &ca1.result } - if ca2.expect != nil { - msg := fmt.Sprintf("%v\n\n", ca2.Format()) - expect := ca2.expect - result := ca2.accessPathResult - s.Equal(expect.accessPath, result.accessPath, msg) - s.Equal(expect.respErr, result.respErr, msg) - s.Equal(expect.respRegionError, result.respRegionError, msg) - s.Equal(expect.regionIsValid, result.regionIsValid, msg) - s.Equal(expect.backoffCnt, result.backoffCnt, msg) - s.Equal(expect.backoffDetail, result.backoffDetail, msg) - } - return true + ca2.checkResult(s, "v2", sender) + return !ca1.accessErrInValid } -func (ca *replicaSelectorAccessPathCase) run(s *testReplicaSelectorSuite) { - reachable.injectConstantLiveness(s.cache) // inject reachable liveness. - msg := ca.Format() +func (s *testReplicaSelectorSuite) runCase(ca replicaSelectorAccessPathCase, v2 bool) bool { + config.UpdateGlobal(func(conf *config.Config) { + conf.EnableReplicaSelectorV2 = v2 + }) + sender := ca.run(s) + version := "v1" + if v2 { + version = "v2" + } + ca.checkResult(s, version, sender) + return !ca.accessErrInValid +} + +func (s *testReplicaSelectorSuite) runMultiCaseAndCompare(cas []replicaSelectorAccessPathCase) bool { + expects := make([]accessPathResult, 0, len(cas)) + valid := true + config.UpdateGlobal(func(conf *config.Config) { + conf.EnableReplicaSelectorV2 = false + }) + for _, ca1 := range cas { + sender := ca1.run(s) + ca1.checkResult(s, "v1", sender) + expects = append(expects, ca1.result) + valid = valid && !ca1.accessErrInValid + } + + config.UpdateGlobal(func(conf *config.Config) { + conf.EnableReplicaSelectorV2 = true + }) + for i, ca2 := range cas { + sender := ca2.run(s) + if ca2.expect == nil { + // compare with ca1 result. + ca2.expect = &expects[i] + } + ca2.checkResult(s, "v2", sender) + valid = valid && !ca2.accessErrInValid + } + return valid +} + +func (ca *replicaSelectorAccessPathCase) checkResult(s *testReplicaSelectorSuite, version string, sender *RegionRequestSender) { + if ca.expect == nil { + return + } + msg := fmt.Sprintf("enable_forwarding: %v\nversion: %v\n%v\nsender: %v\n", s.cache.enableForwarding, version, ca.Format(), sender.String()) + expect := ca.expect + result := ca.result + s.Equal(expect.accessPath, result.accessPath, msg) + s.Equal(expect.respErr, result.respErr, msg) + s.Equal(expect.respRegionError, result.respRegionError, msg) + s.Equal(expect.regionIsValid, result.regionIsValid, msg) + s.Equal(expect.backoffCnt, result.backoffCnt, msg) + s.Equal(expect.backoffDetail, result.backoffDetail, msg) +} + +func (ca *replicaSelectorAccessPathCase) run(s *testReplicaSelectorSuite) *RegionRequestSender { access := []string{} fnClient := &fnClient{fn: func(ctx context.Context, addr string, req *tikvrpc.Request, timeout time.Duration) (response *tikvrpc.Response, err error) { idx := len(access) - access = append(access, fmt.Sprintf("{addr: %v, replica-read: %v, stale-read: %v}", addr, req.ReplicaRead, req.StaleRead)) + if req.ForwardedHost == "" { + access = append(access, fmt.Sprintf("{addr: %v, replica-read: %v, stale-read: %v}", addr, req.ReplicaRead, req.StaleRead)) + } else { + access = append(access, fmt.Sprintf("{addr: %v, replica-read: %v, stale-read: %v, forward_addr: %v}", addr, req.ReplicaRead, req.StaleRead, req.ForwardedHost)) + addr = req.ForwardedHost + } if idx < len(ca.accessErr) { if !ca.accessErr[idx].Valid(addr, req) { // mark this case is invalid. just ignore this case. ca.accessErrInValid = true } else { - loc, err := s.cache.LocateKey(s.bo, []byte("key")) - s.Nil(err) - rc := s.cache.GetCachedRegionWithRLock(loc.Region) + rc := s.getRegion() s.NotNil(rc) regionErr, err := ca.genAccessErr(s.cache, rc, ca.accessErr[idx]) if regionErr != nil { @@ -693,6 +2635,34 @@ func (ca *replicaSelectorAccessPathCase) run(s *testReplicaSelectorSuite) { }}, nil }} sender := NewRegionRequestSender(s.cache, fnClient) + req, opts, timeout := ca.buildRequest(s) + beforeRun(s, ca) + rc := s.getRegion() + s.NotNil(rc) + bo := retry.NewBackofferWithVars(context.Background(), 40000, nil) + resp, _, _, err := sender.SendReqCtx(bo, req, rc.VerID(), timeout, tikvrpc.TiKV, opts...) + ca.recordResult(s, bo, sender.replicaSelector.getBaseReplicaSelector().region, access, resp, err) + afterRun(ca, sender) + return sender +} + +func beforeRun(s *testReplicaSelectorSuite, ca *replicaSelectorAccessPathCase) { + if ca.beforeRun != nil { + ca.beforeRun() + } else { + s.resetStoreState() + } +} + +func afterRun(ca *replicaSelectorAccessPathCase, sender *RegionRequestSender) { + if ca.afterRun != nil { + ca.afterRun() + } else { + sender.replicaSelector.invalidateRegion() // invalidate region to reload for next test case. + } +} + +func (ca *replicaSelectorAccessPathCase) buildRequest(s *testReplicaSelectorSuite) (*tikvrpc.Request, []StoreSelectorOption, time.Duration) { var req *tikvrpc.Request switch ca.reqType { case tikvrpc.CmdGet: @@ -719,42 +2689,32 @@ func (ca *replicaSelectorAccessPathCase) run(s *testReplicaSelectorSuite) { if ca.label != nil { opts = append(opts, WithMatchLabels([]*metapb.StoreLabel{ca.label})) } - // reset slow score, since serverIsBusyErr will mark the store is slow, and affect remaining test cases. - loc, err := s.cache.LocateKey(s.bo, []byte("key")) - s.Nil(err) - rc := s.cache.GetCachedRegionWithRLock(loc.Region) - s.NotNil(rc) - for _, store := range rc.getStore().stores { - store.loadStats.Store(nil) - store.healthStatus.clientSideSlowScore.resetSlowScore() - atomic.StoreUint32(&store.livenessState, uint32(reachable)) - store.setResolveState(resolved) - } - - bo := retry.NewBackofferWithVars(context.Background(), 40000, nil) timeout := ca.timeout if timeout == 0 { timeout = client.ReadTimeoutShort } - resp, _, _, err := sender.SendReqCtx(bo, req, loc.Region, timeout, tikvrpc.TiKV, opts...) + return req, opts, timeout +} + +func (ca *replicaSelectorAccessPathCase) recordResult(s *testReplicaSelectorSuite, bo *retry.Backoffer, region *Region, access []string, resp *tikvrpc.Response, err error) { + ca.result.accessPath = access + ca.result.regionIsValid = region.isValid() + msg := ca.Format() if err == nil { s.NotNil(resp, msg) regionErr, err := resp.GetRegionError() s.Nil(err, msg) - ca.respRegionError = regionErr + ca.result.respRegionError = regionErr } else { - ca.respErr = err.Error() + ca.result.respErr = err.Error() } - ca.accessPath = access - ca.backoffCnt = bo.GetTotalBackoffTimes() + ca.result.backoffCnt = bo.GetTotalBackoffTimes() detail := make([]string, 0, len(bo.GetBackoffTimes())) for tp, cnt := range bo.GetBackoffTimes() { detail = append(detail, fmt.Sprintf("%v+%v", tp, cnt)) } sort.Strings(detail) - ca.backoffDetail = detail - ca.regionIsValid = sender.replicaSelector.region.isValid() - sender.replicaSelector.invalidateRegion() // invalidate region to reload for next test case. + ca.result.backoffDetail = detail } func (ca *replicaSelectorAccessPathCase) genAccessErr(regionCache *RegionCache, r *Region, accessErr RegionErrorType) (regionErr *errorpb.Error, err error) { @@ -796,8 +2756,8 @@ func (c *replicaSelectorAccessPathCase) Format() string { label = fmt.Sprintf("%v->%v", c.label.Key, c.label.Value) } respRegionError := "" - if c.respRegionError != nil { - respRegionError = c.respRegionError.String() + if c.result.respRegionError != nil { + respRegionError = c.result.respRegionError.String() } accessErr := make([]string, len(c.accessErr)) for i := range c.accessErr { @@ -817,8 +2777,32 @@ func (c *replicaSelectorAccessPathCase) Format() string { "\tbackoff_cnt: %v\n"+ "\tbackoff_detail: %v\n"+ "\tregion_is_valid: %v\n}", - c.reqType, c.readType, c.staleRead, c.timeout, c.busyThresholdMs, label, strings.Join(accessErr, ", "), strings.Join(c.accessPath, ", "), - c.respErr, respRegionError, c.backoffCnt, strings.Join(c.backoffDetail, ", "), c.regionIsValid) + c.reqType, c.readType, c.staleRead, c.timeout, c.busyThresholdMs, label, strings.Join(accessErr, ", "), strings.Join(c.result.accessPath, ", "), + c.result.respErr, respRegionError, c.result.backoffCnt, strings.Join(c.result.backoffDetail, ", "), c.result.regionIsValid) +} + +func (s *testReplicaSelectorSuite) resetStoreState() { + // reset slow score, since serverIsBusyErr will mark the store is slow, and affect remaining test cases. + reachable.injectConstantLiveness(s.cache) // inject reachable liveness. + rc := s.getRegion() + s.NotNil(rc) + for _, store := range rc.getStore().stores { + store.loadStats.Store(nil) + store.healthStatus.clientSideSlowScore.resetSlowScore() + store.healthStatus.updateTiKVServerSideSlowScore(0, time.Now()) + store.healthStatus.updateSlowFlag() + atomic.StoreUint32(&store.livenessState, uint32(reachable)) + store.setResolveState(resolved) + } + regionStore := rc.getStore() + for _, storeIdx := range regionStore.accessIndex[tiKVOnly] { + epoch := regionStore.storeEpochs[storeIdx] + storeEpoch := regionStore.stores[storeIdx].epoch + if epoch != storeEpoch { + rc.invalidate(EpochNotMatch) + break + } + } } type RegionErrorType int @@ -1015,3 +2999,190 @@ func (tp RegionErrorType) String() string { return "unknown_" + strconv.Itoa(int(tp)) } } + +type accessErrGenerator struct { + maxAccessErrCnt int + mode int + idx int + baseIdx int + allErrs []RegionErrorType + retryErrs []RegionErrorType +} + +func newAccessErrGenerator(isRead, staleRead bool, maxAccessErrCnt int) *accessErrGenerator { + filter := func(tp RegionErrorType) bool { + // read request won't meet RaftEntryTooLargeErr. + if isRead && tp == RaftEntryTooLargeErr { + return false + } + if staleRead == false && tp == DataIsNotReadyErr { + return false + } + // TODO: since v2 has come compatibility issue with v1, so skip FlashbackInProgressErr. + if tp == FlashbackInProgressErr { + return false + } + return true + } + allErrs := getAllRegionErrors(filter) + retryableErr := getAllRegionErrors(func(tp RegionErrorType) bool { + return filter(tp) && isRegionErrorRetryable(tp) + }) + return &accessErrGenerator{ + maxAccessErrCnt: maxAccessErrCnt, + mode: 0, + idx: 0, + allErrs: allErrs, + retryErrs: retryableErr, + } +} + +func getAllRegionErrors(filter func(errorType RegionErrorType) bool) []RegionErrorType { + errs := make([]RegionErrorType, 0, int(RegionErrorTypeMax)) + for tp := NotLeaderErr; tp < RegionErrorTypeMax; tp++ { + if filter != nil && filter(tp) == false { + continue + } + errs = append(errs, tp) + } + return errs +} + +func isRegionErrorRetryable(tp RegionErrorType) bool { + switch tp { + case NotLeaderErr, NotLeaderWithNewLeader1Err, NotLeaderWithNewLeader2Err, NotLeaderWithNewLeader3Err, ServerIsBusyErr, ServerIsBusyWithEstimatedWaitMsErr, + StaleCommandErr, MaxTimestampNotSyncedErr, ReadIndexNotReadyErr, ProposalInMergingModeErr, DataIsNotReadyErr, + RegionNotInitializedErr, FlashbackInProgressErr, DiskFullErr, DeadLineExceededErr: + return true + } + return false +} + +func (a *accessErrGenerator) genAccessErr(staleRead bool) ([]RegionErrorType, bool) { + if a.mode == 0 { + a.idx = 0 + a.mode = 1 + return nil, false + } + if a.mode == 1 { + idx := a.idx + a.idx++ + if a.idx >= len(a.allErrs) { + a.idx = 0 + a.baseIdx = 0 + a.mode = 2 + } + return []RegionErrorType{a.allErrs[idx]}, false + } + for a.mode <= a.maxAccessErrCnt { + errs := a.genAccessErrs(a.allErrs, a.retryErrs) + if len(errs) > 0 { + return errs, false + } + a.baseIdx = 0 + a.idx = 0 + a.mode++ + // if mode >= 4 , reduce the error type to avoid generating too many combinations. + if a.mode > 4 { + if a.mode > 8 { + a.allErrs = []RegionErrorType{ServerIsBusyErr, ServerIsBusyWithEstimatedWaitMsErr} + } else if a.mode > 7 { + a.allErrs = []RegionErrorType{ServerIsBusyWithEstimatedWaitMsErr, DeadLineExceededErr} + } else if a.mode > 6 { + a.allErrs = []RegionErrorType{NotLeaderWithNewLeader2Err, ServerIsBusyWithEstimatedWaitMsErr, DeadLineExceededErr} + } else if a.mode > 5 { + a.allErrs = []RegionErrorType{NotLeaderErr, NotLeaderWithNewLeader2Err, ServerIsBusyWithEstimatedWaitMsErr, DeadLineExceededErr} + } else { + a.allErrs = []RegionErrorType{NotLeaderErr, NotLeaderWithNewLeader1Err, NotLeaderWithNewLeader2Err, NotLeaderWithNewLeader3Err, ServerIsBusyWithEstimatedWaitMsErr, RegionNotInitializedErr, DeadLineExceededErr} + } + if staleRead { + a.allErrs = append(a.allErrs, DataIsNotReadyErr) + } + a.retryErrs = a.allErrs + } + } + return nil, true +} + +func (a *accessErrGenerator) genAccessErrs(allErrs, retryErrs []RegionErrorType) []RegionErrorType { + defer func() { + a.baseIdx++ + if a.baseIdx >= len(allErrs) { + a.baseIdx = 0 + a.idx++ + } + }() + mode := a.mode + errs := make([]RegionErrorType, mode) + errs[mode-1] = allErrs[a.baseIdx%len(allErrs)] + value := a.idx + for i := mode - 2; i >= 0; i-- { + if i == 0 && value > len(retryErrs) { + return nil + } + errs[i] = retryErrs[value%len(retryErrs)] + value = value / len(retryErrs) + } + return errs +} + +func (s *testReplicaSelectorSuite) getRegion() *Region { + for i := 0; i < 100; i++ { + loc, err := s.cache.LocateKey(s.bo, []byte("key")) + s.Nil(err) + rc := s.cache.GetCachedRegionWithRLock(loc.Region) + if rc == nil { + time.Sleep(time.Millisecond * 10) + continue + } + return rc + } + return nil +} + +func BenchmarkReplicaSelector(b *testing.B) { + mvccStore := mocktikv.MustNewMVCCStore() + cluster := mocktikv.NewCluster(mvccStore) + mocktikv.BootstrapWithMultiStores(cluster, 3) + pdCli := &CodecPDClient{mocktikv.NewPDClient(cluster), apicodec.NewCodecV1(apicodec.ModeTxn)} + cache := NewRegionCache(pdCli) + defer func() { + cache.Close() + mvccStore.Close() + }() + + config.UpdateGlobal(func(conf *config.Config) { + conf.EnableReplicaSelectorV2 = true + }) + cnt := 0 + allErrs := getAllRegionErrors(nil) + fnClient := &fnClient{fn: func(ctx context.Context, addr string, req *tikvrpc.Request, timeout time.Duration) (response *tikvrpc.Response, err error) { + pberr, err := allErrs[cnt%len(allErrs)].GenError() + cnt++ + return &tikvrpc.Response{Resp: &kvrpcpb.GetResponse{ + RegionError: pberr, + Value: []byte("value"), + }}, err + }} + f, _ := os.Create("cpu.profile") + pprof.StartCPUProfile(f) + defer func() { + pprof.StopCPUProfile() + f.Close() + }() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + bo := retry.NewBackofferWithVars(context.Background(), 40000, nil) + req := tikvrpc.NewRequest(tikvrpc.CmdGet, &kvrpcpb.GetRequest{ + Key: []byte("key"), + }) + req.ReplicaReadType = kv.ReplicaReadMixed + loc, err := cache.LocateKey(bo, []byte("key")) + if err != nil { + b.Fail() + } + sender := NewRegionRequestSender(cache, fnClient) + sender.SendReqCtx(bo, req, loc.Region, client.ReadTimeoutShort, tikvrpc.TiKV) + } +} diff --git a/internal/mockstore/mocktikv/cluster.go b/internal/mockstore/mocktikv/cluster.go index 78b59fe6..f5c546a5 100644 --- a/internal/mockstore/mocktikv/cluster.go +++ b/internal/mockstore/mocktikv/cluster.go @@ -427,7 +427,15 @@ func (c *Cluster) AddPeer(regionID, storeID, peerID uint64) { c.Lock() defer c.Unlock() - c.regions[regionID].addPeer(peerID, storeID) + c.regions[regionID].addPeer(peerID, storeID, metapb.PeerRole_Voter) +} + +// AddLearner adds a new learner for the Region on the Store. +func (c *Cluster) AddLearner(regionID, storeID, peerID uint64) { + c.Lock() + defer c.Unlock() + + c.regions[regionID].addPeer(peerID, storeID, metapb.PeerRole_Learner) } // RemovePeer removes the Peer from the Region. Note that if the Peer is leader, @@ -666,8 +674,10 @@ func newRegion(regionID uint64, storeIDs, peerIDs []uint64, leaderPeerID uint64, } } -func (r *Region) addPeer(peerID, storeID uint64) { - r.Meta.Peers = append(r.Meta.Peers, newPeerMeta(peerID, storeID)) +func (r *Region) addPeer(peerID, storeID uint64, role metapb.PeerRole) { + peer := newPeerMeta(peerID, storeID) + peer.Role = role + r.Meta.Peers = append(r.Meta.Peers, peer) r.incConfVer() } diff --git a/util/israce/israce.go b/util/israce/israce.go new file mode 100644 index 00000000..a779e783 --- /dev/null +++ b/util/israce/israce.go @@ -0,0 +1,20 @@ +// Copyright 2024 TiKV Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build race + +package israce + +// RaceEnabled checks if race is enabled. +const RaceEnabled = true diff --git a/util/israce/norace.go b/util/israce/norace.go new file mode 100644 index 00000000..ef056b43 --- /dev/null +++ b/util/israce/norace.go @@ -0,0 +1,20 @@ +// Copyright 2024 TiKV Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !race + +package israce + +// RaceEnabled checks if race is enabled. +const RaceEnabled = false