client-go/internal/locate/replica_selector.go

537 lines
17 KiB
Go

// Copyright 2024 TiKV Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package locate
import (
"fmt"
"time"
"github.com/pingcap/kvproto/pkg/errorpb"
"github.com/pingcap/kvproto/pkg/metapb"
"github.com/pkg/errors"
"github.com/tikv/client-go/v2/config"
"github.com/tikv/client-go/v2/config/retry"
"github.com/tikv/client-go/v2/kv"
"github.com/tikv/client-go/v2/metrics"
"github.com/tikv/client-go/v2/tikvrpc"
)
type ReplicaSelector interface {
next(bo *retry.Backoffer, req *tikvrpc.Request) (*RPCContext, error)
targetReplica() *replica
proxyReplica() *replica
replicaType(rpcCtx *RPCContext) string
String() string
getBaseReplicaSelector() *baseReplicaSelector
getLabels() []*metapb.StoreLabel
onSendSuccess(req *tikvrpc.Request)
onSendFailure(bo *retry.Backoffer, err error)
invalidateRegion()
// Following methods are used to handle region errors.
onNotLeader(bo *retry.Backoffer, ctx *RPCContext, notLeader *errorpb.NotLeader) (shouldRetry bool, err error)
onDataIsNotReady()
onServerIsBusy(bo *retry.Backoffer, ctx *RPCContext, req *tikvrpc.Request, serverIsBusy *errorpb.ServerIsBusy) (shouldRetry bool, err error)
onReadReqConfigurableTimeout(req *tikvrpc.Request) bool
}
// NewReplicaSelector returns a new ReplicaSelector.
func NewReplicaSelector(
regionCache *RegionCache, regionID RegionVerID, req *tikvrpc.Request, opts ...StoreSelectorOption,
) (ReplicaSelector, error) {
if config.GetGlobalConfig().EnableReplicaSelectorV2 {
return newReplicaSelectorV2(regionCache, regionID, req, opts...)
}
return newReplicaSelector(regionCache, regionID, req, opts...)
}
type replicaSelectorV2 struct {
baseReplicaSelector
replicaReadType kv.ReplicaReadType
isStaleRead bool
isReadOnlyReq bool
option storeSelectorOp
target *replica
proxy *replica
attempts int
}
func newReplicaSelectorV2(
regionCache *RegionCache, regionID RegionVerID, req *tikvrpc.Request, opts ...StoreSelectorOption,
) (*replicaSelectorV2, error) {
cachedRegion := regionCache.GetCachedRegionWithRLock(regionID)
if cachedRegion == nil || !cachedRegion.isValid() {
return nil, errors.New("cached region invalid")
}
replicas := buildTiKVReplicas(cachedRegion)
option := storeSelectorOp{}
for _, op := range opts {
op(&option)
}
return &replicaSelectorV2{
baseReplicaSelector: baseReplicaSelector{
regionCache: regionCache,
region: cachedRegion,
replicas: replicas,
busyThreshold: time.Duration(req.BusyThresholdMs) * time.Millisecond,
},
replicaReadType: req.ReplicaReadType,
isStaleRead: req.StaleRead,
isReadOnlyReq: isReadReq(req.Type),
option: option,
target: nil,
attempts: 0,
}, nil
}
func (s *replicaSelectorV2) next(bo *retry.Backoffer, req *tikvrpc.Request) (rpcCtx *RPCContext, err error) {
if !s.region.isValid() {
metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("invalid").Inc()
return nil, nil
}
s.attempts++
s.target = nil
s.proxy = nil
switch s.replicaReadType {
case kv.ReplicaReadLeader:
s.nextForReplicaReadLeader(req)
default:
s.nextForReplicaReadMixed(req)
}
if s.target == nil {
return nil, nil
}
return s.buildRPCContext(bo, s.target, s.proxy)
}
func (s *replicaSelectorV2) nextForReplicaReadLeader(req *tikvrpc.Request) {
if s.regionCache.enableForwarding {
strategy := ReplicaSelectLeaderWithProxyStrategy{}
s.target, s.proxy = strategy.next(s.replicas, s.region)
if s.target != nil && s.proxy != nil {
return
}
}
leaderIdx := s.region.getStore().workTiKVIdx
strategy := ReplicaSelectLeaderStrategy{leaderIdx: leaderIdx}
s.target = strategy.next(s.replicas)
if s.target != nil && s.busyThreshold > 0 && s.isReadOnlyReq && (s.target.store.EstimatedWaitTime() > s.busyThreshold || s.target.serverIsBusy) {
// If the leader is busy in our estimation, try other idle replicas.
// If other replicas are all busy, tryIdleReplica will try the leader again without busy threshold.
mixedStrategy := ReplicaSelectMixedStrategy{leaderIdx: leaderIdx, busyThreshold: s.busyThreshold}
idleTarget := mixedStrategy.next(s, s.region)
if idleTarget != nil {
s.target = idleTarget
req.ReplicaRead = true
} else {
// No threshold if all peers are too busy, remove busy threshold and still use leader.
s.busyThreshold = 0
req.BusyThresholdMs = 0
req.ReplicaRead = false
}
}
if s.target != nil {
return
}
mixedStrategy := ReplicaSelectMixedStrategy{leaderIdx: leaderIdx, leaderOnly: s.option.leaderOnly}
s.target = mixedStrategy.next(s, s.region)
if s.target != nil && s.isReadOnlyReq && s.replicas[leaderIdx].deadlineErrUsingConfTimeout {
req.ReplicaRead = true
req.StaleRead = false
}
}
func (s *replicaSelectorV2) nextForReplicaReadMixed(req *tikvrpc.Request) {
leaderIdx := s.region.getStore().workTiKVIdx
if s.isStaleRead && s.attempts == 2 {
// For stale read second retry, try leader by leader read.
strategy := ReplicaSelectLeaderStrategy{leaderIdx: leaderIdx}
s.target = strategy.next(s.replicas)
if s.target != nil && !s.target.isExhausted(1, 0) {
// For stale read, don't retry leader again if it is accessed at the first attempt.
req.StaleRead = false
req.ReplicaRead = false
return
}
}
preferLeader := req.ReplicaReadType == kv.ReplicaReadPreferLeader
if s.attempts > 1 {
if req.ReplicaReadType == kv.ReplicaReadMixed {
// For mixed read retry, prefer retry leader first.
preferLeader = true
}
}
strategy := ReplicaSelectMixedStrategy{
leaderIdx: leaderIdx,
tryLeader: req.ReplicaReadType == kv.ReplicaReadMixed || req.ReplicaReadType == kv.ReplicaReadPreferLeader,
preferLeader: preferLeader,
leaderOnly: s.option.leaderOnly,
learnerOnly: req.ReplicaReadType == kv.ReplicaReadLearner,
labels: s.option.labels,
stores: s.option.stores,
}
s.target = strategy.next(s, s.region)
if s.target != nil {
if s.isStaleRead && s.attempts == 1 {
// stale-read request first access.
if !s.target.store.IsLabelsMatch(s.option.labels) && s.target.peer.Id != s.region.GetLeaderPeerID() {
// If the target replica's labels is not match and not leader, use replica read.
// This is for compatible with old version.
req.StaleRead = false
req.ReplicaRead = true
} else {
// use stale read.
req.StaleRead = true
req.ReplicaRead = false
}
} else {
// always use replica.
req.StaleRead = false
req.ReplicaRead = s.isReadOnlyReq
}
}
}
type ReplicaSelectLeaderStrategy struct {
leaderIdx AccessIndex
}
func (s ReplicaSelectLeaderStrategy) next(replicas []*replica) *replica {
leader := replicas[s.leaderIdx]
if isLeaderCandidate(leader) {
return leader
}
return nil
}
// ReplicaSelectMixedStrategy is used to select a replica by calculating a score for each replica, and then choose the one with the highest score.
// Attention, if you want the leader replica must be chosen in some case, you should use ReplicaSelectLeaderStrategy, instead of use ReplicaSelectMixedStrategy with preferLeader flag.
type ReplicaSelectMixedStrategy struct {
leaderIdx AccessIndex
tryLeader bool
preferLeader bool
leaderOnly bool
learnerOnly bool
labels []*metapb.StoreLabel
stores []uint64
busyThreshold time.Duration
}
func (s *ReplicaSelectMixedStrategy) next(selector *replicaSelectorV2, region *Region) *replica {
replicas := selector.replicas
maxScoreIdxes := make([]int, 0, len(replicas))
maxScore := -1
reloadRegion := false
for i, r := range replicas {
epochStale := r.isEpochStale()
liveness := r.store.getLivenessState()
isLeader := AccessIndex(i) == s.leaderIdx
if epochStale && ((liveness == reachable && r.store.getResolveState() == resolved) || isLeader) {
reloadRegion = true
}
if !s.isCandidate(r, isLeader, epochStale, liveness) {
continue
}
score := s.calculateScore(r, isLeader)
if score > maxScore {
maxScore = score
maxScoreIdxes = append(maxScoreIdxes[:0], i)
} else if score == maxScore && score > -1 {
maxScoreIdxes = append(maxScoreIdxes, i)
}
}
if reloadRegion {
selector.region.setSyncFlags(needDelayedReloadPending)
}
if len(maxScoreIdxes) == 1 {
idx := maxScoreIdxes[0]
return replicas[idx]
} else if len(maxScoreIdxes) > 1 {
// if there are more than one replica with the same max score, we will randomly select one
// todo: consider use store statistics information to select a faster one.
idx := maxScoreIdxes[randIntn(len(maxScoreIdxes))]
return replicas[idx]
}
if s.busyThreshold > 0 {
// when can't find an idle replica, no need to invalidate region.
return nil
}
// when meet deadline exceeded error, do fast retry without invalidate region cache.
if !hasDeadlineExceededError(selector.replicas) {
selector.invalidateRegion()
}
metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("exhausted").Inc()
return nil
}
func (s *ReplicaSelectMixedStrategy) isCandidate(r *replica, isLeader bool, epochStale bool, liveness livenessState) bool {
if epochStale || liveness == unreachable {
// the replica is not available, skip it.
return false
}
maxAttempt := 1
if r.dataIsNotReady && !isLeader {
// If the replica is failed by data not ready with stale read, we can retry it with replica-read.
// after https://github.com/tikv/tikv/pull/15726, the leader will not return DataIsNotReady error,
// then no need to retry leader again, if you try it again, you may got a NotLeader error.
maxAttempt = 2
}
if r.isExhausted(maxAttempt, 0) {
// attempts is exhausted, skip it.
return false
}
if s.leaderOnly && !isLeader {
return false
}
if s.busyThreshold > 0 && (r.store.EstimatedWaitTime() > s.busyThreshold || r.serverIsBusy || isLeader) {
return false
}
return true
}
const (
// The definition of the score is:
// MSB LSB
// [unused bits][1 bit: LabelMatches][1 bit: PreferLeader][2 bits: NormalPeer + NotSlow]
flagLabelMatches = 1 << 4
flagPreferLeader = 1 << 3
flagNormalPeer = 1 << 2
flagNotSlow = 1 << 1
flagNotAttempt = 1
)
// calculateScore calculates the score of the replica.
func (s *ReplicaSelectMixedStrategy) calculateScore(r *replica, isLeader bool) int {
score := 0
if r.store.IsStoreMatch(s.stores) && r.store.IsLabelsMatch(s.labels) {
score |= flagLabelMatches
}
if isLeader {
if s.preferLeader {
score |= flagPreferLeader
} else if s.tryLeader {
if len(s.labels) > 0 {
// When the leader has matching labels, prefer leader than other mismatching peers.
score |= flagPreferLeader
} else {
score |= flagNormalPeer
}
}
} else {
if s.learnerOnly {
if r.peer.Role == metapb.PeerRole_Learner {
score |= flagNormalPeer
}
} else {
score |= flagNormalPeer
}
}
if !r.store.healthStatus.IsSlow() {
score |= flagNotSlow
}
if r.attempts == 0 {
score |= flagNotAttempt
}
return score
}
type ReplicaSelectLeaderWithProxyStrategy struct{}
func (s ReplicaSelectLeaderWithProxyStrategy) next(replicas []*replica, region *Region) (leader *replica, proxy *replica) {
rs := region.getStore()
leaderIdx := rs.workTiKVIdx
leader = replicas[leaderIdx]
if leader.store.getLivenessState() == reachable || leader.notLeader {
// if leader's store is reachable, no need use proxy.
rs.unsetProxyStoreIfNeeded(region)
return nil, nil
}
proxyIdx := rs.proxyTiKVIdx
if proxyIdx >= 0 && int(proxyIdx) < len(replicas) && s.isCandidate(replicas[proxyIdx], proxyIdx == leaderIdx) {
return leader, replicas[proxyIdx]
}
for i, r := range replicas {
if s.isCandidate(r, AccessIndex(i) == leaderIdx) {
return leader, r
}
}
return nil, nil
}
func (s ReplicaSelectLeaderWithProxyStrategy) isCandidate(r *replica, isLeader bool) bool {
if isLeader ||
r.isExhausted(1, 0) ||
r.store.getLivenessState() != reachable ||
r.isEpochStale() {
// check epoch here, if epoch staled, we can try other replicas. instead of buildRPCContext failed and invalidate region then retry.
return false
}
return true
}
func (s *replicaSelectorV2) onNotLeader(
bo *retry.Backoffer, ctx *RPCContext, notLeader *errorpb.NotLeader,
) (shouldRetry bool, err error) {
if s.target != nil {
s.target.notLeader = true
}
leaderIdx, err := s.baseReplicaSelector.onNotLeader(bo, ctx, notLeader)
if err != nil {
return false, err
}
if leaderIdx >= 0 {
if isLeaderCandidate(s.replicas[leaderIdx]) {
s.replicaReadType = kv.ReplicaReadLeader
}
}
return true, nil
}
func (s *replicaSelectorV2) onFlashbackInProgress(ctx *RPCContext, req *tikvrpc.Request) bool {
// if the failure is caused by replica read, we can retry it with leader safely.
if req.ReplicaRead && s.target != nil && s.target.peer.Id != s.region.GetLeaderPeerID() {
req.BusyThresholdMs = 0
s.busyThreshold = 0
s.replicaReadType = kv.ReplicaReadLeader
req.ReplicaReadType = kv.ReplicaReadLeader
return true
}
return false
}
func (s *replicaSelectorV2) onDataIsNotReady() {
if s.target != nil {
s.target.dataIsNotReady = true
}
}
func (s *replicaSelectorV2) onServerIsBusy(
bo *retry.Backoffer, ctx *RPCContext, req *tikvrpc.Request, serverIsBusy *errorpb.ServerIsBusy,
) (shouldRetry bool, err error) {
var store *Store
if ctx != nil && ctx.Store != nil {
store = ctx.Store
if serverIsBusy.EstimatedWaitMs != 0 {
ctx.Store.updateServerLoadStats(serverIsBusy.EstimatedWaitMs)
if s.busyThreshold != 0 && isReadReq(req.Type) {
// do not retry with batched coprocessor requests.
// it'll be region misses if we send the tasks to replica.
if req.Type == tikvrpc.CmdCop && len(req.Cop().Tasks) > 0 {
return false, nil
}
if s.target != nil {
s.target.serverIsBusy = true
}
}
} else {
// Mark the server is busy (the next incoming READs could be redirected to expected followers.)
ctx.Store.healthStatus.markAlreadySlow()
}
}
backoffErr := errors.Errorf("server is busy, ctx: %v", ctx)
if s.canFastRetry() {
s.addPendingBackoff(store, retry.BoTiKVServerBusy, backoffErr)
return true, nil
}
err = bo.Backoff(retry.BoTiKVServerBusy, backoffErr)
if err != nil {
return false, err
}
return true, nil
}
func (s *replicaSelectorV2) canFastRetry() bool {
if s.replicaReadType == kv.ReplicaReadLeader {
leaderIdx := s.region.getStore().workTiKVIdx
leader := s.replicas[leaderIdx]
if isLeaderCandidate(leader) && !leader.serverIsBusy {
return false
}
}
return true
}
func (s *replicaSelectorV2) onReadReqConfigurableTimeout(req *tikvrpc.Request) bool {
if isReadReqConfigurableTimeout(req) {
if s.target != nil {
s.target.deadlineErrUsingConfTimeout = true
}
return true
}
return false
}
func (s *replicaSelectorV2) onSendFailure(bo *retry.Backoffer, err error) {
metrics.RegionCacheCounterWithSendFail.Inc()
// todo: mark store need check and return to fast retry.
target := s.target
if s.proxy != nil {
target = s.proxy
}
liveness := s.checkLiveness(bo, target)
if s.replicaReadType == kv.ReplicaReadLeader && s.proxy == nil && s.target != nil && s.target.peer.Id == s.region.GetLeaderPeerID() &&
liveness == unreachable && len(s.replicas) > 1 && s.regionCache.enableForwarding {
// just return to use proxy.
return
}
if liveness != reachable {
s.invalidateReplicaStore(target, err)
}
}
func (s *replicaSelectorV2) onSendSuccess(req *tikvrpc.Request) {
if s.proxy != nil && s.target != nil {
for idx, r := range s.replicas {
if r.peer.Id == s.proxy.peer.Id {
s.region.getStore().setProxyStoreIdx(s.region, AccessIndex(idx))
break
}
}
}
if s.target != nil && s.target.peer.Id != s.region.GetLeaderPeerID() && req != nil && !req.StaleRead && !req.ReplicaRead {
s.region.switchWorkLeaderToPeer(s.target.peer)
}
}
func (s *replicaSelectorV2) targetReplica() *replica {
return s.target
}
func (s *replicaSelectorV2) proxyReplica() *replica {
return s.proxy
}
func (s *replicaSelectorV2) getLabels() []*metapb.StoreLabel {
return s.option.labels
}
func (s *replicaSelectorV2) replicaType(_ *RPCContext) string {
if s.target != nil {
if s.target.peer.Id == s.region.GetLeaderPeerID() {
return "leader"
}
return "follower"
}
return "unknown"
}
func (s *replicaSelectorV2) String() string {
if s == nil {
return ""
}
return fmt.Sprintf("replicaSelectorV2{replicaReadType: %v, attempts: %v, %v}", s.replicaReadType.String(), s.attempts, s.baseReplicaSelector.String())
}