From d632a4c0b2a05532253941dfb3bcc7d581b75142 Mon Sep 17 00:00:00 2001 From: mittalrishabh Date: Tue, 21 Oct 2025 19:12:46 -0700 Subject: [PATCH] retry on leader for region not available (#1769) Signed-off-by: rishabh_mittal Co-authored-by: rishabh_mittal --- internal/locate/region_cache.go | 8 ++++++++ internal/locate/region_request.go | 8 ++++++-- internal/locate/region_request_state_test.go | 8 ++++---- internal/locate/replica_selector.go | 17 +++++++++++++++++ internal/locate/replica_selector_test.go | 14 +++++++++++++- 5 files changed, 48 insertions(+), 7 deletions(-) diff --git a/internal/locate/region_cache.go b/internal/locate/region_cache.go index 9cab1b70..0ac2bf5d 100644 --- a/internal/locate/region_cache.go +++ b/internal/locate/region_cache.go @@ -1851,6 +1851,14 @@ func (c *RegionCache) BatchLoadRegionsFromKey(bo *retry.Backoffer, startKey []by return regions[len(regions)-1].EndKey(), nil } +func (c *RegionCache) AsyncInvalidateCachedRegion(id RegionVerID) { + cachedRegion := c.GetCachedRegionWithRLock(id) + if cachedRegion == nil { + return + } + cachedRegion.setSyncFlags(needDelayedReloadPending) +} + // InvalidateCachedRegion removes a cached Region. func (c *RegionCache) InvalidateCachedRegion(id RegionVerID) { c.InvalidateCachedRegionWithReason(id, Other) diff --git a/internal/locate/region_request.go b/internal/locate/region_request.go index 548832e1..b84789da 100644 --- a/internal/locate/region_request.go +++ b/internal/locate/region_request.go @@ -1956,8 +1956,12 @@ func (s *RegionRequestSender) onRegionError( // This peer is removed from the region. Invalidate the region since it's too stale. // if the region error is from follower, can we mark the peer unavailable and reload region asynchronously? if regionErr.GetRegionNotFound() != nil { - s.regionCache.InvalidateCachedRegion(ctx.Region) - return false, nil + if s.replicaSelector != nil { + return s.replicaSelector.onRegionNotFound(bo, ctx, req) + } else { + s.regionCache.InvalidateCachedRegion(ctx.Region) + return false, nil + } } if regionErr.GetKeyNotInRegion() != nil { diff --git a/internal/locate/region_request_state_test.go b/internal/locate/region_request_state_test.go index cd43d86a..cf320a24 100644 --- a/internal/locate/region_request_state_test.go +++ b/internal/locate/region_request_state_test.go @@ -303,11 +303,11 @@ func testRegionCacheStaleRead(t *testing.T) { leaderAsyncReload: util.Some(false), leaderSuccessReplica: []string{"z1"}, leaderSuccessReadType: SuccessStaleRead, - followerRegionValid: false, - followerAsyncReload: util.Some(false), + followerRegionValid: true, + followerAsyncReload: util.Some(true), // may async reload region and access it from leader. - followerSuccessReplica: []string{}, - followerSuccessReadType: ReadFail, + followerSuccessReplica: []string{"z1"}, + followerSuccessReadType: SuccessStaleRead, }, { do: evictLeader, diff --git a/internal/locate/replica_selector.go b/internal/locate/replica_selector.go index a29f924d..0c053903 100644 --- a/internal/locate/replica_selector.go +++ b/internal/locate/replica_selector.go @@ -517,6 +517,23 @@ func (s *replicaSelector) onDataIsNotReady() { } } +func (s *replicaSelector) onRegionNotFound( + bo *retry.Backoffer, ctx *RPCContext, req *tikvrpc.Request, +) (shouldRetry bool, err error) { + leaderIdx := s.region.getStore().workTiKVIdx + leader := s.replicas[leaderIdx] + if !leader.isExhausted(1, 0) { + // if the request is not sent to leader, we can retry it with leader and invalidate the region cache asynchronously. It helps in the scenario + // where region is split by the leader but not yet created in replica due to replica down. + req.SetReplicaReadType(kv.ReplicaReadLeader) + s.replicaReadType = kv.ReplicaReadLeader + s.regionCache.AsyncInvalidateCachedRegion(ctx.Region) + return true, nil + } + s.regionCache.InvalidateCachedRegion(ctx.Region) + return false, nil +} + func (s *replicaSelector) onServerIsBusy( bo *retry.Backoffer, ctx *RPCContext, req *tikvrpc.Request, serverIsBusy *errorpb.ServerIsBusy, ) (shouldRetry bool, err error) { diff --git a/internal/locate/replica_selector_test.go b/internal/locate/replica_selector_test.go index 87d8acdc..50617f83 100644 --- a/internal/locate/replica_selector_test.go +++ b/internal/locate/replica_selector_test.go @@ -1149,12 +1149,24 @@ func testReplicaReadAccessPathByBasicCase(s *testReplicaSelectorSuite) { respErr = "region 0 is not prepared for the flashback" respRegionError = nil regionIsValid = true + case RegionNotFoundErr: + regionIsValid = false } switch readType { case kv.ReplicaReadLeader: accessPath = []string{"{addr: store1, replica-read: false, stale-read: false}"} case kv.ReplicaReadFollower: - accessPath = []string{"{addr: store2, replica-read: true, stale-read: false}"} + // For RegionNotFoundErr from follower, it will retry on leader + if tp == RegionNotFoundErr { + accessPath = []string{ + "{addr: store2, replica-read: true, stale-read: false}", + "{addr: store1, replica-read: false, stale-read: false}", + } + respRegionError = nil + regionIsValid = true + } else { + accessPath = []string{"{addr: store2, replica-read: true, stale-read: false}"} + } case kv.ReplicaReadMixed: if staleRead { accessPath = []string{"{addr: store1, replica-read: false, stale-read: true}"}