diff --git a/internal/locate/pd_codec.go b/internal/locate/pd_codec.go index 7e3f26ac..07b20177 100644 --- a/internal/locate/pd_codec.go +++ b/internal/locate/pd_codec.go @@ -111,18 +111,33 @@ func processRegionResult(region *pd.Region, err error) (*pd.Region, error) { return region, nil } +// decodeError happens if the region range key is not well-formed. +// It indicates TiKV has bugs and the client can't handle such a case, +// so it should report the error to users soon. +type decodeError struct { + error +} + +func isDecodeError(err error) bool { + _, ok := errors.Cause(err).(*decodeError) + if !ok { + _, ok = errors.Cause(err).(decodeError) + } + return ok +} + func decodeRegionMetaKeyInPlace(r *metapb.Region) error { if len(r.StartKey) != 0 { _, decoded, err := codec.DecodeBytes(r.StartKey, nil) if err != nil { - return errors.Trace(err) + return &decodeError{err} } r.StartKey = decoded } if len(r.EndKey) != 0 { _, decoded, err := codec.DecodeBytes(r.EndKey, nil) if err != nil { - return errors.Trace(err) + return &decodeError{err} } r.EndKey = decoded } diff --git a/internal/locate/region_cache.go b/internal/locate/region_cache.go index a8057252..92c6a38d 100644 --- a/internal/locate/region_cache.go +++ b/internal/locate/region_cache.go @@ -1284,6 +1284,9 @@ func (c *RegionCache) loadRegion(bo *retry.Backoffer, key []byte, isEndKey bool) metrics.RegionCacheCounterWithGetRegionOK.Inc() } if err != nil { + if isDecodeError(err) { + return nil, errors.Errorf("failed to decode region range key, key: %q, err: %v", key, err) + } backoffErr = errors.Errorf("loadRegion from PD failed, key: %q, err: %v", key, err) continue } @@ -1334,6 +1337,9 @@ func (c *RegionCache) loadRegionByID(bo *retry.Backoffer, regionID uint64) (*Reg metrics.RegionCacheCounterWithGetRegionByIDOK.Inc() } if err != nil { + if isDecodeError(err) { + return nil, errors.Errorf("failed to decode region range key, regionID: %q, err: %v", regionID, err) + } backoffErr = errors.Errorf("loadRegion from PD failed, regionID: %v, err: %v", regionID, err) continue } @@ -1379,6 +1385,9 @@ func (c *RegionCache) scanRegions(bo *retry.Backoffer, startKey, endKey []byte, } regionsInfo, err := c.pdClient.ScanRegions(ctx, startKey, endKey, limit) if err != nil { + if isDecodeError(err) { + return nil, errors.Errorf("failed to decode region range key, startKey: %q, limit: %q, err: %v", startKey, limit, err) + } metrics.RegionCacheCounterWithScanRegionsError.Inc() backoffErr = errors.Errorf( "scanRegion from PD failed, startKey: %q, limit: %q, err: %v", diff --git a/internal/locate/region_cache_test.go b/internal/locate/region_cache_test.go index cd1cbb10..907b9241 100644 --- a/internal/locate/region_cache_test.go +++ b/internal/locate/region_cache_test.go @@ -1528,3 +1528,20 @@ func BenchmarkOnRequestFail(b *testing.B) { b.Fatal(len(cache.mu.regions)) } } + +func (s *testRegionCacheSuite) TestNoBackoffWhenFailToDecodeRegion() { + region2 := s.cluster.AllocID() + newPeers := s.cluster.AllocIDs(2) + k := []byte("k") + // Use SplitRaw to split a region with non-memcomparable range keys. + s.cluster.SplitRaw(s.region1, region2, k, newPeers, newPeers[0]) + _, err := s.cache.LocateKey(s.bo, k) + s.NotNil(err) + s.Equal(0, s.bo.GetTotalBackoffTimes()) + _, err = s.cache.LocateRegionByID(s.bo, region2) + s.NotNil(err) + s.Equal(0, s.bo.GetTotalBackoffTimes()) + _, err = s.cache.scanRegions(s.bo, []byte{}, []byte{}, 10) + s.NotNil(err) + s.Equal(0, s.bo.GetTotalBackoffTimes()) +}