tikv · disksing · Jul 14, 2023 · Jun 15, 2023 · Jun 19, 2023 · Jun 19, 2023
diff --git a/error/error.go b/error/error.go
@@ -246,7 +246,7 @@ type ErrAssertionFailed struct {
 	*kvrpcpb.AssertionFailed
 }
 
-// ErrLockOnlyIfExistsNoReturnValue is used when the flag `LockOnlyIfExists` of `LockCtx` is set, but `ReturnValues“ is not.
+// ErrLockOnlyIfExistsNoReturnValue is used when the flag `LockOnlyIfExists` of `LockCtx` is set, but `ReturnValues` is not.
 type ErrLockOnlyIfExistsNoReturnValue struct {
 	StartTS     uint64
 	ForUpdateTs uint64

diff --git a/internal/locate/region_cache.go b/internal/locate/region_cache.go
@@ -153,6 +153,7 @@ type Region struct {
 	syncFlag      int32          // region need be sync in next turn
 	lastAccess    int64          // last region access time, see checkRegionCacheTTL
 	invalidReason InvalidReason  // the reason why the region is invalidated
+	asyncReload   atomic.Bool    // the region need to be reloaded in async mode
 }
 
 // AccessIndex represent the index for accessIndex array
@@ -1226,6 +1227,36 @@ func (c *RegionCache) LocateRegionByID(bo *retry.Backoffer, regionID uint64) (*K
 	}, nil
 }
 
+func (c *RegionCache) asyncReloadRegion(region *Region) {
+	if region == nil || !region.asyncReload.CompareAndSwap(false, true) {
+		// async reload triggered by other thread.
+		return
+	}
+	go func() {
+		// wait a while for two reasons:
+		// 1. there may an unavailable duration while recreating the connection.
+		// 2. the store may just be started, and wait safe ts synced to avoid the
+		// possible dataIsNotReady error.
+		time.Sleep(10 * time.Second)
+		regionID := region.GetID()
+		if regionID == 0 {
+			return
+		}
+		bo := retry.NewNoopBackoff(context.Background())
+		lr, err := c.loadRegionByID(bo, regionID)
+		if err != nil {
+			// ignore error and use old region info.
+			logutil.Logger(bo.GetCtx()).Error("load region failure",
+				zap.Uint64("regionID", regionID), zap.Error(err))
+			region.asyncReload.Store(false)
+			return
+		}
+		c.mu.Lock()
+		c.insertRegionToCache(lr)
+		c.mu.Unlock()
+	}()
+}
+
 // GroupKeysByRegion separates keys into groups by their belonging Regions.
 // Specially it also returns the first key's region which may be used as the
 // 'PrimaryLockKey' and should be committed ahead of others.
@@ -1399,8 +1430,11 @@ func (mu *regionIndexMu) insertRegionToCache(cachedRegion *Region) {
 		if InvalidReason(atomic.LoadInt32((*int32)(&oldRegion.invalidReason))) == NoLeader {
 			store.workTiKVIdx = (oldRegionStore.workTiKVIdx + 1) % AccessIndex(store.accessStoreNum(tiKVOnly))
 		}
-		// Invalidate the old region in case it's not invalidated and some requests try with the stale region information.
-		oldRegion.invalidate(Other)
+		// If the region info is async reloaded, the old region is still valid.
+		if !oldRegion.asyncReload.Load() {
+			// Invalidate the old region in case it's not invalidated and some requests try with the stale region information.
+			oldRegion.invalidate(Other)
+		}
 		// Don't refresh TiFlash work idx for region. Otherwise, it will always goto a invalid store which
 		// is under transferring regions.
 		store.workTiFlashIdx.Store(oldRegionStore.workTiFlashIdx.Load())
@@ -2507,8 +2541,8 @@ func (s *Store) reResolve(c *RegionCache) (bool, error) {
 }
 
 func (s *Store) getResolveState() resolveState {
-	var state resolveState
 	if s == nil {
+		var state resolveState
 		return state
 	}
 	return resolveState(atomic.LoadUint64(&s.state))

diff --git a/internal/locate/region_request.go b/internal/locate/region_request.go
@@ -573,18 +573,22 @@ func (state *accessFollower) next(bo *retry.Backoffer, selector *replicaSelector
 	if state.option.preferLeader {
 		state.lastIdx = state.leaderIdx
 	}
+	offset := rand.Intn(replicaSize)
 	for i := 0; i < replicaSize && !state.option.leaderOnly; i++ {
-		idx := AccessIndex((int(state.lastIdx) + i) % replicaSize)
 		// If the given store is abnormal to be accessed under `ReplicaReadMixed` mode, we should choose other followers or leader
 		// as candidates to serve the Read request. Meanwhile, we should make the choice of next() meet Uniform Distribution.
-		for cnt := 0; cnt < replicaSize && !state.isCandidate(idx, selector.replicas[idx]); cnt++ {
-			idx = AccessIndex((int(idx) + rand.Intn(replicaSize)) % replicaSize)
-		}
-		if state.isCandidate(idx, selector.replicas[idx]) {
+		idx := AccessIndex((int(state.lastIdx) + i + offset) % replicaSize)
+		selectReplica := selector.replicas[idx]
+		if state.isCandidate(idx, selectReplica) {
 			state.lastIdx = idx
 			selector.targetIdx = idx
 			break
 		}
+		if selectReplica.isEpochStale() &&
+			selectReplica.store.getResolveState() == resolved &&
+			selectReplica.store.getLivenessState() == reachable {
+			selector.regionCache.asyncReloadRegion(selector.region)
+		}
 	}
 	// If there is no candidate, fallback to the leader.
 	if selector.targetIdx < 0 {