tikv · ti-chi-bot · Oct 30, 2024 · Apr 22, 2024 · Apr 22, 2024 · Apr 22, 2024
diff --git a/pkg/cache/cache_test.go b/pkg/cache/cache_test.go
@@ -371,23 +371,23 @@ func TestPriorityQueue(t *testing.T) {
 	pq.Remove(uint64(1))
 	re.Nil(pq.Get(1))
 	re.Equal(2, pq.Len())
-	entry := pq.Peek()
+	entry := pq.peek()
 	re.Equal(2, entry.Priority)
 	re.Equal(testData[2], entry.Value)
 
 	// case3 update 3's priority to highest
 	pq.Put(-1, testData[3])
-	entry = pq.Peek()
+	entry = pq.peek()
 	re.Equal(-1, entry.Priority)
 	re.Equal(testData[3], entry.Value)
 	pq.Remove(entry.Value.ID())
-	re.Equal(testData[2], pq.Peek().Value)
+	re.Equal(testData[2], pq.peek().Value)
 	re.Equal(1, pq.Len())
 
 	// case4 remove all element
 	pq.Remove(uint64(2))
 	re.Equal(0, pq.Len())
 	re.Empty(pq.items)
-	re.Nil(pq.Peek())
-	re.Nil(pq.Tail())
+	re.Nil(pq.peek())
+	re.Nil(pq.tail())
 }
diff --git a/pkg/cache/priority_queue.go b/pkg/cache/priority_queue.go
@@ -16,6 +16,7 @@ package cache
 
 import (
 	"github.com/tikv/pd/pkg/btree"
+	"github.com/tikv/pd/pkg/utils/syncutil"
 )
 
 // defaultDegree default btree degree, the depth is h<log(degree)(capacity+1)/2
@@ -26,6 +27,7 @@ type PriorityQueue struct {
 	items    map[uint64]*Entry
 	btree    *btree.BTreeG[*Entry]
 	capacity int
+	mutex    syncutil.RWMutex
 }
 
 // NewPriorityQueue construct of priority queue
@@ -44,6 +46,8 @@ type PriorityQueueItem interface {
 
 // Put put value with priority into queue
 func (pq *PriorityQueue) Put(priority int, value PriorityQueueItem) bool {
+	pq.mutex.Lock()
+	defer pq.mutex.Unlock()
 	id := value.ID()
 	entry, ok := pq.items[id]
 	if !ok {
@@ -54,7 +58,9 @@ func (pq *PriorityQueue) Put(priority int, value PriorityQueueItem) bool {
 			if !found || !min.Less(entry) {
 				return false
 			}
+			pq.mutex.Unlock()
 			pq.Remove(min.Value.ID())
+			pq.mutex.Lock()
 		}
 	} else if entry.Priority != priority { // delete before update
 		pq.btree.Delete(entry)
@@ -68,19 +74,22 @@ func (pq *PriorityQueue) Put(priority int, value PriorityQueueItem) bool {
 
 // Get find entry by id from queue
 func (pq *PriorityQueue) Get(id uint64) *Entry {
+	pq.mutex.RLock()
+	defer pq.mutex.RUnlock()
 	return pq.items[id]
 }
 
-// Peek return the highest priority entry
-func (pq *PriorityQueue) Peek() *Entry {
+// peek return the highest priority entry
+// It is used test only
+func (pq *PriorityQueue) peek() *Entry {
 	if max, ok := pq.btree.Max(); ok {
 		return max
 	}
 	return nil
 }
 
-// Tail return the lowest priority entry
-func (pq *PriorityQueue) Tail() *Entry {
+// tail return the lowest priority entry
+func (pq *PriorityQueue) tail() *Entry {
 	if min, ok := pq.btree.Min(); ok {
 		return min
 	}
@@ -89,6 +98,8 @@ func (pq *PriorityQueue) Tail() *Entry {
 
 // Elems return all elements in queue
 func (pq *PriorityQueue) Elems() []*Entry {
+	pq.mutex.RLock()
+	defer pq.mutex.RUnlock()
 	rs := make([]*Entry, pq.Len())
 	count := 0
 	pq.btree.Descend(func(i *Entry) bool {
@@ -101,6 +112,8 @@ func (pq *PriorityQueue) Elems() []*Entry {
 
 // Remove remove value from queue
 func (pq *PriorityQueue) Remove(id uint64) {
+	pq.mutex.Lock()
+	defer pq.mutex.Unlock()
 	if v, ok := pq.items[id]; ok {
 		pq.btree.Delete(v)
 		delete(pq.items, id)

diff --git a/pkg/mcs/scheduling/server/config/config.go b/pkg/mcs/scheduling/server/config/config.go
@@ -396,6 +396,16 @@ func (o *PersistConfig) GetHotRegionCacheHitsThreshold() int {
 	return int(o.GetScheduleConfig().HotRegionCacheHitsThreshold)
 }
 
+// GetPatrolRegionConcurrency  returns the worker count of the patrol.
+func (o *PersistConfig) GetPatrolRegionConcurrency() int {
+	return int(o.GetScheduleConfig().PatrolRegionConcurrency)
+}
+
+// GetPatrolRegionBatchLimit returns the region count of the patrol.
+func (o *PersistConfig) GetPatrolRegionBatchLimit() int {
+	return int(o.GetScheduleConfig().PatrolRegionBatchLimit)
+}
+
 // GetMaxMovableHotPeerSize returns the max movable hot peer size.
 func (o *PersistConfig) GetMaxMovableHotPeerSize() int64 {
 	return o.GetScheduleConfig().MaxMovableHotPeerSize

diff --git a/pkg/schedule/config/config.go b/pkg/schedule/config/config.go
@@ -63,6 +63,8 @@ const (
 	defaultRegionScoreFormulaVersion = "v2"
 	defaultLeaderSchedulePolicy      = "count"
 	defaultStoreLimitVersion         = "v1"
+	defaultPatrolRegionConcurrency   = 1
+	defaultPatrolRegionBatchLimit    = 128
 	// DefaultSplitMergeInterval is the default value of config split merge interval.
 	DefaultSplitMergeInterval      = time.Hour
 	defaultSwitchWitnessInterval   = time.Hour
@@ -305,6 +307,12 @@ type ScheduleConfig struct {
 	// HaltScheduling is the option to halt the scheduling. Once it's on, PD will halt the scheduling,
 	// and any other scheduling configs will be ignored.
 	HaltScheduling bool `toml:"halt-scheduling" json:"halt-scheduling,string,omitempty"`
+
+	// PatrolRegionConcurrency is the number of workers to patrol region.
+	PatrolRegionConcurrency uint64 `toml:"patrol-worker-count" json:"patrol-worker-count"`
+
+	// PatrolRegionBatchLimit is the number of regions to patrol in one batch.
+	PatrolRegionBatchLimit uint64 `toml:"patrol-region-batch-limit" json:"patrol-region-batch-limit"`
 }
 
 // Clone returns a cloned scheduling configuration.
@@ -374,6 +382,13 @@ func (c *ScheduleConfig) Adjust(meta *configutil.ConfigMetaData, reloading bool)
 	if !meta.IsDefined("store-limit-version") {
 		configutil.AdjustString(&c.StoreLimitVersion, defaultStoreLimitVersion)
 	}
+	if !meta.IsDefined("patrol-worker-count") {
+		configutil.AdjustUint64(&c.PatrolRegionConcurrency, defaultPatrolRegionConcurrency)
+	}
+
+	if !meta.IsDefined("patrol-region-batch-limit") {
+		configutil.AdjustUint64(&c.PatrolRegionBatchLimit, defaultPatrolRegionBatchLimit)
+	}
 
 	if !meta.IsDefined("enable-joint-consensus") {
 		c.EnableJointConsensus = defaultEnableJointConsensus

diff --git a/pkg/schedule/config/config_provider.go b/pkg/schedule/config/config_provider.go
@@ -62,6 +62,8 @@ type SchedulerConfigProvider interface {
 	GetHotRegionCacheHitsThreshold() int
 	GetMaxMovableHotPeerSize() int64
 	IsTraceRegionFlow() bool
+	GetPatrolRegionConcurrency() int
+	GetPatrolRegionBatchLimit() int
 
 	GetTolerantSizeRatio() float64
 	GetLeaderSchedulePolicy() constant.SchedulePolicy
@@ -117,6 +119,8 @@ type SharedConfigProvider interface {
 	IsPlacementRulesCacheEnabled() bool
 	SetHaltScheduling(bool, string)
 	GetHotRegionCacheHitsThreshold() int
+	GetPatrolRegionConcurrency() int
+	GetPatrolRegionBatchLimit() int
 
 	// for test purpose
 	SetPlacementRuleEnabled(bool)

diff --git a/pkg/schedule/coordinator.go b/pkg/schedule/coordinator.go
@@ -51,9 +51,8 @@ const (
 	maxLoadConfigRetries       = 10
 	// pushOperatorTickInterval is the interval try to push the operator.
 	pushOperatorTickInterval = 500 * time.Millisecond
+	patrolRegionChanLen      = 1024
 
-	// It takes about 1.3 minutes(1000000/128*10/60/1000) to iterate 1 million regions(with DefaultPatrolRegionInterval=10ms).
-	patrolScanRegionLimit = 128
 	// PluginLoad means action for load plugin
 	PluginLoad = "PluginLoad"
 	// PluginUnload means action for unload plugin
@@ -157,11 +156,16 @@ func (c *Coordinator) IsPendingRegion(region uint64) bool {
 // The function is exposed for test purpose.
 func (c *Coordinator) PatrolRegions() {
 	defer logutil.LogPanic()
-
 	defer c.wg.Done()
 	ticker := time.NewTicker(c.cluster.GetCheckerConfig().GetPatrolRegionInterval())
 	defer ticker.Stop()
 
+	workersCount := c.cluster.GetCheckerConfig().GetPatrolRegionConcurrency()
+	regionChan := make(chan *core.RegionInfo, patrolRegionChanLen)
+	quit := make(chan bool)
+	var wg sync.WaitGroup
+	c.startPatrolRegionWorkers(workersCount, regionChan, quit, &wg)
+
 	log.Info("coordinator starts patrol regions")
 	start := time.Now()
 	var (
@@ -173,42 +177,104 @@ func (c *Coordinator) PatrolRegions() {
 		case <-ticker.C:
 			// Note: we reset the ticker here to support updating configuration dynamically.
 			ticker.Reset(c.cluster.GetCheckerConfig().GetPatrolRegionInterval())
+			newWorkersCount := c.cluster.GetCheckerConfig().GetPatrolRegionConcurrency()
+			if newWorkersCount != workersCount {
+				log.Info("coordinator starts patrol regions with new workers count",
+					zap.Int("old-workers-count", workersCount),
+					zap.Int("new-workers-count", newWorkersCount))
+				workersCount = newWorkersCount
+				close(quit)
+				wg.Wait()
+				quit = make(chan bool)
+				c.startPatrolRegionWorkers(workersCount, regionChan, quit, &wg)
+			}
+			if c.cluster.IsSchedulingHalted() {
+				for len(regionChan) > 0 {
+					<-regionChan
+				}
+				continue
+			}
+
+			// Check priority regions first.
+			c.waitDrainRegionChan(regionChan)
+			c.checkPriorityRegions()
+			// Check suspect regions first.
+			c.waitDrainRegionChan(regionChan)
+			c.checkSuspectRegions(regionChan)
+			// Check regions in the waiting list
+			c.waitDrainRegionChan(regionChan)
+			c.checkWaitingRegions(regionChan)
+
+			c.waitDrainRegionChan(regionChan)
+			key, regions = c.checkRegions(key, c.cluster.GetCheckerConfig().GetPatrolRegionBatchLimit(), regionChan)
+			if len(regions) == 0 {
+				continue
+			}
+			// Updates the label level isolation statistics.
+			c.cluster.UpdateRegionsLabelLevelStats(regions)
+			if len(key) == 0 {
+				dur := time.Since(start)
+				patrolCheckRegionsGauge.Set(dur.Seconds())
+				c.setPatrolRegionsDuration(dur)
+				start = time.Now()
+			}
+			failpoint.Inject("break-patrol", func() {
+				failpoint.Break()
+			})
 		case <-c.ctx.Done():
 			patrolCheckRegionsGauge.Set(0)
 			c.setPatrolRegionsDuration(0)
 			log.Info("patrol regions has been stopped")
+			close(regionChan)
+			close(quit)
+			wg.Wait()
 			return
 		}
-		if c.cluster.IsSchedulingHalted() {
-			continue
-		}
+	}
+}
 
-		// Check priority regions first.
-		c.checkPriorityRegions()
-		// Check suspect regions first.
-		c.checkSuspectRegions()
-		// Check regions in the waiting list
-		c.checkWaitingRegions()
+func (c *Coordinator) startPatrolRegionWorkers(workers int, regionChan <-chan *core.RegionInfo, quit <-chan bool, wg *sync.WaitGroup) {
+	for i := 0; i < workers; i++ {
+		wg.Add(1)
+		go func() {
+			defer logutil.LogPanic()
+			defer wg.Done()
+			for {
+				patrolCheckRegionsChanLenGauge.Set(float64(len(regionChan)))
+				select {
+				case region, ok := <-regionChan:
+					if ok {
+						c.tryAddOperators(region)
+					}
+				case <-quit:
+					return
+				}
+			}
+		}()
+	}
+}
 
-		key, regions = c.checkRegions(key)
-		if len(regions) == 0 {
-			continue
-		}
-		// Updates the label level isolation statistics.
-		c.cluster.UpdateRegionsLabelLevelStats(regions)
-		if len(key) == 0 {
-			dur := time.Since(start)
-			patrolCheckRegionsGauge.Set(dur.Seconds())
-			c.setPatrolRegionsDuration(dur)
-			start = time.Now()
+// waitDrainRegionChan is used to drain the regionChan.
+// It is used to avoid duplicated regions in the regionChan from different sources.
+func (c *Coordinator) waitDrainRegionChan(regionChan chan *core.RegionInfo) {
+	if len(regionChan) == 0 {
+		return
+	}
+	ticker := time.NewTicker(c.cluster.GetCheckerConfig().GetPatrolRegionInterval())
+	defer ticker.Stop()
+	for {
+		select {
+		case <-c.ctx.Done():
+			return
+		case <-ticker.C:
+			if len(regionChan) == 0 {
+				return
+			}
 		}
-		failpoint.Inject("break-patrol", func() {
-			failpoint.Break()
-		})
 	}
 }
 
-func (c *Coordinator) checkRegions(startKey []byte) (key []byte, regions []*core.RegionInfo) {
+func (c *Coordinator) checkRegions(startKey []byte, patrolScanRegionLimit int, regionChan chan *core.RegionInfo) (key []byte, regions []*core.RegionInfo) {
 	regions = c.cluster.ScanRegions(startKey, nil, patrolScanRegionLimit)
 	if len(regions) == 0 {
 		// Resets the scan key.
@@ -217,25 +283,25 @@ func (c *Coordinator) checkRegions(startKey []byte) (key []byte, regions []*core
 	}
 
 	for _, region := range regions {
-		c.tryAddOperators(region)
+		regionChan <- region
 		key = region.GetEndKey()
 	}
 	return
 }
 
-func (c *Coordinator) checkSuspectRegions() {
+func (c *Coordinator) checkSuspectRegions(regionChan chan *core.RegionInfo) {
 	for _, id := range c.checkers.GetSuspectRegions() {
 		region := c.cluster.GetRegion(id)
-		c.tryAddOperators(region)
+		regionChan <- region
 	}
 }
 
-func (c *Coordinator) checkWaitingRegions() {
+func (c *Coordinator) checkWaitingRegions(regionChan chan *core.RegionInfo) {
 	items := c.checkers.GetWaitingRegions()
 	waitingListGauge.Set(float64(len(items)))
 	for _, item := range items {
 		region := c.cluster.GetRegion(item.Key)
-		c.tryAddOperators(region)
+		regionChan <- region
 	}
 }
 

diff --git a/pkg/schedule/metrics.go b/pkg/schedule/metrics.go
@@ -40,10 +40,19 @@ var (
 			Name:      "patrol_regions_time",
 			Help:      "Time spent of patrol checks region.",
 		})
+
+	patrolCheckRegionsChanLenGauge = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "pd",
+			Subsystem: "checker",
+			Name:      "patrol_regions_chan_len",
+			Help:      "Time channel length of patrol checks region.",
+		})
 )
 
 func init() {
 	prometheus.MustRegister(hotSpotStatusGauge)
 	prometheus.MustRegister(regionListGauge)
 	prometheus.MustRegister(patrolCheckRegionsGauge)
+	prometheus.MustRegister(patrolCheckRegionsChanLenGauge)
 }
diff --git a/pkg/schedule/operator/operator_controller.go b/pkg/schedule/operator/operator_controller.go
@@ -461,7 +461,7 @@ func (oc *Controller) checkAddOperator(isPromoting bool, ops ...*Operator) (bool
 			return false, NotInCreateStatus
 		}
 		if !isPromoting && oc.wopStatus.getCount(op.Desc()) >= oc.config.GetSchedulerMaxWaitingOperator() {
-			log.Debug("exceed max return false", zap.Uint64("waiting", oc.wopStatus.ops[op.Desc()]), zap.String("desc", op.Desc()), zap.Uint64("max", oc.config.GetSchedulerMaxWaitingOperator()))
+			log.Debug("exceed max return false", zap.Uint64("waiting", oc.wopStatus.getCount(op.Desc())), zap.String("desc", op.Desc()), zap.Uint64("max", oc.config.GetSchedulerMaxWaitingOperator()))
 			operatorCounter.WithLabelValues(op.Desc(), "exceed-max-waiting").Inc()
 			return false, ExceedWaitLimit
 		}