Skip to content

Commit

Permalink
[YUNIKORN-2908] metrics not removed when a queue is removed
Browse files Browse the repository at this point in the history
  • Loading branch information
Hengzhe committed Oct 24, 2024
1 parent 079a02d commit e175210
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 0 deletions.
9 changes: 9 additions & 0 deletions pkg/metrics/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,15 @@ func GetQueueMetrics(name string) *QueueMetrics {
return queueMetrics
}

func RemoveQueueMetrics(name string) {
m.lock.Lock()
defer m.lock.Unlock()
if metrics, ok := m.queues[name]; ok {
metrics.UnregisterMetrics()
delete(m.queues, name)
}
}

func GetEventMetrics() *EventMetrics {
return m.event
}
Expand Down
15 changes: 15 additions & 0 deletions pkg/metrics/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,21 @@ func InitQueueMetrics(name string) *QueueMetrics {
return q
}

func (m *QueueMetrics) UnregisterMetrics() {
var queueMetricsList = []prometheus.Collector{
m.appMetricsLabel,
m.appMetricsSubsystem,
m.containerMetrics,
m.resourceMetricsLabel,
m.resourceMetricsSubsystem,
}

// Unregister the metrics
for _, metric := range queueMetricsList {
prometheus.Unregister(metric)
}
}

func (m *QueueMetrics) incQueueApplications(state string) {
m.appMetricsLabel.WithLabelValues(state).Inc()
m.appMetricsSubsystem.WithLabelValues(state).Inc()
Expand Down
17 changes: 17 additions & 0 deletions pkg/metrics/queue_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,23 @@ func TestQueuePreemptingResourceMetrics(t *testing.T) {
verifyResourceMetrics(t, "preempting", "cpu")
}

func TestRemoveQueueMetrics(t *testing.T) {
testQueueName := "root.test"
qm = GetQueueMetrics(testQueueName)
defer unregisterQueueMetrics()

qm.SetQueueAllocatedResourceMetrics("cpu", 1)
assert.Assert(t, m.queues[testQueueName] != nil)
qm.SetQueueAllocatedResourceMetrics("cpu", 1)
RemoveQueueMetrics(testQueueName)
assert.Assert(t, m.queues[testQueueName] == nil)
metrics, _ := prometheus.DefaultGatherer.Gather()

Check failure on line 259 in pkg/metrics/queue_test.go

View workflow job for this annotation

GitHub Actions / build

Error return value of `prometheus.DefaultGatherer.Gather` is not checked (errcheck)
for _, metric := range metrics {
assert.Assert(t, metric.GetName() != "yunikorn_queue_resource",
"Metric is not cleaned up")
}
}

func getQueueMetrics() *QueueMetrics {
return InitQueueMetrics("root.test")
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/scheduler/objects/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -986,6 +986,7 @@ func (sq *Queue) RemoveQueue() bool {
return false
}
log.Log(log.SchedQueue).Info("removing queue", zap.String("queue", sq.QueuePath))
sq.removeMetrics()
// root is always managed and is the only queue with a nil parent: no need to guard
sq.parent.removeChildQueue(sq.Name)
sq.queueEvents.SendRemoveQueueEvent(sq.QueuePath, sq.isManaged)
Expand Down Expand Up @@ -1696,6 +1697,10 @@ func (sq *Queue) updatePreemptingResourceMetrics() {
}
}

func (sq *Queue) removeMetrics() {
metrics.RemoveQueueMetrics(sq.QueuePath)
}

func (sq *Queue) String() string {
sq.RLock()
defer sq.RUnlock()
Expand Down

0 comments on commit e175210

Please sign in to comment.