Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gauges for all monitors in content node #3770

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,6 @@ const healthCheckController = async (req) => {
randomBytesToSign
)

const prometheusRegistry = req.app.get('serviceRegistry').prometheusRegistry
const storagePathSizeMetric = prometheusRegistry.getMetric(
prometheusRegistry.metricNames.STORAGE_PATH_SIZE_BYTES
)
storagePathSizeMetric.set({ type: 'total' }, response.storagePathSize)
storagePathSizeMetric.set({ type: 'used' }, response.storagePathUsed)

if (enforceStateMachineQueueHealth) {
const { stateMachineJobs } = response
const {
Expand Down
51 changes: 38 additions & 13 deletions creator-node/src/monitors/MonitoringQueue.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
const Bull = require('bull')
const redis = require('../redis')
const config = require('../config')
const { MONITORS, getMonitorRedisKey } = require('./monitors')
const {
MONITORS,
PROMETHEUS_MONITORS,
getMonitorRedisKey
} = require('./monitors')
const { logger } = require('../logging')

const QUEUE_INTERVAL_MS = 60 * 1000
Expand All @@ -22,7 +26,7 @@ const MONITORING_QUEUE_HISTORY = 500
* 2. Refreshes the value and stores the update in redis
*/
class MonitoringQueue {
constructor() {
constructor(prometheusRegistry) {
this.queue = new Bull('monitoring-queue', {
redis: {
port: config.get('redisPort'),
Expand All @@ -34,6 +38,8 @@ class MonitoringQueue {
}
})

this.prometheusRegistry = prometheusRegistry

// Clean up anything that might be still stuck in the queue on restart
this.queue.empty()

Expand All @@ -48,11 +54,11 @@ class MonitoringQueue {

// Iterate over each monitor and set a new value if the cached
// value is not fresh.
Object.values(MONITORS).forEach(async (monitor) => {
Object.keys(MONITORS).forEach(async (monitorKey) => {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need the key to look up the metric form METRIC_NAMES

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Object.entries(MONITORS).forEach(async (monitorKey, monitorProps) => {} would be v clean here 🙂

try {
await this.refresh(monitor)
await this.refresh(MONITORS[monitorKey], monitorKey)
} catch (e) {
this.logStatus(`Error on ${monitor.name} ${e}`)
this.logStatus(`Error on ${monitorKey} ${e}`)
}
})

Expand All @@ -71,29 +77,48 @@ class MonitoringQueue {
* them on init
*/
async seedInitialValues() {
await this.refresh(MONITORS.STORAGE_PATH_SIZE)
await this.refresh(MONITORS.STORAGE_PATH_USED)
await this.refresh(MONITORS.STORAGE_PATH_SIZE, 'STORAGE_PATH_SIZE')
await this.refresh(MONITORS.STORAGE_PATH_USED, 'STORAGE_PATH_USED')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not ideal but w/e

}

async refresh(monitor) {
const key = getMonitorRedisKey(monitor)
/**
* Refresh monitor in redis and prometheus (if integer)
* @param {Object} monitorVal Object containing the monitor props like { func, ttl, type, name }
* @param {*} monitorKey name of the monitor eg `THIRTY_DAY_ROLLING_SYNC_SUCCESS_COUNT`
*/
async refresh(monitorVal, monitorKey) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we do refresh(monitorKey, monitorPropsObj) or smth for clarity this is v confusing since its ordered backward and also "monitorVal" is actually not the value

Copy link
Contributor Author

@dmanjunath dmanjunath Sep 28, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ass far as the order everything else in this file is monitor first so i was trying to preserve the consistency. but i do agree the monitorVal name is confusing

const key = getMonitorRedisKey(monitorVal)
const ttlKey = `${key}:ttl`

// If the value is fresh, exit early
const isFresh = await redis.get(ttlKey)
if (isFresh) return

const value = await monitor.func()
this.logStatus(`Computed value for ${monitor.name} ${value}`)
const value = await monitorVal.func()
this.logStatus(`Computed value for ${monitorVal.name} ${value}`)

// store integer monitors in prometheus
try {
if (PROMETHEUS_MONITORS.hasOwnProperty(monitorKey)) {
const metric = this.prometheusRegistry.getMetric(
this.prometheusRegistry.metricNames[`MONITOR_${monitorKey}`]
)
metric.set({}, value)
}
} catch (e) {
logger.debug(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if anything above log 98 should be a debug and this one should be info/warn haha

`Couldn't store value: ${value} in prometheus for metric: ${monitorKey}`
)
}

// Set the value
redis.set(key, value)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw the bug i mentioned on linear is that this call is not awaited (although that might have been intentional)


if (monitor.ttl) {
if (monitorVal.ttl) {
// Set a TTL (in seconds) key to track when this value needs refreshing.
// We store a separate TTL key rather than expiring the value itself
// so that in the case of an error, the current value can still be read
redis.set(ttlKey, 1, 'EX', monitor.ttl)
redis.set(ttlKey, 1, 'EX', monitorVal.ttl)
}
}

Expand Down
7 changes: 7 additions & 0 deletions creator-node/src/monitors/monitors.js
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,12 @@ const MONITORS = {
LATEST_FIND_REPLICA_SET_UPDATES_JOB_SUCCESS
}

const PROMETHEUS_MONITORS = {}
for (const monitor in MONITORS) {
if (MONITORS[monitor]?.type === 'int')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whats your thinking for the non-int monitors? we gonna keep writing them all to redis for now unchanged?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep exactly, keep as is

PROMETHEUS_MONITORS[monitor] = MONITORS[monitor]
}

const getMonitorRedisKey = (monitor) =>
`${MONITORING_REDIS_PREFIX}:${monitor.name}`

Expand Down Expand Up @@ -346,6 +352,7 @@ const getMonitors = async (monitors) => {

module.exports = {
MONITORS,
PROMETHEUS_MONITORS,
getMonitorRedisKey,
getMonitors
}
2 changes: 1 addition & 1 deletion creator-node/src/serviceRegistry.js
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class ServiceRegistry {
this.trustedNotifierManager = null // Service that blacklists content on behalf of Content Nodes

// Queues
this.monitoringQueue = new MonitoringQueue() // Recurring job to monitor node state & performance metrics
this.monitoringQueue = new MonitoringQueue(this.prometheusRegistry) // Recurring job to monitor node state & performance metrics
this.sessionExpirationQueue = new SessionExpirationQueue() // Recurring job to clear expired session tokens from Redis and DB
this.imageProcessingQueue = new ImageProcessingQueue() // Resizes all images on Audius
this.transcodingQueue = TranscodingQueue // Transcodes and segments all tracks
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
// eslint-disable-next-line import/no-unresolved
} from '../stateMachineManager/stateMachineConstants'
import * as config from '../../config'
import { PROMETHEUS_MONITORS } from '../../monitors/monitors'

/**
* For explanation of METRICS, and instructions on how to add a new metric, please see `prometheusMonitoring/README.md`
Expand Down Expand Up @@ -58,8 +59,7 @@ const metricNames: Record<string, string> = {
JOBS_ATTEMPTS_HISTOGRAM: 'jobs_attempts',
RECOVER_ORPHANED_DATA_WALLET_COUNTS_GAUGE:
'recover_orphaned_data_wallet_counts',
RECOVER_ORPHANED_DATA_SYNC_COUNTS_GAUGE: 'recover_orphaned_data_sync_counts',
STORAGE_PATH_SIZE_BYTES: 'storage_path_size_bytes'
RECOVER_ORPHANED_DATA_SYNC_COUNTS_GAUGE: 'recover_orphaned_data_sync_counts'
}
// Add a histogram for each job in the state machine queues.
// Some have custom labels below, and all of them use the label: uncaughtError=true/false
Expand All @@ -70,6 +70,11 @@ for (const jobName of Object.values(
`STATE_MACHINE_${jobName}_JOB_DURATION_SECONDS_HISTOGRAM`
] = `state_machine_${snakeCase(jobName)}_job_duration_seconds`
}
// Add gauge for each monitor
for (const monitor of Object.keys(PROMETHEUS_MONITORS)) {
metricNames[`MONITOR_${monitor}`] = `monitor_${snakeCase(monitor)}`
}

export const METRIC_NAMES = Object.freeze(
mapValues(metricNames, (metricName) => `${NAMESPACE_PREFIX}_${metricName}`)
)
Expand Down Expand Up @@ -348,6 +353,22 @@ export const METRICS: Record<string, Metric> = Object.freeze({
}
])
),
// Add gauge for each monitor
...Object.fromEntries(
Object.keys(PROMETHEUS_MONITORS).map((monitor) => {
return [
METRIC_NAMES[`MONITOR_${monitor}`],
{
metricType: METRIC_TYPES.GAUGE,
metricConfig: {
name: METRIC_NAMES[`MONITOR_${monitor}`],
help: `Record monitor: ${monitor}`,
labelNames: []
}
}
]
})
),
[METRIC_NAMES.FIND_SYNC_REQUEST_COUNTS_GAUGE]: {
metricType: METRIC_TYPES.GAUGE,
metricConfig: {
Expand All @@ -374,14 +395,6 @@ export const METRICS: Record<string, Metric> = Object.freeze({
5
)
}
},
[METRIC_NAMES.STORAGE_PATH_SIZE_BYTES]: {
metricType: METRIC_TYPES.GAUGE,
metricConfig: {
name: METRIC_NAMES.STORAGE_PATH_SIZE_BYTES,
help: 'Disk storage size',
labelNames: ['type']
}
}
})

Expand Down
7 changes: 4 additions & 3 deletions creator-node/test/lib/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ async function getApp(
libs: libsClient,
blacklistManager,
redis: redisClient,
monitoringQueue: new MonitoringQueueMock(),
monitoringQueue: new MonitoringQueueMock(prometheusRegistry),
asyncProcessingQueue: apq,
imageProcessingQueue: new ImageProcessingQueue(),
nodeConfig,
Expand All @@ -55,15 +55,16 @@ async function getApp(
}

function getServiceRegistryMock(libsClient, blacklistManager) {
const prometheusRegistry = new PrometheusRegistry()
return {
libs: libsClient,
blacklistManager: blacklistManager,
redis: redisClient,
monitoringQueue: new MonitoringQueueMock(),
monitoringQueue: new MonitoringQueueMock(prometheusRegistry),
syncQueue: new SyncQueue(nodeConfig, redisClient),
nodeConfig,
initLibs: async function () {},
prometheusRegistry: new PrometheusRegistry()
prometheusRegistry
}
}

Expand Down
3 changes: 2 additions & 1 deletion creator-node/test/lib/monitoringQueueMock.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ const redisClient = require('../../src/redis')

// Mock monitoring queue that sets monitor values on construction
class MonitoringQueueMock {
constructor () {
constructor (prometheusRegistry) {
this.prometheusRegistry = prometheusRegistry
redisClient.set(
getMonitorRedisKey(MONITORS.DATABASE_LIVENESS),
'true'
Expand Down