From 500a3430e489bfbd88d93f8e18601507292b243c Mon Sep 17 00:00:00 2001
From: Theo Ilie <theoilie.ti@gmail.com>
Date: Fri, 24 Jun 2022 08:08:51 +0000
Subject: [PATCH 1/5] Allow state machine jobs to record metrics

---
 creator-node/src/serviceRegistry.js           |  23 ++-
 .../prometheus.constants.js                   |  69 ++++++-
 .../src/services/stateMachineManager/index.js |  16 +-
 .../makeCompletedJobEnqueueOtherJobs.js       | 121 ------------
 .../makeOnCompleteCallback.js                 | 186 ++++++++++++++++++
 .../stateMachineManager/stateMachineUtils.js  |  56 +++++-
 .../issueSyncRequest.jobProcessor.js          |  39 +++-
 .../issueSyncRequest.jobProcessor.test.js     | 109 ++++++----
 8 files changed, 431 insertions(+), 188 deletions(-)
 delete mode 100644 creator-node/src/services/stateMachineManager/makeCompletedJobEnqueueOtherJobs.js
 create mode 100644 creator-node/src/services/stateMachineManager/makeOnCompleteCallback.js

diff --git a/creator-node/src/serviceRegistry.js b/creator-node/src/serviceRegistry.js
index 2a354ff1904..3058561561b 100644
--- a/creator-node/src/serviceRegistry.js
+++ b/creator-node/src/serviceRegistry.js
@@ -181,7 +181,7 @@ class ServiceRegistry {
 
   /**
    * Truncates large JSON data in Bull Board after any of the following is exceeded:
-   * - 5 levels of nesting
+   * - 7 levels of nesting
    * - 10,000 characters (strings only)
    * - 100 elements (arrays) or 100 keys (objects)
    *
@@ -197,12 +197,21 @@ class ServiceRegistry {
   truncateBull(dataToTruncate, curDepth = 0) {
     if (
       typeof dataToTruncate === 'object' &&
-      !Array.isArray(dataToTruncate) &&
       dataToTruncate !== null &&
       dataToTruncate !== undefined
     ) {
-      if (curDepth < 5) {
+      if (curDepth < 7) {
         const newDepth = curDepth + 1
+        if (Array.isArray(dataToTruncate)) {
+          if (dataToTruncate.length > 100) {
+            return `[Truncated array with ${dataToTruncate.length} elements]`
+          }
+          const truncatedArr = []
+          dataToTruncate.forEach((element) => {
+            truncatedArr.push(this.truncateBull(element, newDepth))
+          })
+          return truncatedArr
+        }
         const json = Object.assign({}, dataToTruncate)
         Object.entries(dataToTruncate).forEach(([key, value]) => {
           switch (typeof value) {
@@ -233,9 +242,9 @@ class ServiceRegistry {
         })
         return json
       }
-      return `[Truncated object with ${
-        Object.keys(dataToTruncate).length
-      } keys]`
+      return Array.isArray(dataToTruncate)
+        ? `[Truncated array with ${dataToTruncate.length} elements]`
+        : `[Truncated object with ${Object.keys(dataToTruncate).length} keys]`
     }
     return dataToTruncate
   }
@@ -262,7 +271,7 @@ class ServiceRegistry {
     await this._initSnapbackSM()
     this.stateMachineManager = new StateMachineManager()
     const { stateMonitoringQueue, stateReconciliationQueue } =
-      await this.stateMachineManager.init(this.libs)
+      await this.stateMachineManager.init(this.libs, this.prometheusRegistry)
 
     // SyncQueue construction (requires L1 identity)
     // Note - passes in reference to instance of self (serviceRegistry), a very sub-optimal workaround
diff --git a/creator-node/src/services/prometheusMonitoring/prometheus.constants.js b/creator-node/src/services/prometheusMonitoring/prometheus.constants.js
index e78d43df003..dfb5f404274 100644
--- a/creator-node/src/services/prometheusMonitoring/prometheus.constants.js
+++ b/creator-node/src/services/prometheusMonitoring/prometheus.constants.js
@@ -1,5 +1,6 @@
 const promClient = require('prom-client')
 const _ = require('lodash')
+const config = require('../../config')
 
 /**
  * For explanation of Metrics, and instructions on how to add a new metric, please see `prometheusMonitoring/README.md`
@@ -8,6 +9,25 @@ const _ = require('lodash')
 // We add a namespace prefix to differentiate internal metrics from those exported by different exporters from the same host
 const NamespacePrefix = 'audius_cn_'
 
+const MetricLabels = Object.freeze({
+  ISSUE_SYNC_SECONDS_WAITING_FOR_COMPLETION_HISTOGRAM: {
+    // The reason another sync is needed
+    reason_for_additional_sync: [
+      'secondary_progressed_too_slow', // The secondary sync went through, but its clock value didn't increase enough
+      'secondary_failed_to_progress', // The secondary's clock value did not increase at all
+      'none' // No additional sync is required -- the first sync was successful
+    ]
+  }
+})
+const MetricLabelNames = Object.freeze(
+  Object.fromEntries(
+    Object.entries(MetricLabels).map(([metric, metricLabels]) => [
+      metric,
+      Object.keys(metricLabels)
+    ])
+  )
+)
+
 /**
  * @notice Counter and Summary metric types are currently disabled, see README for details.
  */
@@ -21,12 +41,43 @@ const MetricTypes = Object.freeze({
 let MetricNames = {
   SYNC_QUEUE_JOBS_TOTAL_GAUGE: 'sync_queue_jobs_total',
   ROUTE_POST_TRACKS_DURATION_SECONDS_HISTOGRAM:
-    'route_post_tracks_duration_seconds'
+    'route_post_tracks_duration_seconds',
+  ISSUE_SYNC_SECONDS_WAITING_FOR_COMPLETION_HISTOGRAM:
+    'issue_sync_seconds_waiting_for_completion_histogram'
 }
 MetricNames = Object.freeze(
   _.mapValues(MetricNames, (metricName) => NamespacePrefix + metricName)
 )
 
+// Uses an implementation similar to the Golang client's ExponentialBucketsRange function:
+// https://github.com/prometheus/client_golang/blob/v1.12.2/prometheus/histogram.go#L125
+// ExponentialBucketsRange creates 'count' buckets, where the lowest bucket is
+// 'min' and the highest bucket is 'max'. The final +Inf bucket is not counted
+// and not included in the returned array. The returned array is meant to be
+// used for the buckets field of metricConfig.
+//
+// The function throws if 'count' is 0 or negative, if 'min' is 0 or negative.
+const exponentialBucketsRange = (min, max, count) => {
+  if (count < 1) {
+    throw new Error('exponentialBucketsRange count needs a positive count')
+  }
+  if (min <= 0) {
+    throw new Error('ExponentialBucketsRange min needs to be greater than 0')
+  }
+
+  // Formula for exponential buckets: max = min*growthFactor^(bucketCount-1)
+
+  // We know max/min and highest bucket. Solve for growthFactor
+  const growthFactor = (max / min) ** (1 / (count - 1))
+
+  // Now that we know growthFactor, solve for each bucket
+  const buckets = []
+  for (let i = 1; i <= count; i++) {
+    buckets.push(min * growthFactor ** (i - 1))
+  }
+  return buckets
+}
+
 const Metrics = Object.freeze({
   [MetricNames.SYNC_QUEUE_JOBS_TOTAL_GAUGE]: {
     metricType: MetricTypes.GAUGE,
@@ -45,10 +96,26 @@ const Metrics = Object.freeze({
       labelNames: ['code'],
       buckets: [0.1, 0.3, 0.5, 1, 3, 5, 10] // 0.1 to 10 seconds
     }
+  },
+  [MetricNames.ISSUE_SYNC_SECONDS_WAITING_FOR_COMPLETION_HISTOGRAM]: {
+    metricType: MetricTypes.HISTOGRAM,
+    metricConfig: {
+      name: MetricNames.ISSUE_SYNC_SECONDS_WAITING_FOR_COMPLETION_HISTOGRAM,
+      help: 'Seconds spent monitoring an outgoing sync request issued by this node to be completed (successfully or not)',
+      labelNames:
+        MetricLabelNames.ISSUE_SYNC_SECONDS_WAITING_FOR_COMPLETION_HISTOGRAM,
+      // 5 buckets in the range of 1 second to max seconds before timing out a sync request
+      buckets: exponentialBucketsRange(
+        1,
+        config.get('maxSyncMonitoringDurationInMs') / 1000,
+        5
+      )
+    }
   }
 })
 
 module.exports.NamespacePrefix = NamespacePrefix
 module.exports.MetricTypes = MetricTypes
 module.exports.MetricNames = MetricNames
+module.exports.MetricLabels = MetricLabels
 module.exports.Metrics = Metrics
diff --git a/creator-node/src/services/stateMachineManager/index.js b/creator-node/src/services/stateMachineManager/index.js
index 0f58e69dc02..ed56e90c8a5 100644
--- a/creator-node/src/services/stateMachineManager/index.js
+++ b/creator-node/src/services/stateMachineManager/index.js
@@ -7,7 +7,7 @@ const StateReconciliationManager = require('./stateReconciliation')
 const NodeToSpIdManager = require('./CNodeToSpIdMapManager')
 const { RECONFIG_MODES } = require('./stateMachineConstants')
 const QueueInterfacer = require('./QueueInterfacer')
-const makeCompletedJobEnqueueOtherJobs = require('./makeCompletedJobEnqueueOtherJobs')
+const makeOnCompleteCallback = require('./makeOnCompleteCallback')
 
 /**
  * Manages the queue for monitoring the state of Content Nodes and
@@ -15,7 +15,7 @@ const makeCompletedJobEnqueueOtherJobs = require('./makeCompletedJobEnqueueOther
  * Use QueueInterfacer for interfacing with the queues.
  */
 class StateMachineManager {
-  async init(audiusLibs) {
+  async init(audiusLibs, prometheusRegistry) {
     this.updateEnabledReconfigModesSet()
 
     // TODO: Decide on interval to run this on
@@ -40,19 +40,21 @@ class StateMachineManager {
     )
     const stateReconciliationQueue = await stateReconciliationManager.init()
 
-    // Make jobs enqueue other jobs as necessary upon completion
+    // Upon completion, make jobs record metrics and enqueue other jobs as necessary
     stateMonitoringQueue.on(
       'global:completed',
-      makeCompletedJobEnqueueOtherJobs(
+      makeOnCompleteCallback(
         stateMonitoringQueue,
-        stateReconciliationQueue
+        stateReconciliationQueue,
+        prometheusRegistry
       ).bind(this)
     )
     stateReconciliationQueue.on(
       'global:completed',
-      makeCompletedJobEnqueueOtherJobs(
+      makeOnCompleteCallback(
         stateMonitoringQueue,
-        stateReconciliationQueue
+        stateReconciliationQueue,
+        prometheusRegistry
       ).bind(this)
     )
 
diff --git a/creator-node/src/services/stateMachineManager/makeCompletedJobEnqueueOtherJobs.js b/creator-node/src/services/stateMachineManager/makeCompletedJobEnqueueOtherJobs.js
deleted file mode 100644
index e5348c0c01d..00000000000
--- a/creator-node/src/services/stateMachineManager/makeCompletedJobEnqueueOtherJobs.js
+++ /dev/null
@@ -1,121 +0,0 @@
-const { logger: baseLogger, createChildLogger } = require('../../logging')
-const { QUEUE_NAMES, JOB_NAMES } = require('./stateMachineConstants')
-
-/**
- * Higher order function that creates a function that can be used as a Bull Queue onComplete callback to take
- * a job that successfully completed and read its output for new jobs that will be enqueued in bulk.
- * The expected syntax that a job result must output in order to for this function to enqueue more
- * jobs upon completion is:
- * {
- *   jobsToEnqueue: {
- *     [QUEUE_NAMES.STATE_MONITORING]: [
- *       {
- *         jobName: <any job name from JOB_NAMES constant>,
- *         jobData: <object containing data that this job type expects>
- *       },
- *       ...
- *     ],
- *     [QUEUE_NAMES.STATE_RECONCILIATION]: [
- *       {
- *         jobName: <any job name from JOB_NAMES constant>,
- *         jobData: <object containing data that this job type expects>
- *       },
- *       ...
- *     ]
- *   }
- * }
- * @dev MUST be bound to a class containing an `enabledReconfigModes` property.
- *      See usage in index.js (in same directory) for example of how it's bound to StateMachineManager.
- *
- * @param {BullQueue} monitoringQueue the queue that handles state monitoring jobs
- * @param {BullQueue} reconciliationQueue the queue that handles state reconciliation jobs
- * @returns a function that:
- * - takes a jobId (string) and result (string) of a job that successfully completed
- * - parses the result (string) into JSON
- * - bulk-enqueues all jobs under result[QUEUE_NAMES.STATE_MONITORING] into the state monitoring queue
- * - bulk-enqueues all jobs under result[QUEUE_NAMES.STATE_RECONCILIATION] into the state reconciliation queue
- */
-module.exports = function makeCompletedJobEnqueueOtherJobs(
-  monitoringQueue,
-  reconciliationQueue
-) {
-  return async function (jobId, resultString) {
-    // Create a logger so that we can filter logs by the tag `jobId` = <id of the job that successfully completed>
-    const logger = createChildLogger(baseLogger, { jobId })
-
-    // Bull serializes the job result into redis, so we have to deserialize it into JSON
-    let jobsToEnqueue = {}
-    try {
-      jobsToEnqueue = JSON.parse(resultString)?.jobsToEnqueue
-      if (!jobsToEnqueue) {
-        logger.info(
-          `No jobs to enqueue after successful completion. Result: ${resultString}`
-        )
-        return
-      }
-    } catch (e) {
-      logger.warn(`Failed to parse job result string: ${resultString}`)
-      return
-    }
-
-    // Find all monitoring and reconciliation jobs
-    const monitoringJobs = jobsToEnqueue[QUEUE_NAMES.STATE_MONITORING] || []
-    const reconciliationJobs =
-      jobsToEnqueue[QUEUE_NAMES.STATE_RECONCILIATION] || []
-    logger.info(
-      `Attempting to enqueue ${monitoringJobs?.length} monitoring jobs and ${reconciliationJobs.length} reconciliation jobs in bulk`
-    )
-
-    // Enqueue all monitoring jobs in bulk after renaming properties to what Bull expects (name and data)
-    try {
-      const monitoringBulkAddResult = await monitoringQueue.addBulk(
-        monitoringJobs.map((job) => {
-          if (!job?.jobName || !job?.jobData) {
-            logger.error(`Job ${JSON.stringify(job)} is missing name or data!`)
-          }
-          return { name: job.jobName, data: job.jobData }
-        })
-      )
-      logger.info(
-        `Enqueued ${monitoringBulkAddResult.length} monitoring jobs in bulk after successful completion`
-      )
-    } catch (e) {
-      logger.error(
-        `Failed to bulk-enqueue monitoring jobs after successful completion: ${e}`
-      )
-    }
-
-    // Enqueue all reconciliation jobs in bulk after renaming properties to what Bull expects (name and data)
-    try {
-      const reconciliationBulkAddResult = await reconciliationQueue.addBulk(
-        reconciliationJobs.map((job) => {
-          if (!job?.jobName || !job?.jobData) {
-            logger.error(`Job ${JSON.stringify(job)} is missing name or data!`)
-          }
-          // Inject enabledReconfigModesSet into update-replica-set jobs as an array.
-          // It gets `this` from being bound to ./index.js
-          if (job.jobName === JOB_NAMES.UPDATE_REPLICA_SET) {
-            if (this?.hasOwnProperty('enabledReconfigModesSet')) {
-              job.jobData = {
-                ...job.jobData,
-                enabledReconfigModes: Array.from(this.enabledReconfigModesSet)
-              }
-            } else {
-              logger.error(
-                'Function was supposed to be bound to StateMachineManager to access enabledReconfigModesSet! Update replica set jobs will not be able to process!'
-              )
-            }
-          }
-          return { name: job.jobName, data: job.jobData }
-        })
-      )
-      logger.info(
-        `Enqueued ${reconciliationBulkAddResult.length} reconciliation jobs in bulk after successful completion`
-      )
-    } catch (e) {
-      logger.error(
-        `Failed to bulk-enqueue reconciliation jobs after successful completion: ${e}`
-      )
-    }
-  }
-}
diff --git a/creator-node/src/services/stateMachineManager/makeOnCompleteCallback.js b/creator-node/src/services/stateMachineManager/makeOnCompleteCallback.js
new file mode 100644
index 00000000000..c9c7cffad2b
--- /dev/null
+++ b/creator-node/src/services/stateMachineManager/makeOnCompleteCallback.js
@@ -0,0 +1,186 @@
+const _ = require('lodash')
+
+const { logger: baseLogger, createChildLogger } = require('../../logging')
+const { QUEUE_NAMES, JOB_NAMES } = require('./stateMachineConstants')
+
+/**
+ * Higher order function that creates a function that's used as a Bull Queue onComplete callback to take
+ * a job that successfully completed and read its output for new jobs that will be enqueued in bulk as well
+ * as metrics that will be recorded.
+ * The expected syntax that a job result must output in order to for this function to enqueue more
+ * jobs upon completion is:
+ * {
+ *   jobsToEnqueue: {
+ *     [QUEUE_NAMES.STATE_MONITORING]: [
+ *       {
+ *         jobName: <any job name from JOB_NAMES constant>,
+ *         jobData: <object containing data that this job type expects>
+ *       },
+ *       ...
+ *     ],
+ *     [QUEUE_NAMES.STATE_RECONCILIATION]: [
+ *       {
+ *         jobName: <any job name from JOB_NAMES constant>,
+ *         jobData: <object containing data that this job type expects>
+ *       },
+ *       ...
+ *     ]
+ *   }
+ * }
+ * @dev MUST be bound to a class containing an `enabledReconfigModes` property.
+ *      See usage in index.js (in same directory) for example of how it's bound to StateMachineManager.
+ *
+ * @param {BullQueue} monitoringQueue the queue that handles state monitoring jobs
+ * @param {BullQueue} reconciliationQueue the queue that handles state reconciliation jobs
+ * @param {Object} prometheusRegistry the registry of prometheus metrics
+ * @returns a function that:
+ * - takes a jobId (string) and result (string) of a job that successfully completed
+ * - parses the result (string) into JSON
+ * - bulk-enqueues all jobs under result[QUEUE_NAMES.STATE_MONITORING] into the state monitoring queue
+ * - bulk-enqueues all jobs under result[QUEUE_NAMES.STATE_RECONCILIATION] into the state reconciliation queue
+ */
+module.exports = function (
+  monitoringQueue,
+  reconciliationQueue,
+  prometheusRegistry
+) {
+  return async function (jobId, resultString) {
+    // Create a logger so that we can filter logs by the tag `jobId` = <id of the job that successfully completed>
+    const logger = createChildLogger(baseLogger, { jobId })
+
+    // update-replica-set jobs need enabledReconfigModes as an array.
+    // `this` comes from the function being bound via .bind() to ./index.js
+    if (!this?.hasOwnProperty('enabledReconfigModesSet')) {
+      logger.error(
+        'Function was supposed to be bound to StateMachineManager to access enabledReconfigModesSet! Update replica set jobs will not be able to process!'
+      )
+      return
+    }
+    const enabledReconfigModes = Array.from(this.enabledReconfigModesSet)
+
+    // Bull serializes the job result into redis, so we have to deserialize it into JSON
+    let jobResult = {}
+    try {
+      jobResult = JSON.parse(resultString) || {}
+    } catch (e) {
+      logger.warn(`Failed to parse job result string: ${resultString}`)
+      return
+    }
+
+    const { jobsToEnqueue, metricsToRecord } = jobResult
+
+    if (jobsToEnqueue) {
+      // Enqueue monitoring jobs
+      const monitoringJobs = jobsToEnqueue[QUEUE_NAMES.STATE_MONITORING] || []
+      await enqueueMonitoringJobs(monitoringJobs, monitoringQueue, logger)
+
+      // Enqueue reconciliation jobs
+      const reconciliationJobs =
+        jobsToEnqueue[QUEUE_NAMES.STATE_RECONCILIATION] || []
+      await enqueueReconciliationJobs(
+        reconciliationJobs,
+        reconciliationQueue,
+        enabledReconfigModes,
+        logger
+      )
+    } else {
+      logger.info(
+        `No jobs to enqueue after successful completion. Result: ${resultString}`
+      )
+    }
+
+    // Record metrics
+    ;(metricsToRecord || []).forEach((metricInfo) => {
+      try {
+        const { metricName, metricType, metricValue, metricLabel } = metricInfo
+        const metric = prometheusRegistry.getMetric(metricName)
+        if (metricType === 'HISTOGRAM') {
+          _.isEmpty(metricLabel)
+            ? metric.observe(metricValue)
+            : metric.observe(metricLabel, metricValue)
+        } else if (metricType === 'GAUGE') {
+          metric.set(metricValue)
+        } else {
+          logger.error(`Unexpected metric type: ${metricType}`)
+        }
+      } catch (error) {
+        logger.error(`Error recording metric ${metricInfo}: ${error}`)
+      }
+    })
+  }
+}
+
+const enqueueMonitoringJobs = async (jobs, monitoringQueue, logger) => {
+  logger.info(`Attempting to enqueue ${jobs?.length} monitoring jobs in bulk`)
+
+  // Sanitize and transform output into the job format that Bull expects
+  try {
+    const bulkAddResult = await monitoringQueue.addBulk(
+      sanitizeAndTransformMonitoringJobs(jobs)
+    )
+    logger.info(
+      `Enqueued ${bulkAddResult.length} monitoring jobs in bulk after successful completion`
+    )
+  } catch (e) {
+    logger.error(
+      `Failed to bulk-enqueue monitoring jobs after successful completion: ${e}`
+    )
+  }
+}
+
+// Rename properties from job output to properties that Bull expects (jobName+jobData => name+data)
+const sanitizeAndTransformMonitoringJobs = (jobs, logger) => {
+  return jobs.map((job) => {
+    if (!job?.jobName || !job?.jobData) {
+      logger.error(`Job ${JSON.stringify(job)} is missing name or data!`)
+    }
+    return { name: job.jobName, data: job.jobData }
+  })
+}
+
+const enqueueReconciliationJobs = async (
+  jobs,
+  reconciliationQueue,
+  enabledReconfigModes,
+  logger
+) => {
+  logger.info(
+    `Attempting to enqueue ${jobs?.length} reconciliation jobs in bulk`
+  )
+
+  // Sanitize and transform output into the job format that Bull expects. Also inject extra job data
+  try {
+    const bulkAddResult = await reconciliationQueue.addBulk(
+      sanitizeAndTransformReconciliationJobs(jobs, enabledReconfigModes, logger)
+    )
+    logger.info(
+      `Enqueued ${bulkAddResult.length} reconciliation jobs in bulk after successful completion`
+    )
+  } catch (e) {
+    logger.error(
+      `Failed to bulk-enqueue reconciliation jobs after successful completion: ${e}`
+    )
+  }
+}
+
+// Rename properties from job output to properties that Bull expects (jobName+jobData => name+data)
+// Also inject enabledReconfigModes into update-replica-set jobs
+const sanitizeAndTransformReconciliationJobs = (
+  jobs,
+  enabledReconfigModes,
+  logger
+) => {
+  return jobs.map((job) => {
+    if (!job?.jobName || !job?.jobData) {
+      logger.error(`Job ${JSON.stringify(job)} is missing name or data!`)
+    }
+    // Inject enabledReconfigModesSet into update-replica-set jobs
+    if (job.jobName === JOB_NAMES.UPDATE_REPLICA_SET) {
+      job.jobData = {
+        ...job.jobData,
+        enabledReconfigModes
+      }
+    }
+    return { name: job.jobName, data: job.jobData }
+  })
+}
diff --git a/creator-node/src/services/stateMachineManager/stateMachineUtils.js b/creator-node/src/services/stateMachineManager/stateMachineUtils.js
index d1e61d34a88..fc8227af797 100644
--- a/creator-node/src/services/stateMachineManager/stateMachineUtils.js
+++ b/creator-node/src/services/stateMachineManager/stateMachineUtils.js
@@ -3,6 +3,11 @@ const CreatorNode = libs.CreatorNode
 const axios = require('axios')
 const retry = require('async-retry')
 
+const {
+  MetricTypes,
+  MetricNames,
+  MetricLabels
+} = require('../../services/prometheusMonitoring/prometheus.constants')
 const config = require('../../config')
 const { logger } = require('../../logging')
 const { generateTimestampAndSignature } = require('../../apiSigning')
@@ -126,7 +131,56 @@ const retrieveClockValueForUserFromReplica = async (replica, wallet) => {
   return clockValue
 }
 
+/**
+ * Returns an object that can be returned from any state machine job to record a histogram metric being observed.
+ * Example: to call histogram.observe('response_time', { code: '200' }, 1000), you would call this function with:
+ * makeHistogramToRecord('response_time', 1000, 'code', '200')
+ * @param {string} metricName the key of the name of the metric from prometheus.constants
+ * @param {number} metricValue the value to observe
+ * @param {string} [metricLabelName] the optional name of the label
+ * @param {string} [metricLabelValue] the optional value of the label
+ */
+const makeHistogramToRecord = (
+  metricName,
+  metricValue,
+  metricLabelName = '',
+  metricLabelValue = ''
+) => {
+  if (!Object.keys(MetricNames).includes(metricName)) {
+    throw new Error(
+      `Invalid metricName: ${metricName}. Options: ${JSON.stringify(
+        Object.keys(MetricNames)
+      )}`
+    )
+  }
+  if (typeof metricValue !== 'number') {
+    throw new Error(`Invalid non-numerical metricValue: ${metricValue}`)
+  }
+  const useMetricLabel = metricLabelName && metricLabelValue
+  if (
+    useMetricLabel &&
+    !Object.keys(MetricLabels[metricName])?.includes(metricLabelName)
+  ) {
+    throw new Error(`Invalid metricLabelName: ${metricLabelName}`)
+  }
+  if (
+    useMetricLabel &&
+    !MetricLabels[metricName][metricLabelName]?.includes(metricLabelValue)
+  ) {
+    throw new Error(`Invalid metricLabelValue: ${metricLabelValue}`)
+  }
+
+  const metric = {
+    metricName: MetricNames[metricName],
+    metricType: MetricTypes.HISTOGRAM,
+    metricValue,
+    metricLabel: useMetricLabel ? { [metricLabelName]: [metricLabelValue] } : {}
+  }
+  return metric
+}
+
 module.exports = {
   retrieveClockStatusesForUsersAcrossReplicaSet,
-  retrieveClockValueForUserFromReplica
+  retrieveClockValueForUserFromReplica,
+  makeHistogramToRecord
 }
diff --git a/creator-node/src/services/stateMachineManager/stateReconciliation/issueSyncRequest.jobProcessor.js b/creator-node/src/services/stateMachineManager/stateReconciliation/issueSyncRequest.jobProcessor.js
index 0438c16e7ae..715101d54b4 100644
--- a/creator-node/src/services/stateMachineManager/stateReconciliation/issueSyncRequest.jobProcessor.js
+++ b/creator-node/src/services/stateMachineManager/stateReconciliation/issueSyncRequest.jobProcessor.js
@@ -4,7 +4,10 @@ const _ = require('lodash')
 const config = require('../../../config')
 const models = require('../../../models')
 const Utils = require('../../../utils')
-const { retrieveClockValueForUserFromReplica } = require('../stateMachineUtils')
+const {
+  retrieveClockValueForUserFromReplica,
+  makeHistogramToRecord
+} = require('../stateMachineUtils')
 const { getNewOrExistingSyncReq } = require('./stateReconciliationUtils')
 const SyncRequestDeDuplicator = require('./SyncRequestDeDuplicator')
 const SecondarySyncHealthTracker = require('./SecondarySyncHealthTracker')
@@ -110,13 +113,25 @@ module.exports = async function ({ logger, syncType, syncRequestParameters }) {
     logger.error(`${logMsgString} || Error issuing sync request: ${e.message}`)
   }
 
+  const metricsToRecord = []
+
   // Wait until has sync has completed (within time threshold)
-  const additionalSyncIsRequired = await _additionalSyncIsRequired(
-    userWallet,
-    primaryClockValue,
-    secondaryEndpoint,
-    syncType,
-    logger
+  const startWaitingForCompletion = Date.now()
+  const { additionalSyncIsRequired, reasonForAdditionalSync } =
+    await _additionalSyncIsRequired(
+      userWallet,
+      primaryClockValue,
+      secondaryEndpoint,
+      syncType,
+      logger
+    )
+  metricsToRecord.push(
+    makeHistogramToRecord(
+      'ISSUE_SYNC_SECONDS_WAITING_FOR_COMPLETION_HISTOGRAM',
+      (Date.now() - startWaitingForCompletion) / 1000, // Metric is in seconds
+      'reason_for_additional_sync',
+      reasonForAdditionalSync
+    )
   )
 
   // Re-enqueue sync if required
@@ -149,7 +164,8 @@ module.exports = async function ({ logger, syncType, syncRequestParameters }) {
       ? {}
       : {
           [QUEUE_NAMES.STATE_RECONCILIATION]: [additionalSyncReq]
-        }
+        },
+    metricsToRecord
   }
 }
 
@@ -206,7 +222,7 @@ const _getUserPrimaryClockValues = async (wallets) => {
 
 /**
  * Monitor an ongoing sync operation for a given secondaryUrl and user wallet
- * Return boolean indicating if an additional sync is required
+ * Return boolean indicating if an additional sync is required and reason why (or 'none' if no additional sync is required)
  * Record SyncRequest outcomes to SecondarySyncHealthTracker
  */
 const _additionalSyncIsRequired = async (
@@ -267,6 +283,7 @@ const _additionalSyncIsRequired = async (
    * Also check whether additional sync is required
    */
   let additionalSyncIsRequired
+  let reasonForAdditionalSync = 'none'
   if (secondaryCaughtUpToPrimary) {
     await SecondarySyncHealthTracker.recordSuccess(
       secondaryUrl,
@@ -285,6 +302,7 @@ const _additionalSyncIsRequired = async (
       syncType
     )
     additionalSyncIsRequired = true
+    reasonForAdditionalSync = 'secondary_progressed_too_slow'
     logger.info(
       `${logMsgString} || Secondary successfully synced from clock ${initialSecondaryClock} to ${finalSecondaryClock} but hasn't caught up to Primary. Enqueuing additional syncRequest.`
     )
@@ -297,10 +315,11 @@ const _additionalSyncIsRequired = async (
       syncType
     )
     additionalSyncIsRequired = true
+    reasonForAdditionalSync = 'secondary_failed_to_progress'
     logger.error(
       `${logMsgString} || Secondary failed to progress from clock ${initialSecondaryClock}. Enqueuing additional syncRequest.`
     )
   }
 
-  return additionalSyncIsRequired
+  return { additionalSyncIsRequired, reasonForAdditionalSync }
 }
diff --git a/creator-node/test/issueSyncRequest.jobProcessor.test.js b/creator-node/test/issueSyncRequest.jobProcessor.test.js
index a4b6431a39a..6effecd0c57 100644
--- a/creator-node/test/issueSyncRequest.jobProcessor.test.js
+++ b/creator-node/test/issueSyncRequest.jobProcessor.test.js
@@ -208,16 +208,26 @@ describe('test issueSyncRequest job processor', function () {
     nock(baseURL).post('/sync', data).reply(200)
 
     // Verify job outputs the correct results: no sync issued (nock will error if the wrong network req was made)
-    await expect(
-      issueSyncRequestJobProcessor({
-        logger,
-        syncType,
-        syncRequestParameters
-      })
-    ).to.eventually.be.fulfilled.and.deep.equal({
-      error: {},
-      jobsToEnqueue: {}
+    const result = await issueSyncRequestJobProcessor({
+      logger,
+      syncType,
+      syncRequestParameters
     })
+    expect(result).to.have.deep.property('error', {})
+    expect(result).to.have.deep.property('jobsToEnqueue', {})
+    expect(result.metricsToRecord).to.have.lengthOf(1)
+    expect(result.metricsToRecord[0]).to.have.deep.property(
+      'metricName',
+      'audius_cn_issue_sync_seconds_waiting_for_completion_histogram'
+    )
+    expect(result.metricsToRecord[0]).to.have.deep.property('metricLabel', {
+      reason_for_additional_sync: 'none'
+    })
+    expect(result.metricsToRecord[0]).to.have.deep.property(
+      'metricType',
+      'HISTOGRAM'
+    )
+    expect(result.metricsToRecord[0].metricValue).to.be.a('number')
     expect(getNewOrExistingSyncReqStub).to.not.have.been.called
   })
 
@@ -245,16 +255,13 @@ describe('test issueSyncRequest job processor', function () {
     const expectedErrorMessage = `(${syncType}) User ${wallet} | Secondary: ${baseURL} || Secondary has already met SecondaryUserSyncDailyFailureCountThreshold (${failureThreshold}). Will not issue further syncRequests today.`
 
     // Verify job outputs the correct results: error and no sync issued (nock will error if a network req was made)
-    await expect(
-      issueSyncRequestJobProcessor({
-        logger,
-        syncType,
-        syncRequestParameters
-      })
-    ).to.eventually.be.fulfilled.and.deep.equal({
-      error: {
-        message: expectedErrorMessage
-      }
+    const result = await issueSyncRequestJobProcessor({
+      logger,
+      syncType,
+      syncRequestParameters
+    })
+    expect(result).to.have.deep.property('error', {
+      message: expectedErrorMessage
     })
     expect(logger.error).to.have.been.calledOnceWithExactly(
       expectedErrorMessage
@@ -309,18 +316,28 @@ describe('test issueSyncRequest job processor', function () {
     nock(baseURL).post('/sync', data).reply(200)
 
     // Verify job outputs the correct results: an additional sync
-    await expect(
-      issueSyncRequestJobProcessor({
-        logger,
-        syncType,
-        syncRequestParameters
-      })
-    ).to.eventually.be.fulfilled.and.deep.equal({
-      error: {},
-      jobsToEnqueue: {
-        [QUEUE_NAMES.STATE_RECONCILIATION]: [expectedSyncReqToEnqueue]
-      }
+    const result = await issueSyncRequestJobProcessor({
+      logger,
+      syncType,
+      syncRequestParameters
     })
+    expect(result).to.have.deep.property('error', {})
+    expect(result).to.have.deep.property('jobsToEnqueue', {
+      [QUEUE_NAMES.STATE_RECONCILIATION]: [expectedSyncReqToEnqueue]
+    })
+    expect(result.metricsToRecord).to.have.lengthOf(1)
+    expect(result.metricsToRecord[0]).to.have.deep.property(
+      'metricName',
+      'audius_cn_issue_sync_seconds_waiting_for_completion_histogram'
+    )
+    expect(result.metricsToRecord[0]).to.have.deep.property('metricLabel', {
+      reason_for_additional_sync: 'secondary_progressed_too_slow'
+    })
+    expect(result.metricsToRecord[0]).to.have.deep.property(
+      'metricType',
+      'HISTOGRAM'
+    )
+    expect(result.metricsToRecord[0].metricValue).to.be.a('number')
     expect(getNewOrExistingSyncReqStub).to.have.been.calledOnceWithExactly({
       userWallet: wallet,
       secondaryEndpoint: baseURL,
@@ -381,18 +398,28 @@ describe('test issueSyncRequest job processor', function () {
     nock(baseURL).post('/sync', data).reply(200)
 
     // Verify job outputs the correct results: an additional sync
-    await expect(
-      issueSyncRequestJobProcessor({
-        logger,
-        syncType,
-        syncRequestParameters
-      })
-    ).to.eventually.be.fulfilled.and.deep.equal({
-      error: {},
-      jobsToEnqueue: {
-        [QUEUE_NAMES.STATE_RECONCILIATION]: [expectedSyncReqToEnqueue]
-      }
+    const result = await issueSyncRequestJobProcessor({
+      logger,
+      syncType,
+      syncRequestParameters
+    })
+    expect(result).to.have.deep.property('error', {})
+    expect(result).to.have.deep.property('jobsToEnqueue', {
+      [QUEUE_NAMES.STATE_RECONCILIATION]: [expectedSyncReqToEnqueue]
     })
+    expect(result.metricsToRecord).to.have.lengthOf(1)
+    expect(result.metricsToRecord[0]).to.have.deep.property(
+      'metricName',
+      'audius_cn_issue_sync_seconds_waiting_for_completion_histogram'
+    )
+    expect(result.metricsToRecord[0]).to.have.deep.property('metricLabel', {
+      reason_for_additional_sync: 'secondary_failed_to_progress'
+    })
+    expect(result.metricsToRecord[0]).to.have.deep.property(
+      'metricType',
+      'HISTOGRAM'
+    )
+    expect(result.metricsToRecord[0].metricValue).to.be.a('number')
     expect(getNewOrExistingSyncReqStub).to.have.been.calledOnceWithExactly({
       userWallet: wallet,
       secondaryEndpoint: baseURL,

From c1b891758ab3fd7f2618100615b1b93f1512ffb4 Mon Sep 17 00:00:00 2001
From: Theo Ilie <theoilie.ti@gmail.com>
Date: Fri, 24 Jun 2022 21:26:31 +0000
Subject: [PATCH 2/5] Clean up and truncate buckets

---
 .../prometheus.constants.js                   | 76 ++++++-------------
 .../prometheusMonitoring/prometheusUtils.js   | 44 +++++++++++
 .../makeOnCompleteCallback.js                 |  8 +-
 .../stateMachineManager/stateMachineUtils.js  | 49 +++++-------
 .../issueSyncRequest.jobProcessor.js          |  8 +-
 5 files changed, 94 insertions(+), 91 deletions(-)
 create mode 100644 creator-node/src/services/prometheusMonitoring/prometheusUtils.js

diff --git a/creator-node/src/services/prometheusMonitoring/prometheus.constants.js b/creator-node/src/services/prometheusMonitoring/prometheus.constants.js
index dfb5f404274..9e859bf1783 100644
--- a/creator-node/src/services/prometheusMonitoring/prometheus.constants.js
+++ b/creator-node/src/services/prometheusMonitoring/prometheus.constants.js
@@ -1,6 +1,7 @@
 const promClient = require('prom-client')
 const _ = require('lodash')
 const config = require('../../config')
+const { exponentialBucketsRange } = require('./prometheusUtils')
 
 /**
  * For explanation of Metrics, and instructions on how to add a new metric, please see `prometheusMonitoring/README.md`
@@ -9,25 +10,6 @@ const config = require('../../config')
 // We add a namespace prefix to differentiate internal metrics from those exported by different exporters from the same host
 const NamespacePrefix = 'audius_cn_'
 
-const MetricLabels = Object.freeze({
-  ISSUE_SYNC_SECONDS_WAITING_FOR_COMPLETION_HISTOGRAM: {
-    // The reason another sync is needed
-    reason_for_additional_sync: [
-      'secondary_progressed_too_slow', // The secondary sync went through, but its clock value didn't increase enough
-      'secondary_failed_to_progress', // The secondary's clock value did not increase at all
-      'none' // No additional sync is required -- the first sync was successful
-    ]
-  }
-})
-const MetricLabelNames = Object.freeze(
-  Object.fromEntries(
-    Object.entries(MetricLabels).map(([metric, metricLabels]) => [
-      metric,
-      Object.keys(metricLabels)
-    ])
-  )
-)
-
 /**
  * @notice Counter and Summary metric types are currently disabled, see README for details.
  */
@@ -42,41 +24,31 @@ let MetricNames = {
   SYNC_QUEUE_JOBS_TOTAL_GAUGE: 'sync_queue_jobs_total',
   ROUTE_POST_TRACKS_DURATION_SECONDS_HISTOGRAM:
     'route_post_tracks_duration_seconds',
-  ISSUE_SYNC_SECONDS_WAITING_FOR_COMPLETION_HISTOGRAM:
-    'issue_sync_seconds_waiting_for_completion_histogram'
+  ISSUE_SYNC_REQUEST_MONITORING_DURATION_SECONDS_HISTOGRAM:
+    'issue_sync_request_monitoring_duration_seconds'
 }
 MetricNames = Object.freeze(
   _.mapValues(MetricNames, (metricName) => NamespacePrefix + metricName)
 )
 
-// Uses an implementation similar to the Golang client's ExponentialBucketsRange function:
-// https://github.com/prometheus/client_golang/blob/v1.12.2/prometheus/histogram.go#L125
-// ExponentialBucketsRange creates 'count' buckets, where the lowest bucket is
-// 'min' and the highest bucket is 'max'. The final +Inf bucket is not counted
-// and not included in the returned array. The returned array is meant to be
-// used for the buckets field of metricConfig.
-//
-// The function throws if 'count' is 0 or negative, if 'min' is 0 or negative.
-const exponentialBucketsRange = (min, max, count) => {
-  if (count < 1) {
-    throw new Error('exponentialBucketsRange count needs a positive count')
-  }
-  if (min <= 0) {
-    throw new Error('ExponentialBucketsRange min needs to be greater than 0')
-  }
-
-  // Formula for exponential buckets: max = min*growthFactor^(bucketCount-1)
-
-  // We know max/min and highest bucket. Solve for growthFactor
-  const growthFactor = (max / min) ** (1 / (count - 1))
-
-  // Now that we know growthFactor, solve for each bucket
-  const buckets = []
-  for (let i = 1; i <= count; i++) {
-    buckets.push(min * growthFactor ** (i - 1))
+const MetricLabels = Object.freeze({
+  [MetricNames.ISSUE_SYNC_REQUEST_MONITORING_DURATION_SECONDS_HISTOGRAM]: {
+    // The reason another sync is needed
+    reason_for_additional_sync: [
+      'secondary_progressed_too_slow', // The secondary sync went through, but its clock value didn't increase enough
+      'secondary_failed_to_progress', // The secondary's clock value did not increase at all
+      'none' // No additional sync is required -- the first sync was successful
+    ]
   }
-  return buckets
-}
+})
+const MetricLabelNames = Object.freeze(
+  Object.fromEntries(
+    Object.entries(MetricLabels).map(([metric, metricLabels]) => [
+      metric,
+      Object.keys(metricLabels)
+    ])
+  )
+)
 
 const Metrics = Object.freeze({
   [MetricNames.SYNC_QUEUE_JOBS_TOTAL_GAUGE]: {
@@ -97,13 +69,15 @@ const Metrics = Object.freeze({
       buckets: [0.1, 0.3, 0.5, 1, 3, 5, 10] // 0.1 to 10 seconds
     }
   },
-  [MetricNames.ISSUE_SYNC_SECONDS_WAITING_FOR_COMPLETION_HISTOGRAM]: {
+  [MetricNames.ISSUE_SYNC_REQUEST_MONITORING_DURATION_SECONDS_HISTOGRAM]: {
     metricType: MetricTypes.HISTOGRAM,
     metricConfig: {
-      name: MetricNames.ISSUE_SYNC_SECONDS_WAITING_FOR_COMPLETION_HISTOGRAM,
+      name: MetricNames.ISSUE_SYNC_REQUEST_MONITORING_DURATION_SECONDS_HISTOGRAM,
       help: 'Seconds spent monitoring an outgoing sync request issued by this node to be completed (successfully or not)',
       labelNames:
-        MetricLabelNames.ISSUE_SYNC_SECONDS_WAITING_FOR_COMPLETION_HISTOGRAM,
+        MetricLabelNames[
+          MetricNames.ISSUE_SYNC_REQUEST_MONITORING_DURATION_SECONDS_HISTOGRAM
+        ],
       // 5 buckets in the range of 1 second to max seconds before timing out a sync request
       buckets: exponentialBucketsRange(
         1,
diff --git a/creator-node/src/services/prometheusMonitoring/prometheusUtils.js b/creator-node/src/services/prometheusMonitoring/prometheusUtils.js
new file mode 100644
index 00000000000..98df910060c
--- /dev/null
+++ b/creator-node/src/services/prometheusMonitoring/prometheusUtils.js
@@ -0,0 +1,44 @@
+const _ = require('lodash')
+
+/**
+ * Creates 'count' buckets, where the lowest bucket is 'min' and the highest bucket is 'max'.
+ * The final +Inf bucket is not counted and not included in the returned array.
+ * The returned array is meant to be used for the buckets field of metricConfig.
+ * Throws if 'count' is 0 or negative, if 'min' is 0 or negative.
+ *
+ * Uses an implementation similar to the Golang client's ExponentialBucketsRange function:
+ * https://github.com/prometheus/client_golang/blob/v1.12.2/prometheus/histogram.go#L125
+ *
+ * See prom-client (JS) proposed implementation that was never completed:
+ * https://github.com/siimon/prom-client/issues/213
+ *
+ * @param {number} min the lowest value for a bucket
+ * @param {number} max the highest value for a bucket
+ * @param {number} count the number of buckets to generate for values between min and max
+ * @param {number} [precision] the number of decimal points to round each bucket to
+ * @returns 'count' buckets (number[] of length 'count') for values between 'min' and 'max'
+ */
+const exponentialBucketsRange = (min, max, count, precision = 0) => {
+  if (count < 1) {
+    throw new Error('exponentialBucketsRange count needs a positive count')
+  }
+  if (min <= 0) {
+    throw new Error('ExponentialBucketsRange min needs to be greater than 0')
+  }
+
+  // Formula for exponential buckets: max = min*growthFactor^(bucketCount-1)
+
+  // We know max/min and highest bucket. Solve for growthFactor
+  const growthFactor = (max / min) ** (1 / (count - 1))
+
+  // Now that we know growthFactor, solve for each bucket
+  const buckets = []
+  for (let i = 1; i <= count; i++) {
+    const bucket = min * growthFactor ** (i - 1)
+    buckets.push(_.round(bucket, precision))
+  }
+  return buckets
+}
+
+module.exports = {}
+module.exports.exponentialBucketsRange = exponentialBucketsRange
diff --git a/creator-node/src/services/stateMachineManager/makeOnCompleteCallback.js b/creator-node/src/services/stateMachineManager/makeOnCompleteCallback.js
index c9c7cffad2b..6bb9507dd0b 100644
--- a/creator-node/src/services/stateMachineManager/makeOnCompleteCallback.js
+++ b/creator-node/src/services/stateMachineManager/makeOnCompleteCallback.js
@@ -1,5 +1,3 @@
-const _ = require('lodash')
-
 const { logger: baseLogger, createChildLogger } = require('../../logging')
 const { QUEUE_NAMES, JOB_NAMES } = require('./stateMachineConstants')
 
@@ -92,12 +90,10 @@ module.exports = function (
     // Record metrics
     ;(metricsToRecord || []).forEach((metricInfo) => {
       try {
-        const { metricName, metricType, metricValue, metricLabel } = metricInfo
+        const { metricName, metricType, metricValue, metricLabels } = metricInfo
         const metric = prometheusRegistry.getMetric(metricName)
         if (metricType === 'HISTOGRAM') {
-          _.isEmpty(metricLabel)
-            ? metric.observe(metricValue)
-            : metric.observe(metricLabel, metricValue)
+          metric.observe(metricLabels, metricValue)
         } else if (metricType === 'GAUGE') {
           metric.set(metricValue)
         } else {
diff --git a/creator-node/src/services/stateMachineManager/stateMachineUtils.js b/creator-node/src/services/stateMachineManager/stateMachineUtils.js
index fc8227af797..803779ab07d 100644
--- a/creator-node/src/services/stateMachineManager/stateMachineUtils.js
+++ b/creator-node/src/services/stateMachineManager/stateMachineUtils.js
@@ -134,47 +134,34 @@ const retrieveClockValueForUserFromReplica = async (replica, wallet) => {
 /**
  * Returns an object that can be returned from any state machine job to record a histogram metric being observed.
  * Example: to call histogram.observe('response_time', { code: '200' }, 1000), you would call this function with:
- * makeHistogramToRecord('response_time', 1000, 'code', '200')
- * @param {string} metricName the key of the name of the metric from prometheus.constants
+ * makeHistogramToRecord('response_time', 1000, { code: '200' })
+ * @param {string} metricName the name of the metric from prometheus.constants
  * @param {number} metricValue the value to observe
- * @param {string} [metricLabelName] the optional name of the label
- * @param {string} [metricLabelValue] the optional value of the label
+ * @param {string} [metricLabels] the optional mapping of metric label name => metric label value
  */
-const makeHistogramToRecord = (
-  metricName,
-  metricValue,
-  metricLabelName = '',
-  metricLabelValue = ''
-) => {
-  if (!Object.keys(MetricNames).includes(metricName)) {
-    throw new Error(
-      `Invalid metricName: ${metricName}. Options: ${JSON.stringify(
-        Object.keys(MetricNames)
-      )}`
-    )
+const makeHistogramToRecord = (metricName, metricValue, metricLabels = {}) => {
+  if (!Object.values(MetricNames).includes(metricName)) {
+    throw new Error(`Invalid metricName: ${metricName}`)
   }
   if (typeof metricValue !== 'number') {
     throw new Error(`Invalid non-numerical metricValue: ${metricValue}`)
   }
-  const useMetricLabel = metricLabelName && metricLabelValue
-  if (
-    useMetricLabel &&
-    !Object.keys(MetricLabels[metricName])?.includes(metricLabelName)
-  ) {
-    throw new Error(`Invalid metricLabelName: ${metricLabelName}`)
-  }
-  if (
-    useMetricLabel &&
-    !MetricLabels[metricName][metricLabelName]?.includes(metricLabelValue)
-  ) {
-    throw new Error(`Invalid metricLabelValue: ${metricLabelValue}`)
+  const labelNames = Object.keys(MetricLabels[metricName])
+  for (const [labelName, labelValue] of Object.entries(metricLabels)) {
+    if (!labelNames?.includes(labelName)) {
+      throw new Error(`Metric label has invliad name: ${labelName}`)
+    }
+    const labelValues = MetricLabels[metricName][labelName]
+    if (!labelValues?.includes(labelValue)) {
+      throw new Error(`Metric label has invalid value: ${labelValue}`)
+    }
   }
 
   const metric = {
-    metricName: MetricNames[metricName],
-    metricType: MetricTypes.HISTOGRAM,
+    metricName,
+    metricType: 'HISTOGRAM',
     metricValue,
-    metricLabel: useMetricLabel ? { [metricLabelName]: [metricLabelValue] } : {}
+    metricLabels
   }
   return metric
 }
diff --git a/creator-node/src/services/stateMachineManager/stateReconciliation/issueSyncRequest.jobProcessor.js b/creator-node/src/services/stateMachineManager/stateReconciliation/issueSyncRequest.jobProcessor.js
index 715101d54b4..8f764e8152e 100644
--- a/creator-node/src/services/stateMachineManager/stateReconciliation/issueSyncRequest.jobProcessor.js
+++ b/creator-node/src/services/stateMachineManager/stateReconciliation/issueSyncRequest.jobProcessor.js
@@ -4,6 +4,9 @@ const _ = require('lodash')
 const config = require('../../../config')
 const models = require('../../../models')
 const Utils = require('../../../utils')
+const {
+  MetricNames
+} = require('../../prometheusMonitoring/prometheus.constants')
 const {
   retrieveClockValueForUserFromReplica,
   makeHistogramToRecord
@@ -127,10 +130,9 @@ module.exports = async function ({ logger, syncType, syncRequestParameters }) {
     )
   metricsToRecord.push(
     makeHistogramToRecord(
-      'ISSUE_SYNC_SECONDS_WAITING_FOR_COMPLETION_HISTOGRAM',
+      MetricNames.ISSUE_SYNC_REQUEST_MONITORING_DURATION_SECONDS_HISTOGRAM,
       (Date.now() - startWaitingForCompletion) / 1000, // Metric is in seconds
-      'reason_for_additional_sync',
-      reasonForAdditionalSync
+      { reason_for_additional_sync: reasonForAdditionalSync }
     )
   )
 

From 4287af59140288ebc8fe8be6b493a279a26ce4f0 Mon Sep 17 00:00:00 2001
From: Theo Ilie <theoilie.ti@gmail.com>
Date: Fri, 24 Jun 2022 21:56:05 +0000
Subject: [PATCH 3/5] Update test to reflect var name change

---
 creator-node/test/issueSyncRequest.jobProcessor.test.js | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/creator-node/test/issueSyncRequest.jobProcessor.test.js b/creator-node/test/issueSyncRequest.jobProcessor.test.js
index 6effecd0c57..db4a5ca6103 100644
--- a/creator-node/test/issueSyncRequest.jobProcessor.test.js
+++ b/creator-node/test/issueSyncRequest.jobProcessor.test.js
@@ -218,7 +218,7 @@ describe('test issueSyncRequest job processor', function () {
     expect(result.metricsToRecord).to.have.lengthOf(1)
     expect(result.metricsToRecord[0]).to.have.deep.property(
       'metricName',
-      'audius_cn_issue_sync_seconds_waiting_for_completion_histogram'
+      'issue_sync_request_monitoring_duration_seconds'
     )
     expect(result.metricsToRecord[0]).to.have.deep.property('metricLabel', {
       reason_for_additional_sync: 'none'
@@ -328,7 +328,7 @@ describe('test issueSyncRequest job processor', function () {
     expect(result.metricsToRecord).to.have.lengthOf(1)
     expect(result.metricsToRecord[0]).to.have.deep.property(
       'metricName',
-      'audius_cn_issue_sync_seconds_waiting_for_completion_histogram'
+      'issue_sync_request_monitoring_duration_seconds'
     )
     expect(result.metricsToRecord[0]).to.have.deep.property('metricLabel', {
       reason_for_additional_sync: 'secondary_progressed_too_slow'
@@ -410,7 +410,7 @@ describe('test issueSyncRequest job processor', function () {
     expect(result.metricsToRecord).to.have.lengthOf(1)
     expect(result.metricsToRecord[0]).to.have.deep.property(
       'metricName',
-      'audius_cn_issue_sync_seconds_waiting_for_completion_histogram'
+      'issue_sync_request_monitoring_duration_seconds'
     )
     expect(result.metricsToRecord[0]).to.have.deep.property('metricLabel', {
       reason_for_additional_sync: 'secondary_failed_to_progress'

From 9367bd585c6f7a594c4d54cdebf06f577968602e Mon Sep 17 00:00:00 2001
From: Theo Ilie <theoilie.ti@gmail.com>
Date: Fri, 24 Jun 2022 22:15:54 +0000
Subject: [PATCH 4/5] Oops forgot metric namespace in test

---
 creator-node/test/issueSyncRequest.jobProcessor.test.js | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/creator-node/test/issueSyncRequest.jobProcessor.test.js b/creator-node/test/issueSyncRequest.jobProcessor.test.js
index db4a5ca6103..e0f2925293a 100644
--- a/creator-node/test/issueSyncRequest.jobProcessor.test.js
+++ b/creator-node/test/issueSyncRequest.jobProcessor.test.js
@@ -218,7 +218,7 @@ describe('test issueSyncRequest job processor', function () {
     expect(result.metricsToRecord).to.have.lengthOf(1)
     expect(result.metricsToRecord[0]).to.have.deep.property(
       'metricName',
-      'issue_sync_request_monitoring_duration_seconds'
+      'audius_cn_issue_sync_request_monitoring_duration_seconds'
     )
     expect(result.metricsToRecord[0]).to.have.deep.property('metricLabel', {
       reason_for_additional_sync: 'none'
@@ -328,7 +328,7 @@ describe('test issueSyncRequest job processor', function () {
     expect(result.metricsToRecord).to.have.lengthOf(1)
     expect(result.metricsToRecord[0]).to.have.deep.property(
       'metricName',
-      'issue_sync_request_monitoring_duration_seconds'
+      'audius_cn_issue_sync_request_monitoring_duration_seconds'
     )
     expect(result.metricsToRecord[0]).to.have.deep.property('metricLabel', {
       reason_for_additional_sync: 'secondary_progressed_too_slow'
@@ -410,7 +410,7 @@ describe('test issueSyncRequest job processor', function () {
     expect(result.metricsToRecord).to.have.lengthOf(1)
     expect(result.metricsToRecord[0]).to.have.deep.property(
       'metricName',
-      'issue_sync_request_monitoring_duration_seconds'
+      'audius_cn_issue_sync_request_monitoring_duration_seconds'
     )
     expect(result.metricsToRecord[0]).to.have.deep.property('metricLabel', {
       reason_for_additional_sync: 'secondary_failed_to_progress'

From 045b37d4987a67ef8150e1b5bfe2df340e8d3041 Mon Sep 17 00:00:00 2001
From: Theo Ilie <theoilie.ti@gmail.com>
Date: Fri, 24 Jun 2022 22:54:34 +0000
Subject: [PATCH 5/5] Fix test var name for real this time

---
 creator-node/test/issueSyncRequest.jobProcessor.test.js | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/creator-node/test/issueSyncRequest.jobProcessor.test.js b/creator-node/test/issueSyncRequest.jobProcessor.test.js
index e0f2925293a..980935245c5 100644
--- a/creator-node/test/issueSyncRequest.jobProcessor.test.js
+++ b/creator-node/test/issueSyncRequest.jobProcessor.test.js
@@ -220,7 +220,7 @@ describe('test issueSyncRequest job processor', function () {
       'metricName',
       'audius_cn_issue_sync_request_monitoring_duration_seconds'
     )
-    expect(result.metricsToRecord[0]).to.have.deep.property('metricLabel', {
+    expect(result.metricsToRecord[0]).to.have.deep.property('metricLabels', {
       reason_for_additional_sync: 'none'
     })
     expect(result.metricsToRecord[0]).to.have.deep.property(
@@ -330,7 +330,7 @@ describe('test issueSyncRequest job processor', function () {
       'metricName',
       'audius_cn_issue_sync_request_monitoring_duration_seconds'
     )
-    expect(result.metricsToRecord[0]).to.have.deep.property('metricLabel', {
+    expect(result.metricsToRecord[0]).to.have.deep.property('metricLabels', {
       reason_for_additional_sync: 'secondary_progressed_too_slow'
     })
     expect(result.metricsToRecord[0]).to.have.deep.property(
@@ -412,7 +412,7 @@ describe('test issueSyncRequest job processor', function () {
       'metricName',
       'audius_cn_issue_sync_request_monitoring_duration_seconds'
     )
-    expect(result.metricsToRecord[0]).to.have.deep.property('metricLabel', {
+    expect(result.metricsToRecord[0]).to.have.deep.property('metricLabels', {
       reason_for_additional_sync: 'secondary_failed_to_progress'
     })
     expect(result.metricsToRecord[0]).to.have.deep.property(