AudiusProject · jonaylor89 · Jul 22, 2022 · Jul 21, 2022 · Jul 21, 2022 · Jul 21, 2022
diff --git a/discovery-provider/plugins/network-monitoring/src/metrics/index.ts b/discovery-provider/plugins/network-monitoring/src/metrics/index.ts
@@ -1,102 +1,121 @@
-import axios from "axios"
+import axios from "axios";
 import {
-    allUserCountGauge,
-    fullySyncedUsersCountGauge,
-    gateway,
-    generatingMetricsDurationGauge,
-    nullPrimaryUsersCountGauge,
-    partiallySyncedUsersCountGauge,
-    primaryUserCountGauge,
-    unsyncedUsersCountGauge,
-    usersWithAllFoundationNodeReplicaSetGauge,
-} from "../prometheus"
-import { getEnv } from "../utils"
+  allUserCountGauge,
+  fullySyncedUsersCountGauge,
+  gateway,
+  generatingMetricsDurationGauge,
+  nullPrimaryUsersCountGauge,
+  partiallySyncedUsersCountGauge,
+  primaryUserCountGauge,
+  unsyncedUsersCountGauge,
+  userCountGauge,
+  usersWithAllFoundationNodeReplicaSetGauge,
+} from "../prometheus";
+import { getEnv } from "../utils";
 import {
-    getPrimaryUserCount,
-    getAllUserCount,
-    getFullySyncedUsersCount,
-    getPartiallySyncedUsersCount,
-    getUnsyncedUsersCount,
-    getUsersWithNullPrimaryClock,
-    getUsersWithEntireReplicaSetInSpidSetCount,
-} from "./queries"
+  getPrimaryUserCount,
+  getAllUserCount,
+  getFullySyncedUsersCount,
+  getPartiallySyncedUsersCount,
+  getUnsyncedUsersCount,
+  getUsersWithNullPrimaryClock,
+  getUsersWithEntireReplicaSetInSpidSetCount,
+  getUserCount,
+} from "./queries";
 
 export const generateMetrics = async (run_id: number) => {
+  const { foundationNodes } = getEnv();
 
-    const { foundationNodes } = getEnv()
+  console.log(`[${run_id}] generating metrics`);
 
-    console.log(`[${run_id}] generating metrics`)
+  const endTimer = generatingMetricsDurationGauge.startTimer();
 
-    const endTimer = generatingMetricsDurationGauge.startTimer()
+  const userCount = await getUserCount(run_id);
 
-    const allUserCount = await getAllUserCount(run_id)
+  const allUserCount = await getAllUserCount(run_id);
 
-    const primaryUserCount = await getPrimaryUserCount(run_id)
+  const primaryUserCount = await getPrimaryUserCount(run_id);
 
-    const fullySyncedUsersCount = await getFullySyncedUsersCount(run_id)
+  const fullySyncedUsersCount = await getFullySyncedUsersCount(run_id);
 
-    const partiallySyncedUserCount = await getPartiallySyncedUsersCount(run_id)
+  const partiallySyncedUserCount = await getPartiallySyncedUsersCount(run_id);
 
-    const unsyncedUsersCount = await getUnsyncedUsersCount(run_id)
+  const unsyncedUsersCount = await getUnsyncedUsersCount(run_id);
 
-    const usersWithNullPrimaryClock = await getUsersWithNullPrimaryClock(run_id)
+  const usersWithNullPrimaryClock = await getUsersWithNullPrimaryClock(run_id);
 
-    const usersWithAllFoundationNodeReplicaSetCount = await getUsersWithEntireReplicaSetInSpidSetCount(run_id, foundationNodes)
+  const usersWithAllFoundationNodeReplicaSetCount =
+    await getUsersWithEntireReplicaSetInSpidSetCount(run_id, foundationNodes);
 
-    allUserCount.forEach(({ endpoint, count }) => {
-        allUserCountGauge.set({ endpoint, run_id }, count)
-    })
-    primaryUserCount.forEach(({ endpoint, count }) => {
-        primaryUserCountGauge.set({ endpoint, run_id }, count)
-    })
+  allUserCount.forEach(({ endpoint, count }) => {
+    allUserCountGauge.set({ endpoint, run_id }, count);
+  });
+  primaryUserCount.forEach(({ endpoint, count }) => {
+    primaryUserCountGauge.set({ endpoint, run_id }, count);
+  });
 
-    fullySyncedUsersCountGauge.set({ run_id }, fullySyncedUsersCount)
-    partiallySyncedUsersCountGauge.set({ run_id }, partiallySyncedUserCount)
-    unsyncedUsersCountGauge.set({ run_id }, unsyncedUsersCount)
-    nullPrimaryUsersCountGauge.set({ run_id }, usersWithNullPrimaryClock)
-    usersWithAllFoundationNodeReplicaSetGauge.set({ run_id }, usersWithAllFoundationNodeReplicaSetCount)
+  userCountGauge.set({ run_id }, userCount);
+  fullySyncedUsersCountGauge.set({ run_id }, fullySyncedUsersCount);
+  partiallySyncedUsersCountGauge.set({ run_id }, partiallySyncedUserCount);
+  unsyncedUsersCountGauge.set({ run_id }, unsyncedUsersCount);
+  nullPrimaryUsersCountGauge.set({ run_id }, usersWithNullPrimaryClock);
+  usersWithAllFoundationNodeReplicaSetGauge.set(
+    { run_id },
+    usersWithAllFoundationNodeReplicaSetCount
+  );
 
-    // Record duration for generating metrics and export to prometheus
-    endTimer({ run_id: run_id })
+  // Record duration for generating metrics and export to prometheus
+  endTimer({ run_id: run_id });
 
+  if (userCount > 0) {
     await publishSlackReport({
-        fullySyncedUsersCount: fullySyncedUsersCount,
-        partiallySyncedUsersCount: partiallySyncedUserCount,
-        unsyncedUsersCount: unsyncedUsersCount,
-        usersWithNullPrimaryClock: usersWithNullPrimaryClock
-    })
-
-    try {
-        // Finish by publishing metrics to prometheus push gateway
-        console.log(`[${run_id}] pushing metrics to gateway`);
-        await gateway.pushAdd({ jobName: 'network-monitoring' })
-    } catch (e) {
-        console.log(`[generateMetrics] error pushing metrics to pushgateway - ${(e as Error).message}`)
-    }
-
-
-    console.log(`[${run_id}] finish generating metrics`);
-}
+      fullySyncedUsersCount:
+        ((fullySyncedUsersCount / userCount) * 100).toFixed(2) + "%",
+      partiallySyncedUsersCount:
+        ((partiallySyncedUserCount / userCount) * 100).toFixed(2) + "%",
+      unsyncedUsersCount:
+        ((unsyncedUsersCount / userCount) * 100).toFixed(2) + "%",
+      usersWithNullPrimaryClock:
+        ((usersWithNullPrimaryClock / userCount) * 100).toFixed(2) + "%",
+      usersWithAllFoundationNodeReplicaSetCount:
+        ((usersWithAllFoundationNodeReplicaSetCount / userCount) * 100).toFixed(
+          2
+        ) + "%",
+    });
+  }
+
+  try {
+    // Finish by publishing metrics to prometheus push gateway
+    console.log(`[${run_id}] pushing metrics to gateway`);
+    await gateway.pushAdd({ jobName: "network-monitoring" });
+  } catch (e) {
+    console.log(
+      `[generateMetrics] error pushing metrics to pushgateway - ${
+        (e as Error).message
+      }`
+    );
+  }
+
+  console.log(`[${run_id}] finish generating metrics`);
+};
 
 const publishSlackReport = async (metrics: Object) => {
-
-    const { slackUrl } = getEnv()
-
-    if (slackUrl === '') {
-        return
-    }
-
-    let message = `\`\`\`${JSON.stringify(metrics, null, 2)}\`\`\`` 
-    console.log(message)
-
-    try {
-        await axios.post(
-            slackUrl,
-            {
-                text: message,
-            }, 
-        )
-    } catch (e) {
-        console.log(`Error posting to slack in slack reporter ${(e as Error).toString()}`)
-    }
-}
+  const { slackUrl } = getEnv();
+
+  let message = `\`\`\`${JSON.stringify(metrics, null, 2)}\`\`\``;
+  console.log(message);
+
+  if (slackUrl === "") {
+    return;
+  }
+
+  try {
+    await axios.post(slackUrl, {
+      text: message,
+    });
+  } catch (e) {
+    console.log(
+      `Error posting to slack in slack reporter ${(e as Error).toString()}`
+    );
+  }
+};
diff --git a/discovery-provider/plugins/network-monitoring/src/metrics/queries.ts b/discovery-provider/plugins/network-monitoring/src/metrics/queries.ts
@@ -2,6 +2,34 @@
 import { QueryTypes } from "sequelize"
 import { sequelizeConn } from "../db"
 
+/*
+ * Metrics from the discovery DB
+ *
+ * These metrics are primarily used to make prometheus
+ * and grafana more readable/understandable
+ */ 
+
+// Get the current user count from discovery nodes
+export const getUserCount = async (run_id: number): Promise<number> => {
+
+    const usersResp: unknown[] = await sequelizeConn.query(`
+    SELECT COUNT(*) as user_count
+    FROM network_monitoring_users
+    WHERE run_id = :run_id
+    `, {
+        type: QueryTypes.SELECT,
+        replacements: { run_id },
+    })
+
+    const usersCount = parseInt(((usersResp as { user_count: string }[])[0] || { user_count: '0' }).user_count)
+
+    return usersCount
+}
+
+/* 
+ * Core metrics
+ */
+
 export const getCidsReplicatedAtLeastOnce = async (run_id: number): Promise<{ content_node_spid: string, cid_count: number }[]> => {
 
     const cidsListResp = await sequelizeConn.query(`
@@ -59,6 +87,8 @@ export const getPrimaryUserCount = async (run_id: number): Promise<{ endpoint: s
     return primaryCount
 }
 
+// Count of users who have a specific content node in their replica set 
+// This is different from `getUserCount()` which literally just gets the number of users on Audius
 export const getAllUserCount = async (run_id: number): Promise<{ endpoint: string, count: number }[]> => {
     console.log(`[${run_id}] metric: all user count`);
     const userListResp: unknown[] = await sequelizeConn.query(`

diff --git a/discovery-provider/plugins/network-monitoring/src/prometheus.ts b/discovery-provider/plugins/network-monitoring/src/prometheus.ts
@@ -6,6 +6,12 @@ const { pushGatewayUrl } = getEnv()
 
 export const gateway = new client.Pushgateway(pushGatewayUrl)
 
+export const userCountGauge = new client.Gauge({
+    name: 'audius_nm_user_count',
+    help: 'the number of users on audius',
+    labelNames: ['run_id'],
+})
+
 export const allUserCountGauge = new client.Gauge({
     name: 'audius_nm_all_user_count',
     help: 'the count of users with this content node in their replica set',