Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CON-265] Add percentage based metrics to network monitoring #3525

Merged
merged 3 commits into from
Jul 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 100 additions & 81 deletions discovery-provider/plugins/network-monitoring/src/metrics/index.ts
Original file line number Diff line number Diff line change
@@ -1,102 +1,121 @@
import axios from "axios"
import axios from "axios";
import {
allUserCountGauge,
fullySyncedUsersCountGauge,
gateway,
generatingMetricsDurationGauge,
nullPrimaryUsersCountGauge,
partiallySyncedUsersCountGauge,
primaryUserCountGauge,
unsyncedUsersCountGauge,
usersWithAllFoundationNodeReplicaSetGauge,
} from "../prometheus"
import { getEnv } from "../utils"
allUserCountGauge,
fullySyncedUsersCountGauge,
gateway,
generatingMetricsDurationGauge,
nullPrimaryUsersCountGauge,
partiallySyncedUsersCountGauge,
primaryUserCountGauge,
unsyncedUsersCountGauge,
userCountGauge,
usersWithAllFoundationNodeReplicaSetGauge,
} from "../prometheus";
import { getEnv } from "../utils";
import {
getPrimaryUserCount,
getAllUserCount,
getFullySyncedUsersCount,
getPartiallySyncedUsersCount,
getUnsyncedUsersCount,
getUsersWithNullPrimaryClock,
getUsersWithEntireReplicaSetInSpidSetCount,
} from "./queries"
getPrimaryUserCount,
getAllUserCount,
getFullySyncedUsersCount,
getPartiallySyncedUsersCount,
getUnsyncedUsersCount,
getUsersWithNullPrimaryClock,
getUsersWithEntireReplicaSetInSpidSetCount,
getUserCount,
} from "./queries";

export const generateMetrics = async (run_id: number) => {
const { foundationNodes } = getEnv();

const { foundationNodes } = getEnv()
console.log(`[${run_id}] generating metrics`);

console.log(`[${run_id}] generating metrics`)
const endTimer = generatingMetricsDurationGauge.startTimer();

const endTimer = generatingMetricsDurationGauge.startTimer()
const userCount = await getUserCount(run_id);

const allUserCount = await getAllUserCount(run_id)
const allUserCount = await getAllUserCount(run_id);

const primaryUserCount = await getPrimaryUserCount(run_id)
const primaryUserCount = await getPrimaryUserCount(run_id);

const fullySyncedUsersCount = await getFullySyncedUsersCount(run_id)
const fullySyncedUsersCount = await getFullySyncedUsersCount(run_id);

const partiallySyncedUserCount = await getPartiallySyncedUsersCount(run_id)
const partiallySyncedUserCount = await getPartiallySyncedUsersCount(run_id);

const unsyncedUsersCount = await getUnsyncedUsersCount(run_id)
const unsyncedUsersCount = await getUnsyncedUsersCount(run_id);

const usersWithNullPrimaryClock = await getUsersWithNullPrimaryClock(run_id)
const usersWithNullPrimaryClock = await getUsersWithNullPrimaryClock(run_id);

const usersWithAllFoundationNodeReplicaSetCount = await getUsersWithEntireReplicaSetInSpidSetCount(run_id, foundationNodes)
const usersWithAllFoundationNodeReplicaSetCount =
await getUsersWithEntireReplicaSetInSpidSetCount(run_id, foundationNodes);

allUserCount.forEach(({ endpoint, count }) => {
allUserCountGauge.set({ endpoint, run_id }, count)
})
primaryUserCount.forEach(({ endpoint, count }) => {
primaryUserCountGauge.set({ endpoint, run_id }, count)
})
allUserCount.forEach(({ endpoint, count }) => {
allUserCountGauge.set({ endpoint, run_id }, count);
});
primaryUserCount.forEach(({ endpoint, count }) => {
primaryUserCountGauge.set({ endpoint, run_id }, count);
});

fullySyncedUsersCountGauge.set({ run_id }, fullySyncedUsersCount)
partiallySyncedUsersCountGauge.set({ run_id }, partiallySyncedUserCount)
unsyncedUsersCountGauge.set({ run_id }, unsyncedUsersCount)
nullPrimaryUsersCountGauge.set({ run_id }, usersWithNullPrimaryClock)
usersWithAllFoundationNodeReplicaSetGauge.set({ run_id }, usersWithAllFoundationNodeReplicaSetCount)
userCountGauge.set({ run_id }, userCount);
fullySyncedUsersCountGauge.set({ run_id }, fullySyncedUsersCount);
partiallySyncedUsersCountGauge.set({ run_id }, partiallySyncedUserCount);
unsyncedUsersCountGauge.set({ run_id }, unsyncedUsersCount);
nullPrimaryUsersCountGauge.set({ run_id }, usersWithNullPrimaryClock);
usersWithAllFoundationNodeReplicaSetGauge.set(
{ run_id },
usersWithAllFoundationNodeReplicaSetCount
);

// Record duration for generating metrics and export to prometheus
endTimer({ run_id: run_id })
// Record duration for generating metrics and export to prometheus
endTimer({ run_id: run_id });

if (userCount > 0) {
await publishSlackReport({
fullySyncedUsersCount: fullySyncedUsersCount,
partiallySyncedUsersCount: partiallySyncedUserCount,
unsyncedUsersCount: unsyncedUsersCount,
usersWithNullPrimaryClock: usersWithNullPrimaryClock
})

try {
// Finish by publishing metrics to prometheus push gateway
console.log(`[${run_id}] pushing metrics to gateway`);
await gateway.pushAdd({ jobName: 'network-monitoring' })
} catch (e) {
console.log(`[generateMetrics] error pushing metrics to pushgateway - ${(e as Error).message}`)
}


console.log(`[${run_id}] finish generating metrics`);
}
fullySyncedUsersCount:
((fullySyncedUsersCount / userCount) * 100).toFixed(2) + "%",
jonaylor89 marked this conversation as resolved.
Show resolved Hide resolved
partiallySyncedUsersCount:
((partiallySyncedUserCount / userCount) * 100).toFixed(2) + "%",
unsyncedUsersCount:
((unsyncedUsersCount / userCount) * 100).toFixed(2) + "%",
usersWithNullPrimaryClock:
((usersWithNullPrimaryClock / userCount) * 100).toFixed(2) + "%",
usersWithAllFoundationNodeReplicaSetCount:
((usersWithAllFoundationNodeReplicaSetCount / userCount) * 100).toFixed(
2
) + "%",
});
}

try {
// Finish by publishing metrics to prometheus push gateway
console.log(`[${run_id}] pushing metrics to gateway`);
await gateway.pushAdd({ jobName: "network-monitoring" });
} catch (e) {
console.log(
`[generateMetrics] error pushing metrics to pushgateway - ${
(e as Error).message
}`
);
}

console.log(`[${run_id}] finish generating metrics`);
};

const publishSlackReport = async (metrics: Object) => {

const { slackUrl } = getEnv()

if (slackUrl === '') {
return
}

let message = `\`\`\`${JSON.stringify(metrics, null, 2)}\`\`\``
console.log(message)

try {
await axios.post(
slackUrl,
{
text: message,
},
)
} catch (e) {
console.log(`Error posting to slack in slack reporter ${(e as Error).toString()}`)
}
}
const { slackUrl } = getEnv();

let message = `\`\`\`${JSON.stringify(metrics, null, 2)}\`\`\``;
console.log(message);

if (slackUrl === "") {
return;
}

try {
await axios.post(slackUrl, {
text: message,
});
} catch (e) {
console.log(
`Error posting to slack in slack reporter ${(e as Error).toString()}`
);
}
};
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,34 @@
import { QueryTypes } from "sequelize"
import { sequelizeConn } from "../db"

/*
* Metrics from the discovery DB
*
* These metrics are primarily used to make prometheus
* and grafana more readable/understandable
*/

// Get the current user count from discovery nodes
export const getUserCount = async (run_id: number): Promise<number> => {

const usersResp: unknown[] = await sequelizeConn.query(`
SELECT COUNT(*) as user_count
FROM network_monitoring_users
WHERE run_id = :run_id
`, {
type: QueryTypes.SELECT,
replacements: { run_id },
})

const usersCount = parseInt(((usersResp as { user_count: string }[])[0] || { user_count: '0' }).user_count)

return usersCount
}

/*
* Core metrics
*/

export const getCidsReplicatedAtLeastOnce = async (run_id: number): Promise<{ content_node_spid: string, cid_count: number }[]> => {

const cidsListResp = await sequelizeConn.query(`
Expand Down Expand Up @@ -59,6 +87,8 @@ export const getPrimaryUserCount = async (run_id: number): Promise<{ endpoint: s
return primaryCount
}

// Count of users who have a specific content node in their replica set
// This is different from `getUserCount()` which literally just gets the number of users on Audius
export const getAllUserCount = async (run_id: number): Promise<{ endpoint: string, count: number }[]> => {
console.log(`[${run_id}] metric: all user count`);
const userListResp: unknown[] = await sequelizeConn.query(`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ const { pushGatewayUrl } = getEnv()

export const gateway = new client.Pushgateway(pushGatewayUrl)

export const userCountGauge = new client.Gauge({
name: 'audius_nm_user_count',
help: 'the number of users on audius',
labelNames: ['run_id'],
})

export const allUserCountGauge = new client.Gauge({
name: 'audius_nm_all_user_count',
help: 'the count of users with this content node in their replica set',
Expand Down