-
Notifications
You must be signed in to change notification settings - Fork 8.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Security Solution][Telemetry] Concurrent telemetry requests #73558
Changes from 5 commits
3cf7b66
3a82039
93397b1
4ac318b
537aa12
5577ce4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,7 @@ | |
|
||
import { LegacyAPICaller, CoreSetup } from '../../../../../src/core/server'; | ||
import { CollectorDependencies } from './types'; | ||
import { DetectionsUsage, fetchDetectionsUsage } from './detections'; | ||
import { DetectionsUsage, fetchDetectionsUsage, defaultDetectionsUsage } from './detections'; | ||
import { EndpointUsage, getEndpointTelemetryFromFleet } from './endpoints'; | ||
|
||
export type RegisterCollector = (deps: CollectorDependencies) => void; | ||
|
@@ -76,9 +76,14 @@ export const registerCollector: RegisterCollector = ({ | |
isReady: () => kibanaIndex.length > 0, | ||
fetch: async (callCluster: LegacyAPICaller): Promise<UsageData> => { | ||
const savedObjectsClient = await getInternalSavedObjectsClient(core); | ||
const [detections, endpoints] = await Promise.allSettled([ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ℹ️ nice use of |
||
fetchDetectionsUsage(kibanaIndex, callCluster, ml), | ||
getEndpointTelemetryFromFleet(savedObjectsClient), | ||
]); | ||
|
||
return { | ||
detections: await fetchDetectionsUsage(kibanaIndex, callCluster, ml), | ||
endpoints: await getEndpointTelemetryFromFleet(savedObjectsClient), | ||
detections: detections.status === 'fulfilled' ? detections.value : defaultDetectionsUsage, | ||
endpoints: endpoints.status === 'fulfilled' ? endpoints.value : {}, | ||
}; | ||
}, | ||
}); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,10 @@ interface DetectionsMetric { | |
|
||
const isElasticRule = (tags: string[]) => tags.includes(`${INTERNAL_IMMUTABLE_KEY}:true`); | ||
|
||
const initialRulesUsage: DetectionRulesUsage = { | ||
/** | ||
* Default detection rule usage count | ||
*/ | ||
export const initialRulesUsage: DetectionRulesUsage = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❔ Doc comment on exports There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will add, thanks |
||
custom: { | ||
enabled: 0, | ||
disabled: 0, | ||
|
@@ -34,7 +37,10 @@ const initialRulesUsage: DetectionRulesUsage = { | |
}, | ||
}; | ||
|
||
const initialMlJobsUsage: MlJobsUsage = { | ||
/** | ||
* Default ml job usage count | ||
*/ | ||
export const initialMlJobsUsage: MlJobsUsage = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❔ Docs on exports There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
custom: { | ||
enabled: 0, | ||
disabled: 0, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,11 +42,14 @@ export interface AgentLocalMetadata extends AgentMetadata { | |
}; | ||
}; | ||
host: { | ||
hostname: string; | ||
id: string; | ||
name: string; | ||
}; | ||
os: { | ||
name: string; | ||
platform: string; | ||
kernel: string; | ||
version: string; | ||
full: string; | ||
}; | ||
|
@@ -78,17 +81,20 @@ export const updateEndpointOSTelemetry = ( | |
os: AgentLocalMetadata['os'], | ||
osTracker: OSTracker | ||
): OSTracker => { | ||
const updatedOSTracker = cloneDeep(osTracker); | ||
const { version: osVersion, platform: osPlatform, full: osFullName } = os; | ||
if (osFullName && osVersion) { | ||
if (updatedOSTracker[osFullName]) updatedOSTracker[osFullName].count += 1; | ||
else { | ||
updatedOSTracker[osFullName] = { | ||
full_name: osFullName, | ||
platform: osPlatform, | ||
version: osVersion, | ||
count: 1, | ||
}; | ||
let updatedOSTracker = osTracker; | ||
if (os && typeof os === 'object') { | ||
updatedOSTracker = cloneDeep(osTracker); | ||
const { version: osVersion, platform: osPlatform, full: osFullName } = os; | ||
if (osFullName && osVersion) { | ||
if (updatedOSTracker[osFullName]) updatedOSTracker[osFullName].count += 1; | ||
else { | ||
updatedOSTracker[osFullName] = { | ||
full_name: osFullName, | ||
platform: osPlatform, | ||
version: osVersion, | ||
count: 1, | ||
}; | ||
} | ||
} | ||
} | ||
|
||
|
@@ -211,46 +217,53 @@ export const getEndpointTelemetryFromFleet = async ( | |
if (!endpointAgents || endpointAgentsCount < 1) return endpointTelemetry; | ||
|
||
// Use unique hosts to prevent any potential duplicates | ||
const uniqueHostIds: Set<string> = new Set(); | ||
const uniqueHosts: Set<string> = new Set(); | ||
let osTracker: OSTracker = {}; | ||
let dailyActiveCount = 0; | ||
let policyTracker: PoliciesTelemetry = { malware: { active: 0, inactive: 0, failure: 0 } }; | ||
|
||
for (let i = 0; i < endpointAgentsCount; i += 1) { | ||
const { attributes: metadataAttributes } = endpointAgents[i]; | ||
const { last_checkin: lastCheckin, local_metadata: localMetadata } = metadataAttributes; | ||
const { host, os, elastic } = localMetadata as AgentLocalMetadata; // AgentMetadata is just an empty blob, casting for our use case | ||
|
||
if (!uniqueHostIds.has(host.id)) { | ||
uniqueHostIds.add(host.id); | ||
const agentId = elastic?.agent?.id; | ||
osTracker = updateEndpointOSTelemetry(os, osTracker); | ||
|
||
if (agentId) { | ||
let agentEvents; | ||
try { | ||
const response = await getLatestFleetEndpointEvent(soClient, agentId); | ||
agentEvents = response.saved_objects; | ||
} catch (error) { | ||
// If the request fails we do not obtain `active within last 24 hours for this agent` or policy specifics | ||
} | ||
|
||
// AgentEvents will have a max length of 1 | ||
if (agentEvents && agentEvents.length > 0) { | ||
const latestEndpointEvent = agentEvents[0]; | ||
dailyActiveCount = updateEndpointDailyActiveCount( | ||
latestEndpointEvent, | ||
lastCheckin, | ||
dailyActiveCount | ||
try { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks more aggressive than it is. I just moved the contents of the |
||
const { attributes: metadataAttributes } = endpointAgents[i]; | ||
const { last_checkin: lastCheckin, local_metadata: localMetadata } = metadataAttributes; | ||
const { host, os, elastic } = localMetadata as AgentLocalMetadata; | ||
|
||
// Although not perfect, the goal is to dedupe hosts to get the most recent data for a host | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fyi @jonathan-buttner per our conversation |
||
// An agent re-installed on the same host will have all the same id, name, and kernel details | ||
// A cloned VM will have the same id, but "may" have the same name and kernel, but it's really up to the user. | ||
const compoundUniqueId = `${host?.id}-${host?.hostname}-${os?.kernel}`; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm actually thinking about this more, what would happen in the scenario where a user updates their computer to a new OS version? I think the I wonder if we should just stick with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. True, good point. Yea, the OS information would be tricky in update scenarios. I'll simplify it to just those two. Thanks! |
||
if (!uniqueHosts.has(compoundUniqueId)) { | ||
uniqueHosts.add(compoundUniqueId); | ||
const agentId = elastic?.agent?.id; | ||
osTracker = updateEndpointOSTelemetry(os, osTracker); | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❔ Could you add a comment here about where the error was throwing? Moving the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
if (agentId) { | ||
const { saved_objects: agentEvents } = await getLatestFleetEndpointEvent( | ||
soClient, | ||
agentId | ||
); | ||
policyTracker = updateEndpointPolicyTelemetry(latestEndpointEvent, policyTracker); | ||
|
||
// AgentEvents will have a max length of 1 | ||
if (agentEvents && agentEvents.length > 0) { | ||
const latestEndpointEvent = agentEvents[0]; | ||
dailyActiveCount = updateEndpointDailyActiveCount( | ||
latestEndpointEvent, | ||
lastCheckin, | ||
dailyActiveCount | ||
); | ||
policyTracker = updateEndpointPolicyTelemetry(latestEndpointEvent, policyTracker); | ||
} | ||
} | ||
} | ||
} catch (error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we log an error/warning here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see previous comment :). I'll also check with the Telemetry team to see how they handle errors to confirm |
||
// All errors thrown in the loop would be handled here | ||
// Not logging any errors to avoid leaking any potential PII | ||
// Depending on when the error is thrown in the loop some specifics may be missing, but it allows the loop to continue | ||
} | ||
} | ||
|
||
// All unique hosts with an endpoint installed, thus all unique endpoint installs | ||
endpointTelemetry.total_installed = uniqueHostIds.size; | ||
endpointTelemetry.total_installed = uniqueHosts.size; | ||
// Set the daily active count for the endpoints | ||
endpointTelemetry.active_within_last_24_hours = dailyActiveCount; | ||
// Get the objects to populate our OS Telemetry | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If one of the requests fails should we log any errors? or will that already happen?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't already happen, we can throw a log in here, but
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That makes sense. Might be worth asking some others on the telemetry team what the general guidance is.
I guess my thinking was if we release a new stack version and we notice that we're not getting any telemetry for some reason, I believe we collect the logs of our cloud deployments so we could poke around and and least see if one of the requests is failing or something 🤷 .
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yea, as far as I can tell there aren't any patterns it sounds like. We'll just see an empty object for now if anything fails, but I think we can work with the telemetry team put in some better logic here for 7.10