Skip to content

Commit

Permalink
[profiling] Add experimental CPU profiler (#3895)
Browse files Browse the repository at this point in the history
* Add CPU profiler

* Avoid restarting profiling for last collect

Upon process exit, profiler collects profiles a last time and was
restarting all profilers before stopping them.
Avoid this useless restart by changing the profiler API and make
`profile()` methode take a restart argument.

* Bump pprof-nodejs version
  • Loading branch information
nsavoire authored and szegedi committed Jan 17, 2024
1 parent 909d99d commit 8816250
Show file tree
Hide file tree
Showing 15 changed files with 166 additions and 68 deletions.
22 changes: 14 additions & 8 deletions integration-tests/profiler.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ describe('profiler', () => {
const threadIdKey = strings.dedup('thread id')
const osThreadIdKey = strings.dedup('os thread id')
const threadNameValue = strings.dedup('Main Event Loop')
const nonJSThreadNameValue = strings.dedup('Non-JS threads')

for (const sample of profile.sample) {
let ts, spanId, rootSpanId, endpoint, threadName, threadId, osThreadId
for (const label of sample.label) {
Expand All @@ -205,14 +207,18 @@ describe('profiler', () => {
default: assert.fail(`Unexpected label key ${strings.dedup(label.key)} ${encoded}`)
}
}
// Timestamp must be defined and be between process start and end time
assert.isDefined(ts, encoded)
assert.isNumber(osThreadId, encoded)
assert.equal(threadId, strings.dedup('0'), encoded)
assert.isTrue(ts <= procEnd, encoded)
assert.isTrue(ts >= procStart, encoded)
// Thread name must be defined and exactly equal "Main Event Loop"
assert.equal(threadName, threadNameValue, encoded)
if (threadName !== nonJSThreadNameValue) {
// Timestamp must be defined and be between process start and end time
assert.isDefined(ts, encoded)
assert.isNumber(osThreadId, encoded)
assert.equal(threadId, strings.dedup('0'), encoded)
assert.isTrue(ts <= procEnd, encoded)
assert.isTrue(ts >= procStart, encoded)
// Thread name must be defined and exactly equal "Main Event Loop"
assert.equal(threadName, threadNameValue, encoded)
} else {
assert.equal(threadId, strings.dedup('NA'), encoded)
}
// Either all or none of span-related labels are defined
if (endpoint === undefined) {
// It is possible to catch a sample executing in tracer's startSpan so
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
"@datadog/native-iast-rewriter": "2.2.2",
"@datadog/native-iast-taint-tracking": "1.6.4",
"@datadog/native-metrics": "^2.0.0",
"@datadog/pprof": "4.1.0",
"@datadog/pprof": "5.0.0",
"@datadog/sketches-js": "^2.1.0",
"@opentelemetry/api": "^1.0.0",
"@opentelemetry/core": "^1.14.0",
Expand Down
4 changes: 4 additions & 0 deletions packages/dd-trace/src/profiling/config.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class Config {
DD_PROFILING_HEAP_ENABLED,
DD_PROFILING_V8_PROFILER_BUG_WORKAROUND,
DD_PROFILING_WALLTIME_ENABLED,
DD_PROFILING_EXPERIMENTAL_CPU_ENABLED,
DD_PROFILING_EXPERIMENTAL_OOM_MONITORING_ENABLED,
DD_PROFILING_EXPERIMENTAL_OOM_HEAP_LIMIT_EXTENSION_SIZE,
DD_PROFILING_EXPERIMENTAL_OOM_MAX_HEAP_EXTENSION_COUNT,
Expand Down Expand Up @@ -150,6 +151,9 @@ class Config {
DD_PROFILING_EXPERIMENTAL_CODEHOTSPOTS_ENABLED, false))
logExperimentalVarDeprecation('CODEHOTSPOTS_ENABLED')

this.cpuProfilingEnabled = isTrue(coalesce(options.cpuProfilingEnabled,
DD_PROFILING_EXPERIMENTAL_CPU_ENABLED, false))

this.profilers = ensureProfilers(profilers, this)
}
}
Expand Down
3 changes: 2 additions & 1 deletion packages/dd-trace/src/profiling/exporters/file.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

const fs = require('fs')
const { promisify } = require('util')
const { threadId } = require('worker_threads')
const writeFile = promisify(fs.writeFile)

function formatDateTime (t) {
Expand All @@ -19,7 +20,7 @@ class FileExporter {
const types = Object.keys(profiles)
const dateStr = formatDateTime(end)
const tasks = types.map(type => {
return writeFile(`${this._pprofPrefix}${type}_${dateStr}.pprof`, profiles[type])
return writeFile(`${this._pprofPrefix}${type}_worker_${threadId}_${dateStr}.pprof`, profiles[type])
})

return Promise.all(tasks)
Expand Down
32 changes: 18 additions & 14 deletions packages/dd-trace/src/profiling/profiler.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
const { EventEmitter } = require('events')
const { Config } = require('./config')
const { snapshotKinds } = require('./constants')
const { threadNamePrefix } = require('./profilers/shared')

function maybeSourceMap (sourceMap, SourceMapper, debug) {
if (!sourceMap) return
Expand Down Expand Up @@ -68,7 +69,7 @@ class Profiler extends EventEmitter {
mapper,
nearOOMCallback: this._nearOOMExport.bind(this)
})
this._logger.debug(`Started ${profiler.type} profiler`)
this._logger.debug(`Started ${profiler.type} profiler in ${threadNamePrefix} thread`)
}

this._capture(this._timeoutInterval, start)
Expand Down Expand Up @@ -97,7 +98,7 @@ class Profiler extends EventEmitter {

// collect and export current profiles
// once collect returns, profilers can be safely stopped
this._collect(snapshotKinds.ON_SHUTDOWN)
this._collect(snapshotKinds.ON_SHUTDOWN, false)
this._stop()
}

Expand All @@ -108,13 +109,11 @@ class Profiler extends EventEmitter {

for (const profiler of this._config.profilers) {
profiler.stop()
this._logger.debug(`Stopped ${profiler.type} profiler`)
this._logger.debug(`Stopped ${profiler.type} profiler in ${threadNamePrefix} thread`)
}

clearTimeout(this._timer)
this._timer = undefined

return this
}

_capture (timeout, start) {
Expand All @@ -128,18 +127,21 @@ class Profiler extends EventEmitter {
}
}

async _collect (snapshotKind) {
async _collect (snapshotKind, restart = true) {
if (!this._enabled) return

const start = this._lastStart
const end = new Date()
const startDate = this._lastStart
const endDate = new Date()
const profiles = []
const encodedProfiles = {}

try {
// collect profiles synchronously so that profilers can be safely stopped asynchronously
for (const profiler of this._config.profilers) {
const profile = profiler.profile(start, end)
const profile = profiler.profile(restart, startDate, endDate)
if (!restart) {
this._logger.debug(`Stopped ${profiler.type} profiler in ${threadNamePrefix} thread`)
}
if (!profile) continue
profiles.push({ profiler, profile })
}
Expand All @@ -155,8 +157,10 @@ class Profiler extends EventEmitter {
})
}

this._capture(this._timeoutInterval, end)
await this._submit(encodedProfiles, start, end, snapshotKind)
if (restart) {
this._capture(this._timeoutInterval, endDate)
}
await this._submit(encodedProfiles, startDate, endDate, snapshotKind)
this._logger.debug('Submitted profiles')
} catch (err) {
this._logger.error(err)
Expand Down Expand Up @@ -196,10 +200,10 @@ class ServerlessProfiler extends Profiler {
this._flushAfterIntervals = this._config.flushInterval / 1000
}

async _collect (snapshotKind) {
if (this._profiledIntervals >= this._flushAfterIntervals) {
async _collect (snapshotKind, restart = true) {
if (this._profiledIntervals >= this._flushAfterIntervals || !restart) {
this._profiledIntervals = 0
await super._collect(snapshotKind)
await super._collect(snapshotKind, restart)
} else {
this._profiledIntervals += 1
this._capture(this._timeoutInterval, new Date())
Expand Down
14 changes: 10 additions & 4 deletions packages/dd-trace/src/profiling/profilers/events.js
Original file line number Diff line number Diff line change
Expand Up @@ -159,22 +159,24 @@ class EventsProfiler {
}

start () {
// if already started, do nothing
if (this._observer) return

function add (items) {
this.entries.push(...items.getEntries())
}
if (!this._observer) {
this._observer = new PerformanceObserver(add.bind(this))
}
this._observer = new PerformanceObserver(add.bind(this))
this._observer.observe({ entryTypes: Object.keys(decoratorTypes) })
}

stop () {
if (this._observer) {
this._observer.disconnect()
this._observer = undefined
}
}

profile (startDate, endDate) {
profile (restart, startDate, endDate) {
if (this.entries.length === 0) {
// No events in the period; don't produce a profile
return null
Expand Down Expand Up @@ -243,6 +245,10 @@ class EventsProfiler {
unit: stringTable.dedup(pprofValueUnit)
})

if (!restart) {
this.stop()
}

return new Profile({
sampleType: [timeValueType],
timeNanos: endDate.getTime() * MS_TO_NS,
Expand Down
6 changes: 6 additions & 0 deletions packages/dd-trace/src/profiling/profilers/shared.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,17 @@ function cacheThreadLabels () {
}
}

function getNonJSThreadsLabels () {
return { [THREAD_NAME_LABEL]: 'Non-JS threads', [THREAD_ID_LABEL]: 'NA', [OS_THREAD_ID_LABEL]: 'NA' }
}

module.exports = {
END_TIMESTAMP_LABEL,
THREAD_NAME_LABEL,
THREAD_ID_LABEL,
OS_THREAD_ID_LABEL,
threadNamePrefix,
eventLoopThreadName,
getNonJSThreadsLabels,
getThreadLabels: cacheThreadLabels()
}
19 changes: 17 additions & 2 deletions packages/dd-trace/src/profiling/profilers/space.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@ class NativeSpaceProfiler {
this._stackDepth = options.stackDepth || 64
this._pprof = undefined
this._oomMonitoring = options.oomMonitoring || {}
this._started = false
}

start ({ mapper, nearOOMCallback } = {}) {
if (this._started) return

this._mapper = mapper
this._pprof = require('@datadog/pprof')
this._pprof.heap.start(this._samplingInterval, this._stackDepth)
Expand All @@ -31,18 +34,30 @@ class NativeSpaceProfiler {
strategiesToCallbackMode(strategies, this._pprof.heap.CallbackMode)
)
}

this._started = true
}

profile () {
return this._pprof.heap.profile(undefined, this._mapper, getThreadLabels)
profile (restart) {
const profile = this._pprof.heap.profile(undefined, this._mapper, getThreadLabels)
if (!restart) {
this.stop()
}
return profile
}

encode (profile) {
return this._pprof.encode(profile)
}

stop () {
if (!this._started) return
this._pprof.heap.stop()
this._started = false
}

isStarted () {
return this._started
}
}

Expand Down
55 changes: 34 additions & 21 deletions packages/dd-trace/src/profiling/profilers/wall.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ const { HTTP_METHOD, HTTP_ROUTE, RESOURCE_NAME, SPAN_TYPE } = require('../../../
const { WEB } = require('../../../../../ext/types')
const runtimeMetrics = require('../../runtime_metrics')
const telemetryMetrics = require('../../telemetry/metrics')
const { END_TIMESTAMP_LABEL, getThreadLabels } = require('./shared')
const { END_TIMESTAMP_LABEL, getNonJSThreadsLabels, getThreadLabels } = require('./shared')

const beforeCh = dc.channel('dd-trace:storage:before')
const enterCh = dc.channel('dd-trace:storage:enter')
Expand Down Expand Up @@ -78,13 +78,15 @@ class NativeWallProfiler {
this._codeHotspotsEnabled = !!options.codeHotspotsEnabled
this._endpointCollectionEnabled = !!options.endpointCollectionEnabled
this._timelineEnabled = !!options.timelineEnabled
this._cpuProfilingEnabled = !!options.cpuProfilingEnabled
// We need to capture span data into the sample context for either code hotspots
// or endpoint collection.
this._captureSpanData = this._codeHotspotsEnabled || this._endpointCollectionEnabled
// We need to run the pprof wall profiler with sample contexts if we're either
// capturing span data or timeline is enabled (so we need sample timestamps, and for now
// timestamps require the sample contexts feature in the pprof wall profiler.)
this._withContexts = this._captureSpanData || this._timelineEnabled
// timestamps require the sample contexts feature in the pprof wall profiler), or
// cpu profiling is enabled.
this._withContexts = this._captureSpanData || this._timelineEnabled || this._cpuProfilingEnabled
this._v8ProfilerBugWorkaroundEnabled = !!options.v8ProfilerBugWorkaroundEnabled
this._mapper = undefined
this._pprof = undefined
Expand Down Expand Up @@ -131,7 +133,8 @@ class NativeWallProfiler {
sourceMapper: this._mapper,
withContexts: this._withContexts,
lineNumbers: false,
workaroundV8Bug: this._v8ProfilerBugWorkaroundEnabled
workaroundV8Bug: this._v8ProfilerBugWorkaroundEnabled,
collectCpuTime: this._cpuProfilingEnabled
})

if (this._withContexts) {
Expand Down Expand Up @@ -220,22 +223,42 @@ class NativeWallProfiler {

_stop (restart) {
if (!this._started) return

if (this._captureSpanData) {
// update last sample context if needed
this._enter()
this._lastSampleCount = 0
}
const profile = this._pprof.time.stop(restart, this._generateLabels)

if (restart) {
const v8BugDetected = this._pprof.time.v8ProfilerStuckEventLoopDetected()
if (v8BugDetected !== 0) {
this._reportV8bug(v8BugDetected === 1)
}
} else {
if (this._captureSpanData) {
beforeCh.unsubscribe(this._enter)
enterCh.unsubscribe(this._enter)
spanFinishCh.unsubscribe(this._spanFinished)
this._profilerState = undefined
this._lastSpan = undefined
this._lastStartedSpans = undefined
this._lastWebTags = undefined
}
this._started = false
}

return profile
}

_generateLabels (context) {
_generateLabels ({ node, context }) {
// check for special node that represents CPU time all non-JS threads.
// In that case only return a special thread name label since we cannot associate any timestamp/span/endpoint to it.
if (node.name === this._pprof.time.constants.NON_JS_THREADS_FUNCTION_NAME) {
return getNonJSThreadsLabels()
}

if (context == null) {
// generateLabels is also called for samples without context.
// In that case just return thread labels.
Expand Down Expand Up @@ -267,30 +290,20 @@ class NativeWallProfiler {
return labels
}

profile () {
return this._stop(true)
profile (restart) {
return this._stop(restart)
}

encode (profile) {
return this._pprof.encode(profile)
}

stop () {
if (!this._started) return

const profile = this._stop(false)
if (this._captureSpanData) {
beforeCh.unsubscribe(this._enter)
enterCh.unsubscribe(this._enter)
spanFinishCh.unsubscribe(this._spanFinished)
this._profilerState = undefined
this._lastSpan = undefined
this._lastStartedSpans = undefined
this._lastWebTags = undefined
}
this._stop(false)
}

this._started = false
return profile
isStarted () {
return this._started
}
}

Expand Down
Loading

0 comments on commit 8816250

Please sign in to comment.