Skip to content

Commit

Permalink
Metricbeat: store only top N processes by CPU/memory
Browse files Browse the repository at this point in the history
This adds the option to only report on the top N processes by CPU and/or
memory. It is useful because storing metrics about each and every process from
every host can be fairly expensive from the storage point of view. Previously
it was possible to filter processes by name, which was useful if one knew in
advance which are the most interesting processes. This adds a new option which
should be quite convenient in practice, because the number of per-process
documents gets limited while still allowing to display the top processes.

Closes #4126.
  • Loading branch information
Tudor Golubenco committed May 2, 2017
1 parent 3ffbeb2 commit 5e142c0
Show file tree
Hide file tree
Showing 10 changed files with 290 additions and 26 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ https://github.com/elastic/beats/compare/v5.1.1...master[Check the HEAD diff]
- Adding query APIs for metricsets and modules from metricbeat registry {pull}4102[4102]
- Fixing nil pointer on prometheus collector when http response is nil {pull}4119[4119]
- Add http module with json metricset. {pull}4092[4092]
- Add the option to the system module to include only the first top N processes by CPU and memory. {pull}4127[4127].

*Packetbeat*
- Add `fields` and `fields_under_root` to packetbeat protocols configurations. {pull}3518[3518]
Expand Down
22 changes: 22 additions & 0 deletions metricbeat/docs/modules/system.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,25 @@ metricbeat.modules:
metricsets: ["process"]
processes: ['.*']
----

*`process.include_top_n`*:: These options allow you to filter out all processes
that are not in the top N by CPU or memory, in order to reduce the number of
documents created. If both the `by_cpu` and `by_memory` options are used, the
reunion of the two tops is included.

*`process.include_top_n.enabled`*:: Set to false to disable the top N feature and
include all processes, regardless of the other options. The default is `true`,
but nothing is filtered unless one of the other options (`by_cpu` or `by_memory`)
is set to a non-zero value.

*`process.include_top_n.by_cpu`*:: How many processes to include from the top
by CPU. The processes are sorted by the `system.process.cpu.total.pct` field.
The default is 0.

*`process.include_top_n.by_memory`*:: How many processes to include from the top
by memory. The processes are sorted by the `system.process.memory.rss.bytes`
field. The default is 0.

*`process.cgroups.enabled`*:: When the `process` metricset is enabled, you can
use this boolean configuration option to disable cgroup metrics. By default
cgroup metrics collection is enabled.
Expand Down Expand Up @@ -100,6 +119,9 @@ metricbeat.modules:
enabled: true
period: 10s
processes: ['.*']
process.include_top_n:
by_cpu: 5 # include top 5 processes by CPU
by_memory: 5 # include top 5 processes by memory
----

[float]
Expand Down
17 changes: 17 additions & 0 deletions metricbeat/metricbeat.full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,23 @@ metricbeat.modules:
# if true, exports the CPU usage in ticks, together with the percentage values
#cpu_ticks: false

# These options allow you to filter out all processes that are not
# in the top N by CPU or memory, in order to reduce the number of documents created.
# If both the `by_cpu` and `by_memory` options are used, the reunion of the two tops
# is included.
#process.include_top_n:
#
# Set to false to disable this feature and include all processes
#enabled: true

# How many processes to include from the top by CPU. The processes are sorted
# by the `system.process.cpu.total.pct` field.
#by_cpu: 0

# How many processes to include from the top by memory. The processes are sorted
# by the `system.process.memory.rss.bytes` field.
#by_memory: 0

# If false, cmdline of a process is not cached.
#process.cmdline.cache.enabled: true

Expand Down
3 changes: 3 additions & 0 deletions metricbeat/metricbeat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ metricbeat.modules:
enabled: true
period: 10s
processes: ['.*']
process.include_top_n:
by_cpu: 5 # include top 5 processes by CPU
by_memory: 5 # include top 5 processes by memory



Expand Down
17 changes: 17 additions & 0 deletions metricbeat/module/system/_meta/config.full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,23 @@
# if true, exports the CPU usage in ticks, together with the percentage values
#cpu_ticks: false

# These options allow you to filter out all processes that are not
# in the top N by CPU or memory, in order to reduce the number of documents created.
# If both the `by_cpu` and `by_memory` options are used, the reunion of the two tops
# is included.
#process.include_top_n:
#
# Set to false to disable this feature and include all processes
#enabled: true

# How many processes to include from the top by CPU. The processes are sorted
# by the `system.process.cpu.total.pct` field.
#by_cpu: 0

# How many processes to include from the top by memory. The processes are sorted
# by the `system.process.memory.rss.bytes` field.
#by_memory: 0

# If false, cmdline of a process is not cached.
#process.cmdline.cache.enabled: true

Expand Down
3 changes: 3 additions & 0 deletions metricbeat/module/system/_meta/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@
enabled: true
period: 10s
processes: ['.*']
process.include_top_n:
by_cpu: 5 # include top 5 processes by CPU
by_memory: 5 # include top 5 processes by memory
19 changes: 19 additions & 0 deletions metricbeat/module/system/_meta/docs.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,25 @@ metricbeat.modules:
metricsets: ["process"]
processes: ['.*']
----

*`process.include_top_n`*:: These options allow you to filter out all processes
that are not in the top N by CPU or memory, in order to reduce the number of
documents created. If both the `by_cpu` and `by_memory` options are used, the
reunion of the two tops is included.

*`process.include_top_n.enabled`*:: Set to false to disable the top N feature and
include all processes, regardless of the other options. The default is `true`,
but nothing is filtered unless one of the other options (`by_cpu` or `by_memory`)
is set to a non-zero value.

*`process.include_top_n.by_cpu`*:: How many processes to include from the top
by CPU. The processes are sorted by the `system.process.cpu.total.pct` field.
The default is 0.

*`process.include_top_n.by_memory`*:: How many processes to include from the top
by memory. The processes are sorted by the `system.process.memory.rss.bytes`
field. The default is 0.

*`process.cgroups.enabled`*:: When the `process` metricset is enabled, you can
use this boolean configuration option to disable cgroup metrics. By default
cgroup metrics collection is enabled.
Expand Down
93 changes: 72 additions & 21 deletions metricbeat/module/system/process/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"fmt"
"os"
"runtime"
"sort"
"strings"
"time"

Expand All @@ -20,19 +21,20 @@ import (
type ProcsMap map[int]*Process

type Process struct {
Pid int `json:"pid"`
Ppid int `json:"ppid"`
Pgid int `json:"pgid"`
Name string `json:"name"`
Username string `json:"username"`
State string `json:"state"`
CmdLine string `json:"cmdline"`
Cwd string `json:"cwd"`
Mem sigar.ProcMem
Cpu sigar.ProcTime
Ctime time.Time
FD sigar.ProcFDUsage
Env common.MapStr
Pid int `json:"pid"`
Ppid int `json:"ppid"`
Pgid int `json:"pgid"`
Name string `json:"name"`
Username string `json:"username"`
State string `json:"state"`
CmdLine string `json:"cmdline"`
Cwd string `json:"cwd"`
Mem sigar.ProcMem
Cpu sigar.ProcTime
Ctime time.Time
FD sigar.ProcFDUsage
Env common.MapStr
cpuTotalPct float64
}

type ProcStats struct {
Expand All @@ -41,6 +43,7 @@ type ProcStats struct {
CpuTicks bool
EnvWhitelist []string
CacheCmdLine bool
IncludeTop includeTopConfig

procRegexps []match.Matcher // List of regular expressions used to whitelist processes.
envRegexps []match.Matcher // List of regular expressions used to whitelist env vars.
Expand Down Expand Up @@ -216,7 +219,7 @@ func getProcState(b byte) string {
return "unknown"
}

func (procStats *ProcStats) GetProcessEvent(process *Process, last *Process) common.MapStr {
func (procStats *ProcStats) getProcessEvent(process *Process) common.MapStr {
proc := common.MapStr{
"pid": process.Pid,
"ppid": process.Ppid,
Expand Down Expand Up @@ -248,7 +251,7 @@ func (procStats *ProcStats) GetProcessEvent(process *Process, last *Process) com

proc["cpu"] = common.MapStr{
"total": common.MapStr{
"pct": GetProcCpuPercentage(last, process),
"pct": process.cpuTotalPct,
},
"start_time": unixTimeMsToTime(process.Cpu.StartTime),
}
Expand Down Expand Up @@ -336,7 +339,7 @@ func (procStats *ProcStats) GetProcStats() ([]common.MapStr, error) {
return nil, err
}

processes := []common.MapStr{}
processes := []Process{}
newProcs := make(ProcsMap, len(pids))

for _, pid := range pids {
Expand All @@ -363,16 +366,64 @@ func (procStats *ProcStats) GetProcStats() ([]common.MapStr, error) {
}

newProcs[process.Pid] = process

last := procStats.ProcsMap[process.Pid]
proc := procStats.GetProcessEvent(process, last)
process.cpuTotalPct = GetProcCpuPercentage(last, process)
processes = append(processes, *process)
}
}
procStats.ProcsMap = newProcs

processes = procStats.includeTopProcesses(processes)
logp.Debug("processes", "Filtered top processes down to %d processes", len(processes))

procs := []common.MapStr{}
for _, process := range processes {
proc := procStats.getProcessEvent(&process)
procs = append(procs, proc)
}

return procs, nil
}

processes = append(processes, proc)
func (procStats *ProcStats) includeTopProcesses(processes []Process) []Process {

if !procStats.IncludeTop.Enabled ||
(procStats.IncludeTop.ByCPU == 0 && procStats.IncludeTop.ByMemory == 0) {

return processes
}

result := []Process{}
if procStats.IncludeTop.ByCPU > 0 {
sort.Slice(processes, func(i, j int) bool {
return processes[i].cpuTotalPct > processes[j].cpuTotalPct
})
result = append(result, processes[:procStats.IncludeTop.ByCPU]...)
}

if procStats.IncludeTop.ByMemory > 0 {
sort.Slice(processes, func(i, j int) bool {
return processes[i].Mem.Resident > processes[j].Mem.Resident
})
for _, proc := range processes[:procStats.IncludeTop.ByMemory] {
if !isProcessInSlice(result, &proc) {
result = append(result, proc)
}
}
}

procStats.ProcsMap = newProcs
return processes, nil
return result
}

// isProcessInSlice looks up proc in the processes slice and returns if
// found or not
func isProcessInSlice(processes []Process, proc *Process) bool {
for _, p := range processes {
if p.Pid == proc.Pid {
return true
}
}
return false
}

// isWhitelistedEnvVar returns true if the given variable name is a match for
Expand Down
Loading

0 comments on commit 5e142c0

Please sign in to comment.