Metricbeat: store only top N processes by CPU/memory

This adds the option to only report on the top N processes by CPU and/or memory. It is useful because storing metrics about each and every process from every host can be fairly expensive from the storage point of view. Previously it was possible to filter processes by name, which was useful if one knew in advance which are the most interesting processes. This adds a new option which should be quite convenient in practice, because the number of per-process documents gets limited while still allowing to display the top processes. Closes #4126.
elastic · May 2, 2017 · 5e142c0 · 5e142c0
1 parent 3ffbeb2
commit 5e142c0
Show file tree

Hide file tree

Showing 10 changed files with 290 additions and 26 deletions.
diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc
@@ -152,6 +152,7 @@ https://github.com/elastic/beats/compare/v5.1.1...master[Check the HEAD diff]
 - Adding query APIs for metricsets and modules from metricbeat registry {pull}4102[4102]
 - Fixing nil pointer on prometheus collector when http response is nil {pull}4119[4119]
 - Add http module with json metricset. {pull}4092[4092]
+- Add the option to the system module to include only the first top N processes by CPU and memory. {pull}4127[4127].
 
 *Packetbeat*
 - Add `fields` and `fields_under_root` to packetbeat protocols configurations. {pull}3518[3518]

diff --git a/metricbeat/docs/modules/system.asciidoc b/metricbeat/docs/modules/system.asciidoc
@@ -26,6 +26,25 @@ metricbeat.modules:
   metricsets: ["process"]
   processes: ['.*']
 ----
+
+*`process.include_top_n`*:: These options allow you to filter out all processes
+that are not in the top N by CPU or memory, in order to reduce the number of
+documents created. If both the `by_cpu` and `by_memory` options are used, the
+reunion of the two tops is included.
+
+*`process.include_top_n.enabled`*:: Set to false to disable the top N feature and
+include all processes, regardless of the other options. The default is `true`,
+but nothing is filtered unless one of the other options (`by_cpu` or `by_memory`)
+is set to a non-zero value.
+
+*`process.include_top_n.by_cpu`*::  How many processes to include from the top
+by CPU. The processes are sorted by the `system.process.cpu.total.pct` field.
+The default is 0.
+
+*`process.include_top_n.by_memory`*:: How many processes to include from the top
+by memory. The processes are sorted by the `system.process.memory.rss.bytes`
+field. The default is 0.
+
 *`process.cgroups.enabled`*:: When the `process` metricset is enabled, you can
 use this boolean configuration option to disable cgroup metrics. By default
 cgroup metrics collection is enabled.
@@ -100,6 +119,9 @@ metricbeat.modules:
   enabled: true
   period: 10s
   processes: ['.*']
+  process.include_top_n:
+    by_cpu: 5      # include top 5 processes by CPU
+    by_memory: 5   # include top 5 processes by memory
 ----
 
 [float]

diff --git a/metricbeat/metricbeat.full.yml b/metricbeat/metricbeat.full.yml
@@ -64,6 +64,23 @@ metricbeat.modules:
   # if true, exports the CPU usage in ticks, together with the percentage values
   #cpu_ticks: false
 
+  # These options allow you to filter out all processes that are not
+  # in the top N by CPU or memory, in order to reduce the number of documents created.
+  # If both the `by_cpu` and `by_memory` options are used, the reunion of the two tops
+  # is included.
+  #process.include_top_n:
+    #
+    # Set to false to disable this feature and include all processes
+    #enabled: true
+
+    # How many processes to include from the top by CPU. The processes are sorted
+    # by the `system.process.cpu.total.pct` field.
+    #by_cpu: 0
+
+    # How many processes to include from the top by memory. The processes are sorted
+    # by the `system.process.memory.rss.bytes` field.
+    #by_memory: 0
+
   # If false, cmdline of a process is not cached.
   #process.cmdline.cache.enabled: true
 

diff --git a/metricbeat/metricbeat.yml b/metricbeat/metricbeat.yml
@@ -45,6 +45,9 @@ metricbeat.modules:
   enabled: true
   period: 10s
   processes: ['.*']
+  process.include_top_n:
+    by_cpu: 5      # include top 5 processes by CPU
+    by_memory: 5   # include top 5 processes by memory
 
 
 

diff --git a/metricbeat/module/system/_meta/config.full.yml b/metricbeat/module/system/_meta/config.full.yml
@@ -36,6 +36,23 @@
   # if true, exports the CPU usage in ticks, together with the percentage values
   #cpu_ticks: false
 
+  # These options allow you to filter out all processes that are not
+  # in the top N by CPU or memory, in order to reduce the number of documents created.
+  # If both the `by_cpu` and `by_memory` options are used, the reunion of the two tops
+  # is included.
+  #process.include_top_n:
+    #
+    # Set to false to disable this feature and include all processes
+    #enabled: true
+
+    # How many processes to include from the top by CPU. The processes are sorted
+    # by the `system.process.cpu.total.pct` field.
+    #by_cpu: 0
+
+    # How many processes to include from the top by memory. The processes are sorted
+    # by the `system.process.memory.rss.bytes` field.
+    #by_memory: 0
+
   # If false, cmdline of a process is not cached.
   #process.cmdline.cache.enabled: true
 

diff --git a/metricbeat/module/system/_meta/config.yml b/metricbeat/module/system/_meta/config.yml
@@ -32,3 +32,6 @@
   enabled: true
   period: 10s
   processes: ['.*']
+  process.include_top_n:
+    by_cpu: 5      # include top 5 processes by CPU
+    by_memory: 5   # include top 5 processes by memory
diff --git a/metricbeat/module/system/_meta/docs.asciidoc b/metricbeat/module/system/_meta/docs.asciidoc
@@ -21,6 +21,25 @@ metricbeat.modules:
   metricsets: ["process"]
   processes: ['.*']
 ----
+
+*`process.include_top_n`*:: These options allow you to filter out all processes
+that are not in the top N by CPU or memory, in order to reduce the number of
+documents created. If both the `by_cpu` and `by_memory` options are used, the
+reunion of the two tops is included.
+
+*`process.include_top_n.enabled`*:: Set to false to disable the top N feature and
+include all processes, regardless of the other options. The default is `true`,
+but nothing is filtered unless one of the other options (`by_cpu` or `by_memory`)
+is set to a non-zero value.
+
+*`process.include_top_n.by_cpu`*::  How many processes to include from the top
+by CPU. The processes are sorted by the `system.process.cpu.total.pct` field.
+The default is 0.
+
+*`process.include_top_n.by_memory`*:: How many processes to include from the top
+by memory. The processes are sorted by the `system.process.memory.rss.bytes`
+field. The default is 0.
+
 *`process.cgroups.enabled`*:: When the `process` metricset is enabled, you can
 use this boolean configuration option to disable cgroup metrics. By default
 cgroup metrics collection is enabled.

diff --git a/metricbeat/module/system/process/helper.go b/metricbeat/module/system/process/helper.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"os"
 	"runtime"
+	"sort"
 	"strings"
 	"time"
 
@@ -20,19 +21,20 @@ import (
 type ProcsMap map[int]*Process
 
 type Process struct {
-	Pid      int    `json:"pid"`
-	Ppid     int    `json:"ppid"`
-	Pgid     int    `json:"pgid"`
-	Name     string `json:"name"`
-	Username string `json:"username"`
-	State    string `json:"state"`
-	CmdLine  string `json:"cmdline"`
-	Cwd      string `json:"cwd"`
-	Mem      sigar.ProcMem
-	Cpu      sigar.ProcTime
-	Ctime    time.Time
-	FD       sigar.ProcFDUsage
-	Env      common.MapStr
+	Pid         int    `json:"pid"`
+	Ppid        int    `json:"ppid"`
+	Pgid        int    `json:"pgid"`
+	Name        string `json:"name"`
+	Username    string `json:"username"`
+	State       string `json:"state"`
+	CmdLine     string `json:"cmdline"`
+	Cwd         string `json:"cwd"`
+	Mem         sigar.ProcMem
+	Cpu         sigar.ProcTime
+	Ctime       time.Time
+	FD          sigar.ProcFDUsage
+	Env         common.MapStr
+	cpuTotalPct float64
 }
 
 type ProcStats struct {
@@ -41,6 +43,7 @@ type ProcStats struct {
 	CpuTicks     bool
 	EnvWhitelist []string
 	CacheCmdLine bool
+	IncludeTop   includeTopConfig
 
 	procRegexps []match.Matcher // List of regular expressions used to whitelist processes.
 	envRegexps  []match.Matcher // List of regular expressions used to whitelist env vars.
@@ -216,7 +219,7 @@ func getProcState(b byte) string {
 	return "unknown"
 }
 
-func (procStats *ProcStats) GetProcessEvent(process *Process, last *Process) common.MapStr {
+func (procStats *ProcStats) getProcessEvent(process *Process) common.MapStr {
 	proc := common.MapStr{
 		"pid":      process.Pid,
 		"ppid":     process.Ppid,
@@ -248,7 +251,7 @@ func (procStats *ProcStats) GetProcessEvent(process *Process, last *Process) com
 
 	proc["cpu"] = common.MapStr{
 		"total": common.MapStr{
-			"pct": GetProcCpuPercentage(last, process),
+			"pct": process.cpuTotalPct,
 		},
 		"start_time": unixTimeMsToTime(process.Cpu.StartTime),
 	}
@@ -336,7 +339,7 @@ func (procStats *ProcStats) GetProcStats() ([]common.MapStr, error) {
 		return nil, err
 	}
 
-	processes := []common.MapStr{}
+	processes := []Process{}
 	newProcs := make(ProcsMap, len(pids))
 
 	for _, pid := range pids {
@@ -363,16 +366,64 @@ func (procStats *ProcStats) GetProcStats() ([]common.MapStr, error) {
 			}
 
 			newProcs[process.Pid] = process
-
 			last := procStats.ProcsMap[process.Pid]
-			proc := procStats.GetProcessEvent(process, last)
+			process.cpuTotalPct = GetProcCpuPercentage(last, process)
+			processes = append(processes, *process)
+		}
+	}
+	procStats.ProcsMap = newProcs
+
+	processes = procStats.includeTopProcesses(processes)
+	logp.Debug("processes", "Filtered top processes down to %d processes", len(processes))
+
+	procs := []common.MapStr{}
+	for _, process := range processes {
+		proc := procStats.getProcessEvent(&process)
+		procs = append(procs, proc)
+	}
+
+	return procs, nil
+}
 
-			processes = append(processes, proc)
+func (procStats *ProcStats) includeTopProcesses(processes []Process) []Process {
+
+	if !procStats.IncludeTop.Enabled ||
+		(procStats.IncludeTop.ByCPU == 0 && procStats.IncludeTop.ByMemory == 0) {
+
+		return processes
+	}
+
+	result := []Process{}
+	if procStats.IncludeTop.ByCPU > 0 {
+		sort.Slice(processes, func(i, j int) bool {
+			return processes[i].cpuTotalPct > processes[j].cpuTotalPct
+		})
+		result = append(result, processes[:procStats.IncludeTop.ByCPU]...)
+	}
+
+	if procStats.IncludeTop.ByMemory > 0 {
+		sort.Slice(processes, func(i, j int) bool {
+			return processes[i].Mem.Resident > processes[j].Mem.Resident
+		})
+		for _, proc := range processes[:procStats.IncludeTop.ByMemory] {
+			if !isProcessInSlice(result, &proc) {
+				result = append(result, proc)
+			}
 		}
 	}
 
-	procStats.ProcsMap = newProcs
-	return processes, nil
+	return result
+}
+
+// isProcessInSlice looks up proc in the processes slice and returns if
+// found or not
+func isProcessInSlice(processes []Process, proc *Process) bool {
+	for _, p := range processes {
+		if p.Pid == proc.Pid {
+			return true
+		}
+	}
+	return false
 }
 
 // isWhitelistedEnvVar returns true if the given variable name is a match for