Skip to content

Commit

Permalink
[cpu/windows] Switch to performance counters (#192)
Browse files Browse the repository at this point in the history
Add conditional switching to performance counters for `cpu`/`core`
metricset collections
This PR add an option `WithPerformanceCounter`, which uses performance
counter for CPU data collection, when enabled.
Default behaviour will be the existing implementation.

### How to use performance counters?

It's simple. Something like following:
```go
cpu, _ := metrics.New(WithPerformanceCounter())
data, err := cpu.Fetch()
```

### Why performance counters?
From elastic/beats#40926,
> On Windows, Metricbeat measures CPU use via the Windows API call
GetSystemTimes. Each metrics interval, it fetches the CPU numbers, and
compares them to the previous measurement to determine CPU load during
that interval. On most systems this includes CPU time ["including all
threads in all processes, on all
processors"](https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getsystemtimes).
However, on systems with more than 64 cores, it returns only the data
for the [current processor
group](https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups)
of up to 64 cores.

This is a major limitation for such systems and it leads to inconsistent
data.

### Testing

Test results are reported on the linked issue.

Fixes elastic/beats#40926

### **Note:**
`WithPerformanceCounter` option is only effective for windows and is
ineffective if used by other OS'.
  • Loading branch information
VihasMakwana authored Dec 10, 2024
1 parent 1ac0416 commit 80aacba
Show file tree
Hide file tree
Showing 11 changed files with 142 additions and 23 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.22.8

require (
github.com/docker/docker v26.1.5+incompatible
github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d
github.com/elastic/elastic-agent-libs v0.17.4
github.com/elastic/go-licenser v0.4.2
github.com/elastic/go-structform v0.0.9
github.com/elastic/go-sysinfo v1.14.1
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKoh
github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec=
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d h1:nY8LSeTYU1uSDAAg7WwGH/cALgdovAXLdIzV25Ky0Bo=
github.com/elastic/elastic-agent-libs v0.17.4-0.20241126154321-6ed75416832d/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M=
github.com/elastic/elastic-agent-libs v0.17.4 h1:kWK5Kn2EQjM97yHqbeXv+cFAIti4IiI9Qj8huM+lZzE=
github.com/elastic/elastic-agent-libs v0.17.4/go.mod h1:5CR02awPrBr+tfmjBBK+JI+dMmHNQjpVY24J0wjbC7M=
github.com/elastic/go-licenser v0.4.2 h1:bPbGm8bUd8rxzSswFOqvQh1dAkKGkgAmrPxbUi+Y9+A=
github.com/elastic/go-licenser v0.4.2/go.mod h1:W8eH6FaZDR8fQGm+7FnVa7MxI1b/6dAqxz+zPB8nm5c=
github.com/elastic/go-structform v0.0.9 h1:HpcS7xljL4kSyUfDJ8cXTJC6rU5ChL1wYb6cx3HLD+o=
Expand Down
9 changes: 4 additions & 5 deletions metric/cpu/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ type option struct {

type OptionFunc func(*option)

// Note: WithPerformanceCounter option is only effective for windows and is ineffective if used by other OS'.
func WithPerformanceCounter() OptionFunc {
// Note: WithWindowsPerformanceCounter option is only effective for windows and is ineffective if used by other OS'.
func WithWindowsPerformanceCounter() OptionFunc {
return func(o *option) {
o.usePerformanceCounter = true
}
Expand All @@ -94,7 +94,7 @@ func WithPerformanceCounter() OptionFunc {
// Fetch collects a new sample of the CPU usage metrics.
// This will overwrite the currently stored samples.
func (m *Monitor) Fetch() (Metrics, error) {
metric, err := Get(m.Hostfs)
metric, err := Get(m)
if err != nil {
return Metrics{}, fmt.Errorf("error fetching CPU metrics: %w", err)
}
Expand All @@ -108,8 +108,7 @@ func (m *Monitor) Fetch() (Metrics, error) {
// FetchCores collects a new sample of CPU usage metrics per-core
// This will overwrite the currently stored samples.
func (m *Monitor) FetchCores() ([]Metrics, error) {

metric, err := Get(m.Hostfs)
metric, err := Get(m)
if err != nil {
return nil, fmt.Errorf("error fetching CPU metrics: %w", err)
}
Expand Down
10 changes: 9 additions & 1 deletion metric/cpu/cpu_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,14 @@ func buildQuery() (*pdh.Query, error) {
if err := q.Open(); err != nil {
return nil, fmt.Errorf("failed to open query: %w", err)
}
// TODO: implement performance counters as a follow up
if err := q.AddCounter(totalKernelTimeCounter, "", "", true, true); err != nil {
return nil, fmt.Errorf("error calling AddCounter for kernel counter: %w", err)
}
if err := q.AddCounter(totalUserTimeCounter, "", "", true, true); err != nil {
return nil, fmt.Errorf("error calling AddCounter for user counter: %w", err)
}
if err := q.AddCounter(totalIdleTimeCounter, "", "", true, true); err != nil {
return nil, fmt.Errorf("error calling AddCounter for idle counter: %w", err)
}
return &q, nil
}
54 changes: 54 additions & 0 deletions metric/cpu/metric_windows_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Licensed to Elasticsearch B.V. under one or more contributor
// license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright
// ownership. Elasticsearch B.V. licenses this file to you under
// the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//go:build windows

package cpu

import (
"testing"

"github.com/stretchr/testify/require"

"github.com/elastic/elastic-agent-system-metrics/dev-tools/systemtests"
)

func TestCounterLength(t *testing.T) {
monitor, err := New(systemtests.DockerTestResolver())
require.NoError(t, err)
require.NoError(t, monitor.query.CollectData())

query := monitor.query
kernelRawData, err := query.GetRawCounterArray(totalKernelTimeCounter, true)
require.NoError(t, err)

idleRawData, err := query.GetRawCounterArray(totalIdleTimeCounter, true)
require.NoError(t, err)

userRawData, err := query.GetRawCounterArray(totalUserTimeCounter, true)
require.NoError(t, err)

require.Equal(t, len(kernelRawData), len(idleRawData))
require.Equal(t, len(userRawData), len(idleRawData))

for i := 0; i < len(userRawData); i++ {
require.Equal(t, userRawData[i].InstanceName, kernelRawData[i].InstanceName, "InstanceName should be equal")
}
for i := 0; i < len(kernelRawData); i++ {
require.Equal(t, kernelRawData[i].InstanceName, idleRawData[i].InstanceName, "InstanceName should be equal")
}
}
3 changes: 1 addition & 2 deletions metric/cpu/metrics_aix.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ import (
"os"

"github.com/elastic/elastic-agent-libs/opt"
"github.com/elastic/elastic-agent-system-metrics/metric/system/resolve"
)

func init() {
Expand All @@ -58,7 +57,7 @@ func tick2msec(val uint64) uint64 {
}

// Get returns a metrics object for CPU data
func Get(_ resolve.Resolver) (CPUMetrics, error) {
func Get(m *Monitor) (CPUMetrics, error) {

totals, err := getCPUTotals()
if err != nil {
Expand Down
3 changes: 1 addition & 2 deletions metric/cpu/metrics_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,10 @@ import (
"github.com/shirou/gopsutil/v4/cpu"

"github.com/elastic/elastic-agent-libs/opt"
"github.com/elastic/elastic-agent-system-metrics/metric/system/resolve"
)

// Get is the Darwin implementation of Get
func Get(_ resolve.Resolver) (CPUMetrics, error) {
func Get(m *Monitor) (CPUMetrics, error) {
// We're using the gopsutil library here.
// The code used by both gosigar and go-sysinfo appears to be
// the same code as gopsutil, including copy-pasted comments.
Expand Down
3 changes: 1 addition & 2 deletions metric/cpu/metrics_openbsd.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,10 @@ import (
"unsafe"

"github.com/elastic/elastic-agent-libs/opt"
"github.com/elastic/elastic-agent-system-metrics/metric/system/resolve"
)

// Get is the OpenBSD implementation of get
func Get(_ resolve.Resolver) (CPUMetrics, error) {
func Get(m *Monitor) (CPUMetrics, error) {

// see man 2 sysctl
loadGlobal := [C.CPUSTATES]C.long{
Expand Down
6 changes: 3 additions & 3 deletions metric/cpu/metrics_procfs_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ import (
"os"
"strconv"
"strings"

"github.com/elastic/elastic-agent-system-metrics/metric/system/resolve"
)

// Get returns a metrics object for CPU data
func Get(procfs resolve.Resolver) (CPUMetrics, error) {
func Get(m *Monitor) (CPUMetrics, error) {
procfs := m.Hostfs

path := procfs.ResolveHostFS("/proc/stat")
fd, err := os.Open(path)
defer func() {
Expand Down
6 changes: 3 additions & 3 deletions metric/cpu/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ func TestMonitorSample(t *testing.T) {
}

func TestCoresMonitorSample(t *testing.T) {
cores, err := New(systemtests.DockerTestResolver())
require.NoError(t, err)

cpuMetrics, err := Get(systemtests.DockerTestResolver())
cpuMetrics, err := Get(cores)
assert.NoError(t, err, "error in Get()")

cores, err := New(systemtests.DockerTestResolver())
require.NoError(t, err)
cores.lastSample = CPUMetrics{list: make([]CPU, len(cpuMetrics.list))}
sample, err := cores.FetchCores()
require.NoError(t, err)
Expand Down
65 changes: 63 additions & 2 deletions metric/cpu/metrics_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,27 @@ import (
"fmt"
"time"

"github.com/elastic/elastic-agent-libs/helpers/windows/pdh"
"github.com/elastic/elastic-agent-libs/opt"
"github.com/elastic/elastic-agent-system-metrics/metric/system/resolve"
"github.com/elastic/gosigar/sys/windows"
)

var (
processorInformationCounter = "\\Processor Information(%s)\\%s"
totalKernelTimeCounter = fmt.Sprintf(processorInformationCounter, "*", "% Privileged Time")
totalIdleTimeCounter = fmt.Sprintf(processorInformationCounter, "*", "% Idle Time")
totalUserTimeCounter = fmt.Sprintf(processorInformationCounter, "*", "% User Time")
)

// Get fetches Windows CPU system times
func Get(_ resolve.Resolver) (CPUMetrics, error) {
func Get(m *Monitor) (CPUMetrics, error) {
if m.query == nil {
return getUsingSystemTimes()
}
return getUsingPerfCounters(m.query)
}

func getUsingSystemTimes() (CPUMetrics, error) {
idle, kernel, user, err := windows.GetSystemTimes()
if err != nil {
return CPUMetrics{}, fmt.Errorf("call to GetSystemTimes failed: %w", err)
Expand Down Expand Up @@ -66,3 +80,50 @@ func Get(_ resolve.Resolver) (CPUMetrics, error) {

return globalMetrics, nil
}

func getUsingPerfCounters(query *pdh.Query) (CPUMetrics, error) {
globalMetrics := CPUMetrics{}

if err := query.CollectData(); err != nil {
return globalMetrics, err
}

kernelRawData, err := query.GetRawCounterArray(totalKernelTimeCounter, true)
if err != nil {
return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for kernel counter: %w", err)
}
idleRawData, err := query.GetRawCounterArray(totalIdleTimeCounter, true)
if err != nil {
return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for idle counter: %w", err)
}
userRawData, err := query.GetRawCounterArray(totalUserTimeCounter, true)
if err != nil {
return globalMetrics, fmt.Errorf("error calling GetRawCounterArray for user counter: %w", err)
}
var idle, kernel, user time.Duration
globalMetrics.list = make([]CPU, len(userRawData))
for i := 0; i < len(globalMetrics.list); i++ {
// The values returned by GetRawCounterArray are of equal length and are sorted by instance names.
// For CPU core {i}, idleRawData[i], kernelRawData[i], and userRawData[i] correspond to the idle time, kernel time, and user time, respectively.

// values returned by counter are in 100-ns intervals. Hence, convert it to millisecond.
idleTime := time.Duration(idleRawData[i].RawValue.FirstValue*100) / time.Millisecond
kernelTime := time.Duration(kernelRawData[i].RawValue.FirstValue*100) / time.Millisecond
userTime := time.Duration(userRawData[i].RawValue.FirstValue*100) / time.Millisecond

globalMetrics.list[i].Idle = opt.UintWith(uint64(idleTime))
globalMetrics.list[i].Sys = opt.UintWith(uint64(kernelTime))
globalMetrics.list[i].User = opt.UintWith(uint64(userTime))

// add the per-cpu time to track the total time spent by system
idle += idleTime
kernel += kernelTime
user += userTime
}

globalMetrics.totals.Idle = opt.UintWith(uint64(idle))
globalMetrics.totals.Sys = opt.UintWith(uint64(kernel))
globalMetrics.totals.User = opt.UintWith(uint64(user))

return globalMetrics, nil
}

0 comments on commit 80aacba

Please sign in to comment.