From 4b1953d949e57ce33b12176d5f9b7c64baa83a0e Mon Sep 17 00:00:00 2001 From: Pierre Fersing Date: Mon, 16 Oct 2017 11:55:19 +0200 Subject: [PATCH] Fix CPU system plugin that get stuck after suspend Signed-off-by: Pierre Fersing --- CHANGELOG.md | 1 + plugins/inputs/system/cpu.go | 5 ++- plugins/inputs/system/cpu_test.go | 69 +++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index efaeb4476a7dc..444e23a72abb5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,7 @@ - [#3136](https://github.com/influxdata/telegraf/issues/3136): Fix webhooks input address in use during reload. - [#3258](https://github.com/influxdata/telegraf/issues/3258): Unlock Statsd when stopping to prevent deadlock. - [#3319](https://github.com/influxdata/telegraf/issues/3319): Fix cloudwatch output requires unneeded permissions. +- [#3342](https://github.com/influxdata/telegraf/pull/3342): Fix CPU input plugin stuck after suspend on Linux. ## v1.4.3 [unreleased] diff --git a/plugins/inputs/system/cpu.go b/plugins/inputs/system/cpu.go index 55378c93e14a3..99fa451b37e3a 100644 --- a/plugins/inputs/system/cpu.go +++ b/plugins/inputs/system/cpu.go @@ -96,7 +96,8 @@ func (s *CPUStats) Gather(acc telegraf.Accumulator) error { totalDelta := total - lastTotal if totalDelta < 0 { - return fmt.Errorf("Error: current total CPU time is less than previous total CPU time") + err = fmt.Errorf("Error: current total CPU time is less than previous total CPU time") + break } if totalDelta == 0 { @@ -126,7 +127,7 @@ func (s *CPUStats) Gather(acc telegraf.Accumulator) error { s.lastStats[cts.CPU] = cts } - return nil + return err } func totalCpuTime(t cpu.TimesStat) float64 { diff --git a/plugins/inputs/system/cpu_test.go b/plugins/inputs/system/cpu_test.go index fabff8a7d47cb..b6c0c9ef1e61b 100644 --- a/plugins/inputs/system/cpu_test.go +++ b/plugins/inputs/system/cpu_test.go @@ -184,3 +184,72 @@ func TestCPUCountIncrease(t *testing.T) { err = cs.Gather(&acc) require.NoError(t, err) } + +// TestCPUTimesDecrease tests that telegraf continue to works after +// CPU times decrease, which seems to occur when Linux system is suspended. +func TestCPUTimesDecrease(t *testing.T) { + var mps MockPS + defer mps.AssertExpectations(t) + var acc testutil.Accumulator + + cts := cpu.TimesStat{ + CPU: "cpu0", + User: 18, + Idle: 80, + Iowait: 2, + } + + cts2 := cpu.TimesStat{ + CPU: "cpu0", + User: 38, // increased by 20 + Idle: 40, // decreased by 40 + Iowait: 1, // decreased by 1 + } + + cts3 := cpu.TimesStat{ + CPU: "cpu0", + User: 56, // increased by 18 + Idle: 120, // increased by 80 + Iowait: 3, // increased by 2 + } + + mps.On("CPUTimes").Return([]cpu.TimesStat{cts}, nil) + + cs := NewCPUStats(&mps) + + cputags := map[string]string{ + "cpu": "cpu0", + } + + err := cs.Gather(&acc) + require.NoError(t, err) + + // Computed values are checked with delta > 0 becasue of floating point arithmatic + // imprecision + assertContainsTaggedFloat(t, &acc, "cpu", "time_user", 18, 0, cputags) + assertContainsTaggedFloat(t, &acc, "cpu", "time_idle", 80, 0, cputags) + assertContainsTaggedFloat(t, &acc, "cpu", "time_iowait", 2, 0, cputags) + + mps2 := MockPS{} + mps2.On("CPUTimes").Return([]cpu.TimesStat{cts2}, nil) + cs.ps = &mps2 + + // CPU times decreased. An error should be raised + err = cs.Gather(&acc) + require.Error(t, err) + + mps3 := MockPS{} + mps3.On("CPUTimes").Return([]cpu.TimesStat{cts3}, nil) + cs.ps = &mps3 + + err = cs.Gather(&acc) + require.NoError(t, err) + + assertContainsTaggedFloat(t, &acc, "cpu", "time_user", 56, 0, cputags) + assertContainsTaggedFloat(t, &acc, "cpu", "time_idle", 120, 0, cputags) + assertContainsTaggedFloat(t, &acc, "cpu", "time_iowait", 3, 0, cputags) + + assertContainsTaggedFloat(t, &acc, "cpu", "usage_user", 18, 0.0005, cputags) + assertContainsTaggedFloat(t, &acc, "cpu", "usage_idle", 80, 0.0005, cputags) + assertContainsTaggedFloat(t, &acc, "cpu", "usage_iowait", 2, 0.0005, cputags) +}