From 4e8128dfe049ed08abb9f880fa64389dea073453 Mon Sep 17 00:00:00 2001 From: Nicolas Ruflin Date: Tue, 21 Feb 2017 10:40:48 +0100 Subject: [PATCH] Fix docker hanging when container killed (#3612) No timeout was passed to the docker client. It seems in case of a killed container it can happen that the connection is hanging. To interrupt this connection, the timeout from the metricset is passed to the client. That means in case info for a container cannot be fetched, it will timeout. This change requires that the docker module is not run with a timeout of 3s seconds, which indirectly means a period of 3s. The reason is that already the http request waits ~2s for the response. So if 1s is set as timeout, all requests will timeout. Further changes: * Containers without names will be ignored, as these are containers for which the data could not be fetched. * Period was set to 1s by default instead of the period as document. This was changed. * Add documentation node about minimal period. Closes https://github.com/elastic/beats/issues/3610 The issue with this PR was introduce in 5.2.1 by fixing the memory leak. Before go routines just piled up, but now they caused filebeat to hang. This needs also backport to 5.2.2 (cherry picked from commit 99f17d66d79ccb772ed376f255bbdf79be320fac) --- CHANGELOG.asciidoc | 4 ++++ metricbeat/docs/modules/system.asciidoc | 3 +++ metricbeat/mb/builders.go | 5 +++++ metricbeat/mb/mb.go | 1 - metricbeat/mb/mb_test.go | 6 +++--- metricbeat/module/docker/cpu/cpu.go | 2 +- metricbeat/module/docker/diskio/diskio.go | 2 +- metricbeat/module/docker/docker.go | 21 ++++++++++++-------- metricbeat/module/docker/memory/memory.go | 2 +- metricbeat/module/docker/network/network.go | 2 +- metricbeat/module/system/_meta/docs.asciidoc | 3 +++ metricbeat/tests/system/test_docker.py | 13 ++++++------ 12 files changed, 42 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index 42486e80a82..4f755970603 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -34,6 +34,10 @@ https://github.com/elastic/beats/compare/v5.2.0...master[Check the HEAD diff] *Metricbeat* +- Fix go routine leak in docker module. {pull}3492[3492] +- Fix bug docker module hanging when docker container killed. {issue}3610[3610] +- Set timeout to period instead of 1s by default as documented. + *Packetbeat* *Winlogbeat* diff --git a/metricbeat/docs/modules/system.asciidoc b/metricbeat/docs/modules/system.asciidoc index 97116990035..d1faf62c95e 100644 --- a/metricbeat/docs/modules/system.asciidoc +++ b/metricbeat/docs/modules/system.asciidoc @@ -48,6 +48,9 @@ metricbeat.modules: cpu_ticks: true ---- +It is strongly recommended to not run docker metricsets with a period smaller then 3 seconds. The request to the docker +API already takes up to 2s seconds. Otherwise all the requests would timeout and no data is reported. + [float] === Dashboard diff --git a/metricbeat/mb/builders.go b/metricbeat/mb/builders.go index 921ae5eec3d..90213315f3d 100644 --- a/metricbeat/mb/builders.go +++ b/metricbeat/mb/builders.go @@ -89,6 +89,11 @@ func newBaseModuleFromConfig(rawConfig *common.Config) (BaseModule, error) { return baseModule, err } + // If timeout is not set, timeout is set to the same value as period + if baseModule.config.Timeout == 0 { + baseModule.config.Timeout = baseModule.config.Period + } + baseModule.name = strings.ToLower(baseModule.config.Module) err = mustNotContainDuplicates(baseModule.config.Hosts) diff --git a/metricbeat/mb/mb.go b/metricbeat/mb/mb.go index 4b1c36f3b07..ce6c4abf9e9 100644 --- a/metricbeat/mb/mb.go +++ b/metricbeat/mb/mb.go @@ -183,7 +183,6 @@ func (c ModuleConfig) GoString() string { return c.String() } var defaultModuleConfig = ModuleConfig{ Enabled: true, Period: time.Second * 10, - Timeout: time.Second, } // DefaultModuleConfig returns a ModuleConfig with the default values populated. diff --git a/metricbeat/mb/mb_test.go b/metricbeat/mb/mb_test.go index a0ba3120c6c..cd605d0785a 100644 --- a/metricbeat/mb/mb_test.go +++ b/metricbeat/mb/mb_test.go @@ -61,7 +61,7 @@ func TestModuleConfig(t *testing.T) { MetricSets: []string{"test"}, Enabled: true, Period: time.Second * 10, - Timeout: time.Second, + Timeout: 0, }, }, { @@ -124,7 +124,7 @@ func TestModuleConfigDefaults(t *testing.T) { assert.Equal(t, true, mc.Enabled) assert.Equal(t, time.Second*10, mc.Period) - assert.Equal(t, time.Second, mc.Timeout) + assert.Equal(t, time.Second*0, mc.Timeout) assert.Empty(t, mc.Hosts) } @@ -249,7 +249,7 @@ func TestNewBaseModuleFromModuleConfigStruct(t *testing.T) { assert.Equal(t, moduleName, baseModule.Config().Module) assert.Equal(t, true, baseModule.Config().Enabled) assert.Equal(t, time.Second*10, baseModule.Config().Period) - assert.Equal(t, time.Second, baseModule.Config().Timeout) + assert.Equal(t, time.Second*10, baseModule.Config().Timeout) assert.Empty(t, baseModule.Config().Hosts) } diff --git a/metricbeat/module/docker/cpu/cpu.go b/metricbeat/module/docker/cpu/cpu.go index b945c039117..333ff88d385 100644 --- a/metricbeat/module/docker/cpu/cpu.go +++ b/metricbeat/module/docker/cpu/cpu.go @@ -44,7 +44,7 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) { // Fetch returns a list of docker CPU stats. func (m *MetricSet) Fetch() ([]common.MapStr, error) { - stats, err := docker.FetchStats(m.dockerClient) + stats, err := docker.FetchStats(m.dockerClient, m.Module().Config().Timeout) if err != nil { return nil, err } diff --git a/metricbeat/module/docker/diskio/diskio.go b/metricbeat/module/docker/diskio/diskio.go index a0e36b5dd96..81b9c86e19b 100644 --- a/metricbeat/module/docker/diskio/diskio.go +++ b/metricbeat/module/docker/diskio/diskio.go @@ -46,7 +46,7 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) { // Fetch creates list of events with diskio stats for all containers. func (m *MetricSet) Fetch() ([]common.MapStr, error) { - stats, err := docker.FetchStats(m.dockerClient) + stats, err := docker.FetchStats(m.dockerClient, m.Module().Config().Timeout) if err != nil { return nil, err } diff --git a/metricbeat/module/docker/docker.go b/metricbeat/module/docker/docker.go index 79aa42988da..9b2b5a1c328 100644 --- a/metricbeat/module/docker/docker.go +++ b/metricbeat/module/docker/docker.go @@ -7,6 +7,8 @@ import ( "github.com/elastic/beats/metricbeat/mb" "github.com/elastic/beats/metricbeat/mb/parse" + "time" + "github.com/fsouza/go-dockerclient" ) @@ -58,7 +60,7 @@ func NewDockerClient(endpoint string, config Config) (*docker.Client, error) { } // FetchStats returns a list of running containers with all related stats inside -func FetchStats(client *docker.Client) ([]Stat, error) { +func FetchStats(client *docker.Client, timeout time.Duration) ([]Stat, error) { containers, err := client.ListContainers(docker.ListContainersOptions{}) if err != nil { return nil, err @@ -67,24 +69,27 @@ func FetchStats(client *docker.Client) ([]Stat, error) { var wg sync.WaitGroup containersList := make([]Stat, 0, len(containers)) - queue := make(chan Stat, 1) + statsQueue := make(chan Stat, 1) wg.Add(len(containers)) for _, container := range containers { go func(container docker.APIContainers) { defer wg.Done() - queue <- exportContainerStats(client, &container) + statsQueue <- exportContainerStats(client, &container, timeout) }(container) } go func() { wg.Wait() - close(queue) + close(statsQueue) }() // This will break after the queue has been drained and queue is closed. - for container := range queue { - containersList = append(containersList, container) + for stat := range statsQueue { + // If names is empty, there is not data inside + if len(stat.Container.Names) != 0 { + containersList = append(containersList, stat) + } } return containersList, err @@ -95,7 +100,7 @@ func FetchStats(client *docker.Client) ([]Stat, error) { // This is currently very inefficient as docker calculates the average for each request, // means each request will take at least 2s: https://github.com/docker/docker/blob/master/cli/command/container/stats_helpers.go#L148 // Getting all stats at once is implemented here: https://github.com/docker/docker/pull/25361 -func exportContainerStats(client *docker.Client, container *docker.APIContainers) Stat { +func exportContainerStats(client *docker.Client, container *docker.APIContainers, timeout time.Duration) Stat { var wg sync.WaitGroup var event Stat @@ -105,7 +110,7 @@ func exportContainerStats(client *docker.Client, container *docker.APIContainers ID: container.ID, Stats: statsC, Stream: false, - Timeout: -1, + Timeout: timeout, } wg.Add(2) diff --git a/metricbeat/module/docker/memory/memory.go b/metricbeat/module/docker/memory/memory.go index 511828825c2..85a3f7b8f9c 100644 --- a/metricbeat/module/docker/memory/memory.go +++ b/metricbeat/module/docker/memory/memory.go @@ -44,7 +44,7 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) { // Fetch creates a list of memory events for each container. func (m *MetricSet) Fetch() ([]common.MapStr, error) { - stats, err := docker.FetchStats(m.dockerClient) + stats, err := docker.FetchStats(m.dockerClient, m.Module().Config().Timeout) if err != nil { return nil, err } diff --git a/metricbeat/module/docker/network/network.go b/metricbeat/module/docker/network/network.go index 8c8aa13d8b5..2597d8abe4a 100644 --- a/metricbeat/module/docker/network/network.go +++ b/metricbeat/module/docker/network/network.go @@ -46,7 +46,7 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) { // Fetch methods creates a list of network events for each container. func (m *MetricSet) Fetch() ([]common.MapStr, error) { - stats, err := docker.FetchStats(m.dockerClient) + stats, err := docker.FetchStats(m.dockerClient, m.Module().Config().Timeout) if err != nil { return nil, err } diff --git a/metricbeat/module/system/_meta/docs.asciidoc b/metricbeat/module/system/_meta/docs.asciidoc index 037adcdaf5e..bcf32a6c379 100644 --- a/metricbeat/module/system/_meta/docs.asciidoc +++ b/metricbeat/module/system/_meta/docs.asciidoc @@ -43,6 +43,9 @@ metricbeat.modules: cpu_ticks: true ---- +It is strongly recommended to not run docker metricsets with a period smaller then 3 seconds. The request to the docker +API already takes up to 2s seconds. Otherwise all the requests would timeout and no data is reported. + [float] === Dashboard diff --git a/metricbeat/tests/system/test_docker.py b/metricbeat/tests/system/test_docker.py index 9be39ad51e6..69019b0ae52 100644 --- a/metricbeat/tests/system/test_docker.py +++ b/metricbeat/tests/system/test_docker.py @@ -14,7 +14,7 @@ def test_container_fields(self): "name": "docker", "metricsets": ["container"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s", + "period": "10s", }]) proc = self.start_beat() @@ -40,7 +40,7 @@ def test_cpu_fields(self): "name": "docker", "metricsets": ["cpu"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s" + "period": "10s" }]) proc = self.start_beat() @@ -70,7 +70,7 @@ def test_diskio_fields(self): "name": "docker", "metricsets": ["diskio"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s" + "period": "10s" }]) proc = self.start_beat() @@ -97,7 +97,7 @@ def test_info_fields(self): "name": "docker", "metricsets": ["info"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s" + "period": "10s" }]) proc = self.start_beat() @@ -122,7 +122,7 @@ def test_memory_fields(self): "name": "docker", "metricsets": ["memory"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s" + "period": "10s" }]) proc = self.start_beat() @@ -148,7 +148,7 @@ def test_network_fields(self): "name": "docker", "metricsets": ["network"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s" + "period": "10s" }]) proc = self.start_beat() @@ -157,6 +157,7 @@ def test_network_fields(self): # Ensure no errors or warnings exist in the log. log = self.get_log() + self.assertNotRegexpMatches(log.replace("WARN EXPERIMENTAL", ""), "ERR|WARN") output = self.read_output_json()