elastic · urso · Aug 30, 2018 · Aug 29, 2018
diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc
@@ -40,6 +40,7 @@ https://github.com/elastic/beats/compare/v6.4.0...6.4[Check the HEAD diff]
 - Remove unix-like permission checks on Windows, so files can be opened. {issue}7849[7849]
 - Deregister pipeline loader callback when inputsRunner is stopped. {pull}7893[7893]
 - Replace index patterns in TSVB visualizations. {pull}7929[7929]
+- Add backoff support to x-pack monitoring outputs. {issue}7966[7966]
 
 *Auditbeat*
 

diff --git a/auditbeat/auditbeat.reference.yml b/auditbeat/auditbeat.reference.yml
@@ -1143,6 +1143,17 @@ logging.files:
   # The default is 50.
   #bulk_max_size: 50
 
+  # The number of seconds to wait before trying to reconnect to Elasticsearch
+  # after a network error. After waiting backoff.init seconds, the Beat
+  # tries to reconnect. If the attempt fails, the backoff timer is increased
+  # exponentially up to backoff.max. After a successful connection, the backoff
+  # timer is reset. The default is 1s.
+  #backoff.init: 1s
+
+  # The maximum number of seconds to wait before attempting to connect to
+  # Elasticsearch after a network error. The default is 60s.
+  #backoff.max: 60s
+
   # Configure http request timeout before failing an request to Elasticsearch.
   #timeout: 90
 

diff --git a/filebeat/filebeat.reference.yml b/filebeat/filebeat.reference.yml
@@ -1803,6 +1803,17 @@ logging.files:
   # The default is 50.
   #bulk_max_size: 50
 
+  # The number of seconds to wait before trying to reconnect to Elasticsearch
+  # after a network error. After waiting backoff.init seconds, the Beat
+  # tries to reconnect. If the attempt fails, the backoff timer is increased
+  # exponentially up to backoff.max. After a successful connection, the backoff
+  # timer is reset. The default is 1s.
+  #backoff.init: 1s
+
+  # The maximum number of seconds to wait before attempting to connect to
+  # Elasticsearch after a network error. The default is 60s.
+  #backoff.max: 60s
+
   # Configure http request timeout before failing an request to Elasticsearch.
   #timeout: 90
 

diff --git a/heartbeat/heartbeat.reference.yml b/heartbeat/heartbeat.reference.yml
@@ -1250,6 +1250,17 @@ logging.files:
   # The default is 50.
   #bulk_max_size: 50
 
+  # The number of seconds to wait before trying to reconnect to Elasticsearch
+  # after a network error. After waiting backoff.init seconds, the Beat
+  # tries to reconnect. If the attempt fails, the backoff timer is increased
+  # exponentially up to backoff.max. After a successful connection, the backoff
+  # timer is reset. The default is 1s.
+  #backoff.init: 1s
+
+  # The maximum number of seconds to wait before attempting to connect to
+  # Elasticsearch after a network error. The default is 60s.
+  #backoff.max: 60s
+
   # Configure http request timeout before failing an request to Elasticsearch.
   #timeout: 90
 

diff --git a/libbeat/_meta/config.reference.yml b/libbeat/_meta/config.reference.yml
@@ -1036,6 +1036,17 @@ logging.files:
   # The default is 50.
   #bulk_max_size: 50
 
+  # The number of seconds to wait before trying to reconnect to Elasticsearch
+  # after a network error. After waiting backoff.init seconds, the Beat
+  # tries to reconnect. If the attempt fails, the backoff timer is increased
+  # exponentially up to backoff.max. After a successful connection, the backoff
+  # timer is reset. The default is 1s.
+  #backoff.init: 1s
+
+  # The maximum number of seconds to wait before attempting to connect to
+  # Elasticsearch after a network error. The default is 60s.
+  #backoff.max: 60s
+
   # Configure http request timeout before failing an request to Elasticsearch.
   #timeout: 90
 

diff --git a/libbeat/docs/monitoring/shared-monitor-config.asciidoc b/libbeat/docs/monitoring/shared-monitor-config.asciidoc
@@ -39,6 +39,21 @@ configuration option contains the following fields:
 The maximum number of metrics to bulk in a single {es} bulk API index request.
 The default is `50`. For more information, see <<elasticsearch-output>>.
 
+[float]
+==== `backoff.init`
+
+The number of seconds to wait before trying to reconnect to Elasticsearch after
+a network error. After waiting `backoff.init` seconds, {beatname_uc} tries to
+reconnect. If the attempt fails, the backoff timer is increased exponentially up
+to `backoff.max`. After a successful connection, the backoff timer is reset. The
+default is 1s.
+
+[float]
+===== `backoff.max`
+
+The maximum number of seconds to wait before attempting to connect to
+Elasticsearch after a network error. The default is 60s.
+
 [float]
 ==== `compression_level`
 
@@ -79,10 +94,17 @@ The password that {beatname_uc} uses to authenticate with the {es} instances for
 shipping monitoring data.
 
 [float]
-==== `period`
+==== `metrics.period`
 
 The time interval (in seconds) when metrics are sent to the {es} cluster. A new
 snapshot of {beatname_uc} metrics is generated and scheduled for publishing each
+period. The default value is 10 * time.Second.
+
+[float]
+==== `state.period`
+
+The time interval (in seconds) when state information are sent to the {es} cluster. A new
+snapshot of {beatname_uc} state is generated and scheduled for publishing each
 period. The default value is 60 * time.Second.
 
 [float]

diff --git a/libbeat/monitoring/report/elasticsearch/config.go b/libbeat/monitoring/report/elasticsearch/config.go
@@ -42,6 +42,12 @@ type config struct {
 	BulkMaxSize      int               `config:"bulk_max_size" validate:"min=0"`
 	BufferSize       int               `config:"buffer_size"`
 	Tags             []string          `config:"tags"`
+	Backoff          backoff           `config:"backoff"`
+}
+
+type backoff struct {
+	Init time.Duration
+	Max  time.Duration
 }
 
 var defaultConfig = config{
@@ -61,4 +67,8 @@ var defaultConfig = config{
 	BulkMaxSize:      50,
 	BufferSize:       50,
 	Tags:             nil,
+	Backoff: backoff{
+		Init: 1 * time.Second,
+		Max:  60 * time.Second,
+	},
 }
diff --git a/libbeat/monitoring/report/elasticsearch/elasticsearch.go b/libbeat/monitoring/report/elasticsearch/elasticsearch.go
@@ -54,7 +54,8 @@ type reporter struct {
 	// pipeline
 	pipeline *pipeline.Pipeline
 	client   beat.Client
-	out      outputs.Group
+
+	out []outputs.NetworkClient
 }
 
 var debugf = logp.MakeDebug("monitoring")
@@ -104,22 +105,21 @@ func makeReporter(beat beat.Info, cfg *common.Config) (report.Reporter, error) {
 		params[k] = v
 	}
 
-	out := outputs.Group{
-		Clients:   nil,
-		BatchSize: windowSize,
-		Retry:     0, // no retry. on error drop events
-	}
-
 	hosts, err := outputs.ReadHostList(cfg)
 	if err != nil {
 		return nil, err
 	}
+	if len(hosts) == 0 {
+		return nil, errors.New("empty hosts list")
+	}
+
+	var clients []outputs.NetworkClient
 	for _, host := range hosts {
 		client, err := makeClient(host, params, proxyURL, tlsConfig, &config)
 		if err != nil {
 			return nil, err
 		}
-		out.Clients = append(out.Clients, client)
+		clients = append(clients, client)
 	}
 
 	queueFactory := func(e queue.Eventer) (queue.Queue, error) {
@@ -131,18 +131,27 @@ func makeReporter(beat beat.Info, cfg *common.Config) (report.Reporter, error) {
 
 	monitoring := monitoring.Default.GetRegistry("xpack.monitoring")
 
+	outClient := outputs.NewFailoverClient(clients)
+	outClient = outputs.WithBackoff(outClient, config.Backoff.Init, config.Backoff.Max)
+
 	pipeline, err := pipeline.New(
 		beat,
 		monitoring,
-		queueFactory, out, pipeline.Settings{
+		queueFactory,
+		outputs.Group{
+			Clients:   []outputs.Client{outClient},
+			BatchSize: windowSize,
+			Retry:     0, // no retry. Drop event on error.
+		},
+		pipeline.Settings{
 			WaitClose:     0,
 			WaitCloseMode: pipeline.NoWaitOnClose,
 		})
 	if err != nil {
 		return nil, err
 	}
 
-	client, err := pipeline.Connect()
+	pipeConn, err := pipeline.Connect()
 	if err != nil {
 		pipeline.Close()
 		return nil, err
@@ -154,8 +163,8 @@ func makeReporter(beat beat.Info, cfg *common.Config) (report.Reporter, error) {
 		tags:       config.Tags,
 		checkRetry: checkRetry,
 		pipeline:   pipeline,
-		client:     client,
-		out:        out,
+		client:     pipeConn,
+		out:        clients,
 	}
 	go r.initLoop(config)
 	return r, nil
@@ -175,7 +184,7 @@ func (r *reporter) initLoop(c config) {
 
 	for {
 		// Select one configured endpoint by random and check if xpack is available
-		client := r.out.Clients[rand.Intn(len(r.out.Clients))].(outputs.NetworkClient)
+		client := r.out[rand.Intn(len(r.out))]
 		err := client.Connect()
 		if err == nil {
 			closing(client)

diff --git a/metricbeat/metricbeat.reference.yml b/metricbeat/metricbeat.reference.yml
@@ -1710,6 +1710,17 @@ logging.files:
   # The default is 50.
   #bulk_max_size: 50
 
+  # The number of seconds to wait before trying to reconnect to Elasticsearch
+  # after a network error. After waiting backoff.init seconds, the Beat
+  # tries to reconnect. If the attempt fails, the backoff timer is increased
+  # exponentially up to backoff.max. After a successful connection, the backoff
+  # timer is reset. The default is 1s.
+  #backoff.init: 1s
+
+  # The maximum number of seconds to wait before attempting to connect to
+  # Elasticsearch after a network error. The default is 60s.
+  #backoff.max: 60s
+
   # Configure http request timeout before failing an request to Elasticsearch.
   #timeout: 90
 

diff --git a/packetbeat/packetbeat.reference.yml b/packetbeat/packetbeat.reference.yml
@@ -1513,6 +1513,17 @@ logging.files:
   # The default is 50.
   #bulk_max_size: 50
 
+  # The number of seconds to wait before trying to reconnect to Elasticsearch
+  # after a network error. After waiting backoff.init seconds, the Beat
+  # tries to reconnect. If the attempt fails, the backoff timer is increased
+  # exponentially up to backoff.max. After a successful connection, the backoff
+  # timer is reset. The default is 1s.
+  #backoff.init: 1s
+
+  # The maximum number of seconds to wait before attempting to connect to
+  # Elasticsearch after a network error. The default is 60s.
+  #backoff.max: 60s
+
   # Configure http request timeout before failing an request to Elasticsearch.
   #timeout: 90
 

diff --git a/winlogbeat/winlogbeat.reference.yml b/winlogbeat/winlogbeat.reference.yml
@@ -1065,6 +1065,17 @@ logging.files:
   # The default is 50.
   #bulk_max_size: 50
 
+  # The number of seconds to wait before trying to reconnect to Elasticsearch
+  # after a network error. After waiting backoff.init seconds, the Beat
+  # tries to reconnect. If the attempt fails, the backoff timer is increased
+  # exponentially up to backoff.max. After a successful connection, the backoff
+  # timer is reset. The default is 1s.
+  #backoff.init: 1s
+
+  # The maximum number of seconds to wait before attempting to connect to
+  # Elasticsearch after a network error. The default is 60s.
+  #backoff.max: 60s
+
   # Configure http request timeout before failing an request to Elasticsearch.
   #timeout: 90