hashicorp · banks · Jul 23, 2019 · Jul 22, 2019 · Jul 22, 2019 · Jul 22, 2019
diff --git a/agent/agent.go b/agent/agent.go
@@ -1162,6 +1162,9 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
 	if a.config.RaftSnapshotInterval != 0 {
 		base.RaftConfig.SnapshotInterval = a.config.RaftSnapshotInterval
 	}
+	if a.config.RaftTrailingLogs != 0 {
+		base.RaftConfig.TrailingLogs = uint64(a.config.RaftTrailingLogs)
+	}
 	if a.config.ACLMasterToken != "" {
 		base.ACLMasterToken = a.config.ACLMasterToken
 	}

diff --git a/agent/agent_test.go b/agent/agent_test.go
@@ -3925,7 +3925,7 @@ func TestAgent_ReloadConfigTLSConfigFailure(t *testing.T) {
 	require.Len(t, tlsConf.RootCAs.Subjects(), 1)
 }
 
-func TestAgent_consulConfig(t *testing.T) {
+func TestAgent_consulConfig_AutoEncryptAllowTLS(t *testing.T) {
 	t.Parallel()
 	dataDir := testutil.TempDir(t, "agent") // we manage the data dir
 	defer os.RemoveAll(dataDir)
@@ -3941,3 +3941,13 @@ func TestAgent_consulConfig(t *testing.T) {
 	defer a.Shutdown()
 	require.True(t, a.consulConfig().AutoEncryptAllowTLS)
 }
+
+func TestAgent_consulConfig_RaftTrailingLogs(t *testing.T) {
+	t.Parallel()
+	hcl := `
+		raft_trailing_logs = 812345
+	`
+	a := NewTestAgent(t, t.Name(), hcl)
+	defer a.Shutdown()
+	require.Equal(t, uint64(812345), a.consulConfig().RaftConfig.TrailingLogs)
+}
diff --git a/agent/config/builder.go b/agent/config/builder.go
@@ -862,6 +862,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
 		RaftProtocol:                            b.intVal(c.RaftProtocol),
 		RaftSnapshotThreshold:                   b.intVal(c.RaftSnapshotThreshold),
 		RaftSnapshotInterval:                    b.durationVal("raft_snapshot_interval", c.RaftSnapshotInterval),
+		RaftTrailingLogs:                        b.intVal(c.RaftTrailingLogs),
 		ReconnectTimeoutLAN:                     b.durationVal("reconnect_timeout", c.ReconnectTimeoutLAN),
 		ReconnectTimeoutWAN:                     b.durationVal("reconnect_timeout_wan", c.ReconnectTimeoutWAN),
 		RejoinAfterLeave:                        b.boolVal(c.RejoinAfterLeave),

diff --git a/agent/config/config.go b/agent/config/config.go
@@ -239,6 +239,7 @@ type Config struct {
 	RaftProtocol                     *int                     `json:"raft_protocol,omitempty" hcl:"raft_protocol" mapstructure:"raft_protocol"`
 	RaftSnapshotThreshold            *int                     `json:"raft_snapshot_threshold,omitempty" hcl:"raft_snapshot_threshold" mapstructure:"raft_snapshot_threshold"`
 	RaftSnapshotInterval             *string                  `json:"raft_snapshot_interval,omitempty" hcl:"raft_snapshot_interval" mapstructure:"raft_snapshot_interval"`
+	RaftTrailingLogs                 *int                     `json:"raft_trailing_logs,omitempty" hcl:"raft_trailing_logs" mapstructure:"raft_trailing_logs"`
 	ReconnectTimeoutLAN              *string                  `json:"reconnect_timeout,omitempty" hcl:"reconnect_timeout" mapstructure:"reconnect_timeout"`
 	ReconnectTimeoutWAN              *string                  `json:"reconnect_timeout_wan,omitempty" hcl:"reconnect_timeout_wan" mapstructure:"reconnect_timeout_wan"`
 	RejoinAfterLeave                 *bool                    `json:"rejoin_after_leave,omitempty" hcl:"rejoin_after_leave" mapstructure:"rejoin_after_leave"`

diff --git a/agent/config/runtime.go b/agent/config/runtime.go
@@ -965,6 +965,22 @@ type RuntimeConfig struct {
 	// hcl: raft_snapshot_threshold = int
 	RaftSnapshotInterval time.Duration
 
+	// RaftTrailingLogs sets the number of log entries that will be left in the
+	// log store after a snapshot. This must be large enough that a follower can
+	// transfer and restore an entire snapshot of the state before this many new
+	// entries have been appended. In vast majority of cases the default is plenty
+	// but if there is a sustained high write throughput coupled with a huge
+	// multi-gigabyte snapshot setting this higher may be necessary to allow
+	// followers time to reload from snapshot without becoming unhealthy. If it's
+	// too low then followers are unable to ever recover from a restart and will
+	// enter a loop of constantly downloading full snapshots and never catching
+	// up. If you need to change this you should reconsider your usage of Consul
+	// as it is not designed to store multiple-gigabyte data sets with high write
+	// throughput. Defaults to 10000.
+	//
+	// hcl: raft_trailing_logs = int
+	RaftTrailingLogs int
+
 	// ReconnectTimeoutLAN specifies the amount of time to wait to reconnect with
 	// another agent before deciding it's permanently gone. This can be used to
 	// control the time it takes to reap failed nodes from the cluster.

diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go
@@ -3298,6 +3298,7 @@ func TestFullConfig(t *testing.T) {
 			"raft_protocol": 19016,
 			"raft_snapshot_threshold": 16384,
 			"raft_snapshot_interval": "30s",
+			"raft_trailing_logs": 83749,
 			"reconnect_timeout": "23739s",
 			"reconnect_timeout_wan": "26694s",
 			"recursors": [ "63.38.39.58", "92.49.18.18" ],
@@ -3881,6 +3882,7 @@ func TestFullConfig(t *testing.T) {
 			raft_protocol = 19016
 			raft_snapshot_threshold = 16384
 			raft_snapshot_interval = "30s"
+			raft_trailing_logs = 83749
 			reconnect_timeout = "23739s"
 			reconnect_timeout_wan = "26694s"
 			recursors = [ "63.38.39.58", "92.49.18.18" ]
@@ -4532,6 +4534,7 @@ func TestFullConfig(t *testing.T) {
 		RaftProtocol:                     19016,
 		RaftSnapshotThreshold:            16384,
 		RaftSnapshotInterval:             30 * time.Second,
+		RaftTrailingLogs:                 83749,
 		ReconnectTimeoutLAN:              23739 * time.Second,
 		ReconnectTimeoutWAN:              26694 * time.Second,
 		RejoinAfterLeave:                 true,
@@ -5353,6 +5356,7 @@ func TestSanitize(t *testing.T) {
 		"RaftProtocol": 0,
 		"RaftSnapshotInterval": "0s",
 		"RaftSnapshotThreshold": 0,
+		"RaftTrailingLogs": 0,
 		"ReconnectTimeoutLAN": "0s",
 		"ReconnectTimeoutWAN": "0s",
 		"RejoinAfterLeave": false,

diff --git a/website/source/docs/agent/options.html.md b/website/source/docs/agent/options.html.md
@@ -407,21 +407,6 @@ will exit with an error at startup.
   [Raft Protocol Version Compatibility](/docs/upgrade-specific.html#raft-protocol-version-compatibility)
   for more details.
 
-* <a name="_raft_snapshot_threshold"></a><a href="#_raft_snapshot_threshold">`-raft-snapshot-threshold`</a> - This controls the
-  minimum number of raft commit entries between snapshots that are saved to disk. This is a low-level parameter that should
-  rarely need to be changed. Very busy clusters experiencing excessive disk IO may increase this value to reduce disk IO, and minimize
-  the chances of all servers taking snapshots at the same time. Increasing this trades off disk IO for disk space since the log will
-  grow much larger and the space in the raft.db file can't be reclaimed till the next snapshot. Servers may take longer to recover from
-  crashes or failover if this is increased significantly as more logs will need to be replayed. In Consul 1.1.0 and later this
-  defaults to 16384, and in prior versions it was set to 8192.
-
-* <a name="_raft_snapshot_interval"></a><a href="#_raft_snapshot_interval">`-raft-snapshot-interval`</a> - This controls how often servers
-  check if they need to save a snapshot to disk. his is a low-level parameter that should rarely need to be changed. Very busy clusters
-  experiencing excessive disk IO may increase this value to reduce disk IO, and minimize the chances of all servers taking snapshots at the same time.
-  Increasing this trades off disk IO for disk space since the log will grow much larger and the space in the raft.db file can't be reclaimed
-  till the next snapshot. Servers may take longer to recover from crashes or failover if this is increased significantly as more logs
-  will need to be replayed. In Consul 1.1.0 and later this defaults to `30s`, and in prior versions it was set to `5s`.
-
 * <a name="_recursor"></a><a href="#_recursor">`-recursor`</a> - Specifies the address of an upstream DNS
   server. This option may be provided multiple times, and is functionally
   equivalent to the [`recursors` configuration option](#recursors).
@@ -1431,11 +1416,46 @@ default will automatically work with some tooling.
 * <a name="raft_protocol"></a><a href="#raft_protocol">`raft_protocol`</a> Equivalent to the
   [`-raft-protocol` command-line flag](#_raft_protocol).
 
-* <a name="raft_snapshot_threshold"></a><a href="#raft_snapshot_threshold">`raft_snapshot_threshold`</a> Equivalent to the
-  [`-raft-snapshot-threshold` command-line flag](#_raft_snapshot_threshold).
-
-* <a name="raft_snapshot_interval"></a><a href="#raft_snapshot_interval">`raft_snapshot_interval`</a> Equivalent to the
-  [`-raft-snapshot-interval` command-line flag](#_raft_snapshot_interval).
+<!-- Note the extra _ anchors are here because we used to erroneously list these as
+command line flags even though they are not actually defined as valid flags and can 
+only be set in config file. Duplicating the anchor preserves any existing external links 
+to the old fragment -->
+* <a name="raft_snapshot_threshold"></a><a name="_raft_snapshot_threshold"></a>
+  <a href="#raft_snapshot_threshold">`raft_snapshot_threshold`</a> This controls
+  the minimum number of raft commit entries between snapshots that are saved to
+  disk. This is a low-level parameter that should rarely need to be changed.
+  Very busy clusters experiencing excessive disk IO may increase this value to
+  reduce disk IO, and minimize the chances of all servers taking snapshots at
+  the same time. Increasing this trades off disk IO for disk space since the log
+  will grow much larger and the space in the raft.db file can't be reclaimed
+  till the next snapshot. Servers may take longer to recover from crashes or
+  failover if this is increased significantly as more logs will need to be
+  replayed. In Consul 1.1.0 and later this defaults to 16384, and in prior
+  versions it was set to 8192.
+
+* <a name="raft_snapshot_interval"></a><a name="_raft_snapshot_interval"></a> <a
+  href="#raft_snapshot_interval">`raft_snapshot_interval`</a> This controls how
+  often servers check if they need to save a snapshot to disk. his is a
+  low-level parameter that should rarely need to be changed. Very busy clusters
+  experiencing excessive disk IO may increase this value to reduce disk IO, and
+  minimize the chances of all servers taking snapshots at the same time.
+  Increasing this trades off disk IO for disk space since the log will grow much
+  larger and the space in th e raft.db file can't be reclaimed till the next
+  snapshot. Servers may take longer to recover from crashes or failover if this
+  is increased significantly as more logs will need to be replayed. In Consul
+  1.1.0 and later this defaults to `30s`, and in prior versions it was set to
+  `5s`.
+
+* <a name="raft_trailing_logs"></a><a
+  href="#raft_trailing_logs">`raft_trailing_logs`</a> - This controls how many
+  log entries are left in the log store on disk after a snapshot is made. This
+  should only be adjusted when followers cannot catch up to the leader due to a
+  very large snapshot size that and high write throughput causing log truncation
+  before an snapshot can be fully installed. If you need to use this to recover
+  a cluster, consider reducing write throughput or the amount of data stored on
+  Consul as it is likely under a load it is not designed to handle. The default
+  value is 10000 which is suitable for all normal workloads. Added in Consul
+  1.5.4.
 
 * <a name="reap"></a><a href="#reap">`reap`</a> This controls Consul's automatic reaping of child processes,
   which is useful if Consul is running as PID 1 in a Docker container. If this isn't specified, then Consul will