diff --git a/agent/agent.go b/agent/agent.go
index e9d99c07d23c..e7a5235d4895 100644
--- a/agent/agent.go
+++ b/agent/agent.go
@@ -1204,6 +1204,7 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
base.AutopilotConfig.CleanupDeadServers = a.config.AutopilotCleanupDeadServers
base.AutopilotConfig.LastContactThreshold = a.config.AutopilotLastContactThreshold
base.AutopilotConfig.MaxTrailingLogs = uint64(a.config.AutopilotMaxTrailingLogs)
+ base.AutopilotConfig.MinQuorum = a.config.AutopilotMinQuorum
base.AutopilotConfig.ServerStabilizationTime = a.config.AutopilotServerStabilizationTime
base.AutopilotConfig.RedundancyZoneTag = a.config.AutopilotRedundancyZoneTag
base.AutopilotConfig.DisableUpgradeMigration = a.config.AutopilotDisableUpgradeMigration
diff --git a/agent/config/builder.go b/agent/config/builder.go
index d62225fee2cf..06340eb455c5 100644
--- a/agent/config/builder.go
+++ b/agent/config/builder.go
@@ -728,6 +728,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
AutopilotDisableUpgradeMigration: b.boolVal(c.Autopilot.DisableUpgradeMigration),
AutopilotLastContactThreshold: b.durationVal("autopilot.last_contact_threshold", c.Autopilot.LastContactThreshold),
AutopilotMaxTrailingLogs: b.intVal(c.Autopilot.MaxTrailingLogs),
+ AutopilotMinQuorum: b.uintVal(c.Autopilot.MinQuorum),
AutopilotRedundancyZoneTag: b.stringVal(c.Autopilot.RedundancyZoneTag),
AutopilotServerStabilizationTime: b.durationVal("autopilot.server_stabilization_time", c.Autopilot.ServerStabilizationTime),
AutopilotUpgradeVersionTag: b.stringVal(c.Autopilot.UpgradeVersionTag),
@@ -1444,6 +1445,17 @@ func (b *Builder) intVal(v *int) int {
return b.intValWithDefault(v, 0)
}
+func (b *Builder) uintVal(v *uint) uint {
+ return b.uintValWithDefault(v, 0)
+}
+
+func (b *Builder) uintValWithDefault(v *uint, defaultVal uint) uint {
+ if v == nil {
+ return defaultVal
+ }
+ return *v
+}
+
func (b *Builder) uint64ValWithDefault(v *uint64, defaultVal uint64) uint64 {
if v == nil {
return defaultVal
diff --git a/agent/config/config.go b/agent/config/config.go
index b713cbf5bdd6..9f8aff81cefe 100644
--- a/agent/config/config.go
+++ b/agent/config/config.go
@@ -362,6 +362,7 @@ type Autopilot struct {
DisableUpgradeMigration *bool `json:"disable_upgrade_migration,omitempty" hcl:"disable_upgrade_migration" mapstructure:"disable_upgrade_migration"`
LastContactThreshold *string `json:"last_contact_threshold,omitempty" hcl:"last_contact_threshold" mapstructure:"last_contact_threshold"`
MaxTrailingLogs *int `json:"max_trailing_logs,omitempty" hcl:"max_trailing_logs" mapstructure:"max_trailing_logs"`
+ MinQuorum *uint `json:"min_quorum,omitempty" hcl:"min_quorum" mapstructure:"min_quorum"`
RedundancyZoneTag *string `json:"redundancy_zone_tag,omitempty" hcl:"redundancy_zone_tag" mapstructure:"redundancy_zone_tag"`
ServerStabilizationTime *string `json:"server_stabilization_time,omitempty" hcl:"server_stabilization_time" mapstructure:"server_stabilization_time"`
UpgradeVersionTag *string `json:"upgrade_version_tag,omitempty" hcl:"upgrade_version_tag" mapstructure:"upgrade_version_tag"`
diff --git a/agent/config/runtime.go b/agent/config/runtime.go
index 1b21e3ff815a..db244e6a733f 100644
--- a/agent/config/runtime.go
+++ b/agent/config/runtime.go
@@ -196,6 +196,12 @@ type RuntimeConfig struct {
// hcl: autopilot { max_trailing_logs = int }
AutopilotMaxTrailingLogs int
+ // AutopilotMinQuorum sets the minimum number of servers required in a cluster
+ // before autopilot can prune dead servers.
+ //
+ //hcl: autopilot { min_quorum = int }
+ AutopilotMinQuorum uint
+
// AutopilotRedundancyZoneTag is the Meta tag to use for separating servers
// into zones for redundancy. If left blank, this feature will be disabled.
// (Enterprise-only)
diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go
index 29df09e81d98..e93eaca39c97 100644
--- a/agent/config/runtime_test.go
+++ b/agent/config/runtime_test.go
@@ -3520,6 +3520,7 @@ func TestFullConfig(t *testing.T) {
"disable_upgrade_migration": true,
"last_contact_threshold": "12705s",
"max_trailing_logs": 17849,
+ "min_quorum": 3,
"redundancy_zone_tag": "3IsufDJf",
"server_stabilization_time": "23057s",
"upgrade_version_tag": "W9pDwFAL"
@@ -4117,6 +4118,7 @@ func TestFullConfig(t *testing.T) {
disable_upgrade_migration = true
last_contact_threshold = "12705s"
max_trailing_logs = 17849
+ min_quorum = 3
redundancy_zone_tag = "3IsufDJf"
server_stabilization_time = "23057s"
upgrade_version_tag = "W9pDwFAL"
@@ -4819,6 +4821,7 @@ func TestFullConfig(t *testing.T) {
AutopilotDisableUpgradeMigration: true,
AutopilotLastContactThreshold: 12705 * time.Second,
AutopilotMaxTrailingLogs: 17849,
+ AutopilotMinQuorum: 3,
AutopilotRedundancyZoneTag: "3IsufDJf",
AutopilotServerStabilizationTime: 23057 * time.Second,
AutopilotUpgradeVersionTag: "W9pDwFAL",
@@ -5703,6 +5706,7 @@ func TestSanitize(t *testing.T) {
"AutopilotDisableUpgradeMigration": false,
"AutopilotLastContactThreshold": "0s",
"AutopilotMaxTrailingLogs": 0,
+ "AutopilotMinQuorum": 0,
"AutopilotRedundancyZoneTag": "",
"AutopilotServerStabilizationTime": "0s",
"AutopilotUpgradeVersionTag": "",
diff --git a/agent/consul/autopilot/autopilot.go b/agent/consul/autopilot/autopilot.go
index 39d094f590f8..782b333ef0ce 100644
--- a/agent/consul/autopilot/autopilot.go
+++ b/agent/consul/autopilot/autopilot.go
@@ -234,7 +234,7 @@ func (a *Autopilot) pruneDeadServers() error {
// Only do removals if a minority of servers will be affected.
peers := NumPeers(raftConfig)
- if removalCount < peers/2 {
+ if peers-removalCount >= int(conf.MinQuorum) && removalCount < peers/2 {
for _, node := range failed {
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
go serfLAN.RemoveFailedNode(node.Name)
diff --git a/agent/consul/autopilot/structs.go b/agent/consul/autopilot/structs.go
index 54a5025ace6b..4e83a1d3827b 100644
--- a/agent/consul/autopilot/structs.go
+++ b/agent/consul/autopilot/structs.go
@@ -20,6 +20,10 @@ type Config struct {
// be behind before being considered unhealthy.
MaxTrailingLogs uint64
+ // MinQuorum sets the minimum number of servers required in a cluster
+ // before autopilot can prune dead servers.
+ MinQuorum uint
+
// ServerStabilizationTime is the minimum amount of time a server must be
// in a stable, healthy state before it can be added to the cluster. Only
// applicable with Raft protocol version 3 or higher.
diff --git a/agent/consul/autopilot_test.go b/agent/consul/autopilot_test.go
index 35a914eb2e6d..d362d601a7bc 100644
--- a/agent/consul/autopilot_test.go
+++ b/agent/consul/autopilot_test.go
@@ -369,3 +369,97 @@ func TestAutopilot_PromoteNonVoter(t *testing.T) {
}
})
}
+
+func TestAutopilot_BootstrapExpect(t *testing.T) {
+ dc := "dc1"
+ closeMap := make(map[string]chan struct{})
+ conf := func(c *Config) {
+ c.Datacenter = dc
+ c.Bootstrap = false
+ c.BootstrapExpect = 3
+ c.AutopilotConfig.MinQuorum = 3
+ c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(2)
+ c.AutopilotInterval = 100 * time.Millisecond
+ //Let us know when a server is actually gone
+ ch := make(chan struct{})
+ c.NotifyShutdown = func() {
+ t.Logf("%v is shutdown", c.NodeName)
+ close(ch)
+ }
+ closeMap[c.NodeName] = ch
+ }
+ dir1, s1 := testServerWithConfig(t, conf)
+ defer os.RemoveAll(dir1)
+ defer s1.Shutdown()
+
+ dir2, s2 := testServerWithConfig(t, conf)
+ defer os.RemoveAll(dir2)
+ defer s2.Shutdown()
+
+ dir3, s3 := testServerWithConfig(t, conf)
+ defer os.RemoveAll(dir3)
+ defer s3.Shutdown()
+
+ dir4, s4 := testServerWithConfig(t, conf)
+ defer os.RemoveAll(dir4)
+ defer s4.Shutdown()
+
+ servers := map[string]*Server{s1.config.NodeName: s1,
+ s2.config.NodeName: s2,
+ s3.config.NodeName: s3,
+ s4.config.NodeName: s4}
+
+ // Try to join
+ joinLAN(t, s2, s1)
+ joinLAN(t, s3, s1)
+ joinLAN(t, s4, s1)
+
+ //Differentiate between leader and server
+ findStatus := func(leader bool) *Server {
+ for _, mem := range servers {
+ if mem.IsLeader() && leader {
+ return mem
+ }
+ if !leader && !mem.IsLeader() {
+ return mem
+ }
+ }
+
+ t.Fatalf("no members set")
+ return nil
+ }
+
+ for _, s := range servers {
+ testrpc.WaitForLeader(t, s.RPC, dc)
+ retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 4)) })
+ }
+
+ // Have autopilot take one into left
+ dead := findStatus(false)
+ dead.Shutdown()
+ <-closeMap[dead.config.NodeName]
+ retry.Run(t, func(r *retry.R) {
+ leader := findStatus(true)
+ for _, m := range leader.LANMembers() {
+ if m.Name == dead.config.NodeName && m.Status != serf.StatusLeft {
+ r.Fatalf("%v should be left, got %v", m.Name, m.Status.String())
+ }
+ }
+ })
+
+ delete(servers, dead.config.NodeName)
+ //Autopilot should not take this one into left
+ dead = findStatus(false)
+ dead.Shutdown()
+ <-closeMap[dead.config.NodeName]
+
+ retry.Run(t, func(r *retry.R) {
+ leader := findStatus(true)
+ for _, m := range leader.LANMembers() {
+ if m.Name == dead.config.NodeName && m.Status != serf.StatusFailed {
+ r.Fatalf("%v should be failed, got %v", m.Name, m.Status.String())
+ }
+ }
+ })
+
+}
diff --git a/agent/consul/operator_autopilot_endpoint_test.go b/agent/consul/operator_autopilot_endpoint_test.go
index 4728f8436a11..a2a279237716 100644
--- a/agent/consul/operator_autopilot_endpoint_test.go
+++ b/agent/consul/operator_autopilot_endpoint_test.go
@@ -11,7 +11,7 @@ import (
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil/retry"
"github.com/hashicorp/consul/testrpc"
- "github.com/hashicorp/net-rpc-msgpackrpc"
+ msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
"github.com/hashicorp/raft"
)
@@ -116,6 +116,7 @@ func TestOperator_Autopilot_SetConfiguration(t *testing.T) {
Datacenter: "dc1",
Config: autopilot.Config{
CleanupDeadServers: true,
+ MinQuorum: 3,
},
}
var reply *bool
@@ -130,7 +131,7 @@ func TestOperator_Autopilot_SetConfiguration(t *testing.T) {
if err != nil {
t.Fatal(err)
}
- if !config.CleanupDeadServers {
+ if !config.CleanupDeadServers && config.MinQuorum != 3 {
t.Fatalf("bad: %#v", config)
}
}
diff --git a/agent/operator_endpoint.go b/agent/operator_endpoint.go
index e46685e793b2..9be097ab2208 100644
--- a/agent/operator_endpoint.go
+++ b/agent/operator_endpoint.go
@@ -201,6 +201,7 @@ func (s *HTTPServer) OperatorAutopilotConfiguration(resp http.ResponseWriter, re
CleanupDeadServers: reply.CleanupDeadServers,
LastContactThreshold: api.NewReadableDuration(reply.LastContactThreshold),
MaxTrailingLogs: reply.MaxTrailingLogs,
+ MinQuorum: reply.MinQuorum,
ServerStabilizationTime: api.NewReadableDuration(reply.ServerStabilizationTime),
RedundancyZoneTag: reply.RedundancyZoneTag,
DisableUpgradeMigration: reply.DisableUpgradeMigration,
@@ -226,6 +227,7 @@ func (s *HTTPServer) OperatorAutopilotConfiguration(resp http.ResponseWriter, re
CleanupDeadServers: conf.CleanupDeadServers,
LastContactThreshold: conf.LastContactThreshold.Duration(),
MaxTrailingLogs: conf.MaxTrailingLogs,
+ MinQuorum: conf.MinQuorum,
ServerStabilizationTime: conf.ServerStabilizationTime.Duration(),
RedundancyZoneTag: conf.RedundancyZoneTag,
DisableUpgradeMigration: conf.DisableUpgradeMigration,
diff --git a/api/operator_autopilot.go b/api/operator_autopilot.go
index b179406dc12c..ec777f7128d8 100644
--- a/api/operator_autopilot.go
+++ b/api/operator_autopilot.go
@@ -25,6 +25,10 @@ type AutopilotConfiguration struct {
// be behind before being considered unhealthy.
MaxTrailingLogs uint64
+ // MinQuorum sets the minimum number of servers allowed in a cluster before
+ // autopilot can prune dead servers.
+ MinQuorum uint
+
// ServerStabilizationTime is the minimum amount of time a server must be
// in a stable, healthy state before it can be added to the cluster. Only
// applicable with Raft protocol version 3 or higher.
diff --git a/command/operator/autopilot/get/operator_autopilot_get.go b/command/operator/autopilot/get/operator_autopilot_get.go
index ed0c897f5616..d8943d84edbc 100644
--- a/command/operator/autopilot/get/operator_autopilot_get.go
+++ b/command/operator/autopilot/get/operator_autopilot_get.go
@@ -58,6 +58,7 @@ func (c *cmd) Run(args []string) int {
c.UI.Output(fmt.Sprintf("CleanupDeadServers = %v", config.CleanupDeadServers))
c.UI.Output(fmt.Sprintf("LastContactThreshold = %v", config.LastContactThreshold.String()))
c.UI.Output(fmt.Sprintf("MaxTrailingLogs = %v", config.MaxTrailingLogs))
+ c.UI.Output(fmt.Sprintf("MinQuorum = %v", config.MinQuorum))
c.UI.Output(fmt.Sprintf("ServerStabilizationTime = %v", config.ServerStabilizationTime.String()))
c.UI.Output(fmt.Sprintf("RedundancyZoneTag = %q", config.RedundancyZoneTag))
c.UI.Output(fmt.Sprintf("DisableUpgradeMigration = %v", config.DisableUpgradeMigration))
diff --git a/command/operator/autopilot/set/operator_autopilot_set.go b/command/operator/autopilot/set/operator_autopilot_set.go
index c17c7bb2ea6a..f61d2ff98f75 100644
--- a/command/operator/autopilot/set/operator_autopilot_set.go
+++ b/command/operator/autopilot/set/operator_autopilot_set.go
@@ -25,6 +25,7 @@ type cmd struct {
// flags
cleanupDeadServers flags.BoolValue
maxTrailingLogs flags.UintValue
+ minQuorum flags.UintValue
lastContactThreshold flags.DurationValue
serverStabilizationTime flags.DurationValue
redundancyZoneTag flags.StringValue
@@ -40,6 +41,9 @@ func (c *cmd) init() {
c.flags.Var(&c.maxTrailingLogs, "max-trailing-logs",
"Controls the maximum number of log entries that a server can trail the "+
"leader by before being considered unhealthy.")
+ c.flags.Var(&c.minQuorum, "min-quorum",
+ "Sets the minimum number of of servers required in a cluster before autopilot "+
+ "is allowed to prune dead servers.")
c.flags.Var(&c.lastContactThreshold, "last-contact-threshold",
"Controls the maximum amount of time a server can go without contact "+
"from the leader before being considered unhealthy. Must be a duration value "+
@@ -94,6 +98,7 @@ func (c *cmd) Run(args []string) int {
c.redundancyZoneTag.Merge(&conf.RedundancyZoneTag)
c.disableUpgradeMigration.Merge(&conf.DisableUpgradeMigration)
c.upgradeVersionTag.Merge(&conf.UpgradeVersionTag)
+ c.minQuorum.Merge(&conf.MinQuorum)
trailing := uint(conf.MaxTrailingLogs)
c.maxTrailingLogs.Merge(&trailing)
diff --git a/command/operator/autopilot/set/operator_autopilot_set_test.go b/command/operator/autopilot/set/operator_autopilot_set_test.go
index 0670217c8e89..2f6167a3e2fc 100644
--- a/command/operator/autopilot/set/operator_autopilot_set_test.go
+++ b/command/operator/autopilot/set/operator_autopilot_set_test.go
@@ -34,6 +34,7 @@ func TestOperatorAutopilotSetConfigCommand(t *testing.T) {
"-max-trailing-logs=99",
"-last-contact-threshold=123ms",
"-server-stabilization-time=123ms",
+ "-min-quorum=3",
}
code := c.Run(args)
@@ -65,4 +66,7 @@ func TestOperatorAutopilotSetConfigCommand(t *testing.T) {
if reply.ServerStabilizationTime != 123*time.Millisecond {
t.Fatalf("bad: %#v", reply)
}
+ if reply.MinQuorum != 3 {
+ t.Fatalf("bad: %#v", reply)
+ }
}
diff --git a/website/source/api/operator/autopilot.html.md b/website/source/api/operator/autopilot.html.md
index 7139bd4bcf7c..f25fe037e765 100644
--- a/website/source/api/operator/autopilot.html.md
+++ b/website/source/api/operator/autopilot.html.md
@@ -108,6 +108,9 @@ The table below shows this endpoint's support for
- `MaxTrailingLogs` `(int: 250)` specifies the maximum number of log entries
that a server can trail the leader by before being considered unhealthy.
+- `MinQuorum` `int: 0` - specifies the minimum number of servers needed before
+ Autopilot can prune dead servers.
+
- `ServerStabilizationTime` `(string: "10s")` - Specifies the minimum amount of
time a server must be stable in the 'healthy' state before being added to the
cluster. Only takes effect if all servers are running Raft protocol version 3
@@ -134,6 +137,7 @@ The table below shows this endpoint's support for
"CleanupDeadServers": true,
"LastContactThreshold": "200ms",
"MaxTrailingLogs": 250,
+ "MinQuorum": 3,
"ServerStabilizationTime": "10s",
"RedundancyZoneTag": "",
"DisableUpgradeMigration": false,
diff --git a/website/source/docs/agent/options.html.md b/website/source/docs/agent/options.html.md
index dc7dd634c74d..86740dd68b68 100644
--- a/website/source/docs/agent/options.html.md
+++ b/website/source/docs/agent/options.html.md
@@ -782,6 +782,9 @@ default will automatically work with some tooling.
the maximum number of log entries that a server can trail the leader by before being considered unhealthy. Defaults
to 250.
+ * `min_quorum` - Sets the minimum number of servers necessary in a cluster
+ before autopilot can prune dead servers. There is no default.
+
* `server_stabilization_time` -
Controls the minimum amount of time a server must be stable in the 'healthy' state before being added to the
cluster. Only takes effect if all servers are running Raft protocol version 3 or higher. Must be a duration value