Skip to content

Commit

Permalink
Set MinQuorum variable in Autopilot (#6654)
Browse files Browse the repository at this point in the history
* Add MinQuorum to Autopilot
  • Loading branch information
schristoff authored Oct 29, 2019
1 parent 64d099c commit 5e1c6e9
Show file tree
Hide file tree
Showing 16 changed files with 149 additions and 3 deletions.
1 change: 1 addition & 0 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -1204,6 +1204,7 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
base.AutopilotConfig.CleanupDeadServers = a.config.AutopilotCleanupDeadServers
base.AutopilotConfig.LastContactThreshold = a.config.AutopilotLastContactThreshold
base.AutopilotConfig.MaxTrailingLogs = uint64(a.config.AutopilotMaxTrailingLogs)
base.AutopilotConfig.MinQuorum = a.config.AutopilotMinQuorum
base.AutopilotConfig.ServerStabilizationTime = a.config.AutopilotServerStabilizationTime
base.AutopilotConfig.RedundancyZoneTag = a.config.AutopilotRedundancyZoneTag
base.AutopilotConfig.DisableUpgradeMigration = a.config.AutopilotDisableUpgradeMigration
Expand Down
12 changes: 12 additions & 0 deletions agent/config/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
AutopilotDisableUpgradeMigration: b.boolVal(c.Autopilot.DisableUpgradeMigration),
AutopilotLastContactThreshold: b.durationVal("autopilot.last_contact_threshold", c.Autopilot.LastContactThreshold),
AutopilotMaxTrailingLogs: b.intVal(c.Autopilot.MaxTrailingLogs),
AutopilotMinQuorum: b.uintVal(c.Autopilot.MinQuorum),
AutopilotRedundancyZoneTag: b.stringVal(c.Autopilot.RedundancyZoneTag),
AutopilotServerStabilizationTime: b.durationVal("autopilot.server_stabilization_time", c.Autopilot.ServerStabilizationTime),
AutopilotUpgradeVersionTag: b.stringVal(c.Autopilot.UpgradeVersionTag),
Expand Down Expand Up @@ -1444,6 +1445,17 @@ func (b *Builder) intVal(v *int) int {
return b.intValWithDefault(v, 0)
}

func (b *Builder) uintVal(v *uint) uint {
return b.uintValWithDefault(v, 0)
}

func (b *Builder) uintValWithDefault(v *uint, defaultVal uint) uint {
if v == nil {
return defaultVal
}
return *v
}

func (b *Builder) uint64ValWithDefault(v *uint64, defaultVal uint64) uint64 {
if v == nil {
return defaultVal
Expand Down
1 change: 1 addition & 0 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ type Autopilot struct {
DisableUpgradeMigration *bool `json:"disable_upgrade_migration,omitempty" hcl:"disable_upgrade_migration" mapstructure:"disable_upgrade_migration"`
LastContactThreshold *string `json:"last_contact_threshold,omitempty" hcl:"last_contact_threshold" mapstructure:"last_contact_threshold"`
MaxTrailingLogs *int `json:"max_trailing_logs,omitempty" hcl:"max_trailing_logs" mapstructure:"max_trailing_logs"`
MinQuorum *uint `json:"min_quorum,omitempty" hcl:"min_quorum" mapstructure:"min_quorum"`
RedundancyZoneTag *string `json:"redundancy_zone_tag,omitempty" hcl:"redundancy_zone_tag" mapstructure:"redundancy_zone_tag"`
ServerStabilizationTime *string `json:"server_stabilization_time,omitempty" hcl:"server_stabilization_time" mapstructure:"server_stabilization_time"`
UpgradeVersionTag *string `json:"upgrade_version_tag,omitempty" hcl:"upgrade_version_tag" mapstructure:"upgrade_version_tag"`
Expand Down
6 changes: 6 additions & 0 deletions agent/config/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,12 @@ type RuntimeConfig struct {
// hcl: autopilot { max_trailing_logs = int }
AutopilotMaxTrailingLogs int

// AutopilotMinQuorum sets the minimum number of servers required in a cluster
// before autopilot can prune dead servers.
//
//hcl: autopilot { min_quorum = int }
AutopilotMinQuorum uint

// AutopilotRedundancyZoneTag is the Meta tag to use for separating servers
// into zones for redundancy. If left blank, this feature will be disabled.
// (Enterprise-only)
Expand Down
4 changes: 4 additions & 0 deletions agent/config/runtime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3520,6 +3520,7 @@ func TestFullConfig(t *testing.T) {
"disable_upgrade_migration": true,
"last_contact_threshold": "12705s",
"max_trailing_logs": 17849,
"min_quorum": 3,
"redundancy_zone_tag": "3IsufDJf",
"server_stabilization_time": "23057s",
"upgrade_version_tag": "W9pDwFAL"
Expand Down Expand Up @@ -4117,6 +4118,7 @@ func TestFullConfig(t *testing.T) {
disable_upgrade_migration = true
last_contact_threshold = "12705s"
max_trailing_logs = 17849
min_quorum = 3
redundancy_zone_tag = "3IsufDJf"
server_stabilization_time = "23057s"
upgrade_version_tag = "W9pDwFAL"
Expand Down Expand Up @@ -4819,6 +4821,7 @@ func TestFullConfig(t *testing.T) {
AutopilotDisableUpgradeMigration: true,
AutopilotLastContactThreshold: 12705 * time.Second,
AutopilotMaxTrailingLogs: 17849,
AutopilotMinQuorum: 3,
AutopilotRedundancyZoneTag: "3IsufDJf",
AutopilotServerStabilizationTime: 23057 * time.Second,
AutopilotUpgradeVersionTag: "W9pDwFAL",
Expand Down Expand Up @@ -5703,6 +5706,7 @@ func TestSanitize(t *testing.T) {
"AutopilotDisableUpgradeMigration": false,
"AutopilotLastContactThreshold": "0s",
"AutopilotMaxTrailingLogs": 0,
"AutopilotMinQuorum": 0,
"AutopilotRedundancyZoneTag": "",
"AutopilotServerStabilizationTime": "0s",
"AutopilotUpgradeVersionTag": "",
Expand Down
2 changes: 1 addition & 1 deletion agent/consul/autopilot/autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ func (a *Autopilot) pruneDeadServers() error {

// Only do removals if a minority of servers will be affected.
peers := NumPeers(raftConfig)
if removalCount < peers/2 {
if peers-removalCount >= int(conf.MinQuorum) && removalCount < peers/2 {
for _, node := range failed {
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
go serfLAN.RemoveFailedNode(node.Name)
Expand Down
4 changes: 4 additions & 0 deletions agent/consul/autopilot/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ type Config struct {
// be behind before being considered unhealthy.
MaxTrailingLogs uint64

// MinQuorum sets the minimum number of servers required in a cluster
// before autopilot can prune dead servers.
MinQuorum uint

// ServerStabilizationTime is the minimum amount of time a server must be
// in a stable, healthy state before it can be added to the cluster. Only
// applicable with Raft protocol version 3 or higher.
Expand Down
94 changes: 94 additions & 0 deletions agent/consul/autopilot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -369,3 +369,97 @@ func TestAutopilot_PromoteNonVoter(t *testing.T) {
}
})
}

func TestAutopilot_BootstrapExpect(t *testing.T) {
dc := "dc1"
closeMap := make(map[string]chan struct{})
conf := func(c *Config) {
c.Datacenter = dc
c.Bootstrap = false
c.BootstrapExpect = 3
c.AutopilotConfig.MinQuorum = 3
c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(2)
c.AutopilotInterval = 100 * time.Millisecond
//Let us know when a server is actually gone
ch := make(chan struct{})
c.NotifyShutdown = func() {
t.Logf("%v is shutdown", c.NodeName)
close(ch)
}
closeMap[c.NodeName] = ch
}
dir1, s1 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir1)
defer s1.Shutdown()

dir2, s2 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir2)
defer s2.Shutdown()

dir3, s3 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir3)
defer s3.Shutdown()

dir4, s4 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir4)
defer s4.Shutdown()

servers := map[string]*Server{s1.config.NodeName: s1,
s2.config.NodeName: s2,
s3.config.NodeName: s3,
s4.config.NodeName: s4}

// Try to join
joinLAN(t, s2, s1)
joinLAN(t, s3, s1)
joinLAN(t, s4, s1)

//Differentiate between leader and server
findStatus := func(leader bool) *Server {
for _, mem := range servers {
if mem.IsLeader() && leader {
return mem
}
if !leader && !mem.IsLeader() {
return mem
}
}

t.Fatalf("no members set")
return nil
}

for _, s := range servers {
testrpc.WaitForLeader(t, s.RPC, dc)
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 4)) })
}

// Have autopilot take one into left
dead := findStatus(false)
dead.Shutdown()
<-closeMap[dead.config.NodeName]
retry.Run(t, func(r *retry.R) {
leader := findStatus(true)
for _, m := range leader.LANMembers() {
if m.Name == dead.config.NodeName && m.Status != serf.StatusLeft {
r.Fatalf("%v should be left, got %v", m.Name, m.Status.String())
}
}
})

delete(servers, dead.config.NodeName)
//Autopilot should not take this one into left
dead = findStatus(false)
dead.Shutdown()
<-closeMap[dead.config.NodeName]

retry.Run(t, func(r *retry.R) {
leader := findStatus(true)
for _, m := range leader.LANMembers() {
if m.Name == dead.config.NodeName && m.Status != serf.StatusFailed {
r.Fatalf("%v should be failed, got %v", m.Name, m.Status.String())
}
}
})

}
5 changes: 3 additions & 2 deletions agent/consul/operator_autopilot_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil/retry"
"github.com/hashicorp/consul/testrpc"
"github.com/hashicorp/net-rpc-msgpackrpc"
msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
"github.com/hashicorp/raft"
)

Expand Down Expand Up @@ -116,6 +116,7 @@ func TestOperator_Autopilot_SetConfiguration(t *testing.T) {
Datacenter: "dc1",
Config: autopilot.Config{
CleanupDeadServers: true,
MinQuorum: 3,
},
}
var reply *bool
Expand All @@ -130,7 +131,7 @@ func TestOperator_Autopilot_SetConfiguration(t *testing.T) {
if err != nil {
t.Fatal(err)
}
if !config.CleanupDeadServers {
if !config.CleanupDeadServers && config.MinQuorum != 3 {
t.Fatalf("bad: %#v", config)
}
}
Expand Down
2 changes: 2 additions & 0 deletions agent/operator_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ func (s *HTTPServer) OperatorAutopilotConfiguration(resp http.ResponseWriter, re
CleanupDeadServers: reply.CleanupDeadServers,
LastContactThreshold: api.NewReadableDuration(reply.LastContactThreshold),
MaxTrailingLogs: reply.MaxTrailingLogs,
MinQuorum: reply.MinQuorum,
ServerStabilizationTime: api.NewReadableDuration(reply.ServerStabilizationTime),
RedundancyZoneTag: reply.RedundancyZoneTag,
DisableUpgradeMigration: reply.DisableUpgradeMigration,
Expand All @@ -226,6 +227,7 @@ func (s *HTTPServer) OperatorAutopilotConfiguration(resp http.ResponseWriter, re
CleanupDeadServers: conf.CleanupDeadServers,
LastContactThreshold: conf.LastContactThreshold.Duration(),
MaxTrailingLogs: conf.MaxTrailingLogs,
MinQuorum: conf.MinQuorum,
ServerStabilizationTime: conf.ServerStabilizationTime.Duration(),
RedundancyZoneTag: conf.RedundancyZoneTag,
DisableUpgradeMigration: conf.DisableUpgradeMigration,
Expand Down
4 changes: 4 additions & 0 deletions api/operator_autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ type AutopilotConfiguration struct {
// be behind before being considered unhealthy.
MaxTrailingLogs uint64

// MinQuorum sets the minimum number of servers allowed in a cluster before
// autopilot can prune dead servers.
MinQuorum uint

// ServerStabilizationTime is the minimum amount of time a server must be
// in a stable, healthy state before it can be added to the cluster. Only
// applicable with Raft protocol version 3 or higher.
Expand Down
1 change: 1 addition & 0 deletions command/operator/autopilot/get/operator_autopilot_get.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ func (c *cmd) Run(args []string) int {
c.UI.Output(fmt.Sprintf("CleanupDeadServers = %v", config.CleanupDeadServers))
c.UI.Output(fmt.Sprintf("LastContactThreshold = %v", config.LastContactThreshold.String()))
c.UI.Output(fmt.Sprintf("MaxTrailingLogs = %v", config.MaxTrailingLogs))
c.UI.Output(fmt.Sprintf("MinQuorum = %v", config.MinQuorum))
c.UI.Output(fmt.Sprintf("ServerStabilizationTime = %v", config.ServerStabilizationTime.String()))
c.UI.Output(fmt.Sprintf("RedundancyZoneTag = %q", config.RedundancyZoneTag))
c.UI.Output(fmt.Sprintf("DisableUpgradeMigration = %v", config.DisableUpgradeMigration))
Expand Down
5 changes: 5 additions & 0 deletions command/operator/autopilot/set/operator_autopilot_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ type cmd struct {
// flags
cleanupDeadServers flags.BoolValue
maxTrailingLogs flags.UintValue
minQuorum flags.UintValue
lastContactThreshold flags.DurationValue
serverStabilizationTime flags.DurationValue
redundancyZoneTag flags.StringValue
Expand All @@ -40,6 +41,9 @@ func (c *cmd) init() {
c.flags.Var(&c.maxTrailingLogs, "max-trailing-logs",
"Controls the maximum number of log entries that a server can trail the "+
"leader by before being considered unhealthy.")
c.flags.Var(&c.minQuorum, "min-quorum",
"Sets the minimum number of of servers required in a cluster before autopilot "+
"is allowed to prune dead servers.")
c.flags.Var(&c.lastContactThreshold, "last-contact-threshold",
"Controls the maximum amount of time a server can go without contact "+
"from the leader before being considered unhealthy. Must be a duration value "+
Expand Down Expand Up @@ -94,6 +98,7 @@ func (c *cmd) Run(args []string) int {
c.redundancyZoneTag.Merge(&conf.RedundancyZoneTag)
c.disableUpgradeMigration.Merge(&conf.DisableUpgradeMigration)
c.upgradeVersionTag.Merge(&conf.UpgradeVersionTag)
c.minQuorum.Merge(&conf.MinQuorum)

trailing := uint(conf.MaxTrailingLogs)
c.maxTrailingLogs.Merge(&trailing)
Expand Down
4 changes: 4 additions & 0 deletions command/operator/autopilot/set/operator_autopilot_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ func TestOperatorAutopilotSetConfigCommand(t *testing.T) {
"-max-trailing-logs=99",
"-last-contact-threshold=123ms",
"-server-stabilization-time=123ms",
"-min-quorum=3",
}

code := c.Run(args)
Expand Down Expand Up @@ -65,4 +66,7 @@ func TestOperatorAutopilotSetConfigCommand(t *testing.T) {
if reply.ServerStabilizationTime != 123*time.Millisecond {
t.Fatalf("bad: %#v", reply)
}
if reply.MinQuorum != 3 {
t.Fatalf("bad: %#v", reply)
}
}
4 changes: 4 additions & 0 deletions website/source/api/operator/autopilot.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ The table below shows this endpoint's support for
- `MaxTrailingLogs` `(int: 250)` specifies the maximum number of log entries
that a server can trail the leader by before being considered unhealthy.

- `MinQuorum` `int: 0` - specifies the minimum number of servers needed before
Autopilot can prune dead servers.

- `ServerStabilizationTime` `(string: "10s")` - Specifies the minimum amount of
time a server must be stable in the 'healthy' state before being added to the
cluster. Only takes effect if all servers are running Raft protocol version 3
Expand All @@ -134,6 +137,7 @@ The table below shows this endpoint's support for
"CleanupDeadServers": true,
"LastContactThreshold": "200ms",
"MaxTrailingLogs": 250,
"MinQuorum": 3,
"ServerStabilizationTime": "10s",
"RedundancyZoneTag": "",
"DisableUpgradeMigration": false,
Expand Down
3 changes: 3 additions & 0 deletions website/source/docs/agent/options.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,9 @@ default will automatically work with some tooling.
the maximum number of log entries that a server can trail the leader by before being considered unhealthy. Defaults
to 250.

* <a name="min_quorum"></a><a href="#min_quorum">`min_quorum`</a> - Sets the minimum number of servers necessary in a cluster
before autopilot can prune dead servers. There is no default.

* <a name="server_stabilization_time"></a><a href="#server_stabilization_time">`server_stabilization_time`</a> -
Controls the minimum amount of time a server must be stable in the 'healthy' state before being added to the
cluster. Only takes effect if all servers are running Raft protocol version 3 or higher. Must be a duration value
Expand Down

0 comments on commit 5e1c6e9

Please sign in to comment.