Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Persist check state for TTL checks #1009

Merged
merged 7 commits into from
Jun 8, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 95 additions & 2 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"regexp"
"strconv"
"sync"
"time"

"github.com/hashicorp/consul/consul"
"github.com/hashicorp/consul/consul/structs"
Expand All @@ -23,7 +24,8 @@ const (
servicesDir = "services"

// Path to save local agent checks
checksDir = "checks"
checksDir = "checks"
checkStateDir = "checks/state"

// The ID of the faux health checks for maintenance mode
serviceMaintCheckPrefix = "_service_maintenance"
Expand Down Expand Up @@ -757,6 +759,13 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist
TTL: chkType.TTL,
Logger: a.logger,
}

// Restore persisted state, if any
if err := a.loadCheckState(check); err != nil {
a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s",
check.CheckID, err)
}

ttl.Start()
a.checkTTLs[check.CheckID] = ttl

Expand Down Expand Up @@ -842,7 +851,12 @@ func (a *Agent) RemoveCheck(checkID string, persist bool) error {
delete(a.checkTTLs, checkID)
}
if persist {
return a.purgeCheck(checkID)
if err := a.purgeCheck(checkID); err != nil {
return err
}
if err := a.purgeCheckState(checkID); err != nil {
return err
}
}
log.Printf("[DEBUG] agent: removed check %q", checkID)
return nil
Expand All @@ -861,9 +875,88 @@ func (a *Agent) UpdateCheck(checkID, status, output string) error {

// Set the status through CheckTTL to reset the TTL
check.SetStatus(status, output)

// Always persist the state for TTL checks
if err := a.persistCheckState(check, status, output); err != nil {
return fmt.Errorf("failed persisting state for check %q: %s", checkID, err)
}

return nil
}

// persistCheckState is used to record the check status into the data dir.
// This allows the state to be restored on a later agent start. Currently
// only useful for TTL based checks.
func (a *Agent) persistCheckState(check *CheckTTL, status, output string) error {
// Create the persisted state
state := persistedCheckState{
CheckID: check.CheckID,
Status: status,
Output: output,
Expires: time.Now().Add(check.TTL).Unix(),
}

// Encode the state
buf, err := json.Marshal(state)
if err != nil {
return err
}

// Create the state dir if it doesn't exist
dir := filepath.Join(a.config.DataDir, checkStateDir)
if err := os.MkdirAll(dir, 0700); err != nil {
return fmt.Errorf("failed creating check state dir %q: %s", dir, err)
}

// Write the state to the file
file := filepath.Join(dir, stringHash(check.CheckID))
if err := ioutil.WriteFile(file, buf, 0600); err != nil {
return fmt.Errorf("failed writing file %q: %s", file, err)
}

return nil
}

// loadCheckState is used to restore the persisted state of a check.
func (a *Agent) loadCheckState(check *structs.HealthCheck) error {
// Try to read the persisted state for this check
file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(check.CheckID))
buf, err := ioutil.ReadFile(file)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return fmt.Errorf("failed reading file %q: %s", file, err)
}

// Decode the state data
var p persistedCheckState
if err := json.Unmarshal(buf, &p); err != nil {
return fmt.Errorf("failed decoding check state: %s", err)
}

// Check if the state has expired
if time.Now().Unix() >= p.Expires {
a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID)
return a.purgeCheckState(check.CheckID)
}

// Restore the fields from the state
check.Output = p.Output
check.Status = p.Status
return nil
}

// purgeCheckState is used to purge the state of a check from the data dir
func (a *Agent) purgeCheckState(checkID string) error {
file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(checkID))
err := os.Remove(file)
if os.IsNotExist(err) {
return nil
}
return err
}

// Stats is used to get various debugging state from the sub-systems
func (a *Agent) Stats() map[string]map[string]string {
toString := func(v uint64) string {
Expand Down
186 changes: 186 additions & 0 deletions command/agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,49 @@ func TestAgent_AddCheck_MissingService(t *testing.T) {
}
}

func TestAgent_AddCheck_RestoreState(t *testing.T) {
dir, agent := makeAgent(t, nextConfig())
defer os.RemoveAll(dir)
defer agent.Shutdown()

// Create some state and persist it
ttl := &CheckTTL{
CheckID: "baz",
TTL: time.Minute,
}
err := agent.persistCheckState(ttl, structs.HealthPassing, "yup")
if err != nil {
t.Fatalf("err: %s", err)
}

// Build and register the check definition and initial state
health := &structs.HealthCheck{
Node: "foo",
CheckID: "baz",
Name: "baz check 1",
}
chk := &CheckType{
TTL: time.Minute,
}
err = agent.AddCheck(health, chk, false, "")
if err != nil {
t.Fatalf("err: %s", err)
}

// Ensure the check status was restored during registration
checks := agent.state.Checks()
check, ok := checks["baz"]
if !ok {
t.Fatalf("missing check")
}
if check.Status != structs.HealthPassing {
t.Fatalf("bad: %#v", check)
}
if check.Output != "yup" {
t.Fatalf("bad: %#v", check)
}
}

func TestAgent_RemoveCheck(t *testing.T) {
dir, agent := makeAgent(t, nextConfig())
defer os.RemoveAll(dir)
Expand Down Expand Up @@ -1349,3 +1392,146 @@ func TestAgent_loadChecks_checkFails(t *testing.T) {
t.Fatalf("should have purged check")
}
}

func TestAgent_persistCheckState(t *testing.T) {
config := nextConfig()
dir, agent := makeAgent(t, config)
defer os.RemoveAll(dir)
defer agent.Shutdown()

// Create the TTL check to persist
check := &CheckTTL{
CheckID: "check1",
TTL: 10 * time.Minute,
}

// Persist some check state for the check
err := agent.persistCheckState(check, structs.HealthCritical, "nope")
if err != nil {
t.Fatalf("err: %s", err)
}

// Check the persisted file exists and has the content
file := filepath.Join(agent.config.DataDir, checkStateDir, stringHash("check1"))
buf, err := ioutil.ReadFile(file)
if err != nil {
t.Fatalf("err: %s", err)
}

// Decode the state
var p persistedCheckState
if err := json.Unmarshal(buf, &p); err != nil {
t.Fatalf("err: %s", err)
}

// Check the fields
if p.CheckID != "check1" {
t.Fatalf("bad: %#v", p)
}
if p.Output != "nope" {
t.Fatalf("bad: %#v", p)
}
if p.Status != structs.HealthCritical {
t.Fatalf("bad: %#v", p)
}

// Check the expiration time was set
if p.Expires < time.Now().Unix() {
t.Fatalf("bad: %#v", p)
}
}

func TestAgent_loadCheckState(t *testing.T) {
config := nextConfig()
dir, agent := makeAgent(t, config)
defer os.RemoveAll(dir)
defer agent.Shutdown()

// Create a check whose state will expire immediately
check := &CheckTTL{
CheckID: "check1",
TTL: 0,
}

// Persist the check state
err := agent.persistCheckState(check, structs.HealthPassing, "yup")
if err != nil {
t.Fatalf("err: %s", err)
}

// Try to load the state
health := &structs.HealthCheck{
CheckID: "check1",
Status: structs.HealthCritical,
}
if err := agent.loadCheckState(health); err != nil {
t.Fatalf("err: %s", err)
}

// Should not have restored the status due to expiration
if health.Status != structs.HealthCritical {
t.Fatalf("bad: %#v", health)
}
if health.Output != "" {
t.Fatalf("bad: %#v", health)
}

// Should have purged the state
file := filepath.Join(agent.config.DataDir, checksDir, stringHash("check1"))
if _, err := os.Stat(file); !os.IsNotExist(err) {
t.Fatalf("should have purged state")
}

// Set a TTL which will not expire before we check it
check.TTL = time.Minute
err = agent.persistCheckState(check, structs.HealthPassing, "yup")
if err != nil {
t.Fatalf("err: %s", err)
}

// Try to load
if err := agent.loadCheckState(health); err != nil {
t.Fatalf("err: %s", err)
}

// Should have restored
if health.Status != structs.HealthPassing {
t.Fatalf("bad: %#v", health)
}
if health.Output != "yup" {
t.Fatalf("bad: %#v", health)
}
}

func TestAgent_purgeCheckState(t *testing.T) {
config := nextConfig()
dir, agent := makeAgent(t, config)
defer os.RemoveAll(dir)
defer agent.Shutdown()

// No error if the state does not exist
if err := agent.purgeCheckState("check1"); err != nil {
t.Fatalf("err: %s", err)
}

// Persist some state to the data dir
check := &CheckTTL{
CheckID: "check1",
TTL: time.Minute,
}
err := agent.persistCheckState(check, structs.HealthPassing, "yup")
if err != nil {
t.Fatalf("err: %s", err)
}

// Purge the check state
if err := agent.purgeCheckState("check1"); err != nil {
t.Fatalf("err: %s", err)
}

// Removed the file
file := filepath.Join(agent.config.DataDir, checkStateDir, stringHash("check1"))
if _, err := os.Stat(file); !os.IsNotExist(err) {
t.Fatalf("should have removed file")
}
}
11 changes: 11 additions & 0 deletions command/agent/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,17 @@ type persistedCheck struct {
Token string
}

// persistedCheckState is used to persist the current state of a given
// check. This is different from the check definition, and includes an
// expiration timestamp which is used to determine staleness on later
// agent restarts.
type persistedCheckState struct {
CheckID string
Output string
Status string
Expires int64
}

// CheckHTTP is used to periodically make an HTTP request to
// determine the health of a given check.
// The check is passing if the response code is 2XX.
Expand Down
5 changes: 4 additions & 1 deletion website/source/docs/agent/checks.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ There are three different kinds of checks:
set to the failed state. This mechanism, conceptually similar to a dead man's switch,
relies on the application to directly report its health. For example, a healthy app
can periodically `PUT` a status update to the HTTP endpoint; if the app fails, the TTL will
expire and the health check enters a critical state.
expire and the health check enters a critical state. TTL checks also persist
their last known status to disk. This allows the Consul agent to restore the
last known status of the check across restarts. Persisted check status is
valid through the end of the TTL from the time of the last check.

## Check Definition

Expand Down