Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleans up check logging. #3683

Merged
merged 1 commit into from
Nov 10, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 23 additions & 30 deletions agent/checks/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,13 @@ func (c *CheckMonitor) check() {
// Create the command
var cmd *osexec.Cmd
var err error
var cmdDisplay string
if len(c.ScriptArgs) > 0 {
cmdDisplay = fmt.Sprintf("%v", c.ScriptArgs)
cmd, err = exec.Subprocess(c.ScriptArgs)
} else {
cmdDisplay = c.Script
cmd, err = exec.Script(c.Script)
}
if err != nil {
c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", cmdDisplay, err)
c.Logger.Printf("[ERR] agent: Check %q failed to setup: %s", c.CheckID, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return
}
Expand All @@ -129,14 +126,13 @@ func (c *CheckMonitor) check() {
outputStr = fmt.Sprintf("Captured %d of %d bytes\n...\n%s",
output.Size(), output.TotalWritten(), outputStr)
}
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s",
c.CheckID, cmdDisplay, outputStr)
c.Logger.Printf("[TRACE] agent: Check %q output: %s", c.CheckID, outputStr)
return outputStr
}

// Start the check
if err := cmd.Start(); err != nil {
c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", cmdDisplay, err)
c.Logger.Printf("[ERR] agent: Check %q failed to invoke: %s", c.CheckID, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return
}
Expand All @@ -154,11 +150,11 @@ func (c *CheckMonitor) check() {
select {
case <-time.After(timeout):
if err := exec.KillCommandSubtree(cmd); err != nil {
c.Logger.Printf("[WARN] Failed to kill check '%s' after timeout: %v", cmdDisplay, err)
c.Logger.Printf("[WARN] Check %q failed to kill after timeout: %s", c.CheckID, err)
}

msg := fmt.Sprintf("Timed out (%s) running check", timeout.String())
c.Logger.Printf("[WARN] %s '%s'", msg, cmdDisplay)
c.Logger.Printf("[WARN] Check %q: %s", c.CheckID, msg)

outputStr := truncateAndLogOutput()
if len(outputStr) > 0 {
Expand All @@ -178,7 +174,7 @@ func (c *CheckMonitor) check() {
// Check if the check passed
outputStr := truncateAndLogOutput()
if err == nil {
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID)
c.Logger.Printf("[DEBUG] agent: Check %q is passing", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, outputStr)
return
}
Expand All @@ -189,15 +185,15 @@ func (c *CheckMonitor) check() {
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok {
code := status.ExitStatus()
if code == 1 {
c.Logger.Printf("[WARN] agent: Check '%v' is now warning", c.CheckID)
c.Logger.Printf("[WARN] agent: Check %q is now warning", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, outputStr)
return
}
}
}

// Set the health as critical
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID)
c.Logger.Printf("[WARN] agent: Check %q is now critical", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, outputStr)
}

Expand Down Expand Up @@ -247,7 +243,7 @@ func (c *CheckTTL) run() {
for {
select {
case <-c.timer.C:
c.Logger.Printf("[WARN] agent: Check '%v' missed TTL, is now critical",
c.Logger.Printf("[WARN] agent: Check %q missed TTL, is now critical",
c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, c.getExpiredOutput())

Expand All @@ -273,8 +269,7 @@ func (c *CheckTTL) getExpiredOutput() string {
// SetStatus is used to update the status of the check,
// and to renew the TTL. If expired, TTL is restarted.
func (c *CheckTTL) SetStatus(status, output string) {
c.Logger.Printf("[DEBUG] agent: Check '%v' status is now %v",
c.CheckID, status)
c.Logger.Printf("[DEBUG] agent: Check %q status is now %s", c.CheckID, status)
c.Notify.UpdateCheck(c.CheckID, status, output)

// Store the last output so we can retain it if the TTL expires.
Expand Down Expand Up @@ -358,7 +353,6 @@ func (c *CheckHTTP) Stop() {
func (c *CheckHTTP) run() {
// Get the randomized initial pause time
initialPauseTime := lib.RandomStagger(c.Interval)
c.Logger.Printf("[DEBUG] agent: pausing %v before first HTTP request of %s", initialPauseTime, c.HTTP)
next := time.After(initialPauseTime)
for {
select {
Expand All @@ -380,7 +374,7 @@ func (c *CheckHTTP) check() {

req, err := http.NewRequest(method, c.HTTP, nil)
if err != nil {
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err)
c.Logger.Printf("[WARN] agent: Check %q HTTP request failed: %s", c.CheckID, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return
}
Expand All @@ -405,7 +399,7 @@ func (c *CheckHTTP) check() {

resp, err := c.httpClient.Do(req)
if err != nil {
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err)
c.Logger.Printf("[WARN] agent: Check %q HTTP request failed: %s", c.CheckID, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return
}
Expand All @@ -414,27 +408,27 @@ func (c *CheckHTTP) check() {
// Read the response into a circular buffer to limit the size
output, _ := circbuf.NewBuffer(BufSize)
if _, err := io.Copy(output, resp.Body); err != nil {
c.Logger.Printf("[WARN] agent: Check '%v': Get error while reading body: %s", c.CheckID, err)
c.Logger.Printf("[WARN] agent: Check %q error while reading body: %s", c.CheckID, err)
}

// Format the response body
result := fmt.Sprintf("HTTP GET %s: %s Output: %s", c.HTTP, resp.Status, output.String())

if resp.StatusCode >= 200 && resp.StatusCode <= 299 {
// PASSING (2xx)
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID)
c.Logger.Printf("[DEBUG] agent: Check %q is passing", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, result)

} else if resp.StatusCode == 429 {
// WARNING
// 429 Too Many Requests (RFC 6585)
// The user has sent too many requests in a given amount of time.
c.Logger.Printf("[WARN] agent: Check '%v' is now warning", c.CheckID)
c.Logger.Printf("[WARN] agent: Check %q is now warning", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, result)

} else {
// CRITICAL
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID)
c.Logger.Printf("[WARN] agent: Check %q is now critical", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, result)
}
}
Expand Down Expand Up @@ -496,7 +490,6 @@ func (c *CheckTCP) Stop() {
func (c *CheckTCP) run() {
// Get the randomized initial pause time
initialPauseTime := lib.RandomStagger(c.Interval)
c.Logger.Printf("[DEBUG] agent: pausing %v before first socket connection of %s", initialPauseTime, c.TCP)
next := time.After(initialPauseTime)
for {
select {
Expand All @@ -513,12 +506,12 @@ func (c *CheckTCP) run() {
func (c *CheckTCP) check() {
conn, err := c.dialer.Dial(`tcp`, c.TCP)
if err != nil {
c.Logger.Printf("[WARN] agent: socket connection failed '%s': %s", c.TCP, err)
c.Logger.Printf("[WARN] agent: Check %q socket connection failed: %s", c.CheckID, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return
}
conn.Close()
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID)
c.Logger.Printf("[DEBUG] agent: Check %q is passing", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, fmt.Sprintf("TCP connect %s: Success", c.TCP))
}

Expand Down Expand Up @@ -585,7 +578,7 @@ func (c *CheckDocker) check() {
var out string
status, b, err := c.doCheck()
if err != nil {
c.Logger.Printf("[DEBUG] agent: Check '%s': %s", c.CheckID, err)
c.Logger.Printf("[DEBUG] agent: Check %q: %s", c.CheckID, err)
out = err.Error()
} else {
// out is already limited to CheckBufSize since we're getting a
Expand All @@ -595,11 +588,11 @@ func (c *CheckDocker) check() {
if int(b.TotalWritten()) > len(out) {
out = fmt.Sprintf("Captured %d of %d bytes\n...\n%s", len(out), b.TotalWritten(), out)
}
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s", c.CheckID, c.Script, out)
c.Logger.Printf("[TRACE] agent: Check %q output: %s", c.CheckID, out)
}

if status == api.HealthCritical {
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID)
c.Logger.Printf("[WARN] agent: Check %q is now critical", c.CheckID)
}

c.Notify.UpdateCheck(c.CheckID, status, out)
Expand Down Expand Up @@ -632,10 +625,10 @@ func (c *CheckDocker) doCheck() (string, *circbuf.Buffer, error) {
case 0:
return api.HealthPassing, buf, nil
case 1:
c.Logger.Printf("[DEBUG] Check failed with exit code: %d", exitCode)
c.Logger.Printf("[DEBUG] Check %q failed with exit code: %d", c.CheckID, exitCode)
return api.HealthWarning, buf, nil
default:
c.Logger.Printf("[DEBUG] Check failed with exit code: %d", exitCode)
c.Logger.Printf("[DEBUG] Check %q failed with exit code: %d", c.CheckID, exitCode)
return api.HealthCritical, buf, nil
}
}