Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for maximum size for Output of checks #5233

Merged
merged 14 commits into from
Jun 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 27 additions & 14 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -915,6 +915,7 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
base.CoordinateUpdateBatchSize = a.config.ConsulCoordinateUpdateBatchSize
base.CoordinateUpdateMaxBatches = a.config.ConsulCoordinateUpdateMaxBatches
base.CoordinateUpdatePeriod = a.config.ConsulCoordinateUpdatePeriod
base.CheckOutputMaxSize = a.config.CheckOutputMaxSize

base.RaftConfig.HeartbeatTimeout = a.config.ConsulRaftHeartbeatTimeout
base.RaftConfig.LeaderLeaseTimeout = a.config.ConsulRaftLeaderLeaseTimeout
Expand Down Expand Up @@ -971,6 +972,9 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
if a.config.Bootstrap {
base.Bootstrap = true
}
if a.config.CheckOutputMaxSize > 0 {
base.CheckOutputMaxSize = a.config.CheckOutputMaxSize
}
if a.config.RejoinAfterLeave {
base.RejoinAfterLeave = true
}
Expand Down Expand Up @@ -2239,6 +2243,13 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,

// Check if already registered
if chkType != nil {
maxOutputSize := a.config.CheckOutputMaxSize
if maxOutputSize == 0 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may be unnecessary. How could a.config.CheckOutputMaxSize end up as 0? The config builder has a default value of 4096 and returns an error if the input is < 1.

maxOutputSize = checks.DefaultBufSize
}
if chkType.OutputMaxSize > 0 && maxOutputSize > chkType.OutputMaxSize {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As you mention in your description, this will only override the agent check if the check's OutputMaxSize is less than the agent's.

Could you expand more on why we shouldn't allow the check's max to override the agent's max?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@freddygv I wanted to be as conservative as possible (meaning, that for instance, if someone is storing those results in a DB and assumed size will never be more than 4k, we won't break this assumption), but allowing large infrastructure to reduce the size of their checks since it might be hugely impacting when checks are varying a lot (for instance, the normal result if checks if 'OK', while the broken checks returns large stack traces), ask @orarnon for instance who had this issue (we did had it as well on very large clusters in the past)

Increasing this value is trivial and could be done in another patch if someone is resquesting it, but if you really want, we can remove this limit in this PR as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pierresouchay Thanks. Can you please expand on the need for the check-specific max output size? Would adding a max for the agent alone be sufficient?

The implementation would be simpler if we only set this max once. Additionally, there is a UX mismatch where someone can discard output at the agent level, but at the check level the max size can only be brought down to 1.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, the reason for that is serialization of payload: if the user specify 0, it means no value has been specified in payload (for instance for a user not setting OutputMaxSize in the check definition and thus preserving backward compatibility.
As you pointed out in a previous comment, if the user completely want to disable checks output, it can be done by disabling completely check output.

maxOutputSize = chkType.OutputMaxSize
}
switch {

case chkType.IsTTL():
Expand All @@ -2248,10 +2259,11 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
}

ttl := &checks.CheckTTL{
Notify: a.State,
CheckID: check.CheckID,
TTL: chkType.TTL,
Logger: a.logger,
Notify: a.State,
CheckID: check.CheckID,
TTL: chkType.TTL,
Logger: a.logger,
OutputMaxSize: maxOutputSize,
}

// Restore persisted state, if any
Expand Down Expand Up @@ -2285,6 +2297,7 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
Interval: chkType.Interval,
Timeout: chkType.Timeout,
Logger: a.logger,
OutputMaxSize: maxOutputSize,
TLSClientConfig: tlsClientConfig,
}
http.Start()
Expand Down Expand Up @@ -2352,7 +2365,7 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
}

if a.dockerClient == nil {
dc, err := checks.NewDockerClient(os.Getenv("DOCKER_HOST"), checks.BufSize)
dc, err := checks.NewDockerClient(os.Getenv("DOCKER_HOST"), int64(maxOutputSize))
if err != nil {
a.logger.Printf("[ERR] agent: error creating docker client: %s", err)
return err
Expand Down Expand Up @@ -2387,14 +2400,14 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
check.CheckID, checks.MinInterval)
chkType.Interval = checks.MinInterval
}

monitor := &checks.CheckMonitor{
Notify: a.State,
CheckID: check.CheckID,
ScriptArgs: chkType.ScriptArgs,
Interval: chkType.Interval,
Timeout: chkType.Timeout,
Logger: a.logger,
Notify: a.State,
CheckID: check.CheckID,
ScriptArgs: chkType.ScriptArgs,
Interval: chkType.Interval,
Timeout: chkType.Timeout,
Logger: a.logger,
OutputMaxSize: maxOutputSize,
}
monitor.Start()
a.checkMonitors[check.CheckID] = monitor
Expand Down Expand Up @@ -2866,7 +2879,7 @@ func (a *Agent) updateTTLCheck(checkID types.CheckID, status, output string) err
}

// Set the status through CheckTTL to reset the TTL.
check.SetStatus(status, output)
outputTruncated := check.SetStatus(status, output)

// We don't write any files in dev mode so bail here.
if a.config.DataDir == "" {
Expand All @@ -2875,7 +2888,7 @@ func (a *Agent) updateTTLCheck(checkID types.CheckID, status, output string) err

// Persist the state so the TTL check can come up in a good state after
// an agent restart, especially with long TTL values.
if err := a.persistCheckState(check, status, output); err != nil {
if err := a.persistCheckState(check, status, outputTruncated); err != nil {
return fmt.Errorf("failed persisting state for check %q: %s", checkID, err)
}

Expand Down
7 changes: 0 additions & 7 deletions agent/agent_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (

"github.com/hashicorp/consul/acl"
cachetype "github.com/hashicorp/consul/agent/cache-types"
"github.com/hashicorp/consul/agent/checks"
"github.com/hashicorp/consul/agent/debug"
"github.com/hashicorp/consul/agent/local"
"github.com/hashicorp/consul/agent/structs"
Expand Down Expand Up @@ -715,12 +714,6 @@ func (s *HTTPServer) AgentCheckUpdate(resp http.ResponseWriter, req *http.Reques
return nil, nil
}

total := len(update.Output)
if total > checks.BufSize {
update.Output = fmt.Sprintf("%s ... (captured %d of %d bytes)",
update.Output[:checks.BufSize], checks.BufSize, total)
}

checkID := types.CheckID(strings.TrimPrefix(req.URL.Path, "/v1/agent/check/update/"))

// Get the provided token, if any, and vet against any ACL policies.
Expand Down
10 changes: 5 additions & 5 deletions agent/agent_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (
"time"

"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/checks"
"github.com/hashicorp/consul/agent/config"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/debug"
Expand Down Expand Up @@ -2299,7 +2298,8 @@ func TestAgent_FailCheck_ACLDeny(t *testing.T) {

func TestAgent_UpdateCheck(t *testing.T) {
t.Parallel()
a := NewTestAgent(t, t.Name(), "")
maxChecksSize := 256
a := NewTestAgent(t, t.Name(), fmt.Sprintf("check_output_max_size=%d", maxChecksSize))
defer a.Shutdown()
testrpc.WaitForTestAgent(t, a.RPC, "dc1")

Expand Down Expand Up @@ -2340,7 +2340,7 @@ func TestAgent_UpdateCheck(t *testing.T) {
t.Run("log output limit", func(t *testing.T) {
args := checkUpdate{
Status: api.HealthPassing,
Output: strings.Repeat("-= bad -=", 5*checks.BufSize),
Output: strings.Repeat("-= bad -=", 5*maxChecksSize),
}
req, _ := http.NewRequest("PUT", "/v1/agent/check/update/test", jsonReader(args))
resp := httptest.NewRecorder()
Expand All @@ -2359,8 +2359,8 @@ func TestAgent_UpdateCheck(t *testing.T) {
// rough check that the output buffer was cut down so this test
// isn't super brittle.
state := a.State.Checks()["test"]
if state.Status != api.HealthPassing || len(state.Output) > 2*checks.BufSize {
t.Fatalf("bad: %v", state)
if state.Status != api.HealthPassing || len(state.Output) > 2*maxChecksSize {
t.Fatalf("bad: %v, (len:=%d)", state, len(state.Output))
}
})

Expand Down
18 changes: 16 additions & 2 deletions agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1405,15 +1405,16 @@ func TestAgent_updateTTLCheck(t *testing.T) {
t.Parallel()
a := NewTestAgent(t, t.Name(), "")
defer a.Shutdown()

checkBufSize := 100
health := &structs.HealthCheck{
Node: "foo",
CheckID: "mem",
Name: "memory util",
Status: api.HealthCritical,
}
chk := &structs.CheckType{
TTL: 15 * time.Second,
TTL: 15 * time.Second,
OutputMaxSize: checkBufSize,
}

// Add check and update it.
Expand All @@ -1433,6 +1434,19 @@ func TestAgent_updateTTLCheck(t *testing.T) {
if status.Output != "foo" {
t.Fatalf("bad: %v", status)
}

if err := a.updateTTLCheck("mem", api.HealthCritical, strings.Repeat("--bad-- ", 5*checkBufSize)); err != nil {
t.Fatalf("err: %v", err)
}

// Ensure we have a check mapping.
status = a.State.Checks()["mem"]
if status.Status != api.HealthCritical {
t.Fatalf("bad: %v", status)
}
if len(status.Output) > checkBufSize*2 {
t.Fatalf("bad: %v", len(status.Output))
}
}

func TestAgent_PersistService(t *testing.T) {
Expand Down
44 changes: 30 additions & 14 deletions agent/checks/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ const (
// Otherwise we risk fork bombing a system.
MinInterval = time.Second

// BufSize is the maximum size of the captured
// check output. Prevents an enormous buffer
// DefaultBufSize is the maximum size of the captured
// check output by defaut. Prevents an enormous buffer
// from being captured
BufSize = 4 * 1024 // 4KB
DefaultBufSize = 4 * 1024 // 4KB

// UserAgent is the value of the User-Agent header
// for HTTP health checks.
Expand All @@ -56,13 +56,14 @@ type CheckNotifier interface {
// determine the health of a given check. It is compatible with
// nagios plugins and expects the output in the same format.
type CheckMonitor struct {
Notify CheckNotifier
CheckID types.CheckID
Script string
ScriptArgs []string
Interval time.Duration
Timeout time.Duration
Logger *log.Logger
Notify CheckNotifier
CheckID types.CheckID
Script string
ScriptArgs []string
Interval time.Duration
Timeout time.Duration
Logger *log.Logger
OutputMaxSize int

stop bool
stopCh chan struct{}
Expand Down Expand Up @@ -122,7 +123,7 @@ func (c *CheckMonitor) check() {
}

// Collect the output
output, _ := circbuf.NewBuffer(BufSize)
output, _ := circbuf.NewBuffer(int64(c.OutputMaxSize))
cmd.Stdout = output
cmd.Stderr = output
exec.SetSysProcAttr(cmd)
Expand Down Expand Up @@ -222,12 +223,17 @@ type CheckTTL struct {
stop bool
stopCh chan struct{}
stopLock sync.Mutex

OutputMaxSize int
}

// Start is used to start a check ttl, runs until Stop()
func (c *CheckTTL) Start() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if c.OutputMaxSize < 1 {
c.OutputMaxSize = DefaultBufSize
}
c.stop = false
c.stopCh = make(chan struct{})
c.timer = time.NewTimer(c.TTL)
Expand Down Expand Up @@ -275,16 +281,22 @@ func (c *CheckTTL) getExpiredOutput() string {

// SetStatus is used to update the status of the check,
// and to renew the TTL. If expired, TTL is restarted.
func (c *CheckTTL) SetStatus(status, output string) {
// output is returned (might be truncated)
func (c *CheckTTL) SetStatus(status, output string) string {
c.Logger.Printf("[DEBUG] agent: Check %q status is now %s", c.CheckID, status)
total := len(output)
if total > c.OutputMaxSize {
output = fmt.Sprintf("%s ... (captured %d of %d bytes)",
output[:c.OutputMaxSize], c.OutputMaxSize, total)
}
c.Notify.UpdateCheck(c.CheckID, status, output)

// Store the last output so we can retain it if the TTL expires.
c.lastOutputLock.Lock()
c.lastOutput = output
c.lastOutputLock.Unlock()

c.timer.Reset(c.TTL)
return output
}

// CheckHTTP is used to periodically make an HTTP request to
Expand All @@ -303,6 +315,7 @@ type CheckHTTP struct {
Timeout time.Duration
Logger *log.Logger
TLSClientConfig *tls.Config
OutputMaxSize int

httpClient *http.Client
stop bool
Expand Down Expand Up @@ -339,6 +352,9 @@ func (c *CheckHTTP) Start() {
} else if c.Interval < 10*time.Second {
c.httpClient.Timeout = c.Interval
}
if c.OutputMaxSize < 1 {
c.OutputMaxSize = DefaultBufSize
}
}

c.stop = false
Expand Down Expand Up @@ -413,7 +429,7 @@ func (c *CheckHTTP) check() {
defer resp.Body.Close()

// Read the response into a circular buffer to limit the size
output, _ := circbuf.NewBuffer(BufSize)
output, _ := circbuf.NewBuffer(int64(c.OutputMaxSize))
if _, err := io.Copy(output, resp.Body); err != nil {
c.Logger.Printf("[WARN] agent: Check %q error while reading body: %s", c.CheckID, err)
}
Expand Down
Loading