From 924c657949a2cd7d76d33a5e10d0b7b15e1f4618 Mon Sep 17 00:00:00 2001 From: Andrew Kroh Date: Fri, 26 Jan 2018 13:55:06 -0500 Subject: [PATCH] Catch and log panics When a Beat exits due to panic the stack trace is written to stderr. For Beats running under service managers where the stdout/err are discarded this makes it problematic to debug (or even detect a panic). This commit modifies the beat runner to catch (aka `recover`) the panicking main goroutine, log it with logp, flush the logger, then exit. This ensures that the panic is present in the log output. Under normal circumstances (no error) the final log message from every Beat will be ` stopped.`. When there is a panic a l`evel=fatal` log line will be written that contains the panic message and a stack trace. For example: ``` { "level": "info", "timestamp": "2018-01-26T13:36:59.708-0500", "caller": "runtime/asm_amd64.s:510", "message": "auditbeat stopped." } { "level": "fatal", "timestamp": "2018-01-26T13:36:59.709-0500", "logger": "auditbeat", "caller": "instance/beat.go:134", "message": "Failed due to panic.", "panic": "Oh no, big problem!", "stack": "github.com/elastic/beats/libbeat/cmd/instance.Run.func1.1\n\t/Users/akroh/go/src/github.com/elastic/beats/libbeat/cmd/instance/beat.go:134\nruntime.call32\n\t/Users/akroh/.gvm/versions/go1.9.2.darwin.amd64/src/runtime/asm_amd64.s:509\nruntime.gopanic\n\t/Users/akroh/.gvm/versions/go1.9.2.darwin.amd64/src/runtime/panic.go:491\ngithub.com/elastic/beats/libbeat/cmd/instance.(*Beat).launch\n\t/Users/akroh/go/src/github.com/elastic/beats/libbeat/cmd/instance/beat.go:315\ngithub.com/elastic/beats/libbeat/cmd/instance.Run.func1\n\t/Users/akroh/go/src/github.com/elastic/beats/libbeat/cmd/instance/beat.go:142\ngithub.com/elastic/beats/libbeat/cmd/instance.Run\n\t/Users/akroh/go/src/github.com/elastic/beats/libbeat/cmd/instance/beat.go:143\ngithub.com/elastic/beats/libbeat/cmd.genRunCmd.func1\n\t/Users/akroh/go/src/github.com/elastic/beats/libbeat/cmd/run.go:19\ngithub.com/elastic/beats/vendor/github.com/spf13/cobra.(*Command).execute\n\t/Users/akroh/go/src/github.com/elastic/beats/vendor/github.com/spf13/cobra/command.go:704\ngithub.com/elastic/beats/vendor/github.com/spf13/cobra.(*Command).ExecuteC\n\t/Users/akroh/go/src/github.com/elastic/beats/vendor/github.com/spf13/cobra/command.go:785\ngithub.com/elastic/beats/vendor/github.com/spf13/cobra.(*Command).Execute\n\t/Users/akroh/go/src/github.com/elastic/beats/vendor/github.com/spf13/cobra/command.go:738\nmain.main\n\t/Users/akroh/go/src/github.com/elastic/beats/auditbeat/main.go:14\nruntime.main\n\t/Users/akroh/.gvm/versions/go1.9.2.darwin.amd64/src/runtime/proc.go:195" } ``` Log that panic object seperately --- CHANGELOG.asciidoc | 1 + libbeat/cmd/instance/beat.go | 9 ++++++++- libbeat/logp/core_test.go | 7 ++++--- libbeat/logp/global.go | 4 ++-- libbeat/logp/logger.go | 19 +++++++++++++++++++ 5 files changed, 34 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index 497fc4f9d194..230619df5654 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -146,6 +146,7 @@ https://github.com/elastic/beats/compare/v6.0.0-beta2...master[Check the HEAD di - Add the abilility for the add_docker_metadata process to enrich based on process ID. {pull}6100[6100] - The `add_docker_metadata` and `add_kubernetes_metadata` processors are now GA, instead of Beta. {pull}6105[6105] - The node name can be discovered automatically by machine-id matching when beat deployed outside kubernetes cluster. {pull}6146[6146] +- Panics will be written to the logger before exiting. {pull}6199[6199] *Auditbeat* diff --git a/libbeat/cmd/instance/beat.go b/libbeat/cmd/instance/beat.go index 4621776fcbc5..c1df52914322 100644 --- a/libbeat/cmd/instance/beat.go +++ b/libbeat/cmd/instance/beat.go @@ -15,6 +15,7 @@ import ( "time" "github.com/satori/go.uuid" + "go.uber.org/zap" "github.com/elastic/beats/libbeat/api" "github.com/elastic/beats/libbeat/beat" @@ -128,6 +129,12 @@ func initRand() { // XXX Move this as a *Beat method? func Run(name, idxPrefix, version string, bt beat.Creator) error { return handleError(func() error { + defer func() { + if r := recover(); r != nil { + logp.NewLogger(name).Fatalw("Failed due to panic.", + "panic", r, zap.Stack("stack")) + } + }() b, err := NewBeat(name, idxPrefix, version) if err != nil { return err @@ -248,6 +255,7 @@ func (b *Beat) createBeater(bt beat.Creator) (beat.Beater, error) { func (b *Beat) launch(bt beat.Creator) error { defer logp.Sync() + defer logp.Info("%s stopped.", b.Info.Beat) err := b.Init() if err != nil { @@ -299,7 +307,6 @@ func (b *Beat) launch(bt beat.Creator) error { } logp.Info("%s start running.", b.Info.Beat) - defer logp.Info("%s stopped.", b.Info.Beat) if b.Config.HTTP.Enabled() { api.Start(b.Config.HTTP, b.Info) diff --git a/libbeat/logp/core_test.go b/libbeat/logp/core_test.go index f9fa1766b502..3e2950e15235 100644 --- a/libbeat/logp/core_test.go +++ b/libbeat/logp/core_test.go @@ -148,7 +148,7 @@ func TestLoggerLevel(t *testing.T) { } func TestRecover(t *testing.T) { - const recoveryExplanation = "Something went wrong." + const recoveryExplanation = "Something went wrong" const cause = "unexpected condition" DevelopmentSetup(ToObserverOutput()) @@ -160,8 +160,9 @@ func TestRecover(t *testing.T) { assert.Equal(t, zap.ErrorLevel, log.Level) assert.Equal(t, "logp/core_test.go", strings.Split(log.Caller.TrimmedPath(), ":")[0]) - assert.Contains(t, log.Message, recoveryExplanation) - assert.Contains(t, log.Message, cause) + assert.Contains(t, log.Message, recoveryExplanation+ + ". Recovering, but please report this.") + assert.Contains(t, log.ContextMap(), "panic") } }() diff --git a/libbeat/logp/global.go b/libbeat/logp/global.go index 068c532ca720..174810412027 100644 --- a/libbeat/logp/global.go +++ b/libbeat/logp/global.go @@ -83,8 +83,8 @@ func WTF(format string, v ...interface{}) { // Recover stops a panicking goroutine and logs an Error. func Recover(msg string) { if r := recover(); r != nil { - msg := fmt.Sprintf("%s. Recovering, but please report this: %s.", msg, r) + msg := fmt.Sprintf("%s. Recovering, but please report this.", msg) globalLogger().WithOptions(zap.AddCallerSkip(2)). - Error(msg, zap.Stack("stack")) + Error(msg, zap.Any("panic", r), zap.Stack("stack")) } } diff --git a/libbeat/logp/logger.go b/libbeat/logp/logger.go index 04cfc91a0ecd..9bdd2731b79c 100644 --- a/libbeat/logp/logger.go +++ b/libbeat/logp/logger.go @@ -51,6 +51,11 @@ func (l *Logger) Error(args ...interface{}) { l.sugar.Error(args...) } +// Fatal uses fmt.Sprint to construct and log a message, then calls os.Exit(1). +func (l *Logger) Fatal(args ...interface{}) { + l.sugar.Fatal(args...) +} + // Panic uses fmt.Sprint to construct and log a message, then panics. func (l *Logger) Panic(args ...interface{}) { l.sugar.Panic(args...) @@ -84,6 +89,11 @@ func (l *Logger) Errorf(format string, args ...interface{}) { l.sugar.Errorf(format, args...) } +// Fatalf uses fmt.Sprintf to log a templated message, then calls os.Exit(1). +func (l *Logger) Fatalf(format string, args ...interface{}) { + l.sugar.Fatalf(format, args...) +} + // Panicf uses fmt.Sprintf to log a templated message, then panics. func (l *Logger) Panicf(format string, args ...interface{}) { l.sugar.Panicf(format, args...) @@ -129,6 +139,15 @@ func (l *Logger) Errorw(msg string, keysAndValues ...interface{}) { l.sugar.Errorw(msg, keysAndValues...) } +// Fatalw logs a message with some additional context, then calls os.Exit(1). +// The additional context is added in the form of key-value pairs. The optimal +// way to write the value to the log message will be inferred by the value's +// type. To explicitly specify a type you can pass a Field such as +// logp.Stringer. +func (l *Logger) Fatalw(msg string, keysAndValues ...interface{}) { + l.sugar.Fatalw(msg, keysAndValues...) +} + // Panicw logs a message with some additional context, then panics. The // additional context is added in the form of key-value pairs. The optimal way // to write the value to the log message will be inferred by the value's type.