Add support for filebeat multiline handling

split processing into 3 layers: - input layer - line processing layer - event publisher layer (for loop driver) Input layer is responsible for reading files and forwarding errors if appropriate. new multiline system tests: - elasticsearch log with java exception - c-style log tests - max_lines test - max_bytes test - timeout test added asciidocs for multiline
elastic · Dec 29, 2015 · 0d147f7 · 0d147f7
1 parent eae32b3
commit 0d147f7
Show file tree

Hide file tree

Showing 20 changed files with 1,249 additions and 246 deletions.
diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc
@@ -42,6 +42,7 @@ https://github.com/elastic/beats/compare/1.0.0...master[Check the HEAD diff]
 - group all cpu usage per core statistics and export them optionally if cpu_per_core is configured {pull}496[496]
 
 *Filebeat*
+- Add multiline support for combining multiple related lines into one event. {issue}461[461]
 
 *Winlogbeat*
 

diff --git a/filebeat/config/config.go b/filebeat/config/config.go
@@ -67,9 +67,19 @@ type HarvesterConfig struct {
 	BackoffFactor      int    `yaml:"backoff_factor"`
 	MaxBackoff         string `yaml:"max_backoff"`
 	MaxBackoffDuration time.Duration
-	ForceCloseFiles    bool     `yaml:"force_close_files"`
-	ExcludeLines       []string `yaml:"exclude_lines"`
-	IncludeLines       []string `yaml:"include_lines"`
+	ForceCloseFiles    bool             `yaml:"force_close_files"`
+	ExcludeLines       []string         `yaml:"exclude_lines"`
+	IncludeLines       []string         `yaml:"include_lines"`
+	MaxBytes           *int             `yaml:"max_bytes"`
+	Multiline          *MultilineConfig `yaml:"multiline"`
+}
+
+type MultilineConfig struct {
+	Pattern  string `yaml:"pattern"`
+	Negate   bool   `yaml:"negate"`
+	Match    string `yaml:"match"`
+	MaxLines *int   `yaml:"max_lines"`
+	Timeout  string `yaml:"timeout"`
 }
 
 const (
@@ -157,5 +167,4 @@ func (config *Config) FetchConfigs() {
 	if len(config.Filebeat.Prospectors) == 0 {
 		log.Fatalf("No paths given. What files do you want me to watch?")
 	}
-
 }
diff --git a/filebeat/crawler/prospector.go b/filebeat/crawler/prospector.go
@@ -152,6 +152,8 @@ func (p *Prospector) logRun(spoolChan chan *input.FileEvent) {
 	// Seed last scan time
 	p.lastscan = time.Now()
 
+	logp.Debug("prospector", "exclude_files: %s", p.ProspectorConfig.ExcludeFiles)
+
 	// Now let's do one quick scan to pick up new files
 	for _, path := range p.ProspectorConfig.Paths {
 		p.scan(path, spoolChan)
@@ -242,7 +244,7 @@ func (p *Prospector) isFileExcluded(file string) bool {
 func (p *Prospector) scan(path string, output chan *input.FileEvent) {
 
 	logp.Debug("prospector", "scan path %s", path)
-	logp.Debug("prospector", "exclude_files: %s", p.ProspectorConfig.ExcludeFiles)
+
 	// Evaluate the path as a wildcards/shell glob
 	matches, err := filepath.Glob(path)
 	if err != nil {

diff --git a/filebeat/docs/configuration.asciidoc b/filebeat/docs/configuration.asciidoc
@@ -93,7 +93,7 @@ If both `include_lines` and `exclude_lines` are defined, then include_lines is c
 
 ===== exclude_files
 
-A list of regular expressions to match the files to be ignored. By default no file is excluded. 
+A list of regular expressions to match the files to be ignored. By default no file is excluded.
 
 [source,yaml]
 -------------------------------------------------------------------------------------
@@ -148,6 +148,51 @@ document. The default value is `log`.
 
 The buffer size every harvester uses when fetching the file. The default is 16384.
 
+===== max_bytes
+
+Maximum number of bytes a single log event can have. All bytes after max_bytes are discarded and not sent.
+This is especially useful for multiline log messages which can get large. The default is 10MB (10485760).
+
+===== multiline
+
+Mutiline can be used for log messages spanning multiple lines. This is common for Java Stack Traces
+or C-Line Continuation. The following example combines all lines following a line start with a `[`.
+This could for example be a Java Stack Trace.
+
+[source,yaml]
+-------------------------------------------------------------------------------------
+multiline:
+    pattern: ^\[
+    match: after
+    negate: true
+-------------------------------------------------------------------------------------
+
+====== pattern
+
+The regexp pattern that has to be matched. The example pattern matches all lines starting with [
+
+====== negate
+
+Defines if the pattern set under pattern should be negated or not. Default is false.
+
+====== match
+
+Match must be set to "after" or "before". It is used to define if lines should be
+appended to a pattern that was (not) matched before or after or as long as a
+pattern is not matched based on negate.
+
+NOTE: After is the equivalent to previous and before is the equivalent to to next in https://www.elastic.co/guide/en/logstash/current/plugins-codecs-multiline.html[Logstash].
+
+====== max_lines
+
+The maximum number of lines that are combined to one event. In case there are more the max_lines the additional
+lines are discarded. Default is set to 500.
+
+====== timeout
+
+After the defined timeout, an multiline event is sent even if no new pattern was found to start a new event.
+Default is set to 5s.
+
 
 ===== tail_files
 

diff --git a/filebeat/etc/beat.yml b/filebeat/etc/beat.yml
@@ -32,11 +32,11 @@ filebeat:
 
       # Exclude lines. A list of regular expressions to match. It drops the lines that are
       # matching any regular expression from the list. The include_lines is called before
-      # exclude_lines. By default, no lines are dropped. 
+      # exclude_lines. By default, no lines are dropped.
       # exclude_lines: ["^DBG"]
 
       # Include lines. A list of regular expressions to match. It exports the lines that are
-      # matching any regular expression from the list. The include_lines is called before 
+      # matching any regular expression from the list. The include_lines is called before
       # exclude_lines. By default, all the lines are exported.
       # include_lines: ["^ERR", "^WARN"]
 
@@ -73,6 +73,35 @@ filebeat:
       # Defines the buffer size every harvester uses when fetching the file
       #harvester_buffer_size: 16384
 
+      # Maximum number of bytes a single log event can have
+      # All bytes after max_bytes are discarded and not sent. The default is 10MB.
+      # This is especially useful for multiline log messages which can get large.
+      #max_bytes: 10485760
+
+      # Mutiline can be used for log messages spanning multiple lines. This is common
+      # for Java Stack Traces or C-Line Continuation
+      #multiline:
+
+        # The regexp Pattern that has to be matched. The example pattern matches all lines starting with [
+        #pattern: ^\[
+
+        # Defines if the pattern set under pattern should be negated or not. Default is false.
+        #negate: false
+
+        # Match can be set to "after" or "before". It is used to define if lines should be append to a pattern
+        # that was (not) matched before or after or as long as a pattern is not matched based on negate.
+        # Note: After is the equivalent to previous and before is the equivalent to to next in Logstash
+        #match: after
+
+        # The maximum number of lines that are combined to one event.
+        # In case there are more the max_lines the additional lines are discarded.
+        # Default is 500
+        #max_lines: 500
+
+        # After the defined timeout, an multiline event is sent even if no new pattern was found to start a new event
+        # Default is 5s.
+        #timeout: 5s
+
       # Setting tail_files to true means filebeat starts readding new files at the end
       # instead of the beginning. If this is used in combination with log rotation
       # this can mean that the first entries of a new file are skipped.

diff --git a/filebeat/etc/filebeat.yml b/filebeat/etc/filebeat.yml
@@ -32,11 +32,11 @@ filebeat:
 
       # Exclude lines. A list of regular expressions to match. It drops the lines that are
       # matching any regular expression from the list. The include_lines is called before
-      # exclude_lines. By default, no lines are dropped. 
+      # exclude_lines. By default, no lines are dropped.
       # exclude_lines: ["^DBG"]
 
       # Include lines. A list of regular expressions to match. It exports the lines that are
-      # matching any regular expression from the list. The include_lines is called before 
+      # matching any regular expression from the list. The include_lines is called before
       # exclude_lines. By default, all the lines are exported.
       # include_lines: ["^ERR", "^WARN"]
 
@@ -73,6 +73,35 @@ filebeat:
       # Defines the buffer size every harvester uses when fetching the file
       #harvester_buffer_size: 16384
 
+      # Maximum number of bytes a single log event can have
+      # All bytes after max_bytes are discarded and not sent. The default is 10MB.
+      # This is especially useful for multiline log messages which can get large.
+      #max_bytes: 10485760
+
+      # Mutiline can be used for log messages spanning multiple lines. This is common
+      # for Java Stack Traces or C-Line Continuation
+      #multiline:
+
+        # The regexp Pattern that has to be matched. The example pattern matches all lines starting with [
+        #pattern: ^\[
+
+        # Defines if the pattern set under pattern should be negated or not. Default is false.
+        #negate: false
+
+        # Match can be set to "after" or "before". It is used to define if lines should be append to a pattern
+        # that was (not) matched before or after or as long as a pattern is not matched based on negate.
+        # Note: After is the equivalent to previous and before is the equivalent to to next in Logstash
+        #match: after
+
+        # The maximum number of lines that are combined to one event.
+        # In case there are more the max_lines the additional lines are discarded.
+        # Default is 500
+        #max_lines: 500
+
+        # After the defined timeout, an multiline event is sent even if no new pattern was found to start a new event
+        # Default is 5s.
+        #timeout: 5s
+
       # Setting tail_files to true means filebeat starts readding new files at the end
       # instead of the beginning. If this is used in combination with log rotation
       # this can mean that the first entries of a new file are skipped.

diff --git a/filebeat/harvester/harvester.go b/filebeat/harvester/harvester.go
@@ -17,7 +17,6 @@ import (
 	"io"
 	"os"
 	"regexp"
-	"time"
 
 	"github.com/elastic/beats/filebeat/config"
 	"github.com/elastic/beats/filebeat/harvester/encoding"
@@ -33,7 +32,6 @@ type Harvester struct {
 	SpoolerChan        chan *input.FileEvent
 	encoding           encoding.EncodingFactory
 	file               FileSource /* the file being watched */
-	backoff            time.Duration
 	ExcludeLinesRegexp []*regexp.Regexp
 	IncludeLinesRegexp []*regexp.Regexp
 }