diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index d6875cc5c2b..6b4ef7f0305 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -27,6 +27,7 @@ https://github.com/elastic/beats/compare/1.0.0...master[Check the HEAD diff] *Topbeat* *Filebeat* +- Add exclude_files configuration option {pull}563[563] *Winlogbeat* diff --git a/filebeat/config/config.go b/filebeat/config/config.go index 5174d8f793e..db45ea305b2 100644 --- a/filebeat/config/config.go +++ b/filebeat/config/config.go @@ -4,6 +4,7 @@ import ( "log" "os" "path/filepath" + "regexp" "time" "github.com/elastic/beats/libbeat/cfgfile" @@ -49,6 +50,8 @@ type ProspectorConfig struct { ScanFrequency string `yaml:"scan_frequency"` ScanFrequencyDuration time.Duration Harvester HarvesterConfig `yaml:",inline"` + ExcludeFiles []string `yaml:"exclude_files"` + ExcludeFilesRegexp []*regexp.Regexp } type HarvesterConfig struct { diff --git a/filebeat/crawler/prospector.go b/filebeat/crawler/prospector.go index b92eae18995..dfadc0d3f43 100644 --- a/filebeat/crawler/prospector.go +++ b/filebeat/crawler/prospector.go @@ -52,6 +52,10 @@ func (p *Prospector) setupProspectorConfig() error { if err != nil { return err } + config.ExcludeFilesRegexp, err = harvester.InitRegexps(config.ExcludeFiles) + if err != nil { + return err + } // Init File Stat list p.prospectorList = make(map[string]harvester.FileStat) @@ -219,11 +223,26 @@ func (p *Prospector) stdinRun(spoolChan chan *input.FileEvent) { } } +func (p *Prospector) isFileExcluded(file string) bool { + + config := &p.ProspectorConfig + + if len(config.ExcludeFilesRegexp) > 0 { + + if harvester.MatchAnyRegexps(config.ExcludeFilesRegexp, file) { + return true + } + } + + return false +} + // Scans the specific path which can be a glob (/**/**/*.log) // For all found files it is checked if a harvester should be started func (p *Prospector) scan(path string, output chan *input.FileEvent) { logp.Debug("prospector", "scan path %s", path) + logp.Debug("prospector", "exclude_files: %s", p.ProspectorConfig.ExcludeFiles) // Evaluate the path as a wildcards/shell glob matches, err := filepath.Glob(path) if err != nil { @@ -237,6 +256,12 @@ func (p *Prospector) scan(path string, output chan *input.FileEvent) { for _, file := range matches { logp.Debug("prospector", "Check file for harvesting: %s", file) + // check if the file is in the exclude_files list + if p.isFileExcluded(file) { + logp.Debug("prospector", "Exclude file: %s", file) + continue + } + // Stat the file, following any symlinks. fileinfo, err := os.Stat(file) diff --git a/filebeat/crawler/prospector_test.go b/filebeat/crawler/prospector_test.go index e186908b5e0..8ffe9a733f7 100644 --- a/filebeat/crawler/prospector_test.go +++ b/filebeat/crawler/prospector_test.go @@ -171,3 +171,23 @@ func TestProspectorInitInputTypeWrong(t *testing.T) { assert.Nil(t, err) assert.Equal(t, "log", prospector.ProspectorConfig.Harvester.InputType) } + +func TestProspectorFileExclude(t *testing.T) { + + prospectorConfig := config.ProspectorConfig{ + ExcludeFiles: []string{"\\.gz$"}, + Harvester: config.HarvesterConfig{ + BufferSize: 0, + }, + } + + prospector := Prospector{ + ProspectorConfig: prospectorConfig, + } + + prospector.Init() + + assert.True(t, prospector.isFileExcluded("/tmp/log/logw.gz")) + assert.False(t, prospector.isFileExcluded("/tmp/log/logw.log")) + +} diff --git a/filebeat/docs/configuration.asciidoc b/filebeat/docs/configuration.asciidoc index 6ee7c7b1156..521e1debe25 100644 --- a/filebeat/docs/configuration.asciidoc +++ b/filebeat/docs/configuration.asciidoc @@ -91,6 +91,16 @@ If both `include_lines` and `exclude_lines` are defined, then include_lines is c exclude_lines: ["^DBG"] ------------------------------------------------------------------------------------- +===== exclude_files + +A list of regular expressions to match the files to be ignored. By default no file is excluded. + +[source,yaml] +------------------------------------------------------------------------------------- + exclude_files: [".gz$"] +------------------------------------------------------------------------------------- +To ignore all the files with the `gz` extension. + [[configuration-fields]] ===== fields diff --git a/filebeat/etc/beat.yml b/filebeat/etc/beat.yml index de8de4cedc6..64821d7da29 100644 --- a/filebeat/etc/beat.yml +++ b/filebeat/etc/beat.yml @@ -40,6 +40,10 @@ filebeat: # exclude_lines. By default, all the lines are exported. # include_lines: ["^ERR", "^WARN"] + # Exclude files. A list of regular expressions to match. Filebeat drops the files that + # are matching any regular expression from the list. By default, no files are dropped. + # exclude_files: [".gz$"] + # Optional additional fields. These field can be freely picked # to add additional information to the crawled log files for filtering #fields: diff --git a/filebeat/etc/filebeat.yml b/filebeat/etc/filebeat.yml index 8139278c161..34c3dc5e824 100644 --- a/filebeat/etc/filebeat.yml +++ b/filebeat/etc/filebeat.yml @@ -40,6 +40,10 @@ filebeat: # exclude_lines. By default, all the lines are exported. # include_lines: ["^ERR", "^WARN"] + # Exclude files. A list of regular expressions to match. Filebeat drops the files that + # are matching any regular expression from the list. By default, no files are dropped. + # exclude_files: [".gz$"] + # Optional additional fields. These field can be freely picked # to add additional information to the crawled log files for filtering #fields: diff --git a/filebeat/harvester/log.go b/filebeat/harvester/log.go index ab1699eddca..e7efa22a3e4 100644 --- a/filebeat/harvester/log.go +++ b/filebeat/harvester/log.go @@ -5,7 +5,6 @@ import ( "fmt" "io" "os" - "regexp" "time" "github.com/elastic/beats/filebeat/config" @@ -332,35 +331,6 @@ func (h *Harvester) handleReadlineError(lastTimeRead time.Time, err error) error func (h *Harvester) Stop() { } -func InitRegexps(exprs []string) ([]*regexp.Regexp, error) { - - result := []*regexp.Regexp{} - - for _, exp := range exprs { - - rexp, err := regexp.CompilePOSIX(exp) - if err != nil { - logp.Err("Fail to compile the regexp %s: %s", exp, err) - return nil, err - } - result = append(result, rexp) - } - return result, nil -} - -func MatchAnyRegexps(regexps []*regexp.Regexp, text string) bool { - - for _, rexp := range regexps { - if rexp.MatchString(text) { - // drop line - return true - - } - } - - return false -} - const maxConsecutiveEmptyReads = 100 // timedReader keeps track of last time bytes have been read from underlying diff --git a/filebeat/harvester/util.go b/filebeat/harvester/util.go index 54b0c871588..2bc9d137f48 100644 --- a/filebeat/harvester/util.go +++ b/filebeat/harvester/util.go @@ -1,9 +1,11 @@ package harvester import ( + "regexp" + "time" + "github.com/elastic/beats/filebeat/harvester/encoding" "github.com/elastic/beats/libbeat/logp" - "time" ) // isLine checks if the given byte array is a line, means has a line ending \n @@ -60,3 +62,32 @@ func readlineString(bytes []byte, size int) (string, int, error) { s := string(bytes)[:len(bytes)-lineEndingChars(bytes)] return s, size, nil } + +// InitRegexps initializes a list of compiled regular expressions. +func InitRegexps(exprs []string) ([]*regexp.Regexp, error) { + + result := []*regexp.Regexp{} + + for _, exp := range exprs { + + rexp, err := regexp.CompilePOSIX(exp) + if err != nil { + logp.Err("Fail to compile the regexp %s: %s", exp, err) + return nil, err + } + result = append(result, rexp) + } + return result, nil +} + +// MatchAnyRegexps checks if the text matches any of the regular expressions +func MatchAnyRegexps(regexps []*regexp.Regexp, text string) bool { + + for _, rexp := range regexps { + if rexp.MatchString(text) { + return true + } + } + + return false +} diff --git a/filebeat/harvester/util_test.go b/filebeat/harvester/util_test.go new file mode 100644 index 00000000000..00364531db9 --- /dev/null +++ b/filebeat/harvester/util_test.go @@ -0,0 +1,19 @@ +package harvester + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestMatchAnyRegexps(t *testing.T) { + + patterns := []string{"\\.gz$"} + + regexps, err := InitRegexps(patterns) + + assert.Nil(t, err) + + assert.Equal(t, MatchAnyRegexps(regexps, "/var/log/log.gz"), true) + +} diff --git a/filebeat/tests/system/config/filebeat.yml.j2 b/filebeat/tests/system/config/filebeat.yml.j2 index f1df92b6b3f..69ea994aa65 100644 --- a/filebeat/tests/system/config/filebeat.yml.j2 +++ b/filebeat/tests/system/config/filebeat.yml.j2 @@ -29,7 +29,10 @@ filebeat: {% if exclude_lines %} exclude_lines: {{exclude_lines}} {% endif %} - + {% if exclude_files %} + exclude_files: {{exclude_files}} + {% endif %} + spool_size: idle_timeout: 0.1s registry_file: {{ fb.working_dir + '/' }}{{ registryFile|default(".filebeat")}} diff --git a/filebeat/tests/system/test_prospector.py b/filebeat/tests/system/test_prospector.py index dbdbfe6305e..efd862b649d 100644 --- a/filebeat/tests/system/test_prospector.py +++ b/filebeat/tests/system/test_prospector.py @@ -105,7 +105,6 @@ def test_stdin(self): lambda: self.output_has(lines=iterations1+iterations2), max_timeout=15) - proc.kill_and_wait() objs = self.read_output() @@ -140,3 +139,38 @@ def test_rotating_ignore_older_larger_write_rate(self): max_timeout=15) proc.kill_and_wait() + + def test_exclude_files(self): + + self.render_config_template( + path=os.path.abspath(self.working_dir) + "/log/*", + exclude_files=[".gz$"] + ) + os.mkdir(self.working_dir + "/log/") + + testfile = self.working_dir + "/log/test.gz" + file = open(testfile, 'w') + file.write("line in gz file\n") + file.close() + + testfile = self.working_dir + "/log/test.log" + file = open(testfile, 'w') + file.write("line in log file\n") + file.close() + + filebeat = self.start_filebeat() + + self.wait_until( + lambda: self.output_has(lines=1), + max_timeout=15) + + # TODO: Find better solution when filebeat did crawl the file + # Idea: Special flag to filebeat so that filebeat is only doing and + # crawl and then finishes + filebeat.kill_and_wait() + + output = self.read_output() + + # Check that output file has the same number of lines as the log file + assert 1 == len(output) + assert output[0]["message"] == "line in log file"