Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for Grok expressions in Beats #5790

Closed
wants to merge 12 commits into from
67 changes: 67 additions & 0 deletions libbeat/docs/processors-using.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ The supported processors are:
* <<add-cloud-metadata,`add_cloud_metadata`>>
* <<add-locale,`add_locale`>>
* <<decode-json-fields,`decode_json_fields`>>
* <<grok, `grok`>>
* <<drop-event,`drop_event`>>
* <<drop-fields,`drop_fields`>>
* <<include-fields,`include_fields`>>
Expand Down Expand Up @@ -448,6 +449,72 @@ is treated as if the field was not set at all.
exist in the event are overwritten by keys from the decoded JSON object. The
default value is false.

[[grok]]
=== Grok
The `grok` processor extract fields from a string field using a provided regular expression. It is similar to the grok processor in elasticsearch
and logstash (Go regular expressions are used instead of Ruby or Java, so there are minor differences). The Grok expression can extract fields using
the syntax %{regular_expression_name:extracted_field_name}. For instance, in order to extract the timestamp and IP address from

-------
"2012-03-04T22:33:01.003Z,127.0.0.1"
-------

one would use the Grok expression
------
`%{TIMESTAMP_ISO8601:timestamp},%{IP:client_ip}?`
------

One can use named regular expressions, with the syntax `?P<name>`.

The syntax of this processor is

[source,yaml]
-------
processors:
- grok:
field: "field_name"
patterns: ["patter1", "pattern2", ...]
additional_pattern_definitions:
custom_patter1: custom_pattern_definition
custom_patter2: custom_pattern_definition2
when:
condition
-------

For example, in order to parse a Windows firewall pfirewall.log file, one can use the following configuration
[source,yaml]
-------
processors:
- grok:
field: "message"
patterns: ["%{TIMESTAMP_ISO8601:timestamp} %{WORD:action} %{WORD:protocol} (?:%{IP:source_ip}|[-]) (?:%{IP:destination_ip}|-) (?:%{INT:source_port}|-) (?:%{INT:destination_port}|-) `(?:%{INT:size}|-) (?:-|%{WORD:tcp_flags}) (?:-|%{WORD:tcp_syn}) (?:-|%{WORD:tcp_ack}) (?:-|%{WORD:tcp_win}) (?:-|%{WORD:icmp_type}) (?:-|%{WORD:icmp_code}) (?:-|%{WORD:info}) (?:-|%{WORD:direction})"]
-------

that would turn the next event
-------
{"message": "2015-11-22 04:14:00 DROP TCP 10.31.42.53 10.0.0.1 52209 359 52 S 3190407656 0 8192 - - - RECEIVE" }
-------
into
-------
{
"message": "2015-11-22 04:14:00 DROP TCP 10.31.42.53 10.0.0.1 52209 359 52 S 3190407656 0 8192 - - - RECEIVE",
"timestamp": "2015-11-22 04:14:00",
"action": "DROP",
"protocol": "TCP",
"source_ip": "10.31.42.53",
"destination_ip": "10.0.0.1",
"source_port": "52209",
"destination_port": "359",
"size": "52",
"tcp_flags": "S",
"tcp_syn": "3190407656",
"tcp_ack": "0",
"tcp_win": "8192",
"direction": "RECEIVE",
}
-------


[[drop-event]]
=== Drop events

Expand Down
175 changes: 175 additions & 0 deletions libbeat/processors/actions/grok.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
package actions

import (
"fmt"
"regexp"

"github.com/elastic/beats/libbeat/beat"
"github.com/elastic/beats/libbeat/common"
"github.com/elastic/beats/libbeat/logp"
"github.com/elastic/beats/libbeat/processors"
)

type grok struct {
Field string
Patterns []*regexp.Regexp
}

func init() {
processors.RegisterPlugin("grok",
configChecked(newGrok,
requireFields("field", "patterns"),
allowedFields("field", "patterns", "additional_pattern_definitions", "when")))
}

func newGrok(c *common.Config) (processors.Processor, error) {
type config struct {
Field string `config:"field"`
Patterns []string `config:"patterns"`
AdditionalPatternDefinitions map[string]string `config:"additional_pattern_definitions"`
}

var myconfig config
err := c.Unpack(&myconfig)
if err != nil {
logp.Warn("Error unpacking config for grok")
return nil, fmt.Errorf("fail to unpack the grok configuration: %s", err)
}

regexps := make([]*regexp.Regexp, len(myconfig.Patterns))
errInRegexps := false

for i, pattern := range myconfig.Patterns {
expandedPattern, err := grokExpandPattern(pattern, []string{}, myconfig.AdditionalPatternDefinitions)
if err != nil {
logp.Warn("Error compiling regular expression: `%s', %s", pattern, err)
errInRegexps = true
}
var patternStart string
if expandedPattern[0] == '^' {
patternStart = expandedPattern
} else {
patternStart = "^" + expandedPattern
}
regexps[i], err = regexp.Compile(patternStart)
if err != nil {
logp.Warn("Error compiling regular expression: `%s', %s", pattern, err)
logp.Warn("Pattern exanded: %s", expandedPattern)
errInRegexps = true
}
}
if errInRegexps {
return nil, fmt.Errorf("Error compiling regexps")
}
return grok{Field: myconfig.Field, Patterns: regexps}, nil
}

func (g grok) Run(event *beat.Event) (*beat.Event, error) {

fieldi, err := event.GetValue(g.Field)
if err == nil {
field, ok := fieldi.(string)
if ok {
for _, regexp := range g.Patterns {
matches := regexp.FindStringSubmatchIndex(field)
if matches != nil {
subexps := regexp.SubexpNames()
for i, subexp := range subexps {
if len(subexp) > 0 {
if matches[2*i] >= 0 {
event.PutValue(subexp, field[matches[2*i]:matches[2*i+1]])
}
}
}
break
}
}
}
}

return event, nil
}

func (g grok) String() string {
var name = "grok={field:" + g.Field + ", patterns = [ "
for i, regexp := range g.Patterns {
if i > 0 {
name = name + ", " + regexp.String()
} else {
name = name + regexp.String()
}
}
name = name + "]}"
return name
}

var grokRegexp = regexp.MustCompile(`%\{(\w+)(?::(\w+))?\}`)

func grokExpandPattern(pattern string, knownGrokNames []string, customPatterns map[string]string) (string, error) {
matches := grokRegexp.FindAllStringSubmatchIndex(pattern, -1)
var result []byte
if matches == nil {
return pattern, nil
}
i := 0
var errList []error
for _, match := range matches {
patternName := pattern[match[2]:match[3]]
patternExpand, err := grokSearchPattern(patternName, knownGrokNames, customPatterns)
if err != nil {
errList = append(errList, err)
continue
}
if len(errList) == 0 {
if len(match) >= 6 && match[4] >= 0 && match[5] >= 0 {
substName := pattern[match[4]:match[5]]
patternExpand = namedMatch(patternExpand, substName)
} else {
patternExpand = unNamedMatch(patternExpand)
}
if match[0] >= i+1 {
result = append(result, pattern[i:match[0]]...)
}
result = append(result, patternExpand...)
}
i = match[1]
}
if len(errList) != 0 {
return "", fmt.Errorf("Error parsing grok pattern: %v", errList)
}
if i < len(pattern) {
result = append(result, pattern[i:]...)
}
return string(result), nil
}

func grokSearchPattern(patternName string, knownGrokNames []string, customPatterns map[string]string) (string, error) {
recursion := false
for _, usedName := range knownGrokNames {
if usedName == patternName {
recursion = true
}
}
if recursion {
return "", fmt.Errorf("detected recursion in grok name '%s'", patternName)
}

patterns := getGrokBuiltinPattern()
regexpVal, ok := customPatterns[patternName]
if !ok {
regexpVal, ok = patterns[patternName]
}
if !ok {
return "", fmt.Errorf("unknown grok name '%s'", patternName)
}
knownGrokNames2 := append(knownGrokNames, patternName)
return grokExpandPattern(regexpVal, knownGrokNames2, customPatterns)
}

func namedMatch(pattern string, name string) string {
return "(?P<" + name + ">" + pattern + ")"
}

func unNamedMatch(pattern string) string {
return "(?:" + pattern + ")"
}
124 changes: 124 additions & 0 deletions libbeat/processors/actions/grok_builin_patterns.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package actions

import "sync"

func getGrokBuiltinPatternOnce() map[string]string {

// TODO: remove lookbehind expressions like (?<! and
// (?! zero-width negative lookahead and
// (?>
return map[string]string{

"USERNAME": `[a-zA-Z0-9._-]+`,
"USER": `%{USERNAME}`,
"EMAILLOCALPART": `[a-zA-Z0-9_][a-zA-Z0-9_.+-=:]+`,
"EMAILADDRESS": `%{EMAILLOCALPART}@%{HOSTNAME}`,
"HTTPDUSER": `%{EMAILADDRESS}|%{USER}`,
"INT": `(?:[+-]?(?:[0-9]+))`,
"BASE10NUM": `(?:[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+)))`,
"NUMBER": `(?:%{BASE10NUM})`,
"BASE16NUM": `(?:[+-]?(?:0x)?(?:[0-9A-Fa-f]+))`,
"BASE16FLOAT": `\b(?:[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+)))\b`,

"POSINT": `\b(?:[1-9][0-9]*)\b`,
"NONNEGINT": `\b(?:[0-9]+)\b`,
"WORD": `\b\w+\b`,
"NOTSPACE": `\S+`,
"SPACE": `\s*`,
"DATA": `.*?`,
"GREEDYDATA": `.*`,
"QUOTEDSTRING": `(?>(?>"(?>\\.|[^\\"]+)+"|""|(?>'(?>\\.|[^\\']+)+')|''|(?>` + "`" + `(?>\\.|[^\\` + "`" + `]+)+` + "`" + `)|` + "``" + `))`,
"UUID": `[A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}`,

// Networking
"MAC": `(?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC})`,
"CISCOMAC": `(?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4})`,
"WINDOWSMAC": `(?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})`,
"COMMONMAC": `(?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})`,
"IPV6": `((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?`,
// "IPV4": `(?:(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5]))(?![0-9])`,
"IPV4": `(?:(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5]))`,
"IP": `(?:%{IPV6}|%{IPV4})`,
"HOSTNAME": `\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b)`,
"IPORHOST": `(?:%{IP}|%{HOSTNAME})`,
"HOSTPORT": `%{IPORHOST}:%{POSINT}`,

// paths
"PATH": `(?:%{UNIXPATH}|%{WINPATH})`,
"UNIXPATH": `(/([\w_%!$@:.,~-]+|\\.)*)+`,
"TTY": `(?:/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+))`,
"WINPATH": `(?>[A-Za-z]+:|\\)(?:\\[^\\?*]*)+`,
"URIPROTO": `[A-Za-z]+(\+[A-Za-z+]+)?`,
"URIHOST": `%{IPORHOST}(?::%{POSINT:port})?`,
// uripath comes loosely from RFC1738, but mostly from what Firefox
// doesn't turn into %XX
"URIPATH": `(?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\-]*)+`,
//URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)?
"URIPARAM": `\?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]<>]*`,
"URIPATHPARAM": `%{URIPATH}(?:%{URIPARAM})?`,
"URI": `%{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?`,

// Months: January, Feb, 3, 03, 12, December
"MONTH": `\b(?:Jan(?:uary|uar)?|Feb(?:ruary|ruar)?|M(?:a|ä)?r(?:ch|z)?|Apr(?:il)?|Ma(?:y|i)?|Jun(?:e|i)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|O(?:c|k)?t(?:ober)?|Nov(?:ember)?|De(?:c|z)(?:ember)?)\b`,
"MONTHNUM": `(?:0?[1-9]|1[0-2])`,
"MONTHNUM2": `(?:0[1-9]|1[0-2])`,
"MONTHDAY": `(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9])`,

// Days: Monday, Tue, Thu, etc...
"DAY": `(?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?)`,

// Years?
"YEAR": `(?:\d\d){1,2}`,
"HOUR": `(?:2[0123]|[01]?[0-9])`,
"MINUTE": `(?:[0-5][0-9])`,
// '60' is a leap second in most time standards and thus is valid.
"SECOND": `(?:(?:[0-5]?[0-9]|60)(?:[:.,][0-9]+)?)`,
// "TIME": `(?!<[0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9])`, but be careful about possible side effects of removing lookaheads
"TIME": `%{HOUR}:%{MINUTE}(?::%{SECOND})`,
// datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it)
"DATE_US": `%{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}`,
"DATE_EU": `%{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR}`,
"ISO8601_TIMEZONE": `(?:Z|[+-]%{HOUR}(?::?%{MINUTE}))`,
"ISO8601_SECOND": `(?:%{SECOND}|60)`,
"ISO8601_HOUR": `(?:2[0123]|[01][0-9])`,
"TIMESTAMP_ISO8601": `%{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{ISO8601_HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}?`,
"DATE": `%{DATE_US}|%{DATE_EU}`,
"DATESTAMP": `%{DATE}[- ]%{TIME}`,
"TZ": `(?:[PMCE][SD]T|UTC)`,
"DATESTAMP_RFC822": `%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}`,
"DATESTAMP_RFC2822": `%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{TIME} %{ISO8601_TIMEZONE}`,
"DATESTAMP_OTHER": `%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}`,
"DATESTAMP_EVENTLOG": `%{YEAR}%{MONTHNUM2}%{MONTHDAY}%{HOUR}%{MINUTE}%{SECOND}`,
"HTTPDERROR_DATE": `%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{YEAR}`,

// Syslog Dates: Month Day HH:MM:SS
"SYSLOGTIMESTAMP": `%{MONTH} +%{MONTHDAY} %{TIME}`,
"PROG": `[\x21-\x5a\x5c\x5e-\x7e]+`,
"SYSLOGPROG": `%{PROG:program}(?:\[%{POSINT:pid}\])?`,
"SYSLOGHOST": `%{IPORHOST}`,
"SYSLOGFACILITY": `<%{NONNEGINT:facility}.%{NONNEGINT:priority}>`,
"HTTPDATE": `%{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT}`,

// Shortcuts
"QS": `%{QUOTEDSTRING}`,

// Log formats
"SYSLOGBASE": `%{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:`,
"COMMONAPACHELOG": `%{IPORHOST:clientip} %{HTTPDUSER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})" %{NUMBER:response} (?:%{NUMBER:bytes}|-)`,
"COMBINEDAPACHELOG": `%{COMMONAPACHELOG} %{QS:referrer} %{QS:agent}`,
"HTTPD20_ERRORLOG": `\[%{HTTPDERROR_DATE:timestamp}\] \[%{LOGLEVEL:loglevel}\] (?:\[client %{IPORHOST:clientip}\] ){0,1}%{GREEDYDATA:errormsg}`,
"HTTPD24_ERRORLOG": `\[%{HTTPDERROR_DATE:timestamp}\] \[%{WORD:module}:%{LOGLEVEL:loglevel}\] \[pid %{POSINT:pid}:tid %{NUMBER:tid}\]( \(%{POSINT:proxy_errorcode}\)%{DATA:proxy_errormessage}:)?( \[client %{IPORHOST:client}:%{POSINT:clientport}\])? %{DATA:errorcode}: %{GREEDYDATA:message}`,
"HTTPD_ERRORLOG": `%{HTTPD20_ERRORLOG}|%{HTTPD24_ERRORLOG}`,

// Log Levels
"LOGLEVEL": `([Aa]lert|ALERT|[Tt]race|TRACE|[Dd]ebug|DEBUG|[Nn]otice|NOTICE|[Ii]nfo|INFO|[Ww]arn?(?:ing)?|WARN?(?:ING)?|[Ee]rr?(?:or)?|ERR?(?:OR)?|[Cc]rit?(?:ical)?|CRIT?(?:ICAL)?|[Ff]atal|FATAL|[Ss]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?)`,
}
}

var grokBuiltinPatterns map[string]string

func getGrokBuiltinPattern() map[string]string {
var o sync.Once
o.Do(func() { grokBuiltinPatterns = getGrokBuiltinPatternOnce() })
return grokBuiltinPatterns
}
Loading