Skip to content

Commit

Permalink
docs
Browse files Browse the repository at this point in the history
  • Loading branch information
yuanbohan committed Jun 9, 2024
1 parent 0a47c64 commit 0a696f0
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 25 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version = "0.1.1"
edition = "2021"
readme = "README.md"
description = "Rust port of elastic Grok processor"
repository = "https://github.com/GreptimeTeam/promql-parser"
repository = "https://github.com/yuanbohan/grok-rs"
authors = ["yuanbohan"]
keywords = ["grok", "elastic", "logstash", "ETL"]
license = "Apache-2.0"
Expand Down
16 changes: 14 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,16 @@
[![Version](https://img.shields.io/crates/v/grok-rs?label=grok-rs)](https://crates.io/crates/grok-rs)
[![codecov](https://codecov.io/gh/yuanbohan/grok-rs/graph/badge.svg?token=1T8WSFV6BX)](https://codecov.io/gh/yuanbohan/grok-rs)

# grok
# grok_rs

Rust port of Elastic Grok processor, inspired by [grok-go][grok-go] and [grok][grok]
the `grok_rs` is a rust port of Elastic Grok processor, inspired by [grok-go][grok-go] and [grok][grok]

## Usage

```toml
[dependencies]
grok-rs = "0.1"
```

## Example

Expand Down Expand Up @@ -92,10 +99,15 @@ the output is:
}
```

## Notice

`grok_rs` is based on [regex][regex] crate, so lacks several features that are not known how to implement efficiently. This includes, but is not limited to, look-around and backreferences. In exchange, all regex searches in this crate have worst case `O(m * n)` time complexity, where `m` is proportional to the size of the regex and `n` is proportional to the size of the string being searched.

## Elastic Grok compliance

This crate declares compatible with [elastic grok patterns v8.14.0][grok-patterns], which is tagged at 2024-06-05.

[grok-patterns]: https://github.com/elastic/elasticsearch/tree/v8.14.0/libs/grok/src/main/resources/patterns/ecs-v1
[grok-go]: https://github.com/elastic/go-grok
[grok]: https://github.com/daschl/grok
[regex]: https://docs.rs/regex/latest/regex
115 changes: 93 additions & 22 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,40 @@
//! grok-rs is a Rust implementation of the [Grok](https://www.elastic.co/guide/en/elasticsearch/reference/current/grok-processor.html)
//! pattern matching library.
//!
//! The captured group can be renamed based on the alias, and the value can be converted to the specified type.
//! The supported types are:
//! - int
//! - long
//! - float
//! - double
//! - bool
//!
//! # Usage
//!
//! Initiate a Grok instance with the default patterns, or add custom patterns,then compile the your pattern,
//! and parse the input string based on the pattern.
//!
//! ```
//! use std::collections::HashMap;
//! use grok_rs::{Grok, Value};
//!
//! let mut grok = Grok::default();
//! grok.add_pattern("NAME", r"[A-z0-9._-]+");
//! let pattern = grok.compile("%{NAME}", false).unwrap();
//! let expected: HashMap<String, Value> = [("NAME", "admin")]
//! .into_iter()
//! .map(|(k, v)| (k.to_string(), Value::String(v.to_string())))
//! .collect();
//!
//! assert_eq!(expected, pattern.parse("admin").unwrap());
//! assert_eq!(expected, pattern.parse("admin user").unwrap());
//! ```
use std::{
collections::HashMap,
fs::File,
io::{BufRead, BufReader},
};

use glob::glob;
use lazy_static::lazy_static;
use regex::Regex;

const MAX_RECURSION: i32 = 1024;
Expand All @@ -31,7 +60,7 @@ const GROK_PATTERN: &str = r"(?x)
fn load_patterns() -> HashMap<String, String> {
let mut patterns = HashMap::new();

for line in glob("src/patterns/*")
for line in glob::glob("src/patterns/*")
.unwrap()
.map(|e| File::open(e.unwrap()).unwrap())
.flat_map(|f| BufReader::new(f).lines())
Expand All @@ -47,7 +76,7 @@ fn load_patterns() -> HashMap<String, String> {
patterns
}

lazy_static! {
lazy_static::lazy_static! {
static ref GROK_REGEX: Regex = Regex::new(GROK_PATTERN).unwrap();
static ref DEFAULT_PATTERNS: HashMap<String, String> = load_patterns();
}
Expand All @@ -60,17 +89,42 @@ pub enum Value {
String(String),
}

type AliasType = (String, Option<String>);

#[derive(Debug)]
pub struct Pattern {
regex: Regex,
alias: HashMap<String, (String, Option<String>)>,
alias: HashMap<String, AliasType>,
}

impl Pattern {
fn new(regex: Regex, alias: HashMap<String, (String, Option<String>)>) -> Self {
fn new(regex: Regex, alias: HashMap<String, AliasType>) -> Self {
Self { regex, alias }
}

/// parse the input string based on the pattern, and rename the captured group based on alias.
/// - if type is specified, then the value will be converted to the specified type.
/// - if the type is not supported, then the value will be kept as string.
/// - if the value can't be converted to the specified type, then an error will be returned.
/// - if the value can't be captured, then an empty map will be returned.
///
/// # Example
/// ```
/// use std::collections::HashMap;
/// use grok_rs::{Grok, Value};
///
/// let grok = Grok::default();
/// let pattern = grok.compile("%{USERNAME}", false).unwrap();
/// let result = pattern.parse("admin admin@example.com").unwrap();
/// let expected = [("USERNAME", "admin")]
/// .into_iter()
/// .map(|(k, v)| (k.to_string(), Value::String(v.to_string())))
/// .collect::<HashMap<String, Value>>();
/// assert_eq!(expected, result);
///
/// let empty = pattern.parse("✅").unwrap();
/// assert!(empty.is_empty());
/// ```
pub fn parse(&self, s: &str) -> Result<HashMap<String, Value>, String> {
let mut map = HashMap::new();
let names = self.regex.capture_names().flatten().collect::<Vec<_>>();
Expand All @@ -86,19 +140,16 @@ impl Pattern {
match self.alias.get(name) {
Some((alias, type_)) => {
let value = match type_ {
Some(type_) => match type_.as_str() {
"int" | "long" => Value::Int(
value.parse::<i64>().map_err(|e| format!("{e}: {value}"))?,
),
"float" | "double" => Value::Float(
value.parse::<f64>().map_err(|e| format!("{e}: {value}"))?,
),
"bool" | "boolean" => Value::Bool(
value.parse::<bool>().map_err(|e| format!("{e}: {value}"))?,
),
_ => Value::String(value),
},
None => Value::String(value),
Some(type_) if type_.eq("int") || type_.eq("long") => Value::Int(
value.parse::<i64>().map_err(|e| format!("{e}: {value}"))?,
),
Some(type_) if type_.eq("float") || type_.eq("double") => Value::Float(
value.parse::<f64>().map_err(|e| format!("{e}: {value}"))?,
),
Some(type_) if type_.eq("bool") || type_.eq("boolean") => Value::Bool(
value.parse::<bool>().map_err(|e| format!("{e}: {value}"))?,
),
_ => Value::String(value),
};
map.insert(alias.clone(), value);
}
Expand All @@ -119,13 +170,33 @@ pub struct Grok {
}

impl Grok {
/// add a custom pattern, if the pattern is already defined, then it will be overwritten.
/// # Example
/// ```
/// use grok_rs::Grok;
///
/// let mut grok = Grok::default();
/// grok.add_pattern("NAME", r"[A-z0-9._-]+");
/// ```
pub fn add_pattern<T: Into<String>>(&mut self, name: T, pattern: T) {
self.patterns.insert(name.into(), pattern.into());
}

/// if named_capture_only is true, then pattern without alias won't be captured. e.g.
/// if pattern is "%{USERNAME} %{EMAILADDRESS:email}" and named_capture_only is true,
/// then only email will be captured.
/// Compile the pattern, and return a Pattern.
/// - if `named_capture_only` is true, then the unnamed capture group will be ignored.
/// - if the pattern is invalid or not found , then an error will be returned.
///
/// Due to the compile process is heavy, it's recommended compile the pattern once and reuse it.
///
/// # Example
///
/// the USERNAME will be ignored because `named_capture_only` is true.
///
/// ```
/// use grok_rs::Grok;
/// let grok = Grok::default();
/// let pattern = grok.compile("%{USERNAME} %{EMAILADDRESS:email}", true).unwrap();
/// ```
pub fn compile(&self, s: &str, named_capture_only: bool) -> Result<Pattern, String> {
let mut alias_map = HashMap::new();
let mut haystack = s.to_string();
Expand Down

0 comments on commit 0a696f0

Please sign in to comment.