diff --git a/README.md b/README.md index 41cebcb..76dfdfd 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,8 @@ Available parameters are: - `[DRAIN]/sim_th` - similarity threshold (default 0.4) - `[DRAIN]/depth` - depth of all leaf nodes (default 4) - `[DRAIN]/max_children` - max number of children of an internal node (default 100) +- `[DRAIN]/extra_delimiters` - delimiters to apply when splitting log message into words (in addition to whitespace) (default none). + Format is a Python list e.g. `['_', ':']`. - `[MASKING]/masking` - parameters masking - in json format (default "") - `[SNAPSHOT]/snapshot_interval_minutes` - time interval for new snapshots (default 1) - `[SNAPSHOT]/compress_state` - whether to compress the state before saving it. This can be useful when using Kafka persistence. @@ -192,6 +194,9 @@ Our project welcomes external contributions. Please refer to [CONTRIBUTING.md](C ## Change Log +##### v0.8.6 +* Added `extra_delimiters` configuration option to Drain + ##### v0.8.5 * Profiler improvements diff --git a/drain3/drain.py b/drain3/drain.py index af9077c..29542cb 100644 --- a/drain3/drain.py +++ b/drain3/drain.py @@ -32,13 +32,19 @@ def __init__(self, key, depth): class Drain: - def __init__(self, depth=4, sim_th=0.4, max_children=100, profiler: Profiler = NullProfiler()): + def __init__(self, + depth=4, + sim_th=0.4, + max_children=100, + extra_delimiters=(), + profiler: Profiler = NullProfiler()): """ Attributes ---------- depth : depth of all leaf nodes (nodes that contain log clusters) sim_th : similarity threshold max_children : max number of children of an internal node + extra_delimiters: delimiters to apply when splitting log message into words (in addition to whitespace). """ self.depth = depth - 2 # number of prefix tokens in each tree path (exclude root and leaf node) self.sim_th = sim_th @@ -46,6 +52,7 @@ def __init__(self, depth=4, sim_th=0.4, max_children=100, profiler: Profiler = N self.clusters = [] self.root_node = Node("(ROOT)", 0) self.profiler = profiler + self.extra_delimiters = extra_delimiters @staticmethod def has_numbers(s): @@ -231,6 +238,9 @@ def num_to_cluster_id(num): def add_log_message(self, content: str): content = content.strip() + for delimiter in self.extra_delimiters: + content = content.replace(delimiter, " ") + content_tokens = content.split() if self.profiler: diff --git a/drain3/template_miner.py b/drain3/template_miner.py index f901033..b0e7806 100644 --- a/drain3/template_miner.py +++ b/drain3/template_miner.py @@ -4,6 +4,7 @@ Author_email: david.ohana@ibm.com, moshikh@il.ibm.com, eranra@il.ibm.com License : MIT """ +import ast import base64 import configparser import logging @@ -37,10 +38,15 @@ def __init__(self, persistence_handler: PersistenceHandler = None): self.persistence_handler = persistence_handler self.snapshot_interval_seconds = self.config.getint('SNAPSHOT', 'snapshot_interval_minutes', fallback=5) * 60 self.compress_state = self.config.getboolean('SNAPSHOT', 'compress_state', fallback=True) + + extra_delimiters = self.config.get('DRAIN', 'extra_delimiters', fallback="[]") + extra_delimiters = ast.literal_eval(extra_delimiters) + self.drain = Drain( sim_th=self.config.getfloat('DRAIN', 'sim_th', fallback=0.4), depth=self.config.getint('DRAIN', 'depth', fallback=4), max_children=self.config.getint('DRAIN', 'max_children', fallback=100), + extra_delimiters=extra_delimiters, profiler=self.profiler ) self.masker = LogMasker(self.config) diff --git a/examples/drain3.ini b/examples/drain3.ini index e826179..cc99fed 100644 --- a/examples/drain3.ini +++ b/examples/drain3.ini @@ -17,7 +17,7 @@ masking = [ sim_th = 0.4 depth = 4 max_children = 100 - +extra_delimiters = ["_"] [PROFILING] enabled = True diff --git a/setup.py b/setup.py index 9ed2de8..fd842dd 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setup( name='drain3', packages=['drain3'], - version="0.8.5", + version="0.8.6", license='MIT', description="Persistent & streaming log template miner", long_description=long_description,