Added Hoeffding tree and EFDT

adaptive-machine-learning · Feb 22, 2024 · 905a706 · 905a706
1 parent da38b49
commit 905a706
Show file tree

Hide file tree

Showing 5 changed files with 339 additions and 1 deletion.
diff --git a/src/capymoa/learner/classifier/__init__.py b/src/capymoa/learner/classifier/__init__.py
@@ -1,3 +1,5 @@
 from .classifiers import AdaptiveRandomForest, OnlineBagging, AdaptiveRandomForest
+from .efdt import EFDT
+from .hoeffding_tree import HoeffdingTree
 
-__all__ = ["AdaptiveRandomForest", "OnlineBagging", "AdaptiveRandomForest"]
+__all__ = ["AdaptiveRandomForest", "OnlineBagging", "AdaptiveRandomForest", "CapyEFDT", "HoeffdingTree"]
diff --git a/src/capymoa/learner/classifier/efdt.py b/src/capymoa/learner/classifier/efdt.py
@@ -0,0 +1,137 @@
+from __future__ import annotations
+
+import inspect
+
+from capymoa.learner import MOAClassifier
+import moa.classifiers.trees as moa_trees
+from capymoa.stream import Schema
+
+
+class EFDT(MOAClassifier):
+    """Extremely Fast Decision Tree (EFDT) classifier.
+
+    Also referred to as the Hoeffding AnyTime Tree (HATT) classifier. In practice,
+    despite the name, EFDTs are typically slower than a vanilla Hoeffding Tree
+    to process data. The speed differences come from the mechanism of split
+    re-evaluation present in EFDT. Nonetheless, EFDT has theoretical properties
+    that ensure it converges faster than the vanilla Hoeffding Tree to the structure
+    that would be created by a batch decision tree model (such as Classification and
+    Regression Trees - CART). Keep in mind that such propositions hold when processing
+    a stationary data stream. When dealing with non-stationary data, EFDT is somewhat
+    robust to concept drifts as it continually revisits and updates its internal
+    decision tree structure. Still, in such cases, the Hoeffind Adaptive Tree might
+    be a better option, as it was specifically designed to handle non-stationarity.
+
+
+    Parameters
+    ----------
+    grace_period
+        Number of instances a leaf should observe between split attempts.
+    min_samples_reevaluate
+        Number of instances a node should observe before reevaluating the best split.
+    split_criterion
+        Split criterion to use.</br>
+        - 'gini' - Gini</br>
+        - 'info_gain' - Information Gain</br>
+        - 'hellinger' - Helinger Distance</br>
+    confidence
+        Significance level to calculate the Hoeffding bound. The significance level is given by
+        `1 - delta`. Values closer to zero imply longer split decision delays.
+    tau
+        Threshold below which a split will be forced to break ties.
+    leaf_prediction
+        Prediction mechanism used at leafs.</br>
+        - 'mc' - Majority Class</br>
+        - 'nb' - Naive Bayes</br>
+        - 'nba' - Naive Bayes Adaptive</br>
+    nb_threshold
+        Number of instances a leaf should observe before allowing Naive Bayes.
+    numeric_attribute_observer
+        The Splitter or Attribute Observer (AO) used to monitor the class statistics of numeric
+        features and perform splits. Splitters are available in the `tree.splitter` module.
+        Different splitters are available for classification and regression tasks. Classification
+        and regression splitters can be distinguished by their property `is_target_class`.
+        This is an advanced option. Special care must be taken when choosing different splitters.
+        By default, `tree.splitter.GaussianSplitter` is used if `splitter` is `None`.
+    binary_split
+        If True, only allow binary splits.
+    min_branch_fraction
+        The minimum percentage of observed data required for branches resulting from split
+        candidates. To validate a split candidate, at least two resulting branches must have
+        a percentage of samples greater than `min_branch_fraction`. This criterion prevents
+        unnecessary splits when the majority of instances are concentrated in a single branch.
+    max_share_to_split
+        Only perform a split in a leaf if the proportion of elements in the majority class is
+        smaller than this parameter value. This parameter avoids performing splits when most
+        of the data belongs to a single class.
+    max_byte_size
+        The max size of the tree, in bytes.
+    memory_estimate_period
+        Interval (number of processed instances) between memory consumption checks.
+    stop_mem_management
+        If True, stop growing as soon as memory limit is hit.
+    remove_poor_attrs
+        If True, disable poor attributes to reduce memory usage.
+    disable_prepruning
+        If True, disable merit-based tree pre-pruning.
+    """
+
+    def __init__(
+            self,
+            schema: Schema | None = None,
+            random_seed: int = 0,
+            grace_period: int = 200,
+            min_samples_reevaluate: int = 20,
+            split_criterion: str = "info_gain",
+            confidence: float = 1e-7,
+            tie_threshold: float = 0.05,
+            leaf_prediction: str = "nba",
+            nb_threshold: int = 0,
+            numeric_attribute_observer: str | None = None,
+            binary_split: bool = False,
+            min_branch_fraction: float = 0.01,
+            max_share_to_split: float = 0.99,
+            max_byte_size: float = 33554433,
+            memory_estimate_period: int = 1000000,
+            stop_mem_management: bool = False,
+            remove_poor_attrs: bool = False,
+            disable_prepruning: bool = True,
+    ):
+        # Example configuration string:
+        # "trees.EFDT -R 2001 -m 33554433 -n FIMTDDNumericAttributeClassObserver -e 10003000 -g 201 -s GiniSplitCriterion -c 0.002 -t 0.051 -b -z -r -p -l NB -q 1"
+
+        mappings = {
+            "grace_period": "-g",
+            "min_samples_reevaluate": "-R",
+            "max_byte_size": "-m",
+            "numeric_attribute_observer": "-n",
+            "memory_estimate_period": "-e",
+            "split_criterion": "-s",
+            "confidence": "-c",
+            "tie_threshold": "-t",
+            "binary_split": "-b",
+            "stop_mem_management": "-z",
+            "remove_poor_attrs": "-r",
+            "disable_prepruning": "-p",
+            "leaf_prediction": "-l",
+            "nb_threshold": "-q"
+        }
+
+        config_str = ""
+        parameters = inspect.signature(self.__init__).parameters
+        for key in mappings:
+            if key not in parameters:
+                continue
+            this_parameter = parameters[key]
+            default_value = this_parameter.default
+            set_value = locals()[key]
+            is_bool = type(set_value) == bool
+            default_value = default_value if type(default_value) != bool else int(default_value)
+            set_value = set_value if type(set_value) != bool else int(set_value)
+            str_extension = f"{mappings[key]} {set_value if not is_bool else ''} "
+            config_str += str_extension if set_value != default_value else ""
+
+        super(EFDT, self).__init__(moa_learner=moa_trees.EFDT,
+                                   schema=schema,
+                                   CLI=config_str,
+                                   random_seed=random_seed)
diff --git a/src/capymoa/learner/classifier/hoeffding_tree.py b/src/capymoa/learner/classifier/hoeffding_tree.py
@@ -0,0 +1,120 @@
+from __future__ import annotations
+
+import inspect
+
+from capymoa.learner import MOAClassifier
+import moa.classifiers.trees as moa_trees
+from capymoa.stream import Schema
+
+
+class HoeffdingTree(MOAClassifier):
+    """Hoeffding Tree classifier.
+
+    Parameters
+    ----------
+    grace_period
+        Number of instances a leaf should observe between split attempts.
+    split_criterion
+        Split criterion to use.</br>
+        - 'gini' - Gini</br>
+        - 'info_gain' - Information Gain</br>
+        - 'hellinger' - Helinger Distance</br>
+    confidence
+        Significance level to calculate the Hoeffding bound. The significance level is given by
+        `1 - delta`. Values closer to zero imply longer split decision delays.
+    tau
+        Threshold below which a split will be forced to break ties.
+    leaf_prediction
+        Prediction mechanism used at leafs.</br>
+        - 'mc' - Majority Class</br>
+        - 'nb' - Naive Bayes</br>
+        - 'nba' - Naive Bayes Adaptive</br>
+    nb_threshold
+        Number of instances a leaf should observe before allowing Naive Bayes.
+    numeric_attribute_observer
+        The Splitter or Attribute Observer (AO) used to monitor the class statistics of numeric
+        features and perform splits. Splitters are available in the `tree.splitter` module.
+        Different splitters are available for classification and regression tasks. Classification
+        and regression splitters can be distinguished by their property `is_target_class`.
+        This is an advanced option. Special care must be taken when choosing different splitters.
+        By default, `tree.splitter.GaussianSplitter` is used if `splitter` is `None`.
+    binary_split
+        If True, only allow binary splits.
+    min_branch_fraction
+        The minimum percentage of observed data required for branches resulting from split
+        candidates. To validate a split candidate, at least two resulting branches must have
+        a percentage of samples greater than `min_branch_fraction`. This criterion prevents
+        unnecessary splits when the majority of instances are concentrated in a single branch.
+    max_share_to_split
+        Only perform a split in a leaf if the proportion of elements in the majority class is
+        smaller than this parameter value. This parameter avoids performing splits when most
+        of the data belongs to a single class.
+    max_byte_size
+        The max size of the tree, in bytes.
+    memory_estimate_period
+        Interval (number of processed instances) between memory consumption checks.
+    stop_mem_management
+        If True, stop growing as soon as memory limit is hit.
+    remove_poor_attrs
+        If True, disable poor attributes to reduce memory usage.
+    disable_prepruning
+        If True, disable merit-based tree pre-pruning.
+    """
+
+    def __init__(
+            self,
+            schema: Schema | None = None,
+            random_seed: int = 0,
+            grace_period: int = 200,
+            split_criterion: str = "info_gain",
+            confidence: float = 1e-7,
+            tie_threshold: float = 0.05,
+            leaf_prediction: str = "nba",
+            nb_threshold: int = 0,
+            numeric_attribute_observer: str | None = None,
+            binary_split: bool = False,
+            min_branch_fraction: float = 0.01,
+            max_share_to_split: float = 0.99,
+            max_byte_size: float = 33554433,
+            memory_estimate_period: int = 1000000,
+            stop_mem_management: bool = False,
+            remove_poor_attrs: bool = False,
+            disable_prepruning: bool = True,
+    ):
+        # Example configuration string:
+        # "trees.EFDT -R 2001 -m 33554433 -n FIMTDDNumericAttributeClassObserver -e 10003000 -g 201 -s GiniSplitCriterion -c 0.002 -t 0.051 -b -z -r -p -l NB -q 1"
+
+        mappings = {
+            "grace_period": "-g",
+            "max_byte_size": "-m",
+            "numeric_attribute_observer": "-n",
+            "memory_estimate_period": "-e",
+            "split_criterion": "-s",
+            "confidence": "-c",
+            "tie_threshold": "-t",
+            "binary_split": "-b",
+            "stop_mem_management": "-z",
+            "remove_poor_attrs": "-r",
+            "disable_prepruning": "-p",
+            "leaf_prediction": "-l",
+            "nb_threshold": "-q"
+        }
+
+        config_str = ""
+        parameters = inspect.signature(self.__init__).parameters
+        for key in mappings:
+            if key not in parameters:
+                continue
+            this_parameter = parameters[key]
+            default_value = this_parameter.default
+            set_value = locals()[key]
+            is_bool = type(set_value) == bool
+            default_value = default_value if type(default_value) != bool else int(default_value)
+            set_value = set_value if type(set_value) != bool else int(set_value)
+            str_extension = f"{mappings[key]} {set_value if not is_bool else ''} "
+            config_str += str_extension if set_value != default_value else ""
+
+        super(HoeffdingTree, self).__init__(moa_learner=moa_trees.HoeffdingTree,
+                                            schema=schema,
+                                            CLI=config_str,
+                                            random_seed=random_seed)
diff --git a/tests/test_EFDT.py b/tests/test_EFDT.py
@@ -0,0 +1,40 @@
+from capymoa.datasets.datasets import ElectricityTiny, CovtypeTiny
+from capymoa.learner.classifier import EFDT
+from test_utility.ssl_helpers import assert_ssl_evaluation
+import pytest
+
+
+@pytest.mark.parametrize(
+    "stream, expectation",
+    [
+        (ElectricityTiny(), 46.0),
+        (CovtypeTiny(), 46.0),
+    ],
+    ids=["ElectricityTiny", "CovtypeTiny"]
+)
+def test_EFDT(stream, expectation):
+    # The optimizer steps are set to 10 to speed up the test
+    learner = EFDT(
+        schema=stream.schema,
+        grace_period=201,
+        min_samples_reevaluate=21,
+        # split_criterion="gini",
+        confidence=1e-3,
+        tie_threshold=0.055,
+        # leaf_prediction="mc",
+        nb_threshold=1,
+        # numeric_attribute_observer="FIMTDDNumericAttributeClassObserver",
+        binary_split=True,
+        min_branch_fraction=0.02,
+        max_share_to_split=0.98,
+        max_byte_size=33554434,
+        memory_estimate_period=1000001,
+        stop_mem_management=True,
+        remove_poor_attrs=True,
+        disable_prepruning=False,
+    )
+    assert_ssl_evaluation(
+        learner,
+        stream,
+        expectation,
+    )
diff --git a/tests/test_HT.py b/tests/test_HT.py
@@ -0,0 +1,39 @@
+from capymoa.datasets.datasets import ElectricityTiny, CovtypeTiny
+from capymoa.learner.classifier import HoeffdingTree
+from test_utility.ssl_helpers import assert_ssl_evaluation
+import pytest
+
+
+@pytest.mark.parametrize(
+    "stream, expectation",
+    [
+        (ElectricityTiny(), 46.0),
+        (CovtypeTiny(), 46.0),
+    ],
+    ids=["ElectricityTiny", "CovtypeTiny"]
+)
+def test_HT(stream, expectation):
+    # The optimizer steps are set to 10 to speed up the test
+    learner = HoeffdingTree(
+        schema=stream.schema,
+        grace_period=201,
+        # split_criterion="gini",
+        confidence=1e-3,
+        tie_threshold=0.055,
+        # leaf_prediction="mc",
+        nb_threshold=1,
+        # numeric_attribute_observer="FIMTDDNumericAttributeClassObserver",
+        binary_split=True,
+        min_branch_fraction=0.02,
+        max_share_to_split=0.98,
+        max_byte_size=33554434,
+        memory_estimate_period=1000001,
+        stop_mem_management=True,
+        remove_poor_attrs=True,
+        disable_prepruning=False,
+    )
+    assert_ssl_evaluation(
+        learner,
+        stream,
+        expectation,
+    )