-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Marco Heyden
committed
Feb 22, 2024
1 parent
da38b49
commit 905a706
Showing
5 changed files
with
339 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
from .classifiers import AdaptiveRandomForest, OnlineBagging, AdaptiveRandomForest | ||
from .efdt import EFDT | ||
from .hoeffding_tree import HoeffdingTree | ||
|
||
__all__ = ["AdaptiveRandomForest", "OnlineBagging", "AdaptiveRandomForest"] | ||
__all__ = ["AdaptiveRandomForest", "OnlineBagging", "AdaptiveRandomForest", "CapyEFDT", "HoeffdingTree"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
from __future__ import annotations | ||
|
||
import inspect | ||
|
||
from capymoa.learner import MOAClassifier | ||
import moa.classifiers.trees as moa_trees | ||
from capymoa.stream import Schema | ||
|
||
|
||
class EFDT(MOAClassifier): | ||
"""Extremely Fast Decision Tree (EFDT) classifier. | ||
Also referred to as the Hoeffding AnyTime Tree (HATT) classifier. In practice, | ||
despite the name, EFDTs are typically slower than a vanilla Hoeffding Tree | ||
to process data. The speed differences come from the mechanism of split | ||
re-evaluation present in EFDT. Nonetheless, EFDT has theoretical properties | ||
that ensure it converges faster than the vanilla Hoeffding Tree to the structure | ||
that would be created by a batch decision tree model (such as Classification and | ||
Regression Trees - CART). Keep in mind that such propositions hold when processing | ||
a stationary data stream. When dealing with non-stationary data, EFDT is somewhat | ||
robust to concept drifts as it continually revisits and updates its internal | ||
decision tree structure. Still, in such cases, the Hoeffind Adaptive Tree might | ||
be a better option, as it was specifically designed to handle non-stationarity. | ||
Parameters | ||
---------- | ||
grace_period | ||
Number of instances a leaf should observe between split attempts. | ||
min_samples_reevaluate | ||
Number of instances a node should observe before reevaluating the best split. | ||
split_criterion | ||
Split criterion to use.</br> | ||
- 'gini' - Gini</br> | ||
- 'info_gain' - Information Gain</br> | ||
- 'hellinger' - Helinger Distance</br> | ||
confidence | ||
Significance level to calculate the Hoeffding bound. The significance level is given by | ||
`1 - delta`. Values closer to zero imply longer split decision delays. | ||
tau | ||
Threshold below which a split will be forced to break ties. | ||
leaf_prediction | ||
Prediction mechanism used at leafs.</br> | ||
- 'mc' - Majority Class</br> | ||
- 'nb' - Naive Bayes</br> | ||
- 'nba' - Naive Bayes Adaptive</br> | ||
nb_threshold | ||
Number of instances a leaf should observe before allowing Naive Bayes. | ||
numeric_attribute_observer | ||
The Splitter or Attribute Observer (AO) used to monitor the class statistics of numeric | ||
features and perform splits. Splitters are available in the `tree.splitter` module. | ||
Different splitters are available for classification and regression tasks. Classification | ||
and regression splitters can be distinguished by their property `is_target_class`. | ||
This is an advanced option. Special care must be taken when choosing different splitters. | ||
By default, `tree.splitter.GaussianSplitter` is used if `splitter` is `None`. | ||
binary_split | ||
If True, only allow binary splits. | ||
min_branch_fraction | ||
The minimum percentage of observed data required for branches resulting from split | ||
candidates. To validate a split candidate, at least two resulting branches must have | ||
a percentage of samples greater than `min_branch_fraction`. This criterion prevents | ||
unnecessary splits when the majority of instances are concentrated in a single branch. | ||
max_share_to_split | ||
Only perform a split in a leaf if the proportion of elements in the majority class is | ||
smaller than this parameter value. This parameter avoids performing splits when most | ||
of the data belongs to a single class. | ||
max_byte_size | ||
The max size of the tree, in bytes. | ||
memory_estimate_period | ||
Interval (number of processed instances) between memory consumption checks. | ||
stop_mem_management | ||
If True, stop growing as soon as memory limit is hit. | ||
remove_poor_attrs | ||
If True, disable poor attributes to reduce memory usage. | ||
disable_prepruning | ||
If True, disable merit-based tree pre-pruning. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
schema: Schema | None = None, | ||
random_seed: int = 0, | ||
grace_period: int = 200, | ||
min_samples_reevaluate: int = 20, | ||
split_criterion: str = "info_gain", | ||
confidence: float = 1e-7, | ||
tie_threshold: float = 0.05, | ||
leaf_prediction: str = "nba", | ||
nb_threshold: int = 0, | ||
numeric_attribute_observer: str | None = None, | ||
binary_split: bool = False, | ||
min_branch_fraction: float = 0.01, | ||
max_share_to_split: float = 0.99, | ||
max_byte_size: float = 33554433, | ||
memory_estimate_period: int = 1000000, | ||
stop_mem_management: bool = False, | ||
remove_poor_attrs: bool = False, | ||
disable_prepruning: bool = True, | ||
): | ||
# Example configuration string: | ||
# "trees.EFDT -R 2001 -m 33554433 -n FIMTDDNumericAttributeClassObserver -e 10003000 -g 201 -s GiniSplitCriterion -c 0.002 -t 0.051 -b -z -r -p -l NB -q 1" | ||
|
||
mappings = { | ||
"grace_period": "-g", | ||
"min_samples_reevaluate": "-R", | ||
"max_byte_size": "-m", | ||
"numeric_attribute_observer": "-n", | ||
"memory_estimate_period": "-e", | ||
"split_criterion": "-s", | ||
"confidence": "-c", | ||
"tie_threshold": "-t", | ||
"binary_split": "-b", | ||
"stop_mem_management": "-z", | ||
"remove_poor_attrs": "-r", | ||
"disable_prepruning": "-p", | ||
"leaf_prediction": "-l", | ||
"nb_threshold": "-q" | ||
} | ||
|
||
config_str = "" | ||
parameters = inspect.signature(self.__init__).parameters | ||
for key in mappings: | ||
if key not in parameters: | ||
continue | ||
this_parameter = parameters[key] | ||
default_value = this_parameter.default | ||
set_value = locals()[key] | ||
is_bool = type(set_value) == bool | ||
default_value = default_value if type(default_value) != bool else int(default_value) | ||
set_value = set_value if type(set_value) != bool else int(set_value) | ||
str_extension = f"{mappings[key]} {set_value if not is_bool else ''} " | ||
config_str += str_extension if set_value != default_value else "" | ||
|
||
super(EFDT, self).__init__(moa_learner=moa_trees.EFDT, | ||
schema=schema, | ||
CLI=config_str, | ||
random_seed=random_seed) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
from __future__ import annotations | ||
|
||
import inspect | ||
|
||
from capymoa.learner import MOAClassifier | ||
import moa.classifiers.trees as moa_trees | ||
from capymoa.stream import Schema | ||
|
||
|
||
class HoeffdingTree(MOAClassifier): | ||
"""Hoeffding Tree classifier. | ||
Parameters | ||
---------- | ||
grace_period | ||
Number of instances a leaf should observe between split attempts. | ||
split_criterion | ||
Split criterion to use.</br> | ||
- 'gini' - Gini</br> | ||
- 'info_gain' - Information Gain</br> | ||
- 'hellinger' - Helinger Distance</br> | ||
confidence | ||
Significance level to calculate the Hoeffding bound. The significance level is given by | ||
`1 - delta`. Values closer to zero imply longer split decision delays. | ||
tau | ||
Threshold below which a split will be forced to break ties. | ||
leaf_prediction | ||
Prediction mechanism used at leafs.</br> | ||
- 'mc' - Majority Class</br> | ||
- 'nb' - Naive Bayes</br> | ||
- 'nba' - Naive Bayes Adaptive</br> | ||
nb_threshold | ||
Number of instances a leaf should observe before allowing Naive Bayes. | ||
numeric_attribute_observer | ||
The Splitter or Attribute Observer (AO) used to monitor the class statistics of numeric | ||
features and perform splits. Splitters are available in the `tree.splitter` module. | ||
Different splitters are available for classification and regression tasks. Classification | ||
and regression splitters can be distinguished by their property `is_target_class`. | ||
This is an advanced option. Special care must be taken when choosing different splitters. | ||
By default, `tree.splitter.GaussianSplitter` is used if `splitter` is `None`. | ||
binary_split | ||
If True, only allow binary splits. | ||
min_branch_fraction | ||
The minimum percentage of observed data required for branches resulting from split | ||
candidates. To validate a split candidate, at least two resulting branches must have | ||
a percentage of samples greater than `min_branch_fraction`. This criterion prevents | ||
unnecessary splits when the majority of instances are concentrated in a single branch. | ||
max_share_to_split | ||
Only perform a split in a leaf if the proportion of elements in the majority class is | ||
smaller than this parameter value. This parameter avoids performing splits when most | ||
of the data belongs to a single class. | ||
max_byte_size | ||
The max size of the tree, in bytes. | ||
memory_estimate_period | ||
Interval (number of processed instances) between memory consumption checks. | ||
stop_mem_management | ||
If True, stop growing as soon as memory limit is hit. | ||
remove_poor_attrs | ||
If True, disable poor attributes to reduce memory usage. | ||
disable_prepruning | ||
If True, disable merit-based tree pre-pruning. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
schema: Schema | None = None, | ||
random_seed: int = 0, | ||
grace_period: int = 200, | ||
split_criterion: str = "info_gain", | ||
confidence: float = 1e-7, | ||
tie_threshold: float = 0.05, | ||
leaf_prediction: str = "nba", | ||
nb_threshold: int = 0, | ||
numeric_attribute_observer: str | None = None, | ||
binary_split: bool = False, | ||
min_branch_fraction: float = 0.01, | ||
max_share_to_split: float = 0.99, | ||
max_byte_size: float = 33554433, | ||
memory_estimate_period: int = 1000000, | ||
stop_mem_management: bool = False, | ||
remove_poor_attrs: bool = False, | ||
disable_prepruning: bool = True, | ||
): | ||
# Example configuration string: | ||
# "trees.EFDT -R 2001 -m 33554433 -n FIMTDDNumericAttributeClassObserver -e 10003000 -g 201 -s GiniSplitCriterion -c 0.002 -t 0.051 -b -z -r -p -l NB -q 1" | ||
|
||
mappings = { | ||
"grace_period": "-g", | ||
"max_byte_size": "-m", | ||
"numeric_attribute_observer": "-n", | ||
"memory_estimate_period": "-e", | ||
"split_criterion": "-s", | ||
"confidence": "-c", | ||
"tie_threshold": "-t", | ||
"binary_split": "-b", | ||
"stop_mem_management": "-z", | ||
"remove_poor_attrs": "-r", | ||
"disable_prepruning": "-p", | ||
"leaf_prediction": "-l", | ||
"nb_threshold": "-q" | ||
} | ||
|
||
config_str = "" | ||
parameters = inspect.signature(self.__init__).parameters | ||
for key in mappings: | ||
if key not in parameters: | ||
continue | ||
this_parameter = parameters[key] | ||
default_value = this_parameter.default | ||
set_value = locals()[key] | ||
is_bool = type(set_value) == bool | ||
default_value = default_value if type(default_value) != bool else int(default_value) | ||
set_value = set_value if type(set_value) != bool else int(set_value) | ||
str_extension = f"{mappings[key]} {set_value if not is_bool else ''} " | ||
config_str += str_extension if set_value != default_value else "" | ||
|
||
super(HoeffdingTree, self).__init__(moa_learner=moa_trees.HoeffdingTree, | ||
schema=schema, | ||
CLI=config_str, | ||
random_seed=random_seed) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from capymoa.datasets.datasets import ElectricityTiny, CovtypeTiny | ||
from capymoa.learner.classifier import EFDT | ||
from test_utility.ssl_helpers import assert_ssl_evaluation | ||
import pytest | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"stream, expectation", | ||
[ | ||
(ElectricityTiny(), 46.0), | ||
(CovtypeTiny(), 46.0), | ||
], | ||
ids=["ElectricityTiny", "CovtypeTiny"] | ||
) | ||
def test_EFDT(stream, expectation): | ||
# The optimizer steps are set to 10 to speed up the test | ||
learner = EFDT( | ||
schema=stream.schema, | ||
grace_period=201, | ||
min_samples_reevaluate=21, | ||
# split_criterion="gini", | ||
confidence=1e-3, | ||
tie_threshold=0.055, | ||
# leaf_prediction="mc", | ||
nb_threshold=1, | ||
# numeric_attribute_observer="FIMTDDNumericAttributeClassObserver", | ||
binary_split=True, | ||
min_branch_fraction=0.02, | ||
max_share_to_split=0.98, | ||
max_byte_size=33554434, | ||
memory_estimate_period=1000001, | ||
stop_mem_management=True, | ||
remove_poor_attrs=True, | ||
disable_prepruning=False, | ||
) | ||
assert_ssl_evaluation( | ||
learner, | ||
stream, | ||
expectation, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from capymoa.datasets.datasets import ElectricityTiny, CovtypeTiny | ||
from capymoa.learner.classifier import HoeffdingTree | ||
from test_utility.ssl_helpers import assert_ssl_evaluation | ||
import pytest | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"stream, expectation", | ||
[ | ||
(ElectricityTiny(), 46.0), | ||
(CovtypeTiny(), 46.0), | ||
], | ||
ids=["ElectricityTiny", "CovtypeTiny"] | ||
) | ||
def test_HT(stream, expectation): | ||
# The optimizer steps are set to 10 to speed up the test | ||
learner = HoeffdingTree( | ||
schema=stream.schema, | ||
grace_period=201, | ||
# split_criterion="gini", | ||
confidence=1e-3, | ||
tie_threshold=0.055, | ||
# leaf_prediction="mc", | ||
nb_threshold=1, | ||
# numeric_attribute_observer="FIMTDDNumericAttributeClassObserver", | ||
binary_split=True, | ||
min_branch_fraction=0.02, | ||
max_share_to_split=0.98, | ||
max_byte_size=33554434, | ||
memory_estimate_period=1000001, | ||
stop_mem_management=True, | ||
remove_poor_attrs=True, | ||
disable_prepruning=False, | ||
) | ||
assert_ssl_evaluation( | ||
learner, | ||
stream, | ||
expectation, | ||
) |