From 80f7007786d76d2b82eade82a3db2f14d34d6102 Mon Sep 17 00:00:00 2001 From: nuwangunasekara Date: Wed, 1 May 2024 23:01:59 +1200 Subject: [PATCH] feat: add SGBT --- src/capymoa/classifier/__init__.py | 2 + src/capymoa/classifier/_sgbt.py | 94 ++++++++++++++++++++++++++++++ tests/test_classifiers.py | 5 +- 3 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 src/capymoa/classifier/_sgbt.py diff --git a/src/capymoa/classifier/__init__.py b/src/capymoa/classifier/__init__.py index 46bf077b..4a1efb16 100644 --- a/src/capymoa/classifier/__init__.py +++ b/src/capymoa/classifier/__init__.py @@ -6,6 +6,7 @@ from ._passive_aggressive_classifier import PassiveAggressiveClassifier from ._sgd_classifier import SGDClassifier from ._knn import KNN +from ._sgbt import SGBT __all__ = [ "AdaptiveRandomForest", @@ -17,4 +18,5 @@ "KNN", "PassiveAggressiveClassifier", "SGDClassifier", + "SGBT" ] diff --git a/src/capymoa/classifier/_sgbt.py b/src/capymoa/classifier/_sgbt.py new file mode 100644 index 00000000..98349560 --- /dev/null +++ b/src/capymoa/classifier/_sgbt.py @@ -0,0 +1,94 @@ +from __future__ import annotations +from typing import Union + +from capymoa.base import ( + MOAClassifier, +) +from capymoa.stream import Schema +from capymoa._utils import build_cli_str_from_mapping_and_locals + +from moa.classifiers.meta import StreamingGradientBoostedTrees as _MOA_SGBT + + +class SGBT(MOAClassifier): + """Streaming Gradient Boosted Trees (SGBT) Classifier + + Streaming Gradient Boosted Trees (SGBT), which is trained using weighted squared loss elicited + in XGBoost. SGBT exploits trees with a replacement strategy to detect and recover from drifts, + thus enabling the ensemble to adapt without sacrificing the predictive performance. + + Reference: + + `Gradient boosted trees for evolving data streams. + Nuwan Gunasekara, Bernhard Pfahringer, Heitor Murilo Gomes, Albert Bifet. + Machine Learning, Springer, 2024. + `_ + + Example usages: + + >>> from capymoa.datasets import ElectricityTiny + >>> from capymoa.classifier import SGBT + >>> from capymoa.evaluation import prequential_evaluation + >>> stream = ElectricityTiny() + >>> schema = stream.get_schema() + >>> learner = SGBT(schema) + >>> results = prequential_evaluation(stream, learner, max_instances=1000) + >>> results["cumulative"].accuracy() + 86.3 + >>> stream = ElectricityTiny() + >>> schema = stream.get_schema() + >>> learner = SGBT(schema, base_learner='meta.AdaptiveRandomForestRegressor -s 10', boosting_iterations=10) + >>> results = prequential_evaluation(stream, learner, max_instances=1000) + >>> results["cumulative"].accuracy() + 86.8 + """ + + def __init__( + self, + schema: Schema | None = None, + random_seed: int = 0, + base_learner = 'trees.FIMTDD -s VarianceReductionSplitCriterion -g 25 -c 0.05 -e -p', + boosting_iterations: int = 100, + percentage_of_features: int = 75, + learning_rate = 0.0125, + disable_one_hot: bool = False, + multiply_hessian_by: int = 1, + skip_training: int =1, + use_squared_loss: bool = False, + ): + """Streaming Gradient Boosted Trees (SGBT) Classifier + + :param schema: The schema of the stream. + :param random_seed: The random seed passed to the MOA learner. + :param base_learner: The base learner to be trained. Default FIMTDD -s VarianceReductionSplitCriterion -g 25 -c 0.05 -e -p. + :param boosting_iterations: The number of boosting iterations. + :param percentage_of_features: The percentage of features to use. + :param learning_rate: The learning rate. + :param disable_one_hot: Whether to disable one-hot encoding for regressors that supports nominal attributes. + :param multiply_hessian_by: The multiply hessian by this parameter to generate weights for multiple iterations. + :param skip_training: Skip training of 1/skip_training instances. skip_training=1 means no skipping is performed (train on all instances). + :param use_squared_loss: Whether to use squared loss for classification. + """ + + mapping = { + "base_learner": "-l", + "boosting_iterations": "-s", + "percentage_of_features": "-m", + "learning_rate": "-L", + "disable_one_hot": "-H", + "multiply_hessian_by": "-M", + "skip_training": "-S", + "use_squared_loss": "-K", + "random_seed": "-r", + } + + assert (type(base_learner) == str + ), "Only MOA CLI strings are supported for SGBT base_learner, at the moment." + + config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) + super(SGBT, self).__init__( + moa_learner=_MOA_SGBT, + schema=schema, + CLI=config_str, + random_seed=random_seed, + ) \ No newline at end of file diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py index 54adbac0..d8fa59f9 100644 --- a/tests/test_classifiers.py +++ b/tests/test_classifiers.py @@ -5,7 +5,8 @@ AdaptiveRandomForest, OnlineBagging, NaiveBayes, - KNN + KNN, + SGBT ) from capymoa.base import Classifier from capymoa.base import MOAClassifier @@ -38,6 +39,7 @@ (partial(KNN), 81.6, 74.0, None), (partial(PassiveAggressiveClassifier), 84.7, 81.0, None), (partial(SGDClassifier), 84.7, 83.0, None), + (partial(SGBT), 88.75, 88.0, None), ], ids=[ "OnlineBagging", @@ -49,6 +51,7 @@ "KNN", "PassiveAggressiveClassifier", "SGDClassifier", + "SGBT" ], ) def test_classifiers(