From a4c206af1f8f397fbb6d9fb84aaa48407ce6674e Mon Sep 17 00:00:00 2001
From: Anton Lee <anton.j.15.le@gmail.com>
Date: Thu, 11 Apr 2024 16:49:46 +1200
Subject: [PATCH] Add `PassiveAggressiveClassifier`

---
 docs/conf.py                               |   1 +
 src/capymoa/learner/classifier/__init__.py |  10 +-
 src/capymoa/learner/classifier/sklearn.py  | 111 +++++++++++++++++++++
 tests/test_classifiers.py                  |  15 ++-
 4 files changed, 131 insertions(+), 6 deletions(-)
 create mode 100644 src/capymoa/learner/classifier/sklearn.py

diff --git a/docs/conf.py b/docs/conf.py
index d70e43e1..30d2a854 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -38,6 +38,7 @@
     'wiki': ('https://en.wikipedia.org/wiki/%s', ''),
     'moa-api': ('https://javadoc.io/doc/nz.ac.waikato.cms.moa/moa/latest/%s', ''),
     'doi': ('https://doi.org/%s', ''),
+    'sklearn': ('https://scikit-learn.org/stable/modules/generated/sklearn.%s.html', 'sklearn.%s'),
 }
 
 
diff --git a/src/capymoa/learner/classifier/__init__.py b/src/capymoa/learner/classifier/__init__.py
index 7f13556a..ee4b3c6e 100644
--- a/src/capymoa/learner/classifier/__init__.py
+++ b/src/capymoa/learner/classifier/__init__.py
@@ -1,5 +1,13 @@
 from .classifiers import AdaptiveRandomForest, OnlineBagging, AdaptiveRandomForest
 from .efdt import EFDT
+from .sklearn import PassiveAggressiveClassifier
 from .hoeffding_tree import HoeffdingTree
 
-__all__ = ["AdaptiveRandomForest", "OnlineBagging", "AdaptiveRandomForest", "EFDT", "HoeffdingTree"]
+__all__ = [
+    "AdaptiveRandomForest",
+    "OnlineBagging",
+    "AdaptiveRandomForest",
+    "EFDT",
+    "HoeffdingTree",
+    "PassiveAggressiveClassifier",
+]
diff --git a/src/capymoa/learner/classifier/sklearn.py b/src/capymoa/learner/classifier/sklearn.py
new file mode 100644
index 00000000..f212bfc6
--- /dev/null
+++ b/src/capymoa/learner/classifier/sklearn.py
@@ -0,0 +1,111 @@
+from typing import Optional, Dict, Union, Literal
+from capymoa.learner.learners import Classifier
+from sklearn.linear_model import (
+    PassiveAggressiveClassifier as skPassiveAggressiveClassifier,
+)
+from capymoa.stream.instance import Instance, LabeledInstance
+from capymoa.stream.stream import Schema
+from capymoa.type_alias import LabelIndex, LabelProbabilities
+import numpy as np
+
+
+class PassiveAggressiveClassifier(Classifier):
+    """Streaming Passive Aggressive Classifier
+
+    This wraps :sklearn:`linear_model.PassiveAggressiveClassifier` for
+    ease of use in the streaming context. Some options are missing because
+    they are not relevant in the streaming context.
+
+    `Online Passive-Aggressive Algorithms K. Crammer, O. Dekel, J. Keshat, S.
+    Shalev-Shwartz, Y. Singer - JMLR (2006)
+    <http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>`_
+
+    >>> from capymoa.datasets import ElectricityTiny
+    >>> from capymoa.learner.classifier import PassiveAggressiveClassifier
+    >>> from capymoa.evaluation import prequential_evaluation
+    >>> stream = ElectricityTiny()
+    >>> schema = stream.get_schema()
+    >>> learner = PassiveAggressiveClassifier(schema)
+    >>> results = prequential_evaluation(stream, learner, max_instances=1000, optimise=False)
+    >>> results["cumulative"].accuracy()
+    84.3
+    """
+
+    sklearner: skPassiveAggressiveClassifier
+    """The underlying scikit-learn object. See: :sklearn:`linear_model.PassiveAggressiveClassifier`"""
+
+    def __init__(
+        self,
+        schema: Schema,
+        max_step_size: float = 1.0,
+        fit_intercept: bool = True,
+        loss: str = "hinge",
+        n_jobs: Optional[int] = None,
+        class_weight: Union[Dict[int, float], None, Literal["balanced"]] = None,
+        average: bool = False,
+        random_seed=1,
+    ):
+        """Construct a passive aggressive classifier.
+
+        :param schema: Stream schema
+        :param max_step_size: Maximum step size (regularization).
+        :param fit_intercept: Whether the intercept should be estimated or not.
+            If False, the data is assumed to be already centered.
+        :param loss: The loss function to be used: hinge: equivalent to PA-I in
+            the reference paper. squared_hinge: equivalent to PA-II in the reference paper.
+        :param n_jobs: The number of CPUs to use to do the OVA (One Versus All,
+            for multi-class problems) computation. None means 1 unless in a
+            ``joblib.parallel_backend`` context. -1 means using all processors.
+        :param class_weight: Preset for the ``sklearner.class_weight`` fit parameter.
+
+            Weights associated with classes. If not given, all classes are
+            supposed to have weight one.
+
+            The “balanced” mode uses the values of y to automatically adjust
+            weights inversely proportional to class frequencies in the input
+            data as ``n_samples / (n_classes * np.bincount(y))``.
+        :param average: When set to True, computes the averaged SGD weights and
+            stores the result in the ``sklearner.coef_`` attribute. If set to an int greater
+            than 1, averaging will begin once the total number of samples
+            seen reaches average. So ``average=10`` will begin averaging after
+            seeing 10 samples.
+        :param random_seed: Seed for the random number generator.
+        """
+
+        super().__init__(schema, random_seed)
+
+        self.sklearner = skPassiveAggressiveClassifier(
+            C=max_step_size,
+            fit_intercept=fit_intercept,
+            early_stopping=False,
+            shuffle=False,
+            verbose=0,
+            loss=loss,
+            n_jobs=n_jobs,
+            warm_start=False,
+            class_weight=class_weight,
+            average=average,
+            random_state=random_seed,
+        )
+        self._classes = schema.get_label_indexes()
+        self._is_fitted = False
+
+    def __str__(self):
+        return str(self.sklearner)
+
+    def train(self, instance: LabeledInstance):
+        x = instance.x.reshape(1, -1)
+        y = np.array(instance.y_index).reshape(1)
+        self.sklearner.partial_fit(x, y, classes=self._classes)
+        self._is_fitted = True
+
+    def predict(self, instance: Instance) -> Optional[LabelIndex]:
+        if not self._is_fitted:
+            return None
+        x = instance.x.reshape(1, -1)
+        return self.sklearner.predict(x).item()
+
+    def predict_proba(self, instance: Instance) -> LabelProbabilities:
+        proba = np.zeros(len(self._classes))
+        proba[self.predict(instance)] = 1
+        return proba
diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py
index 544085c0..d976d1e0 100644
--- a/tests/test_classifiers.py
+++ b/tests/test_classifiers.py
@@ -6,6 +6,8 @@
 import pytest
 from functools import partial
 
+from capymoa.learner.classifier.sklearn import PassiveAggressiveClassifier
+
 
 @pytest.mark.parametrize(
     "learner_constructor,accuracy,win_accuracy",
@@ -13,14 +15,16 @@
         (partial(OnlineBagging, ensemble_size=5), 84.6, 89.0),
         (partial(AdaptiveRandomForest), 89.6, 91.0),
         (partial(HoeffdingTree), 73.85, 73.0),
-        (partial(EFDT), 82.7, 82.0)
+        (partial(EFDT), 82.7, 82.0),
+        (partial(PassiveAggressiveClassifier), 84.7, 81.0),
     ],
     ids=[
         "OnlineBagging",
         "AdaptiveRandomForest",
         "HoeffdingTree",
-        "EFDT"
-    ]
+        "EFDT",
+        "PassiveAggressiveClassifier",
+    ],
 )
 def test_on_tiny(learner_constructor, accuracy, win_accuracy):
     """Test on tiny is a fast running simple test to check if a learner's
@@ -32,7 +36,9 @@ def test_on_tiny(learner_constructor, accuracy, win_accuracy):
     """
     stream = ElectricityTiny()
     evaluator = ClassificationEvaluator(schema=stream.get_schema())
-    win_evaluator = ClassificationWindowedEvaluator(schema=stream.get_schema(), window_size=100)
+    win_evaluator = ClassificationWindowedEvaluator(
+        schema=stream.get_schema(), window_size=100
+    )
     learner = learner_constructor(schema=stream.get_schema())
 
     while stream.has_more_instances():
@@ -44,4 +50,3 @@ def test_on_tiny(learner_constructor, accuracy, win_accuracy):
 
     assert evaluator.accuracy() == pytest.approx(accuracy, abs=0.1)
     assert win_evaluator.accuracy() == pytest.approx(win_accuracy, abs=0.1)
-