feat: clustering, 4 wrappers, notebook and documentation

adaptive-machine-learning · Aug 22, 2024 · b5c8f28 · b5c8f28
1 parent dc68be6
commit b5c8f28
Show file tree

Hide file tree

Showing 9 changed files with 505 additions and 157 deletions.
diff --git a/notebooks/clustering.ipynb b/notebooks/clustering.ipynb
diff --git a/src/capymoa/base.py b/src/capymoa/base.py
@@ -536,6 +536,32 @@ def score_instance(self, instance):
 ##############################################################
 ######################### Clustering #########################
 ##############################################################
+class ClusteringResult:
+    """Abstract clustering result class that has the structure of clusters: centers, weights, radii, and ids.
+    
+    IDs might not be available for most MOA implementations."""
+
+    def __init__(self, centers, weights, radii, ids):
+        self._centers = centers
+        self._weights = weights
+        self._radii = radii
+        self._ids = ids
+
+    def get_centers(self):
+        return self._centers
+
+    def get_weights(self):
+        return self._weights
+
+    def get_radii(self):
+        return self._radii
+
+    def get_ids(self):
+        return self._ids
+
+    def __str__(self) -> str:
+        return f"Centers: {self._centers}, Weights: {self._weights}, Radii: {self._radii}, IDs: {self._ids}"
+
 class Clusterer(ABC):
     def __init__(self, schema: Schema, random_seed=1):
         self.random_seed = random_seed
@@ -551,6 +577,46 @@ def __str__(self):
     def train(self, instance: Instance):
         pass
 
+    @abstractmethod
+    def implements_micro_clusters(self) -> bool:
+        pass
+
+    @abstractmethod
+    def implements_macro_clusters(self) -> bool:
+        pass
+
+    @abstractmethod
+    def _get_micro_clusters_centers(self):
+        pass
+
+    @abstractmethod
+    def _get_micro_clusters_radii(self):
+        pass
+
+    @abstractmethod
+    def _get_micro_clusters_weights(self):
+        pass
+
+    @abstractmethod
+    def _get_clusters_centers(self):
+        pass
+
+    @abstractmethod
+    def _get_clusters_radii(self):
+        pass
+
+    @abstractmethod
+    def _get_clusters_weights(self):
+        pass
+
+    @abstractmethod
+    def get_clustering_result(self):
+        pass
+
+    @abstractmethod
+    def get_micro_clustering_result(self):
+        pass
+
     # @abstractmethod
     # def predict(self, instance: Instance) -> Optional[LabelIndex]:
     #     pass
@@ -606,46 +672,59 @@ def CLI_help(self):
     def train(self, instance):
         self.moa_learner.trainOnInstance(instance.java_instance.getData())
 
-    def get_micro_clusters_centers(self):
+    def _get_micro_clusters_centers(self):
         ret = []
         for c in self.moa_learner.getMicroClusteringResult().getClustering():
             java_array = c.getCenter()[:-1]
             python_array = [java_array[i] for i in range(len(java_array))]  # Convert to Python list
             ret.append(python_array)
         return ret
 
-    def get_micro_clusters_radii(self):
+    def _get_micro_clusters_radii(self):
         ret = []
         for c in self.moa_learner.getMicroClusteringResult().getClustering():
             ret.append(c.getRadius())
         return ret
 
-    def get_micro_clusters_weights(self):
+    def _get_micro_clusters_weights(self):
         ret = []
         for c in self.moa_learner.getMicroClusteringResult().getClustering():
             ret.append(c.getWeight())
         return ret
 
-    def get_clusters_centers(self):
+    def _get_clusters_centers(self):
         ret = []
         for c in self.moa_learner.getClusteringResult().getClustering():
             java_array = c.getCenter()[:-1]
             python_array = [java_array[i] for i in range(len(java_array))]  # Convert to Python list
             ret.append(python_array)
         return ret
 
-    def get_clusters_radii(self):
+    def _get_clusters_radii(self):
         ret = []
         for c in self.moa_learner.getClusteringResult().getClustering():
             ret.append(c.getRadius())
         return ret
 
-    def get_clusters_weights(self):
+    def _get_clusters_weights(self):
         ret = []
         for c in self.moa_learner.getClusteringResult().getClustering():
             ret.append(c.getWeight())
         return ret
 
+    def get_clustering_result(self):
+        if self.implements_macro_clusters():
+            # raise ValueError("This clusterer does not implement macro-clusters.")
+            return ClusteringResult(self._get_clusters_centers(), self._get_clusters_weights(), self._get_clusters_radii(), [])
+        else:
+            return ClusteringResult([], [], [], [])
+
+    def get_micro_clustering_result(self):
+        if self.implements_micro_clusters():
+            return ClusteringResult(self._get_micro_clusters_centers(), self._get_micro_clusters_weights(), self._get_micro_clusters_radii(), [])
+        else:
+            return ClusteringResult([], [], [], [])
+
 
     # def predict(self, instance):
     #     return Utils.maxIndex(

diff --git a/src/capymoa/clusterers/__init__.py b/src/capymoa/clusterers/__init__.py
@@ -0,0 +1,13 @@
+from ._clustream import Clustream
+from ._clustream_with_kmeans import Clustream_with_kmeans
+from ._clustree import ClusTree
+from ._denstream_with_dbscan import Denstream_with_dbscan
+# from ._dstream import Dstream
+
+__all__ = [
+    "Clustream",
+    "Clustream_with_kmeans",
+    "ClusTree",
+    "Denstream_with_dbscan",
+    # "Dstream"
+]
diff --git a/src/capymoa/clusterers/_clustream.py b/src/capymoa/clusterers/_clustream.py
@@ -0,0 +1,57 @@
+from capymoa.base import MOAClusterer, ClusteringResult
+import os
+import typing
+from moa.clusterers.clustream import Clustream as _MOA_Clustream
+from capymoa.stream import Schema
+from capymoa._utils import build_cli_str_from_mapping_and_locals
+# import numpy as np
+
+class Clustream(MOAClusterer):
+    """
+    Clustream clustering algorithm without Macro-clustering.
+    """
+    def __init__(
+        self,
+        schema: typing.Union[Schema, None] = None,
+        time_window: int = 1000,
+        max_num_kernels: int = 100,
+        kernel_radi_factor: float = 2
+    ):
+        """Clustream clusterer.
+
+        :param schema: The schema of the stream.
+        :param time_window: The size of the time window.
+        :param max_num_kernels: Maximum number of micro kernels to use.
+        :param kernel_radi_factor: Multiplier for the kernel radius
+        """
+
+        mapping = {
+            "time_window": "-h",
+            "max_num_kernels": "-k",
+            "kernel_radi_factor": "-t"
+        }
+
+        config_str = build_cli_str_from_mapping_and_locals(mapping, locals())
+        self.moa_learner = _MOA_Clustream()
+        super(Clustream, self).__init__(
+            schema=schema,
+            CLI=config_str,
+            moa_learner=self.moa_learner
+        )
+
+    def implements_micro_clusters(self) -> bool:
+        return True
+
+    def implements_macro_clusters(self) -> bool:
+        return False
+
+    # def predict(self, X):
+    #     clusters = self.get_micro_clustering_result()
+    #     min_dist = np.inf
+    #     closest_center = None
+    #     for center in clusters.get_centers():
+    #         if np.linalg.norm(center - X) < min_dist:
+    #             min_dist = np.linalg.norm(center - X)
+    #             closest_center = center
+    #     print(closest_center)
+    #     return closest_center
diff --git a/src/capymoa/clusterers/_clustream_with_kmeans.py b/src/capymoa/clusterers/_clustream_with_kmeans.py
@@ -0,0 +1,62 @@
+from capymoa.base import MOAClusterer
+import typing
+from moa.clusterers.clustream import WithKmeans as _MOA_Clustream_WKM
+from capymoa.stream import Schema
+from capymoa._utils import build_cli_str_from_mapping_and_locals
+# import numpy as np
+
+class Clustream_with_kmeans(MOAClusterer):
+    """
+    Clustream clustering algorithm without Macro-clustering.
+    """
+    def __init__(
+        self,
+        schema: typing.Union[Schema, None] = None,
+        time_window: int = 1000,
+        max_num_kernels: int = 100,
+        kernel_radi_factor: float = 2,
+        k_option: int = 5
+    ):
+        """Clustream clusterer with K-means offline clustering.
+
+        :param schema: The schema of the stream.
+        :param time_window: The size of the time window.
+        :param max_num_kernels: Maximum number of micro kernels to use.
+        :param kernel_radi_factor: Multiplier for the kernel radius
+        :param k_option: Number of clusters to use in the k-means offline step
+        """
+
+        mapping = {
+            "time_window": "-h",
+            "max_num_kernels": "-m",
+            "kernel_radi_factor": "-t",
+            "k_option": "-k"
+        }
+
+        config_str = build_cli_str_from_mapping_and_locals(mapping, locals())
+        self.moa_learner = _MOA_Clustream_WKM()
+        super(Clustream_with_kmeans, self).__init__(
+            schema=schema,
+            CLI=config_str,
+            moa_learner=self.moa_learner
+        )
+
+    def implements_micro_clusters(self) -> bool:
+        return True
+
+    def implements_macro_clusters(self) -> bool:
+        return True
+
+    # def predict(self, X):
+    #     clusters = self.get_micro_clustering_result()
+    #     min_dist = np.inf
+    #     closest_center = None
+    #     for center in clusters.get_centers():
+    #         if np.linalg.norm(center - X) < min_dist:
+    #             min_dist = np.linalg.norm(center - X)
+    #             closest_center = center
+    #     print(closest_center)
+    #     return closest_center
+
+    def __str__(self):
+        return "Clustream with KMeans"
diff --git a/src/capymoa/clusterers/_clustree.py b/src/capymoa/clusterers/_clustree.py
@@ -0,0 +1,57 @@
+from capymoa.base import MOAClusterer, ClusteringResult
+import os
+import typing
+from moa.clusterers.clustree import ClusTree as _MOA_ClusTree
+from capymoa.stream import Schema
+from capymoa._utils import build_cli_str_from_mapping_and_locals
+# import numpy as np
+
+class ClusTree(MOAClusterer):
+    """
+    ClusTree clustering algorithm without Macro-clustering.
+    """
+    def __init__(
+        self,
+        schema: typing.Union[Schema, None] = None,
+        horizon: int = 1000,
+        max_height: int = 8,
+        breadth_first_strategy: bool = False
+    ):
+        """Clustream clusterer.
+
+        :param schema: The schema of the stream
+        :param horizon: The size of the time window
+        :param max_height: The maximum height of the tree
+        :param breadth_first_strategy: Whether to use breadth-first strategy
+        """
+
+        mapping = {
+            "horizon": "-h",
+            "max_height": "-H",
+            "breadth_first_strategy": "-B"
+        }
+
+        config_str = build_cli_str_from_mapping_and_locals(mapping, locals())
+        self.moa_learner = _MOA_ClusTree()
+        super(ClusTree, self).__init__(
+            schema=schema,
+            CLI=config_str,
+            moa_learner=self.moa_learner
+        )
+
+    def implements_micro_clusters(self) -> bool:
+        return True
+
+    def implements_macro_clusters(self) -> bool:
+        return False
+
+    # def predict(self, X):
+    #     clusters = self.get_micro_clustering_result()
+    #     min_dist = np.inf
+    #     closest_center = None
+    #     for center in clusters.get_centers():
+    #         if np.linalg.norm(center - X) < min_dist:
+    #             min_dist = np.linalg.norm(center - X)
+    #             closest_center = center
+    #     print(closest_center)
+    #     return closest_center