Skip to content

Commit

Permalink
feat: clustering, 4 wrappers, notebook and documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
cassales authored and hmgomes committed Aug 22, 2024
1 parent dc68be6 commit b5c8f28
Show file tree
Hide file tree
Showing 9 changed files with 505 additions and 157 deletions.
129 changes: 38 additions & 91 deletions notebooks/clustering.ipynb

Large diffs are not rendered by default.

91 changes: 85 additions & 6 deletions src/capymoa/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,32 @@ def score_instance(self, instance):
##############################################################
######################### Clustering #########################
##############################################################
class ClusteringResult:
"""Abstract clustering result class that has the structure of clusters: centers, weights, radii, and ids.
IDs might not be available for most MOA implementations."""

def __init__(self, centers, weights, radii, ids):
self._centers = centers
self._weights = weights
self._radii = radii
self._ids = ids

def get_centers(self):
return self._centers

def get_weights(self):
return self._weights

def get_radii(self):
return self._radii

def get_ids(self):
return self._ids

def __str__(self) -> str:
return f"Centers: {self._centers}, Weights: {self._weights}, Radii: {self._radii}, IDs: {self._ids}"

class Clusterer(ABC):
def __init__(self, schema: Schema, random_seed=1):
self.random_seed = random_seed
Expand All @@ -551,6 +577,46 @@ def __str__(self):
def train(self, instance: Instance):
pass

@abstractmethod
def implements_micro_clusters(self) -> bool:
pass

@abstractmethod
def implements_macro_clusters(self) -> bool:
pass

@abstractmethod
def _get_micro_clusters_centers(self):
pass

@abstractmethod
def _get_micro_clusters_radii(self):
pass

@abstractmethod
def _get_micro_clusters_weights(self):
pass

@abstractmethod
def _get_clusters_centers(self):
pass

@abstractmethod
def _get_clusters_radii(self):
pass

@abstractmethod
def _get_clusters_weights(self):
pass

@abstractmethod
def get_clustering_result(self):
pass

@abstractmethod
def get_micro_clustering_result(self):
pass

# @abstractmethod
# def predict(self, instance: Instance) -> Optional[LabelIndex]:
# pass
Expand Down Expand Up @@ -606,46 +672,59 @@ def CLI_help(self):
def train(self, instance):
self.moa_learner.trainOnInstance(instance.java_instance.getData())

def get_micro_clusters_centers(self):
def _get_micro_clusters_centers(self):
ret = []
for c in self.moa_learner.getMicroClusteringResult().getClustering():
java_array = c.getCenter()[:-1]
python_array = [java_array[i] for i in range(len(java_array))] # Convert to Python list
ret.append(python_array)
return ret

def get_micro_clusters_radii(self):
def _get_micro_clusters_radii(self):
ret = []
for c in self.moa_learner.getMicroClusteringResult().getClustering():
ret.append(c.getRadius())
return ret

def get_micro_clusters_weights(self):
def _get_micro_clusters_weights(self):
ret = []
for c in self.moa_learner.getMicroClusteringResult().getClustering():
ret.append(c.getWeight())
return ret

def get_clusters_centers(self):
def _get_clusters_centers(self):
ret = []
for c in self.moa_learner.getClusteringResult().getClustering():
java_array = c.getCenter()[:-1]
python_array = [java_array[i] for i in range(len(java_array))] # Convert to Python list
ret.append(python_array)
return ret

def get_clusters_radii(self):
def _get_clusters_radii(self):
ret = []
for c in self.moa_learner.getClusteringResult().getClustering():
ret.append(c.getRadius())
return ret

def get_clusters_weights(self):
def _get_clusters_weights(self):
ret = []
for c in self.moa_learner.getClusteringResult().getClustering():
ret.append(c.getWeight())
return ret

def get_clustering_result(self):
if self.implements_macro_clusters():
# raise ValueError("This clusterer does not implement macro-clusters.")
return ClusteringResult(self._get_clusters_centers(), self._get_clusters_weights(), self._get_clusters_radii(), [])
else:
return ClusteringResult([], [], [], [])

def get_micro_clustering_result(self):
if self.implements_micro_clusters():
return ClusteringResult(self._get_micro_clusters_centers(), self._get_micro_clusters_weights(), self._get_micro_clusters_radii(), [])
else:
return ClusteringResult([], [], [], [])


# def predict(self, instance):
# return Utils.maxIndex(
Expand Down
13 changes: 13 additions & 0 deletions src/capymoa/clusterers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from ._clustream import Clustream
from ._clustream_with_kmeans import Clustream_with_kmeans
from ._clustree import ClusTree
from ._denstream_with_dbscan import Denstream_with_dbscan
# from ._dstream import Dstream

__all__ = [
"Clustream",
"Clustream_with_kmeans",
"ClusTree",
"Denstream_with_dbscan",
# "Dstream"
]
57 changes: 57 additions & 0 deletions src/capymoa/clusterers/_clustream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from capymoa.base import MOAClusterer, ClusteringResult
import os
import typing
from moa.clusterers.clustream import Clustream as _MOA_Clustream
from capymoa.stream import Schema
from capymoa._utils import build_cli_str_from_mapping_and_locals
# import numpy as np

class Clustream(MOAClusterer):
"""
Clustream clustering algorithm without Macro-clustering.
"""
def __init__(
self,
schema: typing.Union[Schema, None] = None,
time_window: int = 1000,
max_num_kernels: int = 100,
kernel_radi_factor: float = 2
):
"""Clustream clusterer.
:param schema: The schema of the stream.
:param time_window: The size of the time window.
:param max_num_kernels: Maximum number of micro kernels to use.
:param kernel_radi_factor: Multiplier for the kernel radius
"""

mapping = {
"time_window": "-h",
"max_num_kernels": "-k",
"kernel_radi_factor": "-t"
}

config_str = build_cli_str_from_mapping_and_locals(mapping, locals())
self.moa_learner = _MOA_Clustream()
super(Clustream, self).__init__(
schema=schema,
CLI=config_str,
moa_learner=self.moa_learner
)

def implements_micro_clusters(self) -> bool:
return True

def implements_macro_clusters(self) -> bool:
return False

# def predict(self, X):
# clusters = self.get_micro_clustering_result()
# min_dist = np.inf
# closest_center = None
# for center in clusters.get_centers():
# if np.linalg.norm(center - X) < min_dist:
# min_dist = np.linalg.norm(center - X)
# closest_center = center
# print(closest_center)
# return closest_center
62 changes: 62 additions & 0 deletions src/capymoa/clusterers/_clustream_with_kmeans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from capymoa.base import MOAClusterer
import typing
from moa.clusterers.clustream import WithKmeans as _MOA_Clustream_WKM
from capymoa.stream import Schema
from capymoa._utils import build_cli_str_from_mapping_and_locals
# import numpy as np

class Clustream_with_kmeans(MOAClusterer):
"""
Clustream clustering algorithm without Macro-clustering.
"""
def __init__(
self,
schema: typing.Union[Schema, None] = None,
time_window: int = 1000,
max_num_kernels: int = 100,
kernel_radi_factor: float = 2,
k_option: int = 5
):
"""Clustream clusterer with K-means offline clustering.
:param schema: The schema of the stream.
:param time_window: The size of the time window.
:param max_num_kernels: Maximum number of micro kernels to use.
:param kernel_radi_factor: Multiplier for the kernel radius
:param k_option: Number of clusters to use in the k-means offline step
"""

mapping = {
"time_window": "-h",
"max_num_kernels": "-m",
"kernel_radi_factor": "-t",
"k_option": "-k"
}

config_str = build_cli_str_from_mapping_and_locals(mapping, locals())
self.moa_learner = _MOA_Clustream_WKM()
super(Clustream_with_kmeans, self).__init__(
schema=schema,
CLI=config_str,
moa_learner=self.moa_learner
)

def implements_micro_clusters(self) -> bool:
return True

def implements_macro_clusters(self) -> bool:
return True

# def predict(self, X):
# clusters = self.get_micro_clustering_result()
# min_dist = np.inf
# closest_center = None
# for center in clusters.get_centers():
# if np.linalg.norm(center - X) < min_dist:
# min_dist = np.linalg.norm(center - X)
# closest_center = center
# print(closest_center)
# return closest_center

def __str__(self):
return "Clustream with KMeans"
57 changes: 57 additions & 0 deletions src/capymoa/clusterers/_clustree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from capymoa.base import MOAClusterer, ClusteringResult
import os
import typing
from moa.clusterers.clustree import ClusTree as _MOA_ClusTree
from capymoa.stream import Schema
from capymoa._utils import build_cli_str_from_mapping_and_locals
# import numpy as np

class ClusTree(MOAClusterer):
"""
ClusTree clustering algorithm without Macro-clustering.
"""
def __init__(
self,
schema: typing.Union[Schema, None] = None,
horizon: int = 1000,
max_height: int = 8,
breadth_first_strategy: bool = False
):
"""Clustream clusterer.
:param schema: The schema of the stream
:param horizon: The size of the time window
:param max_height: The maximum height of the tree
:param breadth_first_strategy: Whether to use breadth-first strategy
"""

mapping = {
"horizon": "-h",
"max_height": "-H",
"breadth_first_strategy": "-B"
}

config_str = build_cli_str_from_mapping_and_locals(mapping, locals())
self.moa_learner = _MOA_ClusTree()
super(ClusTree, self).__init__(
schema=schema,
CLI=config_str,
moa_learner=self.moa_learner
)

def implements_micro_clusters(self) -> bool:
return True

def implements_macro_clusters(self) -> bool:
return False

# def predict(self, X):
# clusters = self.get_micro_clustering_result()
# min_dist = np.inf
# closest_center = None
# for center in clusters.get_centers():
# if np.linalg.norm(center - X) < min_dist:
# min_dist = np.linalg.norm(center - X)
# closest_center = center
# print(closest_center)
# return closest_center
Loading

0 comments on commit b5c8f28

Please sign in to comment.