Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Outlier detection: keep instance ids, make thread safe #5427

Merged
merged 1 commit into from
May 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 37 additions & 37 deletions Orange/classification/outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,43 +9,51 @@
from sklearn.svm import OneClassSVM

from Orange.base import SklLearner, SklModel
from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable, \
Variable
from Orange.data.util import get_unique_names
from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable
from Orange.data.util import get_unique_names, SharedComputeValue
from Orange.preprocess import AdaptiveNormalize
from Orange.util import wrap_callback, dummy_callback
from Orange.util import dummy_callback

__all__ = ["LocalOutlierFactorLearner", "IsolationForestLearner",
"EllipticEnvelopeLearner", "OneClassSVMLearner"]


class _CachedTransform:
# to be used with SharedComputeValue
def __init__(self, model):
self.model = model

def __call__(self, data):
return self.model.data_to_model_domain(data)


class _OutlierModel(SklModel):
def __init__(self, skl_model):
super().__init__(skl_model)
self._cached_data = None
self.outlier_var = None
self.cached_transform = _CachedTransform(self)

def predict(self, X: np.ndarray) -> np.ndarray:
pred = self.skl_model.predict(X)
pred[pred == -1] = 0
return pred[:, None]

def new_domain(self, data: Table) -> Domain:
assert self.outlier_var is not None
return Domain(data.domain.attributes, data.domain.class_vars,
data.domain.metas + (self.outlier_var,))

def __call__(self, data: Table, progress_callback: Callable = None) \
-> Table:
assert isinstance(data, Table)
assert self.outlier_var is not None

domain = Domain(data.domain.attributes, data.domain.class_vars,
data.domain.metas + (self.outlier_var,))
domain = self.new_domain(data)
if progress_callback is None:
progress_callback = dummy_callback
progress_callback(0, "Preprocessing...")
self._cached_data = self.data_to_model_domain(
data, wrap_callback(progress_callback, end=0.1))
progress_callback(0.1, "Predicting...")
metas = np.hstack((data.metas, self.predict(self._cached_data.X)))
progress_callback(0, "Predicting...")
new_table = data.transform(domain)
progress_callback(1)
return Table.from_numpy(domain, data.X, data.Y, metas)
return new_table


class _OutlierLearner(SklLearner):
Expand All @@ -64,27 +72,17 @@ def _fit_model(self, data: Table) -> _OutlierModel:
compute_value=transformer
)

transformer.variable = variable
model.outlier_var = variable
return model


class _Transformer:
class _Transformer(SharedComputeValue):
def __init__(self, model: _OutlierModel):
super().__init__(model.cached_transform)
self._model = model
self._variable = None

@property
def variable(self) -> Variable:
return self._variable

@variable.setter
def variable(self, var: Variable):
self._variable = var

def __call__(self, data: Table) -> np.ndarray:
assert isinstance(self._variable, Variable)
return self._model(data).get_column_view(self._variable)[0]
def compute(self, data: Table, shared_data: Table) -> np.ndarray:
return self._model.predict(shared_data.X)[:, 0]


class OneClassSVMLearner(_OutlierLearner):
Expand Down Expand Up @@ -142,13 +140,16 @@ def mahalanobis(self, observations: np.ndarray) -> np.ndarray:
"""
return self.skl_model.mahalanobis(observations)[:, None]

def __call__(self, data: Table, progress_callback: Callable = None) \
-> Table:
pred = super().__call__(data, progress_callback)
domain = Domain(pred.domain.attributes, pred.domain.class_vars,
pred.domain.metas + (self.mahal_var,))
metas = np.hstack((pred.metas, self.mahalanobis(self._cached_data.X)))
return Table.from_numpy(domain, pred.X, pred.Y, metas)
def new_domain(self, data: Table) -> Domain:
assert self.mahal_var is not None
domain = super().new_domain(data)
return Domain(domain.attributes, domain.class_vars,
domain.metas + (self.mahal_var,))


class _TransformerMahalanobis(_Transformer):
def compute(self, data: Table, shared_data: Table) -> np.ndarray:
return self._model.mahalanobis(shared_data.X)[:, 0]


class EllipticEnvelopeLearner(_OutlierLearner):
Expand All @@ -166,13 +167,12 @@ def _fit_model(self, data: Table) -> EllipticEnvelopeClassifier:
domain = data.domain
model = super()._fit_model(data.transform(Domain(domain.attributes)))

transformer = _Transformer(model)
transformer = _TransformerMahalanobis(model)
names = [v.name for v in domain.variables + domain.metas]
variable = ContinuousVariable(
get_unique_names(names, "Mahalanobis"),
compute_value=transformer
)

transformer.variable = variable
model.mahal_var = variable
return model
12 changes: 9 additions & 3 deletions Orange/classification/tests/test_outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pickle
import tempfile
import unittest
from unittest.mock import Mock
from unittest.mock import Mock, patch

import numpy as np

Expand Down Expand Up @@ -36,6 +36,7 @@ def assert_table_equal(self, table1, table2):
np.testing.assert_array_equal(table1.metas, table2.metas)

def assert_table_appended_outlier(self, table1, table2, offset=1):
np.testing.assert_array_equal(table1.ids, table2.ids)
np.testing.assert_array_equal(table1.X, table2.X)
np.testing.assert_array_equal(table1.Y, table2.Y)
np.testing.assert_array_equal(table1.metas, table2.metas[:, :-offset])
Expand All @@ -47,7 +48,6 @@ def assert_table_appended_outlier(self, table1, table2, offset=1):
self.assertEqual(table2.domain.metas[-offset].name, "Outlier")
self.assertIsNotNone(table2.domain.metas[-offset].compute_value)


class TestOneClassSVMLearner(_TestDetector):
def test_OneClassSVM(self):
np.random.seed(42)
Expand Down Expand Up @@ -128,12 +128,19 @@ def test_EllipticEnvelope(self):
def test_mahalanobis(self):
n = len(self.X_all)
pred = self.model(self.X_all)

y_pred = pred[:, self.model.outlier_var].metas
y_mahal = pred[:, self.model.mahal_var].metas
y_mahal, y_pred = zip(*sorted(zip(y_mahal, y_pred), reverse=True))
self.assertTrue(all(i == 0 for i in y_pred[:int(self.cont * n)]))
self.assertTrue(all(i == 1 for i in y_pred[int(self.cont * n):]))

def test_single_data_to_model_domain(self):
with patch.object(self.model, "data_to_model_domain",
wraps=self.model.data_to_model_domain) as call:
self.model(self.X_all)
self.assertEqual(call.call_count, 1)

def test_EllipticEnvelope_ignores_y(self):
domain = Domain((ContinuousVariable("x1"), ContinuousVariable("x2")),
(ContinuousVariable("y1"), ContinuousVariable("y2")))
Expand Down Expand Up @@ -231,7 +238,6 @@ def test_transformer(self):
detect = self.detector(self.iris)
pred = detect(self.iris)
var = pred.domain.metas[0]
self.assertIs(var, var.compute_value.variable)
np.testing.assert_array_equal(pred[:, "Outlier"].metas.ravel(),
var.compute_value(self.iris))

Expand Down