From fa18777b6cccc6f7b00c1cb573fcce3b6f03f3f3 Mon Sep 17 00:00:00 2001 From: Ajda Pretnar Date: Fri, 1 Mar 2019 16:11:40 +0100 Subject: [PATCH 1/9] Sparse Jaccard --- Orange/distance/base.py | 50 +++++++++++++++++++++++++++++++++++++ Orange/distance/distance.py | 6 ++--- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/Orange/distance/base.py b/Orange/distance/base.py index ee84001d712..89b1b97c5d7 100644 --- a/Orange/distance/base.py +++ b/Orange/distance/base.py @@ -1,5 +1,6 @@ import numpy as np import sklearn.metrics as skl_metrics +from scipy.sparse import issparse, csr_matrix from Orange.data import Table, Domain, Instance, RowInstance from Orange.misc import DistMatrix @@ -13,6 +14,7 @@ # TODO this *private* function is called from several widgets to prepare # data for calling the below classes. After we (mostly) stopped relying # on sklearn.metrics, this is (mostly) unnecessary + def _preprocess(table, impute=True): """Remove categorical attributes and impute missing values.""" if not len(table): @@ -499,3 +501,51 @@ def __call__(self, e1, e2=None, axis=1, impute=False): else: dist_matrix = DistMatrix(dist) return dist_matrix + +class SparseJaccard: + """ + Fallback for `Jaccard` on sparse data or raw numpy arrays. If data is + sparse, data normalized with intersection/union. Sklearn's function can't + handle discrete or missing data and normalization. + """ + + def __call__(self, e1, e2=None, axis=1, impute=False): + x1 = _orange_to_numpy(e1) + x2 = _orange_to_numpy(e2) + if axis == 0: + x1 = x1.T + if x2 is not None: + x2 = x2.T + if issparse(x1): + dist = self.sparse_jaccard(x1, x2) + else: + dist = skl_metrics.pairwise.pairwise_distances(x1, + x2, + metric="jaccard") + if impute and np.isnan(dist).any(): + dist = np.nan_to_num(dist) + if isinstance(e1, (Table, RowInstance)): + dist_matrix = DistMatrix(dist, e1, e2, axis) + else: + dist_matrix = DistMatrix(dist) + return dist_matrix + + def sparse_jaccard(self, x1, x2=None): + symmetric = x2 is None + if symmetric: + x2 = x1 + x1 = csr_matrix(x1) + x1.eliminate_zeros() + x2 = csr_matrix(x2) + x2.eliminate_zeros() + n, m = x1.shape[0], x2.shape[0] + matrix = np.zeros((n, m)) + for i in range(n): + xi_ind = set(x1[i].indices) + for j in range(i if symmetric else m): + jacc = 1 - len(xi_ind.intersection(x2[j].indices))\ + / len(set(x1[i].indices).union(x1[j].indices)) + matrix[i, j] = jacc + if symmetric: + matrix[j, i] = jacc + return matrix diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index fd4f01cf250..feb669c6d7c 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -11,7 +11,7 @@ from Orange.statistics import util from .base import (Distance, DistanceModel, FittedDistance, FittedDistanceModel, - SklDistance, _orange_to_numpy) + SklDistance, _orange_to_numpy, SparseJaccard) class EuclideanRowsModel(FittedDistanceModel): """ @@ -416,9 +416,9 @@ def compute_distances(self, x1, x2): class Jaccard(FittedDistance): - supports_sparse = False + supports_sparse = True supports_discrete = True - fallback = SklDistance('jaccard') + fallback = SparseJaccard() ModelType = JaccardModel def fit_rows(self, attributes, x, n_vals): From e7913977d9771713c15c83d019affcd3ab6be46a Mon Sep 17 00:00:00 2001 From: Ajda Pretnar Date: Mon, 4 Mar 2019 09:59:51 +0100 Subject: [PATCH 2/9] Do not remove nonbinary for sparse --- Orange/widgets/unsupervised/owdistances.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py index ead54502ae5..91690a45907 100644 --- a/Orange/widgets/unsupervised/owdistances.py +++ b/Orange/widgets/unsupervised/owdistances.py @@ -132,14 +132,18 @@ def _fix_discrete(): def _fix_nonbinary(): nonlocal data if metric is distance.Jaccard: - nbinary = sum(a.is_discrete and len(a.values) == 2 - for a in data.domain.attributes) - if not nbinary: - self.Error.no_binary_features() - return False - elif nbinary < len(data.domain.attributes): - self.Warning.ignoring_nonbinary() - data = distance.remove_nonbinary_features(data) + if issparse(data.X): + # do not remove non-binary + return True + else: + nbinary = sum(a.is_discrete and len(a.values) == 2 + for a in data.domain.attributes) + if not nbinary: + self.Error.no_binary_features() + return False + elif nbinary < len(data.domain.attributes): + self.Warning.ignoring_nonbinary() + data = distance.remove_nonbinary_features(data) return True def _fix_missing(): From 70467ea1cc50326bda6b73139e5944bfea159131 Mon Sep 17 00:00:00 2001 From: Ajda Pretnar Date: Mon, 4 Mar 2019 14:38:05 +0100 Subject: [PATCH 3/9] Test sparse Jaccard --- Orange/distance/tests/test_distance.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/Orange/distance/tests/test_distance.py b/Orange/distance/tests/test_distance.py index ef7b691651e..7c55f1d2a01 100644 --- a/Orange/distance/tests/test_distance.py +++ b/Orange/distance/tests/test_distance.py @@ -27,11 +27,13 @@ def test_no_data(self): def test_sparse(self): """Test sparse support in distances.""" - sparse_iris = csr_matrix(Table('iris').X) if not self.Distance.supports_sparse: - self.assertRaises(TypeError, self.Distance, sparse_iris) + self.assertRaises(TypeError, self.Distance, self.sparse_data) else: - self.Distance(sparse_iris) + # check the result is the same as for dense + dist_numpy = self.Distance(self.dense_X) + dist_sparse = self.Distance(self.sparse_data) + np.testing.assert_allclose(dist_sparse, dist_numpy) class CommonFittedTests(CommonTests): @@ -144,6 +146,12 @@ def setUp(self): self.mixed_data = self.data = Table.from_numpy( self.domain, np.hstack((self.cont_data.X[:3], self.disc_data.X))) + self.dense_X = np.array([[1, 0, 2], + [-1, 5, 0], + [0, 1, 1], + [7, 0, 0]]) + self.sparse_data = Table(csr_matrix(self.dense_X)) + # Correct results in these tests were computed manually or with Excel; @@ -838,6 +846,12 @@ def setUp(self): [1, 0, 1], [1, 0, 0]]) + self.dense_X = np.array([[1, 0, 2], + [-1, 5, 0], + [0, 1, 1], + [7, 0, 0]]) + self.sparse_data = Table(csr_matrix(self.dense_X)) + def test_jaccard_rows(self): assert_almost_equal = np.testing.assert_almost_equal From 7ba8c0b12fedaabd1acc55b8ce16e77aade8ce40 Mon Sep 17 00:00:00 2001 From: Ajda Pretnar Date: Mon, 4 Mar 2019 15:04:37 +0100 Subject: [PATCH 4/9] Disable check for Jaccard --- Orange/widgets/unsupervised/owdistances.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py index 91690a45907..9f9ebe03908 100644 --- a/Orange/widgets/unsupervised/owdistances.py +++ b/Orange/widgets/unsupervised/owdistances.py @@ -119,7 +119,9 @@ def _check_sparse(): def _fix_discrete(): nonlocal data if data.domain.has_discrete_attributes() and ( - issparse(data.X) and getattr(metric, "fallback", None) + issparse(data.X) and getattr(metric, "fallback", + None) and metric is not + distance.Jaccard or not metric.supports_discrete or self.axis == 1 and metric is not distance.Jaccard): if not data.domain.has_continuous_attributes(): From 551effc45154da84bc60cb76e73f0854e750da5a Mon Sep 17 00:00:00 2001 From: janezd Date: Tue, 12 Mar 2019 23:02:03 +0100 Subject: [PATCH 5/9] Distances: Support numpy arrays without fallbacks --- Orange/distance/base.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/Orange/distance/base.py b/Orange/distance/base.py index 89b1b97c5d7..b43efa0d2c1 100644 --- a/Orange/distance/base.py +++ b/Orange/distance/base.py @@ -275,8 +275,9 @@ def __init__(self, attributes, axis=1, impute=False): self.attributes = attributes def __call__(self, e1, e2=None): - if e1.domain.attributes != self.attributes or \ - e2 is not None and e2.domain.attributes != self.attributes: + if self.attributes is not None and ( + e1.domain.attributes != self.attributes + or e2 is not None and e2.domain.attributes != self.attributes): raise ValueError("mismatching domains") return super().__call__(e1, e2) @@ -350,12 +351,17 @@ def fit(self, data): Prepare the data on attributes, call `fit_cols` or `fit_rows` and return the resulting model. """ - attributes = data.domain.attributes x = _orange_to_numpy(data) - n_vals = np.fromiter( - (len(attr.values) if attr.is_discrete else 0 - for attr in attributes), - dtype=np.int32, count=len(attributes)) + if hasattr(data, "domain"): + attributes = data.domain.attributes + n_vals = np.fromiter( + (len(attr.values) if attr.is_discrete else 0 + for attr in attributes), + dtype=np.int32, count=len(attributes)) + else: + assert isinstance(x, np.ndarray) + attributes = None + n_vals = np.zeros(x.shape[1], dtype=np.int32) return [self.fit_cols, self.fit_rows][self.axis](attributes, x, n_vals) def fit_cols(self, attributes, x, n_vals): From d962655dbb60c9941c1c16d2983741ea8b4d2adf Mon Sep 17 00:00:00 2001 From: janezd Date: Tue, 12 Mar 2019 23:02:49 +0100 Subject: [PATCH 6/9] Jaccard distance: Move from a fallback to its own class --- Orange/distance/base.py | 49 -------------------------- Orange/distance/distance.py | 36 ++++++++++++++++--- Orange/distance/tests/test_distance.py | 27 ++++++-------- Orange/tests/test_distances.py | 42 +++++++++++----------- 4 files changed, 61 insertions(+), 93 deletions(-) diff --git a/Orange/distance/base.py b/Orange/distance/base.py index b43efa0d2c1..83e2ae5cd90 100644 --- a/Orange/distance/base.py +++ b/Orange/distance/base.py @@ -1,6 +1,5 @@ import numpy as np import sklearn.metrics as skl_metrics -from scipy.sparse import issparse, csr_matrix from Orange.data import Table, Domain, Instance, RowInstance from Orange.misc import DistMatrix @@ -507,51 +506,3 @@ def __call__(self, e1, e2=None, axis=1, impute=False): else: dist_matrix = DistMatrix(dist) return dist_matrix - -class SparseJaccard: - """ - Fallback for `Jaccard` on sparse data or raw numpy arrays. If data is - sparse, data normalized with intersection/union. Sklearn's function can't - handle discrete or missing data and normalization. - """ - - def __call__(self, e1, e2=None, axis=1, impute=False): - x1 = _orange_to_numpy(e1) - x2 = _orange_to_numpy(e2) - if axis == 0: - x1 = x1.T - if x2 is not None: - x2 = x2.T - if issparse(x1): - dist = self.sparse_jaccard(x1, x2) - else: - dist = skl_metrics.pairwise.pairwise_distances(x1, - x2, - metric="jaccard") - if impute and np.isnan(dist).any(): - dist = np.nan_to_num(dist) - if isinstance(e1, (Table, RowInstance)): - dist_matrix = DistMatrix(dist, e1, e2, axis) - else: - dist_matrix = DistMatrix(dist) - return dist_matrix - - def sparse_jaccard(self, x1, x2=None): - symmetric = x2 is None - if symmetric: - x2 = x1 - x1 = csr_matrix(x1) - x1.eliminate_zeros() - x2 = csr_matrix(x2) - x2.eliminate_zeros() - n, m = x1.shape[0], x2.shape[0] - matrix = np.zeros((n, m)) - for i in range(n): - xi_ind = set(x1[i].indices) - for j in range(i if symmetric else m): - jacc = 1 - len(xi_ind.intersection(x2[j].indices))\ - / len(set(x1[i].indices).union(x1[j].indices)) - matrix[i, j] = jacc - if symmetric: - matrix[j, i] = jacc - return matrix diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index feb669c6d7c..3f27632b5ba 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -3,6 +3,7 @@ import numpy as np from scipy import stats +from scipy import sparse as sp import sklearn.metrics as skl_metrics from sklearn.utils.extmath import row_norms, safe_sparse_dot from sklearn.metrics import pairwise_distances @@ -11,7 +12,7 @@ from Orange.statistics import util from .base import (Distance, DistanceModel, FittedDistance, FittedDistanceModel, - SklDistance, _orange_to_numpy, SparseJaccard) + SklDistance, _orange_to_numpy) class EuclideanRowsModel(FittedDistanceModel): """ @@ -395,6 +396,9 @@ def compute_distances(self, x1, x2): compute distances between rows without missing values, and a slower loop for those with missing values. """ + if sp.issparse(x1): + return self.sparse_jaccard(x1, x2) + nonzeros1 = np.not_equal(x1, 0).view(np.int8) if self.axis == 1: nans1 = _distance.any_nan_row(x1) @@ -414,11 +418,30 @@ def compute_distances(self, x1, x2): return _distance.jaccard_cols( nonzeros1, x1, nans1, self.ps) + def sparse_jaccard(self, x1, x2=None): + symmetric = x2 is None + if symmetric: + x2 = x1 + x1 = sp.csr_matrix(x1) + x1.eliminate_zeros() + x2 = sp.csr_matrix(x2) + x2.eliminate_zeros() + n, m = x1.shape[0], x2.shape[0] + matrix = np.zeros((n, m)) + for i in range(n): + xi_ind = set(x1[i].indices) + for j in range(i if symmetric else m): + jacc = 1 - len(xi_ind.intersection(x2[j].indices))\ + / len(set(x1[i].indices).union(x1[j].indices)) + matrix[i, j] = jacc + if symmetric: + matrix[j, i] = jacc + return matrix + class Jaccard(FittedDistance): supports_sparse = True supports_discrete = True - fallback = SparseJaccard() ModelType = JaccardModel def fit_rows(self, attributes, x, n_vals): @@ -426,9 +449,12 @@ def fit_rows(self, attributes, x, n_vals): Return a model for computation of Jaccard values. The model stores frequencies of non-zero values per each column. """ - ps = np.fromiter( - (_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))), - dtype=np.double, count=len(n_vals)) + if sp.issparse(x): + ps = None # wrong! + else: + ps = np.fromiter( + (_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))), + dtype=np.double, count=len(n_vals)) return JaccardModel(attributes, self.axis, self.impute, ps) fit_cols = fit_rows diff --git a/Orange/distance/tests/test_distance.py b/Orange/distance/tests/test_distance.py index 7c55f1d2a01..e056619fbf1 100644 --- a/Orange/distance/tests/test_distance.py +++ b/Orange/distance/tests/test_distance.py @@ -27,13 +27,18 @@ def test_no_data(self): def test_sparse(self): """Test sparse support in distances.""" + domain = Domain([ContinuousVariable(c) for c in "abc"]) + dense_data = Table.from_list( + domain, [[1, 0, 2], [-1, 5, 0], [0, 1, 1], [7, 0, 0]]) + sparse_data = Table(domain, csr_matrix(dense_data.X)) + if not self.Distance.supports_sparse: - self.assertRaises(TypeError, self.Distance, self.sparse_data) + self.assertRaises(TypeError, self.Distance, sparse_data) else: - # check the result is the same as for dense - dist_numpy = self.Distance(self.dense_X) - dist_sparse = self.Distance(self.sparse_data) - np.testing.assert_allclose(dist_sparse, dist_numpy) + # check the result is the same for sparse and dense + dist_dense = self.Distance(dense_data) + dist_sparse = self.Distance(sparse_data) + np.testing.assert_allclose(dist_sparse, dist_dense) class CommonFittedTests(CommonTests): @@ -146,12 +151,6 @@ def setUp(self): self.mixed_data = self.data = Table.from_numpy( self.domain, np.hstack((self.cont_data.X[:3], self.disc_data.X))) - self.dense_X = np.array([[1, 0, 2], - [-1, 5, 0], - [0, 1, 1], - [7, 0, 0]]) - self.sparse_data = Table(csr_matrix(self.dense_X)) - # Correct results in these tests were computed manually or with Excel; @@ -846,12 +845,6 @@ def setUp(self): [1, 0, 1], [1, 0, 0]]) - self.dense_X = np.array([[1, 0, 2], - [-1, 5, 0], - [0, 1, 1], - [7, 0, 0]]) - self.sparse_data = Table(csr_matrix(self.dense_X)) - def test_jaccard_rows(self): assert_almost_equal = np.testing.assert_almost_equal diff --git a/Orange/tests/test_distances.py b/Orange/tests/test_distances.py index 571ad274e32..24e914db49a 100644 --- a/Orange/tests/test_distances.py +++ b/Orange/tests/test_distances.py @@ -9,7 +9,6 @@ import scipy.spatial import scipy.stats from scipy.sparse import csr_matrix -from sklearn.exceptions import DataConversionWarning from Orange.data import (Table, Domain, ContinuousVariable, DiscreteVariable, StringVariable, Instance) @@ -500,27 +499,26 @@ def test_jaccard_distance_many_examples(self): [0., 0., 0.5]])) def test_jaccard_distance_numpy(self): - with self.assertWarns(DataConversionWarning): - np.testing.assert_almost_equal( - self.dist(self.titanic[0].x, self.titanic[2].x, axis=1), - np.array([[0.5]])) - np.testing.assert_almost_equal( - self.dist(self.titanic.X), - np.array([[0., 0., 0.5, 0.5], - [0., 0., 0.5, 0.5], - [0.5, 0.5, 0., 0.], - [0.5, 0.5, 0., 0.]])) - np.testing.assert_almost_equal( - self.dist(self.titanic[2].x, self.titanic[:3].X), - np.array([[0.5, 0.5, 0.]])) - np.testing.assert_almost_equal( - self.dist(self.titanic[:2].X, self.titanic[3].x), - np.array([[0.5], - [0.5]])) - np.testing.assert_almost_equal( - self.dist(self.titanic[:2].X, self.titanic[:3].X), - np.array([[0., 0., 0.5], - [0., 0., 0.5]])) + np.testing.assert_almost_equal( + self.dist(self.titanic[0].x, self.titanic[2].x, axis=1), + np.array([[0.5]])) + np.testing.assert_almost_equal( + self.dist(self.titanic.X), + np.array([[0., 0., 0.5, 0.5], + [0., 0., 0.5, 0.5], + [0.5, 0.5, 0., 0.], + [0.5, 0.5, 0., 0.]])) + np.testing.assert_almost_equal( + self.dist(self.titanic[2].x, self.titanic[:3].X), + np.array([[0.5, 0.5, 0.]])) + np.testing.assert_almost_equal( + self.dist(self.titanic[:2].X, self.titanic[3].x), + np.array([[0.5], + [0.5]])) + np.testing.assert_almost_equal( + self.dist(self.titanic[:2].X, self.titanic[:3].X), + np.array([[0., 0., 0.5], + [0., 0., 0.5]])) # noinspection PyTypeChecker From 5cf9802ccebda2f94d82d280ad073bce3b55e2ef Mon Sep 17 00:00:00 2001 From: janezd Date: Wed, 13 Mar 2019 17:07:12 +0100 Subject: [PATCH 7/9] Cosine distance: Fix clipping --- Orange/distance/distance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index 3f27632b5ba..a1005fa75d8 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -369,7 +369,7 @@ def prepare_data(x): data1 = prepare_data(x1) data2 = data1 if x2 is None else prepare_data(x2) dist = safe_sparse_dot(data1, data2.T) - np.clip(dist, 0, 1, out=dist) + np.clip(dist, -1, 1, out=dist) if x2 is None: diag = np.diag_indices_from(dist) dist[diag] = np.where(np.isnan(dist[diag]), np.nan, 1.0) From 9d84d8516f25c0d04808d7d2745594c88223d28a Mon Sep 17 00:00:00 2001 From: Ajda Pretnar Date: Thu, 14 Mar 2019 15:39:52 +0100 Subject: [PATCH 8/9] Section code and extend fitter --- Orange/distance/distance.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index a1005fa75d8..609f6b6f9b1 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -386,6 +386,12 @@ def __init__(self, attributes, axis, impute, ps): self.ps = ps def compute_distances(self, x1, x2): + if sp.issparse(x1): + return self._compute_sparse(x1, x2) + else: + return self._compute_dense(x1, x2) + + def _compute_dense(self, x1, x2): """ The method uses a function implemented in Cython. Data (`x1` and `x2`) is accompanied by two tables. One is a 2-d table in which elements of @@ -396,9 +402,6 @@ def compute_distances(self, x1, x2): compute distances between rows without missing values, and a slower loop for those with missing values. """ - if sp.issparse(x1): - return self.sparse_jaccard(x1, x2) - nonzeros1 = np.not_equal(x1, 0).view(np.int8) if self.axis == 1: nans1 = _distance.any_nan_row(x1) @@ -418,7 +421,7 @@ def compute_distances(self, x1, x2): return _distance.jaccard_cols( nonzeros1, x1, nans1, self.ps) - def sparse_jaccard(self, x1, x2=None): + def _compute_sparse(self, x1, x2=None): symmetric = x2 is None if symmetric: x2 = x1 @@ -450,7 +453,7 @@ def fit_rows(self, attributes, x, n_vals): frequencies of non-zero values per each column. """ if sp.issparse(x): - ps = None # wrong! + ps = x.getnnz(axis=0) else: ps = np.fromiter( (_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))), From a30e68885317e020a8f0ed3fd8050f328cb0b019 Mon Sep 17 00:00:00 2001 From: janezd Date: Fri, 15 Mar 2019 14:51:35 +0100 Subject: [PATCH 9/9] OWDistance: Minor reformatting --- Orange/widgets/unsupervised/owdistances.py | 37 ++++++++++------------ 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py index 9f9ebe03908..18aac47daff 100644 --- a/Orange/widgets/unsupervised/owdistances.py +++ b/Orange/widgets/unsupervised/owdistances.py @@ -118,12 +118,11 @@ def _check_sparse(): def _fix_discrete(): nonlocal data - if data.domain.has_discrete_attributes() and ( - issparse(data.X) and getattr(metric, "fallback", - None) and metric is not - distance.Jaccard - or not metric.supports_discrete - or self.axis == 1 and metric is not distance.Jaccard): + if data.domain.has_discrete_attributes() \ + and metric is not distance.Jaccard \ + and (issparse(data.X) and getattr(metric, "fallback", None) + or not metric.supports_discrete + or self.axis == 1): if not data.domain.has_continuous_attributes(): self.Error.no_continuous_features() return False @@ -133,19 +132,15 @@ def _fix_discrete(): def _fix_nonbinary(): nonlocal data - if metric is distance.Jaccard: - if issparse(data.X): - # do not remove non-binary - return True - else: - nbinary = sum(a.is_discrete and len(a.values) == 2 - for a in data.domain.attributes) - if not nbinary: - self.Error.no_binary_features() - return False - elif nbinary < len(data.domain.attributes): - self.Warning.ignoring_nonbinary() - data = distance.remove_nonbinary_features(data) + if metric is distance.Jaccard and not issparse(data.X): + nbinary = sum(a.is_discrete and len(a.values) == 2 + for a in data.domain.attributes) + if not nbinary: + self.Error.no_binary_features() + return False + elif nbinary < len(data.domain.attributes): + self.Warning.ignoring_nonbinary() + data = distance.remove_nonbinary_features(data) return True def _fix_missing(): @@ -157,11 +152,11 @@ def _fix_missing(): self.clear_messages() if data is None: - return + return None for check in (_check_sparse, _fix_discrete, _fix_missing, _fix_nonbinary): if not check(): - return + return None try: if metric.supports_normalization and self.normalized_dist: return metric(data, axis=1 - self.axis, impute=True,