diff --git a/Orange/distance/base.py b/Orange/distance/base.py index ee84001d712..83e2ae5cd90 100644 --- a/Orange/distance/base.py +++ b/Orange/distance/base.py @@ -13,6 +13,7 @@ # TODO this *private* function is called from several widgets to prepare # data for calling the below classes. After we (mostly) stopped relying # on sklearn.metrics, this is (mostly) unnecessary + def _preprocess(table, impute=True): """Remove categorical attributes and impute missing values.""" if not len(table): @@ -273,8 +274,9 @@ def __init__(self, attributes, axis=1, impute=False): self.attributes = attributes def __call__(self, e1, e2=None): - if e1.domain.attributes != self.attributes or \ - e2 is not None and e2.domain.attributes != self.attributes: + if self.attributes is not None and ( + e1.domain.attributes != self.attributes + or e2 is not None and e2.domain.attributes != self.attributes): raise ValueError("mismatching domains") return super().__call__(e1, e2) @@ -348,12 +350,17 @@ def fit(self, data): Prepare the data on attributes, call `fit_cols` or `fit_rows` and return the resulting model. """ - attributes = data.domain.attributes x = _orange_to_numpy(data) - n_vals = np.fromiter( - (len(attr.values) if attr.is_discrete else 0 - for attr in attributes), - dtype=np.int32, count=len(attributes)) + if hasattr(data, "domain"): + attributes = data.domain.attributes + n_vals = np.fromiter( + (len(attr.values) if attr.is_discrete else 0 + for attr in attributes), + dtype=np.int32, count=len(attributes)) + else: + assert isinstance(x, np.ndarray) + attributes = None + n_vals = np.zeros(x.shape[1], dtype=np.int32) return [self.fit_cols, self.fit_rows][self.axis](attributes, x, n_vals) def fit_cols(self, attributes, x, n_vals): diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index fd4f01cf250..609f6b6f9b1 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -3,6 +3,7 @@ import numpy as np from scipy import stats +from scipy import sparse as sp import sklearn.metrics as skl_metrics from sklearn.utils.extmath import row_norms, safe_sparse_dot from sklearn.metrics import pairwise_distances @@ -368,7 +369,7 @@ def prepare_data(x): data1 = prepare_data(x1) data2 = data1 if x2 is None else prepare_data(x2) dist = safe_sparse_dot(data1, data2.T) - np.clip(dist, 0, 1, out=dist) + np.clip(dist, -1, 1, out=dist) if x2 is None: diag = np.diag_indices_from(dist) dist[diag] = np.where(np.isnan(dist[diag]), np.nan, 1.0) @@ -385,6 +386,12 @@ def __init__(self, attributes, axis, impute, ps): self.ps = ps def compute_distances(self, x1, x2): + if sp.issparse(x1): + return self._compute_sparse(x1, x2) + else: + return self._compute_dense(x1, x2) + + def _compute_dense(self, x1, x2): """ The method uses a function implemented in Cython. Data (`x1` and `x2`) is accompanied by two tables. One is a 2-d table in which elements of @@ -414,11 +421,30 @@ def compute_distances(self, x1, x2): return _distance.jaccard_cols( nonzeros1, x1, nans1, self.ps) + def _compute_sparse(self, x1, x2=None): + symmetric = x2 is None + if symmetric: + x2 = x1 + x1 = sp.csr_matrix(x1) + x1.eliminate_zeros() + x2 = sp.csr_matrix(x2) + x2.eliminate_zeros() + n, m = x1.shape[0], x2.shape[0] + matrix = np.zeros((n, m)) + for i in range(n): + xi_ind = set(x1[i].indices) + for j in range(i if symmetric else m): + jacc = 1 - len(xi_ind.intersection(x2[j].indices))\ + / len(set(x1[i].indices).union(x1[j].indices)) + matrix[i, j] = jacc + if symmetric: + matrix[j, i] = jacc + return matrix + class Jaccard(FittedDistance): - supports_sparse = False + supports_sparse = True supports_discrete = True - fallback = SklDistance('jaccard') ModelType = JaccardModel def fit_rows(self, attributes, x, n_vals): @@ -426,9 +452,12 @@ def fit_rows(self, attributes, x, n_vals): Return a model for computation of Jaccard values. The model stores frequencies of non-zero values per each column. """ - ps = np.fromiter( - (_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))), - dtype=np.double, count=len(n_vals)) + if sp.issparse(x): + ps = x.getnnz(axis=0) + else: + ps = np.fromiter( + (_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))), + dtype=np.double, count=len(n_vals)) return JaccardModel(attributes, self.axis, self.impute, ps) fit_cols = fit_rows diff --git a/Orange/distance/tests/test_distance.py b/Orange/distance/tests/test_distance.py index ef7b691651e..e056619fbf1 100644 --- a/Orange/distance/tests/test_distance.py +++ b/Orange/distance/tests/test_distance.py @@ -27,11 +27,18 @@ def test_no_data(self): def test_sparse(self): """Test sparse support in distances.""" - sparse_iris = csr_matrix(Table('iris').X) + domain = Domain([ContinuousVariable(c) for c in "abc"]) + dense_data = Table.from_list( + domain, [[1, 0, 2], [-1, 5, 0], [0, 1, 1], [7, 0, 0]]) + sparse_data = Table(domain, csr_matrix(dense_data.X)) + if not self.Distance.supports_sparse: - self.assertRaises(TypeError, self.Distance, sparse_iris) + self.assertRaises(TypeError, self.Distance, sparse_data) else: - self.Distance(sparse_iris) + # check the result is the same for sparse and dense + dist_dense = self.Distance(dense_data) + dist_sparse = self.Distance(sparse_data) + np.testing.assert_allclose(dist_sparse, dist_dense) class CommonFittedTests(CommonTests): diff --git a/Orange/tests/test_distances.py b/Orange/tests/test_distances.py index 571ad274e32..24e914db49a 100644 --- a/Orange/tests/test_distances.py +++ b/Orange/tests/test_distances.py @@ -9,7 +9,6 @@ import scipy.spatial import scipy.stats from scipy.sparse import csr_matrix -from sklearn.exceptions import DataConversionWarning from Orange.data import (Table, Domain, ContinuousVariable, DiscreteVariable, StringVariable, Instance) @@ -500,27 +499,26 @@ def test_jaccard_distance_many_examples(self): [0., 0., 0.5]])) def test_jaccard_distance_numpy(self): - with self.assertWarns(DataConversionWarning): - np.testing.assert_almost_equal( - self.dist(self.titanic[0].x, self.titanic[2].x, axis=1), - np.array([[0.5]])) - np.testing.assert_almost_equal( - self.dist(self.titanic.X), - np.array([[0., 0., 0.5, 0.5], - [0., 0., 0.5, 0.5], - [0.5, 0.5, 0., 0.], - [0.5, 0.5, 0., 0.]])) - np.testing.assert_almost_equal( - self.dist(self.titanic[2].x, self.titanic[:3].X), - np.array([[0.5, 0.5, 0.]])) - np.testing.assert_almost_equal( - self.dist(self.titanic[:2].X, self.titanic[3].x), - np.array([[0.5], - [0.5]])) - np.testing.assert_almost_equal( - self.dist(self.titanic[:2].X, self.titanic[:3].X), - np.array([[0., 0., 0.5], - [0., 0., 0.5]])) + np.testing.assert_almost_equal( + self.dist(self.titanic[0].x, self.titanic[2].x, axis=1), + np.array([[0.5]])) + np.testing.assert_almost_equal( + self.dist(self.titanic.X), + np.array([[0., 0., 0.5, 0.5], + [0., 0., 0.5, 0.5], + [0.5, 0.5, 0., 0.], + [0.5, 0.5, 0., 0.]])) + np.testing.assert_almost_equal( + self.dist(self.titanic[2].x, self.titanic[:3].X), + np.array([[0.5, 0.5, 0.]])) + np.testing.assert_almost_equal( + self.dist(self.titanic[:2].X, self.titanic[3].x), + np.array([[0.5], + [0.5]])) + np.testing.assert_almost_equal( + self.dist(self.titanic[:2].X, self.titanic[:3].X), + np.array([[0., 0., 0.5], + [0., 0., 0.5]])) # noinspection PyTypeChecker diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py index ead54502ae5..18aac47daff 100644 --- a/Orange/widgets/unsupervised/owdistances.py +++ b/Orange/widgets/unsupervised/owdistances.py @@ -118,10 +118,11 @@ def _check_sparse(): def _fix_discrete(): nonlocal data - if data.domain.has_discrete_attributes() and ( - issparse(data.X) and getattr(metric, "fallback", None) - or not metric.supports_discrete - or self.axis == 1 and metric is not distance.Jaccard): + if data.domain.has_discrete_attributes() \ + and metric is not distance.Jaccard \ + and (issparse(data.X) and getattr(metric, "fallback", None) + or not metric.supports_discrete + or self.axis == 1): if not data.domain.has_continuous_attributes(): self.Error.no_continuous_features() return False @@ -131,7 +132,7 @@ def _fix_discrete(): def _fix_nonbinary(): nonlocal data - if metric is distance.Jaccard: + if metric is distance.Jaccard and not issparse(data.X): nbinary = sum(a.is_discrete and len(a.values) == 2 for a in data.domain.attributes) if not nbinary: @@ -151,11 +152,11 @@ def _fix_missing(): self.clear_messages() if data is None: - return + return None for check in (_check_sparse, _fix_discrete, _fix_missing, _fix_nonbinary): if not check(): - return + return None try: if metric.supports_normalization and self.normalized_dist: return metric(data, axis=1 - self.axis, impute=True,