biolab · janezd · Mar 15, 2019 · Mar 1, 2019 · Mar 4, 2019 · Mar 4, 2019
diff --git a/Orange/distance/base.py b/Orange/distance/base.py
@@ -13,6 +13,7 @@
 # TODO this *private* function is called from several widgets to prepare
 # data for calling the below classes. After we (mostly) stopped relying
 # on sklearn.metrics, this is (mostly) unnecessary
+
 def _preprocess(table, impute=True):
     """Remove categorical attributes and impute missing values."""
     if not len(table):
@@ -273,8 +274,9 @@ def __init__(self, attributes, axis=1, impute=False):
         self.attributes = attributes
 
     def __call__(self, e1, e2=None):
-        if e1.domain.attributes != self.attributes or \
-                    e2 is not None and e2.domain.attributes != self.attributes:
+        if self.attributes is not None and (
+                e1.domain.attributes != self.attributes
+                or e2 is not None and e2.domain.attributes != self.attributes):
             raise ValueError("mismatching domains")
         return super().__call__(e1, e2)
 
@@ -348,12 +350,17 @@ def fit(self, data):
         Prepare the data on attributes, call `fit_cols` or `fit_rows` and
         return the resulting model.
         """
-        attributes = data.domain.attributes
         x = _orange_to_numpy(data)
-        n_vals = np.fromiter(
-            (len(attr.values) if attr.is_discrete else 0
-             for attr in attributes),
-            dtype=np.int32, count=len(attributes))
+        if hasattr(data, "domain"):
+            attributes = data.domain.attributes
+            n_vals = np.fromiter(
+                (len(attr.values) if attr.is_discrete else 0
+                 for attr in attributes),
+                dtype=np.int32, count=len(attributes))
+        else:
+            assert isinstance(x, np.ndarray)
+            attributes = None
+            n_vals = np.zeros(x.shape[1], dtype=np.int32)
         return [self.fit_cols, self.fit_rows][self.axis](attributes, x, n_vals)
 
     def fit_cols(self, attributes, x, n_vals):

diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 from scipy import stats
+from scipy import sparse as sp
 import sklearn.metrics as skl_metrics
 from sklearn.utils.extmath import row_norms, safe_sparse_dot
 from sklearn.metrics import pairwise_distances
@@ -368,7 +369,7 @@ def prepare_data(x):
             data1 = prepare_data(x1)
             data2 = data1 if x2 is None else prepare_data(x2)
             dist = safe_sparse_dot(data1, data2.T)
-            np.clip(dist, 0, 1, out=dist)
+            np.clip(dist, -1, 1, out=dist)
             if x2 is None:
                 diag = np.diag_indices_from(dist)
                 dist[diag] = np.where(np.isnan(dist[diag]), np.nan, 1.0)
@@ -385,6 +386,12 @@ def __init__(self, attributes, axis, impute, ps):
         self.ps = ps
 
     def compute_distances(self, x1, x2):
+        if sp.issparse(x1):
+            return self._compute_sparse(x1, x2)
+        else:
+            return self._compute_dense(x1, x2)
+
+    def _compute_dense(self, x1, x2):
         """
         The method uses a function implemented in Cython. Data (`x1` and `x2`)
         is accompanied by two tables. One is a 2-d table in which elements of
@@ -414,21 +421,43 @@ def compute_distances(self, x1, x2):
             return _distance.jaccard_cols(
                 nonzeros1, x1, nans1, self.ps)
 
+    def _compute_sparse(self, x1, x2=None):
+        symmetric = x2 is None
+        if symmetric:
+            x2 = x1
+        x1 = sp.csr_matrix(x1)
+        x1.eliminate_zeros()
+        x2 = sp.csr_matrix(x2)
+        x2.eliminate_zeros()
+        n, m = x1.shape[0], x2.shape[0]
+        matrix = np.zeros((n, m))
+        for i in range(n):
+            xi_ind = set(x1[i].indices)
+            for j in range(i if symmetric else m):
+                jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
+                           / len(set(x1[i].indices).union(x1[j].indices))
+                matrix[i, j] = jacc
+                if symmetric:
+                    matrix[j, i] = jacc
+        return matrix
+
 
 class Jaccard(FittedDistance):
-    supports_sparse = False
+    supports_sparse = True
     supports_discrete = True
-    fallback = SklDistance('jaccard')
     ModelType = JaccardModel
 
     def fit_rows(self, attributes, x, n_vals):
         """
         Return a model for computation of Jaccard values. The model stores
         frequencies of non-zero values per each column.
         """
-        ps = np.fromiter(
-            (_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
-            dtype=np.double, count=len(n_vals))
+        if sp.issparse(x):
+            ps = x.getnnz(axis=0)
+        else:
+            ps = np.fromiter(
+                (_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
+                dtype=np.double, count=len(n_vals))
         return JaccardModel(attributes, self.axis, self.impute, ps)
 
     fit_cols = fit_rows

diff --git a/Orange/distance/tests/test_distance.py b/Orange/distance/tests/test_distance.py
@@ -27,11 +27,18 @@ def test_no_data(self):
 
     def test_sparse(self):
         """Test sparse support in distances."""
-        sparse_iris = csr_matrix(Table('iris').X)
+        domain = Domain([ContinuousVariable(c) for c in "abc"])
+        dense_data = Table.from_list(
+            domain, [[1, 0, 2], [-1, 5, 0], [0, 1, 1], [7, 0, 0]])
+        sparse_data = Table(domain, csr_matrix(dense_data.X))
+
         if not self.Distance.supports_sparse:
-            self.assertRaises(TypeError, self.Distance, sparse_iris)
+            self.assertRaises(TypeError, self.Distance, sparse_data)
         else:
-            self.Distance(sparse_iris)
+            # check the result is the same for sparse and dense
+            dist_dense = self.Distance(dense_data)
+            dist_sparse = self.Distance(sparse_data)
+            np.testing.assert_allclose(dist_sparse, dist_dense)
 
 
 class CommonFittedTests(CommonTests):

diff --git a/Orange/tests/test_distances.py b/Orange/tests/test_distances.py
@@ -9,7 +9,6 @@
 import scipy.spatial
 import scipy.stats
 from scipy.sparse import csr_matrix
-from sklearn.exceptions import DataConversionWarning
 
 from Orange.data import (Table, Domain, ContinuousVariable,
                          DiscreteVariable, StringVariable, Instance)
@@ -500,27 +499,26 @@ def test_jaccard_distance_many_examples(self):
                       [0., 0., 0.5]]))
 
     def test_jaccard_distance_numpy(self):
-        with self.assertWarns(DataConversionWarning):
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
-                np.array([[0.5]]))
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic.X),
-                np.array([[0., 0., 0.5, 0.5],
-                          [0., 0., 0.5, 0.5],
-                          [0.5, 0.5, 0., 0.],
-                          [0.5, 0.5, 0., 0.]]))
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic[2].x, self.titanic[:3].X),
-                np.array([[0.5, 0.5, 0.]]))
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic[:2].X, self.titanic[3].x),
-                np.array([[0.5],
-                          [0.5]]))
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic[:2].X, self.titanic[:3].X),
-                np.array([[0., 0., 0.5],
-                          [0., 0., 0.5]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
+            np.array([[0.5]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic.X),
+            np.array([[0., 0., 0.5, 0.5],
+                      [0., 0., 0.5, 0.5],
+                      [0.5, 0.5, 0., 0.],
+                      [0.5, 0.5, 0., 0.]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic[2].x, self.titanic[:3].X),
+            np.array([[0.5, 0.5, 0.]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic[:2].X, self.titanic[3].x),
+            np.array([[0.5],
+                      [0.5]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic[:2].X, self.titanic[:3].X),
+            np.array([[0., 0., 0.5],
+                      [0., 0., 0.5]]))
 
 
 # noinspection PyTypeChecker

diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py
@@ -118,10 +118,11 @@ def _check_sparse():
 
         def _fix_discrete():
             nonlocal data
-            if data.domain.has_discrete_attributes() and (
-                    issparse(data.X) and getattr(metric, "fallback", None)
-                    or not metric.supports_discrete
-                    or self.axis == 1 and metric is not distance.Jaccard):
+            if data.domain.has_discrete_attributes() \
+                    and metric is not distance.Jaccard \
+                    and (issparse(data.X) and getattr(metric, "fallback", None)
+                         or not metric.supports_discrete
+                         or self.axis == 1):
                 if not data.domain.has_continuous_attributes():
                     self.Error.no_continuous_features()
                     return False
@@ -131,7 +132,7 @@ def _fix_discrete():
 
         def _fix_nonbinary():
             nonlocal data
-            if metric is distance.Jaccard:
+            if metric is distance.Jaccard and not issparse(data.X):
                 nbinary = sum(a.is_discrete and len(a.values) == 2
                               for a in data.domain.attributes)
                 if not nbinary:
@@ -151,11 +152,11 @@ def _fix_missing():
 
         self.clear_messages()
         if data is None:
-            return
+            return None
         for check in (_check_sparse,
                       _fix_discrete, _fix_missing, _fix_nonbinary):
             if not check():
-                return
+                return None
         try:
             if metric.supports_normalization and self.normalized_dist:
                 return metric(data, axis=1 - self.axis, impute=True,