From fa18777b6cccc6f7b00c1cb573fcce3b6f03f3f3 Mon Sep 17 00:00:00 2001
From: Ajda Pretnar <ajdapretnar@gmail.com>
Date: Fri, 1 Mar 2019 16:11:40 +0100
Subject: [PATCH 1/9] Sparse Jaccard

---
 Orange/distance/base.py     | 50 +++++++++++++++++++++++++++++++++++++
 Orange/distance/distance.py |  6 ++---
 2 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/Orange/distance/base.py b/Orange/distance/base.py
index ee84001d712..89b1b97c5d7 100644
--- a/Orange/distance/base.py
+++ b/Orange/distance/base.py
@@ -1,5 +1,6 @@
 import numpy as np
 import sklearn.metrics as skl_metrics
+from scipy.sparse import issparse, csr_matrix
 
 from Orange.data import Table, Domain, Instance, RowInstance
 from Orange.misc import DistMatrix
@@ -13,6 +14,7 @@
 # TODO this *private* function is called from several widgets to prepare
 # data for calling the below classes. After we (mostly) stopped relying
 # on sklearn.metrics, this is (mostly) unnecessary
+
 def _preprocess(table, impute=True):
     """Remove categorical attributes and impute missing values."""
     if not len(table):
@@ -499,3 +501,51 @@ def __call__(self, e1, e2=None, axis=1, impute=False):
         else:
             dist_matrix = DistMatrix(dist)
         return dist_matrix
+
+class SparseJaccard:
+    """
+    Fallback for `Jaccard` on sparse data or raw numpy arrays. If data is
+    sparse, data normalized with intersection/union. Sklearn's function can't
+    handle discrete or missing data and normalization.
+    """
+
+    def __call__(self, e1, e2=None, axis=1, impute=False):
+        x1 = _orange_to_numpy(e1)
+        x2 = _orange_to_numpy(e2)
+        if axis == 0:
+            x1 = x1.T
+            if x2 is not None:
+                x2 = x2.T
+        if issparse(x1):
+            dist = self.sparse_jaccard(x1, x2)
+        else:
+            dist = skl_metrics.pairwise.pairwise_distances(x1,
+                                                           x2,
+                                                           metric="jaccard")
+        if impute and np.isnan(dist).any():
+            dist = np.nan_to_num(dist)
+        if isinstance(e1, (Table, RowInstance)):
+            dist_matrix = DistMatrix(dist, e1, e2, axis)
+        else:
+            dist_matrix = DistMatrix(dist)
+        return dist_matrix
+
+    def sparse_jaccard(self, x1, x2=None):
+        symmetric = x2 is None
+        if symmetric:
+            x2 = x1
+        x1 = csr_matrix(x1)
+        x1.eliminate_zeros()
+        x2 = csr_matrix(x2)
+        x2.eliminate_zeros()
+        n, m = x1.shape[0], x2.shape[0]
+        matrix = np.zeros((n, m))
+        for i in range(n):
+            xi_ind = set(x1[i].indices)
+            for j in range(i if symmetric else m):
+                jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
+                           / len(set(x1[i].indices).union(x1[j].indices))
+                matrix[i, j] = jacc
+                if symmetric:
+                    matrix[j, i] = jacc
+        return matrix
diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py
index fd4f01cf250..feb669c6d7c 100644
--- a/Orange/distance/distance.py
+++ b/Orange/distance/distance.py
@@ -11,7 +11,7 @@
 from Orange.statistics import util
 
 from .base import (Distance, DistanceModel, FittedDistance, FittedDistanceModel,
-                   SklDistance, _orange_to_numpy)
+                   SklDistance, _orange_to_numpy, SparseJaccard)
 
 class EuclideanRowsModel(FittedDistanceModel):
     """
@@ -416,9 +416,9 @@ def compute_distances(self, x1, x2):
 
 
 class Jaccard(FittedDistance):
-    supports_sparse = False
+    supports_sparse = True
     supports_discrete = True
-    fallback = SklDistance('jaccard')
+    fallback = SparseJaccard()
     ModelType = JaccardModel
 
     def fit_rows(self, attributes, x, n_vals):

From e7913977d9771713c15c83d019affcd3ab6be46a Mon Sep 17 00:00:00 2001
From: Ajda Pretnar <ajdapretnar@gmail.com>
Date: Mon, 4 Mar 2019 09:59:51 +0100
Subject: [PATCH 2/9] Do not remove nonbinary for sparse

---
 Orange/widgets/unsupervised/owdistances.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py
index ead54502ae5..91690a45907 100644
--- a/Orange/widgets/unsupervised/owdistances.py
+++ b/Orange/widgets/unsupervised/owdistances.py
@@ -132,14 +132,18 @@ def _fix_discrete():
         def _fix_nonbinary():
             nonlocal data
             if metric is distance.Jaccard:
-                nbinary = sum(a.is_discrete and len(a.values) == 2
-                              for a in data.domain.attributes)
-                if not nbinary:
-                    self.Error.no_binary_features()
-                    return False
-                elif nbinary < len(data.domain.attributes):
-                    self.Warning.ignoring_nonbinary()
-                    data = distance.remove_nonbinary_features(data)
+                if issparse(data.X):
+                    # do not remove non-binary
+                    return True
+                else:
+                    nbinary = sum(a.is_discrete and len(a.values) == 2
+                                  for a in data.domain.attributes)
+                    if not nbinary:
+                        self.Error.no_binary_features()
+                        return False
+                    elif nbinary < len(data.domain.attributes):
+                        self.Warning.ignoring_nonbinary()
+                        data = distance.remove_nonbinary_features(data)
             return True
 
         def _fix_missing():

From 70467ea1cc50326bda6b73139e5944bfea159131 Mon Sep 17 00:00:00 2001
From: Ajda Pretnar <ajdapretnar@gmail.com>
Date: Mon, 4 Mar 2019 14:38:05 +0100
Subject: [PATCH 3/9] Test sparse Jaccard

---
 Orange/distance/tests/test_distance.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/Orange/distance/tests/test_distance.py b/Orange/distance/tests/test_distance.py
index ef7b691651e..7c55f1d2a01 100644
--- a/Orange/distance/tests/test_distance.py
+++ b/Orange/distance/tests/test_distance.py
@@ -27,11 +27,13 @@ def test_no_data(self):
 
     def test_sparse(self):
         """Test sparse support in distances."""
-        sparse_iris = csr_matrix(Table('iris').X)
         if not self.Distance.supports_sparse:
-            self.assertRaises(TypeError, self.Distance, sparse_iris)
+            self.assertRaises(TypeError, self.Distance, self.sparse_data)
         else:
-            self.Distance(sparse_iris)
+            # check the result is the same as for dense
+            dist_numpy = self.Distance(self.dense_X)
+            dist_sparse = self.Distance(self.sparse_data)
+            np.testing.assert_allclose(dist_sparse, dist_numpy)
 
 
 class CommonFittedTests(CommonTests):
@@ -144,6 +146,12 @@ def setUp(self):
         self.mixed_data = self.data = Table.from_numpy(
             self.domain, np.hstack((self.cont_data.X[:3], self.disc_data.X)))
 
+        self.dense_X = np.array([[1, 0, 2],
+                                 [-1, 5, 0],
+                                 [0, 1, 1],
+                                 [7, 0, 0]])
+        self.sparse_data = Table(csr_matrix(self.dense_X))
+
 
 
 # Correct results in these tests were computed manually or with Excel;
@@ -838,6 +846,12 @@ def setUp(self):
              [1, 0, 1],
              [1, 0, 0]])
 
+        self.dense_X = np.array([[1, 0, 2],
+                      [-1, 5, 0],
+                      [0, 1, 1],
+                      [7, 0, 0]])
+        self.sparse_data = Table(csr_matrix(self.dense_X))
+
     def test_jaccard_rows(self):
         assert_almost_equal = np.testing.assert_almost_equal
 

From 7ba8c0b12fedaabd1acc55b8ce16e77aade8ce40 Mon Sep 17 00:00:00 2001
From: Ajda Pretnar <ajdapretnar@gmail.com>
Date: Mon, 4 Mar 2019 15:04:37 +0100
Subject: [PATCH 4/9] Disable check for Jaccard

---
 Orange/widgets/unsupervised/owdistances.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py
index 91690a45907..9f9ebe03908 100644
--- a/Orange/widgets/unsupervised/owdistances.py
+++ b/Orange/widgets/unsupervised/owdistances.py
@@ -119,7 +119,9 @@ def _check_sparse():
         def _fix_discrete():
             nonlocal data
             if data.domain.has_discrete_attributes() and (
-                    issparse(data.X) and getattr(metric, "fallback", None)
+                    issparse(data.X) and getattr(metric, "fallback",
+                                                 None) and metric is not
+                                distance.Jaccard
                     or not metric.supports_discrete
                     or self.axis == 1 and metric is not distance.Jaccard):
                 if not data.domain.has_continuous_attributes():

From 551effc45154da84bc60cb76e73f0854e750da5a Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Tue, 12 Mar 2019 23:02:03 +0100
Subject: [PATCH 5/9] Distances: Support numpy arrays without fallbacks

---
 Orange/distance/base.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/Orange/distance/base.py b/Orange/distance/base.py
index 89b1b97c5d7..b43efa0d2c1 100644
--- a/Orange/distance/base.py
+++ b/Orange/distance/base.py
@@ -275,8 +275,9 @@ def __init__(self, attributes, axis=1, impute=False):
         self.attributes = attributes
 
     def __call__(self, e1, e2=None):
-        if e1.domain.attributes != self.attributes or \
-                    e2 is not None and e2.domain.attributes != self.attributes:
+        if self.attributes is not None and (
+                e1.domain.attributes != self.attributes
+                or e2 is not None and e2.domain.attributes != self.attributes):
             raise ValueError("mismatching domains")
         return super().__call__(e1, e2)
 
@@ -350,12 +351,17 @@ def fit(self, data):
         Prepare the data on attributes, call `fit_cols` or `fit_rows` and
         return the resulting model.
         """
-        attributes = data.domain.attributes
         x = _orange_to_numpy(data)
-        n_vals = np.fromiter(
-            (len(attr.values) if attr.is_discrete else 0
-             for attr in attributes),
-            dtype=np.int32, count=len(attributes))
+        if hasattr(data, "domain"):
+            attributes = data.domain.attributes
+            n_vals = np.fromiter(
+                (len(attr.values) if attr.is_discrete else 0
+                 for attr in attributes),
+                dtype=np.int32, count=len(attributes))
+        else:
+            assert isinstance(x, np.ndarray)
+            attributes = None
+            n_vals = np.zeros(x.shape[1], dtype=np.int32)
         return [self.fit_cols, self.fit_rows][self.axis](attributes, x, n_vals)
 
     def fit_cols(self, attributes, x, n_vals):

From d962655dbb60c9941c1c16d2983741ea8b4d2adf Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Tue, 12 Mar 2019 23:02:49 +0100
Subject: [PATCH 6/9] Jaccard distance: Move from a fallback to its own class

---
 Orange/distance/base.py                | 49 --------------------------
 Orange/distance/distance.py            | 36 ++++++++++++++++---
 Orange/distance/tests/test_distance.py | 27 ++++++--------
 Orange/tests/test_distances.py         | 42 +++++++++++-----------
 4 files changed, 61 insertions(+), 93 deletions(-)

diff --git a/Orange/distance/base.py b/Orange/distance/base.py
index b43efa0d2c1..83e2ae5cd90 100644
--- a/Orange/distance/base.py
+++ b/Orange/distance/base.py
@@ -1,6 +1,5 @@
 import numpy as np
 import sklearn.metrics as skl_metrics
-from scipy.sparse import issparse, csr_matrix
 
 from Orange.data import Table, Domain, Instance, RowInstance
 from Orange.misc import DistMatrix
@@ -507,51 +506,3 @@ def __call__(self, e1, e2=None, axis=1, impute=False):
         else:
             dist_matrix = DistMatrix(dist)
         return dist_matrix
-
-class SparseJaccard:
-    """
-    Fallback for `Jaccard` on sparse data or raw numpy arrays. If data is
-    sparse, data normalized with intersection/union. Sklearn's function can't
-    handle discrete or missing data and normalization.
-    """
-
-    def __call__(self, e1, e2=None, axis=1, impute=False):
-        x1 = _orange_to_numpy(e1)
-        x2 = _orange_to_numpy(e2)
-        if axis == 0:
-            x1 = x1.T
-            if x2 is not None:
-                x2 = x2.T
-        if issparse(x1):
-            dist = self.sparse_jaccard(x1, x2)
-        else:
-            dist = skl_metrics.pairwise.pairwise_distances(x1,
-                                                           x2,
-                                                           metric="jaccard")
-        if impute and np.isnan(dist).any():
-            dist = np.nan_to_num(dist)
-        if isinstance(e1, (Table, RowInstance)):
-            dist_matrix = DistMatrix(dist, e1, e2, axis)
-        else:
-            dist_matrix = DistMatrix(dist)
-        return dist_matrix
-
-    def sparse_jaccard(self, x1, x2=None):
-        symmetric = x2 is None
-        if symmetric:
-            x2 = x1
-        x1 = csr_matrix(x1)
-        x1.eliminate_zeros()
-        x2 = csr_matrix(x2)
-        x2.eliminate_zeros()
-        n, m = x1.shape[0], x2.shape[0]
-        matrix = np.zeros((n, m))
-        for i in range(n):
-            xi_ind = set(x1[i].indices)
-            for j in range(i if symmetric else m):
-                jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
-                           / len(set(x1[i].indices).union(x1[j].indices))
-                matrix[i, j] = jacc
-                if symmetric:
-                    matrix[j, i] = jacc
-        return matrix
diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py
index feb669c6d7c..3f27632b5ba 100644
--- a/Orange/distance/distance.py
+++ b/Orange/distance/distance.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 from scipy import stats
+from scipy import sparse as sp
 import sklearn.metrics as skl_metrics
 from sklearn.utils.extmath import row_norms, safe_sparse_dot
 from sklearn.metrics import pairwise_distances
@@ -11,7 +12,7 @@
 from Orange.statistics import util
 
 from .base import (Distance, DistanceModel, FittedDistance, FittedDistanceModel,
-                   SklDistance, _orange_to_numpy, SparseJaccard)
+                   SklDistance, _orange_to_numpy)
 
 class EuclideanRowsModel(FittedDistanceModel):
     """
@@ -395,6 +396,9 @@ def compute_distances(self, x1, x2):
         compute distances between rows without missing values, and a slower
         loop for those with missing values.
         """
+        if sp.issparse(x1):
+            return self.sparse_jaccard(x1, x2)
+
         nonzeros1 = np.not_equal(x1, 0).view(np.int8)
         if self.axis == 1:
             nans1 = _distance.any_nan_row(x1)
@@ -414,11 +418,30 @@ def compute_distances(self, x1, x2):
             return _distance.jaccard_cols(
                 nonzeros1, x1, nans1, self.ps)
 
+    def sparse_jaccard(self, x1, x2=None):
+        symmetric = x2 is None
+        if symmetric:
+            x2 = x1
+        x1 = sp.csr_matrix(x1)
+        x1.eliminate_zeros()
+        x2 = sp.csr_matrix(x2)
+        x2.eliminate_zeros()
+        n, m = x1.shape[0], x2.shape[0]
+        matrix = np.zeros((n, m))
+        for i in range(n):
+            xi_ind = set(x1[i].indices)
+            for j in range(i if symmetric else m):
+                jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
+                           / len(set(x1[i].indices).union(x1[j].indices))
+                matrix[i, j] = jacc
+                if symmetric:
+                    matrix[j, i] = jacc
+        return matrix
+
 
 class Jaccard(FittedDistance):
     supports_sparse = True
     supports_discrete = True
-    fallback = SparseJaccard()
     ModelType = JaccardModel
 
     def fit_rows(self, attributes, x, n_vals):
@@ -426,9 +449,12 @@ def fit_rows(self, attributes, x, n_vals):
         Return a model for computation of Jaccard values. The model stores
         frequencies of non-zero values per each column.
         """
-        ps = np.fromiter(
-            (_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
-            dtype=np.double, count=len(n_vals))
+        if sp.issparse(x):
+            ps = None  # wrong!
+        else:
+            ps = np.fromiter(
+                (_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
+                dtype=np.double, count=len(n_vals))
         return JaccardModel(attributes, self.axis, self.impute, ps)
 
     fit_cols = fit_rows
diff --git a/Orange/distance/tests/test_distance.py b/Orange/distance/tests/test_distance.py
index 7c55f1d2a01..e056619fbf1 100644
--- a/Orange/distance/tests/test_distance.py
+++ b/Orange/distance/tests/test_distance.py
@@ -27,13 +27,18 @@ def test_no_data(self):
 
     def test_sparse(self):
         """Test sparse support in distances."""
+        domain = Domain([ContinuousVariable(c) for c in "abc"])
+        dense_data = Table.from_list(
+            domain, [[1, 0, 2], [-1, 5, 0], [0, 1, 1], [7, 0, 0]])
+        sparse_data = Table(domain, csr_matrix(dense_data.X))
+
         if not self.Distance.supports_sparse:
-            self.assertRaises(TypeError, self.Distance, self.sparse_data)
+            self.assertRaises(TypeError, self.Distance, sparse_data)
         else:
-            # check the result is the same as for dense
-            dist_numpy = self.Distance(self.dense_X)
-            dist_sparse = self.Distance(self.sparse_data)
-            np.testing.assert_allclose(dist_sparse, dist_numpy)
+            # check the result is the same for sparse and dense
+            dist_dense = self.Distance(dense_data)
+            dist_sparse = self.Distance(sparse_data)
+            np.testing.assert_allclose(dist_sparse, dist_dense)
 
 
 class CommonFittedTests(CommonTests):
@@ -146,12 +151,6 @@ def setUp(self):
         self.mixed_data = self.data = Table.from_numpy(
             self.domain, np.hstack((self.cont_data.X[:3], self.disc_data.X)))
 
-        self.dense_X = np.array([[1, 0, 2],
-                                 [-1, 5, 0],
-                                 [0, 1, 1],
-                                 [7, 0, 0]])
-        self.sparse_data = Table(csr_matrix(self.dense_X))
-
 
 
 # Correct results in these tests were computed manually or with Excel;
@@ -846,12 +845,6 @@ def setUp(self):
              [1, 0, 1],
              [1, 0, 0]])
 
-        self.dense_X = np.array([[1, 0, 2],
-                      [-1, 5, 0],
-                      [0, 1, 1],
-                      [7, 0, 0]])
-        self.sparse_data = Table(csr_matrix(self.dense_X))
-
     def test_jaccard_rows(self):
         assert_almost_equal = np.testing.assert_almost_equal
 
diff --git a/Orange/tests/test_distances.py b/Orange/tests/test_distances.py
index 571ad274e32..24e914db49a 100644
--- a/Orange/tests/test_distances.py
+++ b/Orange/tests/test_distances.py
@@ -9,7 +9,6 @@
 import scipy.spatial
 import scipy.stats
 from scipy.sparse import csr_matrix
-from sklearn.exceptions import DataConversionWarning
 
 from Orange.data import (Table, Domain, ContinuousVariable,
                          DiscreteVariable, StringVariable, Instance)
@@ -500,27 +499,26 @@ def test_jaccard_distance_many_examples(self):
                       [0., 0., 0.5]]))
 
     def test_jaccard_distance_numpy(self):
-        with self.assertWarns(DataConversionWarning):
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
-                np.array([[0.5]]))
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic.X),
-                np.array([[0., 0., 0.5, 0.5],
-                          [0., 0., 0.5, 0.5],
-                          [0.5, 0.5, 0., 0.],
-                          [0.5, 0.5, 0., 0.]]))
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic[2].x, self.titanic[:3].X),
-                np.array([[0.5, 0.5, 0.]]))
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic[:2].X, self.titanic[3].x),
-                np.array([[0.5],
-                          [0.5]]))
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic[:2].X, self.titanic[:3].X),
-                np.array([[0., 0., 0.5],
-                          [0., 0., 0.5]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
+            np.array([[0.5]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic.X),
+            np.array([[0., 0., 0.5, 0.5],
+                      [0., 0., 0.5, 0.5],
+                      [0.5, 0.5, 0., 0.],
+                      [0.5, 0.5, 0., 0.]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic[2].x, self.titanic[:3].X),
+            np.array([[0.5, 0.5, 0.]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic[:2].X, self.titanic[3].x),
+            np.array([[0.5],
+                      [0.5]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic[:2].X, self.titanic[:3].X),
+            np.array([[0., 0., 0.5],
+                      [0., 0., 0.5]]))
 
 
 # noinspection PyTypeChecker

From 5cf9802ccebda2f94d82d280ad073bce3b55e2ef Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Wed, 13 Mar 2019 17:07:12 +0100
Subject: [PATCH 7/9] Cosine distance: Fix clipping

---
 Orange/distance/distance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py
index 3f27632b5ba..a1005fa75d8 100644
--- a/Orange/distance/distance.py
+++ b/Orange/distance/distance.py
@@ -369,7 +369,7 @@ def prepare_data(x):
             data1 = prepare_data(x1)
             data2 = data1 if x2 is None else prepare_data(x2)
             dist = safe_sparse_dot(data1, data2.T)
-            np.clip(dist, 0, 1, out=dist)
+            np.clip(dist, -1, 1, out=dist)
             if x2 is None:
                 diag = np.diag_indices_from(dist)
                 dist[diag] = np.where(np.isnan(dist[diag]), np.nan, 1.0)

From 9d84d8516f25c0d04808d7d2745594c88223d28a Mon Sep 17 00:00:00 2001
From: Ajda Pretnar <ajdapretnar@gmail.com>
Date: Thu, 14 Mar 2019 15:39:52 +0100
Subject: [PATCH 8/9] Section code and extend fitter

---
 Orange/distance/distance.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py
index a1005fa75d8..609f6b6f9b1 100644
--- a/Orange/distance/distance.py
+++ b/Orange/distance/distance.py
@@ -386,6 +386,12 @@ def __init__(self, attributes, axis, impute, ps):
         self.ps = ps
 
     def compute_distances(self, x1, x2):
+        if sp.issparse(x1):
+            return self._compute_sparse(x1, x2)
+        else:
+            return self._compute_dense(x1, x2)
+
+    def _compute_dense(self, x1, x2):
         """
         The method uses a function implemented in Cython. Data (`x1` and `x2`)
         is accompanied by two tables. One is a 2-d table in which elements of
@@ -396,9 +402,6 @@ def compute_distances(self, x1, x2):
         compute distances between rows without missing values, and a slower
         loop for those with missing values.
         """
-        if sp.issparse(x1):
-            return self.sparse_jaccard(x1, x2)
-
         nonzeros1 = np.not_equal(x1, 0).view(np.int8)
         if self.axis == 1:
             nans1 = _distance.any_nan_row(x1)
@@ -418,7 +421,7 @@ def compute_distances(self, x1, x2):
             return _distance.jaccard_cols(
                 nonzeros1, x1, nans1, self.ps)
 
-    def sparse_jaccard(self, x1, x2=None):
+    def _compute_sparse(self, x1, x2=None):
         symmetric = x2 is None
         if symmetric:
             x2 = x1
@@ -450,7 +453,7 @@ def fit_rows(self, attributes, x, n_vals):
         frequencies of non-zero values per each column.
         """
         if sp.issparse(x):
-            ps = None  # wrong!
+            ps = x.getnnz(axis=0)
         else:
             ps = np.fromiter(
                 (_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),

From a30e68885317e020a8f0ed3fd8050f328cb0b019 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Fri, 15 Mar 2019 14:51:35 +0100
Subject: [PATCH 9/9] OWDistance: Minor reformatting

---
 Orange/widgets/unsupervised/owdistances.py | 37 ++++++++++------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py
index 9f9ebe03908..18aac47daff 100644
--- a/Orange/widgets/unsupervised/owdistances.py
+++ b/Orange/widgets/unsupervised/owdistances.py
@@ -118,12 +118,11 @@ def _check_sparse():
 
         def _fix_discrete():
             nonlocal data
-            if data.domain.has_discrete_attributes() and (
-                    issparse(data.X) and getattr(metric, "fallback",
-                                                 None) and metric is not
-                                distance.Jaccard
-                    or not metric.supports_discrete
-                    or self.axis == 1 and metric is not distance.Jaccard):
+            if data.domain.has_discrete_attributes() \
+                    and metric is not distance.Jaccard \
+                    and (issparse(data.X) and getattr(metric, "fallback", None)
+                         or not metric.supports_discrete
+                         or self.axis == 1):
                 if not data.domain.has_continuous_attributes():
                     self.Error.no_continuous_features()
                     return False
@@ -133,19 +132,15 @@ def _fix_discrete():
 
         def _fix_nonbinary():
             nonlocal data
-            if metric is distance.Jaccard:
-                if issparse(data.X):
-                    # do not remove non-binary
-                    return True
-                else:
-                    nbinary = sum(a.is_discrete and len(a.values) == 2
-                                  for a in data.domain.attributes)
-                    if not nbinary:
-                        self.Error.no_binary_features()
-                        return False
-                    elif nbinary < len(data.domain.attributes):
-                        self.Warning.ignoring_nonbinary()
-                        data = distance.remove_nonbinary_features(data)
+            if metric is distance.Jaccard and not issparse(data.X):
+                nbinary = sum(a.is_discrete and len(a.values) == 2
+                              for a in data.domain.attributes)
+                if not nbinary:
+                    self.Error.no_binary_features()
+                    return False
+                elif nbinary < len(data.domain.attributes):
+                    self.Warning.ignoring_nonbinary()
+                    data = distance.remove_nonbinary_features(data)
             return True
 
         def _fix_missing():
@@ -157,11 +152,11 @@ def _fix_missing():
 
         self.clear_messages()
         if data is None:
-            return
+            return None
         for check in (_check_sparse,
                       _fix_discrete, _fix_missing, _fix_nonbinary):
             if not check():
-                return
+                return None
         try:
             if metric.supports_normalization and self.normalized_dist:
                 return metric(data, axis=1 - self.axis, impute=True,