Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Support sparse Jaccard #3657

Merged
merged 9 commits into from
Mar 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions Orange/distance/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# TODO this *private* function is called from several widgets to prepare
# data for calling the below classes. After we (mostly) stopped relying
# on sklearn.metrics, this is (mostly) unnecessary

def _preprocess(table, impute=True):
"""Remove categorical attributes and impute missing values."""
if not len(table):
Expand Down Expand Up @@ -273,8 +274,9 @@ def __init__(self, attributes, axis=1, impute=False):
self.attributes = attributes

def __call__(self, e1, e2=None):
if e1.domain.attributes != self.attributes or \
e2 is not None and e2.domain.attributes != self.attributes:
if self.attributes is not None and (
e1.domain.attributes != self.attributes
or e2 is not None and e2.domain.attributes != self.attributes):
raise ValueError("mismatching domains")
return super().__call__(e1, e2)

Expand Down Expand Up @@ -348,12 +350,17 @@ def fit(self, data):
Prepare the data on attributes, call `fit_cols` or `fit_rows` and
return the resulting model.
"""
attributes = data.domain.attributes
x = _orange_to_numpy(data)
n_vals = np.fromiter(
(len(attr.values) if attr.is_discrete else 0
for attr in attributes),
dtype=np.int32, count=len(attributes))
if hasattr(data, "domain"):
attributes = data.domain.attributes
n_vals = np.fromiter(
(len(attr.values) if attr.is_discrete else 0
for attr in attributes),
dtype=np.int32, count=len(attributes))
else:
assert isinstance(x, np.ndarray)
attributes = None
n_vals = np.zeros(x.shape[1], dtype=np.int32)
return [self.fit_cols, self.fit_rows][self.axis](attributes, x, n_vals)

def fit_cols(self, attributes, x, n_vals):
Expand Down
41 changes: 35 additions & 6 deletions Orange/distance/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
from scipy import stats
from scipy import sparse as sp
import sklearn.metrics as skl_metrics
from sklearn.utils.extmath import row_norms, safe_sparse_dot
from sklearn.metrics import pairwise_distances
Expand Down Expand Up @@ -368,7 +369,7 @@ def prepare_data(x):
data1 = prepare_data(x1)
data2 = data1 if x2 is None else prepare_data(x2)
dist = safe_sparse_dot(data1, data2.T)
np.clip(dist, 0, 1, out=dist)
np.clip(dist, -1, 1, out=dist)
if x2 is None:
diag = np.diag_indices_from(dist)
dist[diag] = np.where(np.isnan(dist[diag]), np.nan, 1.0)
Expand All @@ -385,6 +386,12 @@ def __init__(self, attributes, axis, impute, ps):
self.ps = ps

def compute_distances(self, x1, x2):
if sp.issparse(x1):
return self._compute_sparse(x1, x2)
else:
return self._compute_dense(x1, x2)

def _compute_dense(self, x1, x2):
"""
The method uses a function implemented in Cython. Data (`x1` and `x2`)
is accompanied by two tables. One is a 2-d table in which elements of
Expand Down Expand Up @@ -414,21 +421,43 @@ def compute_distances(self, x1, x2):
return _distance.jaccard_cols(
nonzeros1, x1, nans1, self.ps)

def _compute_sparse(self, x1, x2=None):
symmetric = x2 is None
if symmetric:
x2 = x1
x1 = sp.csr_matrix(x1)
x1.eliminate_zeros()
x2 = sp.csr_matrix(x2)
x2.eliminate_zeros()
n, m = x1.shape[0], x2.shape[0]
matrix = np.zeros((n, m))
for i in range(n):
xi_ind = set(x1[i].indices)
for j in range(i if symmetric else m):
jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
/ len(set(x1[i].indices).union(x1[j].indices))
matrix[i, j] = jacc
if symmetric:
matrix[j, i] = jacc
return matrix


class Jaccard(FittedDistance):
supports_sparse = False
supports_sparse = True
supports_discrete = True
fallback = SklDistance('jaccard')
ModelType = JaccardModel

def fit_rows(self, attributes, x, n_vals):
"""
Return a model for computation of Jaccard values. The model stores
frequencies of non-zero values per each column.
"""
ps = np.fromiter(
(_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
dtype=np.double, count=len(n_vals))
if sp.issparse(x):
ps = x.getnnz(axis=0)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand the Cython code correctly, this should be ok. @janezd

else:
ps = np.fromiter(
(_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
dtype=np.double, count=len(n_vals))
return JaccardModel(attributes, self.axis, self.impute, ps)

fit_cols = fit_rows
Expand Down
13 changes: 10 additions & 3 deletions Orange/distance/tests/test_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,18 @@ def test_no_data(self):

def test_sparse(self):
"""Test sparse support in distances."""
sparse_iris = csr_matrix(Table('iris').X)
domain = Domain([ContinuousVariable(c) for c in "abc"])
dense_data = Table.from_list(
domain, [[1, 0, 2], [-1, 5, 0], [0, 1, 1], [7, 0, 0]])
sparse_data = Table(domain, csr_matrix(dense_data.X))

if not self.Distance.supports_sparse:
self.assertRaises(TypeError, self.Distance, sparse_iris)
self.assertRaises(TypeError, self.Distance, sparse_data)
else:
self.Distance(sparse_iris)
# check the result is the same for sparse and dense
dist_dense = self.Distance(dense_data)
dist_sparse = self.Distance(sparse_data)
np.testing.assert_allclose(dist_sparse, dist_dense)


class CommonFittedTests(CommonTests):
Expand Down
42 changes: 20 additions & 22 deletions Orange/tests/test_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import scipy.spatial
import scipy.stats
from scipy.sparse import csr_matrix
from sklearn.exceptions import DataConversionWarning

from Orange.data import (Table, Domain, ContinuousVariable,
DiscreteVariable, StringVariable, Instance)
Expand Down Expand Up @@ -500,27 +499,26 @@ def test_jaccard_distance_many_examples(self):
[0., 0., 0.5]]))

def test_jaccard_distance_numpy(self):
with self.assertWarns(DataConversionWarning):
np.testing.assert_almost_equal(
self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
np.array([[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic.X),
np.array([[0., 0., 0.5, 0.5],
[0., 0., 0.5, 0.5],
[0.5, 0.5, 0., 0.],
[0.5, 0.5, 0., 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[2].x, self.titanic[:3].X),
np.array([[0.5, 0.5, 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[3].x),
np.array([[0.5],
[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[:3].X),
np.array([[0., 0., 0.5],
[0., 0., 0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
np.array([[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic.X),
np.array([[0., 0., 0.5, 0.5],
[0., 0., 0.5, 0.5],
[0.5, 0.5, 0., 0.],
[0.5, 0.5, 0., 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[2].x, self.titanic[:3].X),
np.array([[0.5, 0.5, 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[3].x),
np.array([[0.5],
[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[:3].X),
np.array([[0., 0., 0.5],
[0., 0., 0.5]]))


# noinspection PyTypeChecker
Expand Down
15 changes: 8 additions & 7 deletions Orange/widgets/unsupervised/owdistances.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,11 @@ def _check_sparse():

def _fix_discrete():
nonlocal data
if data.domain.has_discrete_attributes() and (
issparse(data.X) and getattr(metric, "fallback", None)
or not metric.supports_discrete
or self.axis == 1 and metric is not distance.Jaccard):
if data.domain.has_discrete_attributes() \
and metric is not distance.Jaccard \
and (issparse(data.X) and getattr(metric, "fallback", None)
or not metric.supports_discrete
or self.axis == 1):
if not data.domain.has_continuous_attributes():
self.Error.no_continuous_features()
return False
Expand All @@ -131,7 +132,7 @@ def _fix_discrete():

def _fix_nonbinary():
nonlocal data
if metric is distance.Jaccard:
if metric is distance.Jaccard and not issparse(data.X):
nbinary = sum(a.is_discrete and len(a.values) == 2
for a in data.domain.attributes)
if not nbinary:
Expand All @@ -151,11 +152,11 @@ def _fix_missing():

self.clear_messages()
if data is None:
return
return None
for check in (_check_sparse,
_fix_discrete, _fix_missing, _fix_nonbinary):
if not check():
return
return None
try:
if metric.supports_normalization and self.normalized_dist:
return metric(data, axis=1 - self.axis, impute=True,
Expand Down