diff --git a/Orange/distance/__init__.py b/Orange/distance/__init__.py index 77dbe871211..3eb37b4f58a 100644 --- a/Orange/distance/__init__.py +++ b/Orange/distance/__init__.py @@ -1,7 +1,7 @@ from .distance import (Distance, DistanceModel, Euclidean, Manhattan, Cosine, Jaccard, SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute, - Mahalanobis, MahalanobisDistance, Hamming) + Mahalanobis, MahalanobisDistance, Hamming, Bhattacharyya) from .base import ( _preprocess, remove_discrete_features, remove_nonbinary_features, impute) diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index 87d91ce45dd..e8156e5ce38 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -5,6 +5,7 @@ from scipy import stats from scipy import sparse as sp import sklearn.metrics as skl_metrics +from sklearn.utils import check_array from sklearn.utils.extmath import row_norms, safe_sparse_dot from sklearn.metrics import pairwise_distances @@ -645,6 +646,49 @@ def fit(self, _): return PearsonModel(True, self.axis, self.impute) +def _prob_dist(a): + # Makes the vector sum to one, as to mimic probability distribution. + return a / np.sum(a) + + +def check_non_negative(a): + # Raise an exception for infinities, nans and negative values + check_array(a, + accept_sparse=True, accept_large_sparse=True, ensure_2d=False) + if a.min() < 0: + raise ValueError("Bhattcharyya distance requires non-negative values") + + +def _bhattacharyya(a, b): + # not a real metric, does not obey triangle inequality + check_non_negative(a) + check_non_negative(b) + a = _prob_dist(a) + b = _prob_dist(b) + if sp.issparse(a): + return -np.log(np.sum(np.sqrt(a.multiply(b)))) + return -np.log(np.sum(np.sqrt(a * b))) + + +class Bhattacharyya(Distance): + supports_discrete = False + supports_sparse = True + + def fit(self, data): + return BhattacharyyaModel(self.axis, self.impute) + + +class BhattacharyyaModel(DistanceModel): + + def compute_distances(self, x1, x2): + if x2 is None: + x2 = x1 + if self.axis == 1: + return pairwise_distances(x1, x2, _bhattacharyya) + else: + return pairwise_distances(x1.T, x2.T, _bhattacharyya) + + class Mahalanobis(Distance): supports_sparse = False supports_missing = False diff --git a/Orange/tests/test_distances.py b/Orange/tests/test_distances.py index 907b84a1950..9ab7a999a80 100644 --- a/Orange/tests/test_distances.py +++ b/Orange/tests/test_distances.py @@ -2,6 +2,7 @@ # pylint: disable=missing-docstring from unittest import TestCase +import unittest import pickle import numpy as np @@ -14,7 +15,8 @@ DiscreteVariable, StringVariable, Instance) from Orange.distance import (Euclidean, SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute, Manhattan, Cosine, - Jaccard, _preprocess, MahalanobisDistance) + Jaccard, _preprocess, MahalanobisDistance, + Bhattacharyya) from Orange.distance.distance import _spearmanr2, _corrcoef2 from Orange.misc import DistMatrix from Orange.tests import named_file, test_filename @@ -91,20 +93,20 @@ def test_from_file(self): self.assertEqual(m.axis, 1) with named_file( - """3 axis=1 symmetric - 0.12 3.45 6.78 - 9.01 2.34 5.67 - 8.90""") as name: + """3 axis=1 symmetric + 0.12 3.45 6.78 + 9.01 2.34 5.67 + 8.90""") as name: m = DistMatrix.from_file(name) np.testing.assert_almost_equal(m, np.array([[0.12, 9.01, 8.90], [9.01, 2.34, 0], [8.90, 0, 0]])) with named_file( - """3 row_labels - starič 0.12 3.45 6.78 - aleš 9.01 2.34 5.67 - anže 8.90""", encoding="utf-8""") as name: + """3 row_labels + starič 0.12 3.45 6.78 + aleš 9.01 2.34 5.67 + anže 8.90""", encoding="utf-8""") as name: m = DistMatrix.from_file(name) np.testing.assert_almost_equal(m, np.array([[0.12, 9.01, 8.90], [9.01, 2.34, 0], @@ -150,10 +152,10 @@ def assertErrorMsg(content, msg): def test_save(self): with named_file( - """3 axis=1 row_labels - danny 0.12 3.45 6.78 - eve 9.01 2.34 5.67 - frank 8.90""") as name: + """3 axis=1 row_labels + danny 0.12 3.45 6.78 + eve 9.01 2.34 5.67 + frank 8.90""") as name: m = DistMatrix.from_file(name) m.save(name) m = DistMatrix.from_file(name) @@ -167,11 +169,11 @@ def test_save(self): self.assertEqual(m.axis, 1) with named_file( - """3 axis=0 asymmetric col_labels row_labels - ann bert chad - danny 0.12 3.45 6.78 - eve 9.01 2.34 5.67 - frank 8.90 1.23 4.56""") as name: + """3 axis=0 asymmetric col_labels row_labels + ann bert chad + danny 0.12 3.45 6.78 + eve 9.01 2.34 5.67 + frank 8.90 1.23 4.56""") as name: m = DistMatrix.from_file(name) m.save(name) m = DistMatrix.from_file(name) @@ -943,6 +945,41 @@ def test_dimensions(self): mah(xt[0], xt[1]) +class TestBhattacharyya(TestCase): + + @classmethod + def setUpClass(cls): + cls.dist = Bhattacharyya + + def test_dense_array(self): + #Also checks normalization + data = Table('iris') + true_out = np.array([[0, 4.48049499e-04, 2.07117086e-05], + [4.48049499e-04, 0, 3.65052724e-04], + [2.07117086e-05, 3.65052724e-04, 0]]) + np.testing.assert_array_almost_equal(self.dist(data.X[:3]), true_out) + + def test_sparse_array(self): + data = csr_matrix([[0.5, 0.5], [0, 0.5]]) + self.assertAlmostEqual(self.dist(data[0], data[1]), 0.3465735902799726, delta=1e-5) + + def test_columns(self): + data = np.array([[0.5, 0.2], [0.5, 0.8]]) + true_out = np.array([[0, 0.05268025782891318], + [0.05268025782891318, 0]]) + np.testing.assert_array_almost_equal(self.dist(data, axis=0), true_out) + + def test_negative_input(self): + a = np.array([0, np.nan]) + b = np.array([1, 1]) + self.assertRaises(ValueError, self.dist, a, b) + a[1] = -1 + self.assertRaises(ValueError, self.dist, a, b) + a = csr_matrix(a) + b = csr_matrix(b) + self.assertRaises(ValueError, self.dist, a, b) + + class TestDistances(TestCase): @classmethod def setUpClass(cls): @@ -982,3 +1019,6 @@ def test_distance_to_instance(self): iris = Table('iris') inst = Instance(iris.domain, np.concatenate((iris[1].x, iris[1].y))) self.assertEqual(Euclidean(iris[1], inst), 0) + +if __name__ == '__main__': + unittest.main() diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py index faddbdb5495..153cd78b56f 100644 --- a/Orange/widgets/unsupervised/owdistances.py +++ b/Orange/widgets/unsupervised/owdistances.py @@ -21,7 +21,8 @@ ("Absolute Spearman", distance.SpearmanRAbsolute), ("Pearson", distance.PearsonR), ("Absolute Pearson", distance.PearsonRAbsolute), - ("Hamming", distance.Hamming) + ("Hamming", distance.Hamming), + ('Bhattacharyya', distance.Bhattacharyya) ] diff --git a/Orange/widgets/unsupervised/tests/test_owdistances.py b/Orange/widgets/unsupervised/tests/test_owdistances.py index 245552ade26..2362a6bbadc 100644 --- a/Orange/widgets/unsupervised/tests/test_owdistances.py +++ b/Orange/widgets/unsupervised/tests/test_owdistances.py @@ -103,3 +103,12 @@ def test_too_big_array(self): def test_migrates_normalized_dist(self): w = self.create_widget(OWDistances, stored_settings={"metric_idx": 0}) self.assertFalse(w.normalized_dist) + + def test_negative_values_bhattacharyya(self): + self.iris.X[0, 0] *= -1 + for self.widget.metric_idx, (_, metric) in enumerate(METRICS): + if metric == distance.Bhattacharyya: + break + self.send_signal(self.widget.Inputs.data, self.iris) + self.assertTrue(self.widget.Error.distances_value_error.is_shown()) + self.iris.X[0, 0] *= -1 diff --git a/doc/visual-programming/source/widgets/unsupervised/distances.md b/doc/visual-programming/source/widgets/unsupervised/distances.md index 75c17238ce4..f66976b87df 100644 --- a/doc/visual-programming/source/widgets/unsupervised/distances.md +++ b/doc/visual-programming/source/widgets/unsupervised/distances.md @@ -32,8 +32,9 @@ Distances work well with Orange add-ons, too. The distance matrix can be fed to - [Pearson](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient) (linear correlation between the values, remapped as a distance in a [0, 1] interval) - [Pearson absolute](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient) (linear correlation between the absolute values, remapped as a distance in a [0, 1] interval) - [Hamming](https://en.wikipedia.org/wiki/Hamming_distance) (the number of features at which the corresponding values are different) + - [Bhattacharyya distance](https://en.wikipedia.org/wiki/Bhattacharyya_distance) (Similarity between two probability distributions, not a real distance as it doesn't obey triangle inequality.) - Normalize the features. Normalization is always done column-wise. + Normalize the features. Normalization is always done column-wise. Values are zero centered and scaled. In case of missing values, the widget automatically imputes the average value of the row or the column. The widget works for both numeric and categorical data. In case of categorical data, the distance is 0 if the two values are the same ('green' and 'green') and 1 if they are not ('green' and 'blue'). 3. Tick *Apply Automatically* to automatically commit changes to other widgets. Alternatively, press '*Apply*'.