Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Bhatthacharayya distance #4111

Merged
merged 5 commits into from
Nov 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Orange/distance/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .distance import (Distance, DistanceModel,
Euclidean, Manhattan, Cosine, Jaccard,
SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute,
Mahalanobis, MahalanobisDistance, Hamming)
Mahalanobis, MahalanobisDistance, Hamming, Bhattacharyya)

from .base import (
_preprocess, remove_discrete_features, remove_nonbinary_features, impute)
44 changes: 44 additions & 0 deletions Orange/distance/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from scipy import stats
from scipy import sparse as sp
import sklearn.metrics as skl_metrics
from sklearn.utils import check_array
from sklearn.utils.extmath import row_norms, safe_sparse_dot
from sklearn.metrics import pairwise_distances

Expand Down Expand Up @@ -645,6 +646,49 @@ def fit(self, _):
return PearsonModel(True, self.axis, self.impute)


def _prob_dist(a):
# Makes the vector sum to one, as to mimic probability distribution.
return a / np.sum(a)


def check_non_negative(a):
# Raise an exception for infinities, nans and negative values
check_array(a,
accept_sparse=True, accept_large_sparse=True, ensure_2d=False)
if a.min() < 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AndrejaKovacic, would this be OK, too?

Dense arrays also have a method min, so there's no need for if. Also, I think it is better to not change the exception message raised by check_array so the caller is informed that, for instance, there are nan values in the data.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's more informative this way.

raise ValueError("Bhattcharyya distance requires non-negative values")


def _bhattacharyya(a, b):
# not a real metric, does not obey triangle inequality
check_non_negative(a)
check_non_negative(b)
a = _prob_dist(a)
b = _prob_dist(b)
if sp.issparse(a):
return -np.log(np.sum(np.sqrt(a.multiply(b))))
return -np.log(np.sum(np.sqrt(a * b)))


class Bhattacharyya(Distance):
supports_discrete = False
supports_sparse = True

def fit(self, data):
return BhattacharyyaModel(self.axis, self.impute)


class BhattacharyyaModel(DistanceModel):

def compute_distances(self, x1, x2):
if x2 is None:
x2 = x1
if self.axis == 1:
return pairwise_distances(x1, x2, _bhattacharyya)
else:
return pairwise_distances(x1.T, x2.T, _bhattacharyya)


class Mahalanobis(Distance):
supports_sparse = False
supports_missing = False
Expand Down
76 changes: 58 additions & 18 deletions Orange/tests/test_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# pylint: disable=missing-docstring

from unittest import TestCase
import unittest
import pickle

import numpy as np
Expand All @@ -14,7 +15,8 @@
DiscreteVariable, StringVariable, Instance)
from Orange.distance import (Euclidean, SpearmanR, SpearmanRAbsolute,
PearsonR, PearsonRAbsolute, Manhattan, Cosine,
Jaccard, _preprocess, MahalanobisDistance)
Jaccard, _preprocess, MahalanobisDistance,
Bhattacharyya)
from Orange.distance.distance import _spearmanr2, _corrcoef2
from Orange.misc import DistMatrix
from Orange.tests import named_file, test_filename
Expand Down Expand Up @@ -91,20 +93,20 @@ def test_from_file(self):
self.assertEqual(m.axis, 1)

with named_file(
"""3 axis=1 symmetric
0.12 3.45 6.78
9.01 2.34 5.67
8.90""") as name:
"""3 axis=1 symmetric
0.12 3.45 6.78
9.01 2.34 5.67
8.90""") as name:
m = DistMatrix.from_file(name)
np.testing.assert_almost_equal(m, np.array([[0.12, 9.01, 8.90],
[9.01, 2.34, 0],
[8.90, 0, 0]]))

with named_file(
"""3 row_labels
starič 0.12 3.45 6.78
aleš 9.01 2.34 5.67
anže 8.90""", encoding="utf-8""") as name:
"""3 row_labels
starič 0.12 3.45 6.78
aleš 9.01 2.34 5.67
anže 8.90""", encoding="utf-8""") as name:
m = DistMatrix.from_file(name)
np.testing.assert_almost_equal(m, np.array([[0.12, 9.01, 8.90],
[9.01, 2.34, 0],
Expand Down Expand Up @@ -150,10 +152,10 @@ def assertErrorMsg(content, msg):

def test_save(self):
with named_file(
"""3 axis=1 row_labels
danny 0.12 3.45 6.78
eve 9.01 2.34 5.67
frank 8.90""") as name:
"""3 axis=1 row_labels
danny 0.12 3.45 6.78
eve 9.01 2.34 5.67
frank 8.90""") as name:
m = DistMatrix.from_file(name)
m.save(name)
m = DistMatrix.from_file(name)
Expand All @@ -167,11 +169,11 @@ def test_save(self):
self.assertEqual(m.axis, 1)

with named_file(
"""3 axis=0 asymmetric col_labels row_labels
ann bert chad
danny 0.12 3.45 6.78
eve 9.01 2.34 5.67
frank 8.90 1.23 4.56""") as name:
"""3 axis=0 asymmetric col_labels row_labels
ann bert chad
danny 0.12 3.45 6.78
eve 9.01 2.34 5.67
frank 8.90 1.23 4.56""") as name:
m = DistMatrix.from_file(name)
m.save(name)
m = DistMatrix.from_file(name)
Expand Down Expand Up @@ -943,6 +945,41 @@ def test_dimensions(self):
mah(xt[0], xt[1])


class TestBhattacharyya(TestCase):

@classmethod
def setUpClass(cls):
cls.dist = Bhattacharyya

def test_dense_array(self):
#Also checks normalization
data = Table('iris')
true_out = np.array([[0, 4.48049499e-04, 2.07117086e-05],
[4.48049499e-04, 0, 3.65052724e-04],
[2.07117086e-05, 3.65052724e-04, 0]])
np.testing.assert_array_almost_equal(self.dist(data.X[:3]), true_out)

def test_sparse_array(self):
data = csr_matrix([[0.5, 0.5], [0, 0.5]])
self.assertAlmostEqual(self.dist(data[0], data[1]), 0.3465735902799726, delta=1e-5)

def test_columns(self):
data = np.array([[0.5, 0.2], [0.5, 0.8]])
true_out = np.array([[0, 0.05268025782891318],
[0.05268025782891318, 0]])
np.testing.assert_array_almost_equal(self.dist(data, axis=0), true_out)

def test_negative_input(self):
a = np.array([0, np.nan])
b = np.array([1, 1])
self.assertRaises(ValueError, self.dist, a, b)
a[1] = -1
self.assertRaises(ValueError, self.dist, a, b)
a = csr_matrix(a)
b = csr_matrix(b)
self.assertRaises(ValueError, self.dist, a, b)


class TestDistances(TestCase):
@classmethod
def setUpClass(cls):
Expand Down Expand Up @@ -982,3 +1019,6 @@ def test_distance_to_instance(self):
iris = Table('iris')
inst = Instance(iris.domain, np.concatenate((iris[1].x, iris[1].y)))
self.assertEqual(Euclidean(iris[1], inst), 0)

if __name__ == '__main__':
unittest.main()
3 changes: 2 additions & 1 deletion Orange/widgets/unsupervised/owdistances.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
("Absolute Spearman", distance.SpearmanRAbsolute),
("Pearson", distance.PearsonR),
("Absolute Pearson", distance.PearsonRAbsolute),
("Hamming", distance.Hamming)
("Hamming", distance.Hamming),
('Bhattacharyya', distance.Bhattacharyya)
]


Expand Down
9 changes: 9 additions & 0 deletions Orange/widgets/unsupervised/tests/test_owdistances.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,12 @@ def test_too_big_array(self):
def test_migrates_normalized_dist(self):
w = self.create_widget(OWDistances, stored_settings={"metric_idx": 0})
self.assertFalse(w.normalized_dist)

def test_negative_values_bhattacharyya(self):
self.iris.X[0, 0] *= -1
for self.widget.metric_idx, (_, metric) in enumerate(METRICS):
if metric == distance.Bhattacharyya:
break
self.send_signal(self.widget.Inputs.data, self.iris)
self.assertTrue(self.widget.Error.distances_value_error.is_shown())
self.iris.X[0, 0] *= -1
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ Distances work well with Orange add-ons, too. The distance matrix can be fed to
- [Pearson](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient) (linear correlation between the values, remapped as a distance in a [0, 1] interval)
- [Pearson absolute](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient) (linear correlation between the absolute values, remapped as a distance in a [0, 1] interval)
- [Hamming](https://en.wikipedia.org/wiki/Hamming_distance) (the number of features at which the corresponding values are different)
- [Bhattacharyya distance](https://en.wikipedia.org/wiki/Bhattacharyya_distance) (Similarity between two probability distributions, not a real distance as it doesn't obey triangle inequality.)

Normalize the features. Normalization is always done column-wise.
Normalize the features. Normalization is always done column-wise. Values are zero centered and scaled.
In case of missing values, the widget automatically imputes the average value of the row or the column.
The widget works for both numeric and categorical data. In case of categorical data, the distance is 0 if the two values are the same ('green' and 'green') and 1 if they are not ('green' and 'blue').
3. Tick *Apply Automatically* to automatically commit changes to other widgets. Alternatively, press '*Apply*'.
Expand Down