Skip to content

Commit

Permalink
Merge pull request #6205 from markotoplak/stats-nan-mean-variance
Browse files Browse the repository at this point in the history
Weighted variance computation for sparse and dense arrays
  • Loading branch information
markotoplak authored Nov 18, 2022
2 parents af8124a + f0ee0d6 commit 7d18600
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 1 deletion.
4 changes: 4 additions & 0 deletions Orange/statistics/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,9 +318,13 @@ def sample(self, size=None, replace=True):
return np.random.choice(self[0, :], size, replace, normalized[1, :])

def mean(self):
if len(self[0]) == 0:
return np.nan
return np.average(np.asarray(self[0]), weights=np.asarray(self[1]))

def variance(self):
if len(self[0]) == 0:
return np.nan
mean = self.mean()
return np.dot((self[0] - mean) ** 2, self[1]) / np.sum(self[1])

Expand Down
39 changes: 39 additions & 0 deletions Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,45 @@ def nanmean(x, axis=None, weights=None):
return means


def nan_mean_var(x, axis=None, weights=None):
"""
Computes means and variance of dense and sparse matrices.
Supports weights. Based on mean_variance_axis.
"""
if axis is None:
raise NotImplementedError("axis=None is not supported")

if not sp.issparse(x):
if weights is None:
means = bn.nanmean(x, axis=axis)
variances = bn.nanvar(x, axis=axis)
else:
if axis == 0:
weights = weights.reshape(-1, 1)
elif axis == 1:
weights = weights.reshape(1, -1)
else:
raise NotImplementedError

nanw = ~np.isnan(x) * weights # do not divide by non-used weights
wsum = np.sum(nanw, axis=axis)
means = bn.nansum(x * weights, axis=axis) / wsum

if axis == 0:
mr = means.reshape(1, -1)
elif axis == 1:
mr = means.reshape(-1, 1)

variances = bn.nansum(((x - mr) ** 2) * weights, axis=axis) / wsum
else:
# mean_variance_axis is picky regarding the input type
if weights is not None:
weights = weights.astype(float)
means, variances = mean_variance_axis(x, axis=axis, weights=weights)

return means, variances


def nanvar(x, axis=None, ddof=0):
""" Equivalent of np.nanvar that supports sparse or dense matrices. """
def nanvar_sparse(x):
Expand Down
88 changes: 87 additions & 1 deletion Orange/tests/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
from scipy.sparse import csr_matrix, issparse, lil_matrix, csc_matrix, \
SparseEfficiencyWarning

from Orange.data import Table, Domain, ContinuousVariable
from Orange.data.util import assure_array_dense
from Orange.statistics.distribution import get_distributions_for_columns
from Orange.statistics.util import bincount, countnans, contingency, digitize, \
mean, nanmax, nanmean, nanmedian, nanmin, nansum, nanunique, stats, std, \
unique, var, nanstd, nanvar, nanmode, nan_to_num, FDR, isnan, any_nan, \
all_nan
all_nan, nan_mean_var
from sklearn.utils import check_random_state


Expand Down Expand Up @@ -418,6 +420,90 @@ def test_weights_axis_1(self, array):
)


class TestNanMeanVar(unittest.TestCase):
def setUp(self):
self.x = np.array([[0, 1, 5],
[3, 4, np.nan],
[2, np.nan, np.nan],
[np.nan, np.nan, np.nan]])
self.means0 = [5/3, 5/2, 5/1]
self.means1 = [6/3, 7/2, 2/1, np.nan]
self.vars0 = [14/9, 9/4, 0]
self.vars1 = [14/3, 1/4, 0, np.nan]
self.w0 = np.array([4, 3, 2, 1])
self.w1 = np.array([1, 2, 3])
self.means0w = [13/9, 16/7, 20/4]
self.means1w = [17/6, 11/3, 2/1, np.nan]
self.vars0w = [1.8024691358024691, 2.2040816326530615, 0]
self.vars1w = [4.805555555555556, 0.22222222222222224, 0, np.nan]

@dense_sparse
def test_axis_none(self, array):
with self.assertRaises(NotImplementedError):
nan_mean_var(array(self.x))

@staticmethod
def compare_correct(x, weights, means_correct, vars_correct, axis=None):
means, variances = nan_mean_var(x, axis=axis, weights=weights)
np.testing.assert_almost_equal(means_correct, means)
np.testing.assert_almost_equal(vars_correct, variances)

@staticmethod
def compare_unweighted(x, axis=None):
means, variances = nan_mean_var(x, axis=axis)
np.testing.assert_almost_equal(nanmean(x, axis=axis), means)
np.testing.assert_almost_equal(nanvar(x, axis=axis), variances)

@staticmethod
def compare_distributions(x, dense, weights=None, axis=None):
def from_distributions(x, weights):
if axis == 1:
x = x.T
domain = Domain(ContinuousVariable(str(i)) for i in range(x.shape[1]))
table = Table.from_numpy(domain, x, W=weights)
dists = get_distributions_for_columns(table, list(range(x.shape[1])))
means = [d.mean() for d in dists]
variances = [d.variance() for d in dists]
return means, variances

means, variances = nan_mean_var(x, axis=axis, weights=weights)
md, vd = from_distributions(dense, weights)
np.testing.assert_almost_equal(md, means)
np.testing.assert_almost_equal(vd, variances)

@dense_sparse
def test_axis_0(self, array):
self.compare_correct(array(self.x), None, self.means0, self.vars0, axis=0)
self.compare_unweighted(array(self.x), axis=0)
self.compare_distributions(array(self.x), self.x, axis=0)

self.compare_correct(array(self.x.T), None, self.means1, self.vars1, axis=0)
self.compare_unweighted(array(self.x.T), axis=0)
self.compare_distributions(array(self.x.T), self.x.T, axis=0)

self.compare_correct(array(self.x), self.w0, self.means0w, self.vars0w, axis=0)
self.compare_distributions(array(self.x), self.x, weights=self.w0, axis=0)

self.compare_correct(array(self.x.T), self.w1, self.means1w, self.vars1w, axis=0)
self.compare_distributions(array(self.x.T), self.x.T, weights=self.w1, axis=0)

@dense_sparse
def test_axis_1(self, array):
self.compare_correct(array(self.x), None, self.means1, self.vars1, axis=1)
self.compare_unweighted(array(self.x), axis=1)
self.compare_distributions(array(self.x), self.x, axis=1)

self.compare_correct(array(self.x.T), None, self.means0, self.vars0, axis=1)
self.compare_unweighted(array(self.x.T), axis=1)
self.compare_distributions(array(self.x.T), self.x.T, axis=1)

self.compare_correct(array(self.x), self.w1, self.means1w, self.vars1w, axis=1)
self.compare_distributions(array(self.x), self.x, weights=self.w1, axis=1)

self.compare_correct(array(self.x.T), self.w0, self.means0w, self.vars0w, axis=1)
self.compare_distributions(array(self.x.T), self.x.T, weights=self.w0, axis=1)


class TestDigitize(unittest.TestCase):
def setUp(self):
# pylint: disable=bad-whitespace
Expand Down

0 comments on commit 7d18600

Please sign in to comment.