Merge pull request #6205 from markotoplak/stats-nan-mean-variance

Weighted variance computation for sparse and dense arrays
biolab · Nov 18, 2022 · 7d18600 · 7d18600
2 parents af8124a + f0ee0d6
commit 7d18600
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 1 deletion.
diff --git a/Orange/statistics/distribution.py b/Orange/statistics/distribution.py
@@ -318,9 +318,13 @@ def sample(self, size=None, replace=True):
         return np.random.choice(self[0, :], size, replace, normalized[1, :])
 
     def mean(self):
+        if len(self[0]) == 0:
+            return np.nan
         return np.average(np.asarray(self[0]), weights=np.asarray(self[1]))
 
     def variance(self):
+        if len(self[0]) == 0:
+            return np.nan
         mean = self.mean()
         return np.dot((self[0] - mean) ** 2, self[1]) / np.sum(self[1])
 

diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py
@@ -476,6 +476,45 @@ def nanmean(x, axis=None, weights=None):
     return means
 
 
+def nan_mean_var(x, axis=None, weights=None):
+    """
+    Computes means and variance of dense and sparse matrices.
+    Supports weights. Based on mean_variance_axis.
+    """
+    if axis is None:
+        raise NotImplementedError("axis=None is not supported")
+
+    if not sp.issparse(x):
+        if weights is None:
+            means = bn.nanmean(x, axis=axis)
+            variances = bn.nanvar(x, axis=axis)
+        else:
+            if axis == 0:
+                weights = weights.reshape(-1, 1)
+            elif axis == 1:
+                weights = weights.reshape(1, -1)
+            else:
+                raise NotImplementedError
+
+            nanw = ~np.isnan(x) * weights  # do not divide by non-used weights
+            wsum = np.sum(nanw, axis=axis)
+            means = bn.nansum(x * weights, axis=axis) / wsum
+
+            if axis == 0:
+                mr = means.reshape(1, -1)
+            elif axis == 1:
+                mr = means.reshape(-1, 1)
+
+            variances = bn.nansum(((x - mr) ** 2) * weights, axis=axis) / wsum
+    else:
+        # mean_variance_axis is picky regarding the input type
+        if weights is not None:
+            weights = weights.astype(float)
+        means, variances = mean_variance_axis(x, axis=axis, weights=weights)
+
+    return means, variances
+
+
 def nanvar(x, axis=None, ddof=0):
     """ Equivalent of np.nanvar that supports sparse or dense matrices. """
     def nanvar_sparse(x):

diff --git a/Orange/tests/test_statistics.py b/Orange/tests/test_statistics.py
@@ -9,11 +9,13 @@
 from scipy.sparse import csr_matrix, issparse, lil_matrix, csc_matrix, \
     SparseEfficiencyWarning
 
+from Orange.data import Table, Domain, ContinuousVariable
 from Orange.data.util import assure_array_dense
+from Orange.statistics.distribution import get_distributions_for_columns
 from Orange.statistics.util import bincount, countnans, contingency, digitize, \
     mean, nanmax, nanmean, nanmedian, nanmin, nansum, nanunique, stats, std, \
     unique, var, nanstd, nanvar, nanmode, nan_to_num, FDR, isnan, any_nan, \
-    all_nan
+    all_nan, nan_mean_var
 from sklearn.utils import check_random_state
 
 
@@ -418,6 +420,90 @@ def test_weights_axis_1(self, array):
         )
 
 
+class TestNanMeanVar(unittest.TestCase):
+    def setUp(self):
+        self.x = np.array([[0, 1, 5],
+                           [3, 4, np.nan],
+                           [2, np.nan, np.nan],
+                           [np.nan, np.nan, np.nan]])
+        self.means0 = [5/3, 5/2, 5/1]
+        self.means1 = [6/3, 7/2, 2/1, np.nan]
+        self.vars0 = [14/9, 9/4, 0]
+        self.vars1 = [14/3, 1/4, 0, np.nan]
+        self.w0 = np.array([4, 3, 2, 1])
+        self.w1 = np.array([1, 2, 3])
+        self.means0w = [13/9, 16/7, 20/4]
+        self.means1w = [17/6, 11/3, 2/1, np.nan]
+        self.vars0w = [1.8024691358024691, 2.2040816326530615, 0]
+        self.vars1w = [4.805555555555556, 0.22222222222222224, 0, np.nan]
+
+    @dense_sparse
+    def test_axis_none(self, array):
+        with self.assertRaises(NotImplementedError):
+            nan_mean_var(array(self.x))
+
+    @staticmethod
+    def compare_correct(x, weights, means_correct, vars_correct, axis=None):
+        means, variances = nan_mean_var(x, axis=axis, weights=weights)
+        np.testing.assert_almost_equal(means_correct, means)
+        np.testing.assert_almost_equal(vars_correct, variances)
+
+    @staticmethod
+    def compare_unweighted(x, axis=None):
+        means, variances = nan_mean_var(x, axis=axis)
+        np.testing.assert_almost_equal(nanmean(x, axis=axis), means)
+        np.testing.assert_almost_equal(nanvar(x, axis=axis), variances)
+
+    @staticmethod
+    def compare_distributions(x, dense, weights=None, axis=None):
+        def from_distributions(x, weights):
+            if axis == 1:
+                x = x.T
+            domain = Domain(ContinuousVariable(str(i)) for i in range(x.shape[1]))
+            table = Table.from_numpy(domain, x, W=weights)
+            dists = get_distributions_for_columns(table, list(range(x.shape[1])))
+            means = [d.mean() for d in dists]
+            variances = [d.variance() for d in dists]
+            return means, variances
+
+        means, variances = nan_mean_var(x, axis=axis, weights=weights)
+        md, vd = from_distributions(dense, weights)
+        np.testing.assert_almost_equal(md, means)
+        np.testing.assert_almost_equal(vd, variances)
+
+    @dense_sparse
+    def test_axis_0(self, array):
+        self.compare_correct(array(self.x), None, self.means0, self.vars0, axis=0)
+        self.compare_unweighted(array(self.x), axis=0)
+        self.compare_distributions(array(self.x), self.x, axis=0)
+
+        self.compare_correct(array(self.x.T), None, self.means1, self.vars1, axis=0)
+        self.compare_unweighted(array(self.x.T), axis=0)
+        self.compare_distributions(array(self.x.T), self.x.T, axis=0)
+
+        self.compare_correct(array(self.x), self.w0, self.means0w, self.vars0w, axis=0)
+        self.compare_distributions(array(self.x), self.x, weights=self.w0, axis=0)
+
+        self.compare_correct(array(self.x.T), self.w1, self.means1w, self.vars1w, axis=0)
+        self.compare_distributions(array(self.x.T), self.x.T, weights=self.w1, axis=0)
+
+    @dense_sparse
+    def test_axis_1(self, array):
+        self.compare_correct(array(self.x), None, self.means1, self.vars1, axis=1)
+        self.compare_unweighted(array(self.x), axis=1)
+        self.compare_distributions(array(self.x), self.x, axis=1)
+
+        self.compare_correct(array(self.x.T), None, self.means0, self.vars0, axis=1)
+        self.compare_unweighted(array(self.x.T), axis=1)
+        self.compare_distributions(array(self.x.T), self.x.T, axis=1)
+
+        self.compare_correct(array(self.x), self.w1, self.means1w, self.vars1w, axis=1)
+        self.compare_distributions(array(self.x), self.x, weights=self.w1, axis=1)
+
+        self.compare_correct(array(self.x.T), self.w0, self.means0w, self.vars0w, axis=1)
+        self.compare_distributions(array(self.x.T), self.x.T, weights=self.w0, axis=1)
+
+
 class TestDigitize(unittest.TestCase):
     def setUp(self):
         # pylint: disable=bad-whitespace