biolab · pavlin-policar · Sep 4, 2017 · Sep 8, 2017 · Sep 8, 2017 · Sep 8, 2017
diff --git a/Orange/data/table.py b/Orange/data/table.py
@@ -19,7 +19,8 @@
 )
 from Orange.data.util import SharedComputeValue, vstack, hstack
 from Orange.statistics.util import bincount, countnans, contingency, \
-    stats as fast_stats
+    stats as fast_stats, sparse_has_zeros, sparse_count_zeros, \
+    sparse_zero_weights
 from Orange.util import flatten
 
 __all__ = ["dataset_dirs", "get_sample_datasets_dir", "RowInstance", "Table"]
@@ -1384,42 +1385,58 @@ def _compute_distributions(self, columns=None):
             columns = range(len(self.domain.variables))
         else:
             columns = [self.domain.index(var) for var in columns]
+
         distributions = []
         if sp.issparse(self.X):
             self.X = self.X.tocsc()
+
         W = self.W.ravel() if self.has_weights() else None
+
         for col in columns:
-            var = self.domain[col]
+            variable = self.domain[col]
+
+            # Select the correct data column from X, Y or metas
             if 0 <= col < self.X.shape[1]:
-                m = self.X[:, col]
+                x = self.X[:, col]
             elif col < 0:
-                m = self.metas[:, col * (-1) - 1]
-                if np.issubdtype(m.dtype, np.dtype(object)):
-                    m = m.astype(float)
+                x = self.metas[:, col * (-1) - 1]
+                if np.issubdtype(x.dtype, np.dtype(object)):
+                    x = x.astype(float)
             else:
-                m = self._Y[:, col - self.X.shape[1]]
-            if var.is_discrete:
-                dist, unknowns = bincount(m, len(var.values) - 1, W)
-            elif not m.shape[0]:
+                x = self._Y[:, col - self.X.shape[1]]
+
+            if variable.is_discrete:
+                dist, unknowns = bincount(x, weights=W, max_val=len(variable.values) - 1)
+            elif not x.shape[0]:
                 dist, unknowns = np.zeros((2, 0)), 0
             else:
                 if W is not None:
-                    unknowns = countnans(m, W)
-                    if sp.issparse(m):
-                        arg_sort = np.argsort(m.data)
-                        ranks = m.indices[arg_sort]
-                        vals = np.vstack((m.data[arg_sort], W[ranks]))
+                    if sp.issparse(x):
+                        arg_sort = np.argsort(x.data)
+                        ranks = x.indices[arg_sort]
+                        vals = np.vstack((x.data[arg_sort], W[ranks]))
                     else:
-                        ranks = np.argsort(m)
-                        vals = np.vstack((m[ranks], W[ranks]))
+                        ranks = np.argsort(x)
+                        vals = np.vstack((x[ranks], W[ranks]))
                 else:
-                    unknowns = countnans(m.astype(float))
-                    if sp.issparse(m):
-                        m = m.data
-                    vals = np.ones((2, m.shape[0]))
-                    vals[0, :] = m
+                    x_values = x.data if sp.issparse(x) else x
+                    vals = np.ones((2, x_values.shape[0]))
+                    vals[0, :] = x_values
                     vals[0, :].sort()
+
                 dist = np.array(_valuecount.valuecount(vals))
+                # If sparse, then 0s will not be counted with `valuecount`, so
+                # we have to add them to the result manually.
+                if sp.issparse(x) and sparse_has_zeros(x):
+                    if W is not None:
+                        zero_weights = sparse_zero_weights(x, W).sum()
+                    else:
+                        zero_weights = sparse_count_zeros(x)
+                    zero_vec = [0, zero_weights]
+                    dist = np.insert(dist, np.searchsorted(dist[0], 0), zero_vec, axis=1)
+                # Since `countnans` assumes vector shape to be (1, n) and `x`
+                # shape is (n, 1), we pass the transpose
+                unknowns = countnans(x.T, W)
             distributions.append((dist, unknowns))
 
         return distributions

diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py
@@ -10,80 +10,181 @@
 import bottleneck as bn
 
 
-def _count_nans_per_row_sparse(X, weights):
+def _count_nans_per_row_sparse(X, weights, dtype=None):
     """ Count the number of nans (undefined) values per row. """
-    items_per_row = 1 if X.ndim == 1 else X.shape[1]
-    counts = np.ones(X.shape[0]) * items_per_row
-    nnz_per_row = np.bincount(X.indices, minlength=len(counts))
-    counts -= nnz_per_row
     if weights is not None:
-        counts *= weights
-    return np.sum(counts)
+        X = X.tocoo(copy=False)
+        nonzero_mask = np.isnan(X.data)
+        nan_rows, nan_cols = X.row[nonzero_mask], X.col[nonzero_mask]
+
+        if weights.ndim == 1:
+            data_weights = weights[nan_rows]
+        else:
+            data_weights = weights[nan_rows, nan_cols]
+
+        w = sp.coo_matrix((data_weights, (nan_rows, nan_cols)), shape=X.shape)
+        w = w.tocsr(copy=False)
+
+        return np.fromiter((np.sum(row.data) for row in w), dtype=dtype)
+
+    return np.fromiter((np.isnan(row.data).sum() for row in X), dtype=dtype)
+
+
+def sparse_count_zeros(x):
+    """ Count the number of implicit zeros in a sparse matrix. """
+    if not sp.issparse(x):
+        raise TypeError('The matrix provided was not sparse.')
+    return np.prod(x.shape) - x.nnz
+
+
+def sparse_has_zeros(x):
+    """ Check if sparse matrix contains any implicit zeros. """
+    if not sp.issparse(x):
+        raise TypeError('The matrix provided was not sparse.')
+    return np.prod(x.shape) != x.nnz
 
 
-def bincount(X, max_val=None, weights=None, minlength=None):
+def sparse_zero_weights(x, weights):
+    """ Extract the weight values of all zeros in a sparse matrix. """
+    if not sp.issparse(x):
+        raise TypeError('The matrix provided was not sparse.')
+
+    if weights.ndim == 1:
+        n_items = np.prod(x.shape)
+        zero_indices = np.setdiff1d(np.arange(n_items), x.indices, assume_unique=True)
+        return weights[zero_indices]
+    else:
+        # Can easily be implemented using a coo_matrix
+        raise NotImplemented(
+            'Computing zero weights on ndimensinal weight matrix is not implemented'
+        )
+
+
+def bincount(x, weights=None, max_val=None, minlength=None):
     """Return counts of values in array X.
 
     Works kind of like np.bincount(), except that it also supports floating
     arrays with nans.
+
+    Parameters
+    ----------
+    x : array_like, 1 dimension, nonnegative ints
+        Input array.
+    weights : array_like, optional
+        Weights, array of the same shape as x.
+    max_val : int, optional
+        Indicates the maximum value we expect to find in X and sets the result
+        array size accordingly. E.g. if we set `max_val=2` yet the largest
+        value in X is 1, the result will contain a bin for the value 2, and
+        will be set to 0. See examples for usage.
+    minlength : int, optional
+        A minimum number of bins for the output array. See numpy docs for info.
+
+    Returns
+    -------
+    Tuple[np.ndarray, int]
+        Returns the bincounts and the number of NaN values.
+
+    Examples
+    --------
+    In case `max_val` is provided, the return shape includes bins for these
+    values as well, even if they do not appear in the data. However, this will
+    not truncate the bincount if values larger than `max_count` are found.
+    >>> bincount([0, 0, 1, 1, 2], max_val=4)
+    (array([ 2.,  2.,  1.,  0.,  0.]), 0.0)
+    >>> bincount([0, 1, 2, 3, 4], max_val=2)
+    (array([ 1.,  1.,  1.,  1.,  1.]), 0.0)
+
     """
-    if sp.issparse(X):
-        minlength = max_val + 1
-        bin_weights = weights[X.indices] if weights is not None else None
-        return (np.bincount(X.data.astype(int),
-                            weights=bin_weights,
-                            minlength=minlength, ),
-                _count_nans_per_row_sparse(X, weights))
-
-    X = np.asanyarray(X)
-    if X.dtype.kind == 'f' and bn.anynan(X):
-        nonnan = ~np.isnan(X)
-        X = X[nonnan]
+    # Store the original matrix before any manipulation to check for sparse
+    x_original = x
+    if sp.issparse(x):
+        if weights is not None:
+            zero_weights = sparse_zero_weights(x, weights).sum()
+            weights = weights[x.indices]
+        else:
+            zero_weights = sparse_count_zeros(x)
+
+        x = x.data
+
+    x = np.asanyarray(x)
+    if x.dtype.kind == 'f' and bn.anynan(x):
+        nonnan = ~np.isnan(x)
+        x = x[nonnan]
         if weights is not None:
             nans = (~nonnan * weights).sum(axis=0)
             weights = weights[nonnan]
         else:
             nans = (~nonnan).sum(axis=0)
     else:
-        nans = 0. if X.ndim == 1 else np.zeros(X.shape[1], dtype=float)
+        nans = 0. if x.ndim == 1 else np.zeros(x.shape[1], dtype=float)
+
     if minlength is None and max_val is not None:
         minlength = max_val + 1
-    bc = np.array([]) if minlength is not None and minlength <= 0 else \
-        np.bincount(X.astype(np.int32, copy=False),
-                    weights=weights, minlength=minlength).astype(float)
+
+    if minlength is not None and minlength <= 0:
+        bc = np.array([])
+    else:
+        bc = np.bincount(
+            x.astype(np.int32, copy=False), weights=weights, minlength=minlength
+        ).astype(float)
+        # Since `csr_matrix.values` only contain non-zero values, we must count
+        # those separately and set the appropriate bin
+        if sp.issparse(x_original):
+            bc[0] = zero_weights
+
     return bc, nans
 
 
-def countnans(X, weights=None, axis=None, dtype=None, keepdims=False):
+def countnans(x, weights=None, axis=None, dtype=None, keepdims=False):
     """
-    Count the undefined elements in arr along given axis.
+    Count the undefined elements in an array along given axis.
 
     Parameters
     ----------
-    X : array_like
-    weights : array_like
+    x : array_like
+    weights : array_like, optional
         Weights to weight the nans with, before or after counting (depending
         on the weights shape).
+    axis : int, optional
+    dtype : dtype, optional
+        The data type of the returned array.
 
     Returns
     -------
-    counts
+    Union[np.ndarray, float]
+
     """
-    if not sp.issparse(X):
-        X = np.asanyarray(X)
-        isnan = np.isnan(X)
-        if weights is not None and weights.shape == X.shape:
+    if not sp.issparse(x):
+        x = np.asanyarray(x)
+        isnan = np.isnan(x)
+        if weights is not None and weights.shape == x.shape:
             isnan = isnan * weights
+
         counts = isnan.sum(axis=axis, dtype=dtype, keepdims=keepdims)
-        if weights is not None and weights.shape != X.shape:
+        if weights is not None and weights.shape != x.shape:
             counts = counts * weights
     else:
-        if any(attr is not None for attr in [axis, dtype]) or \
-                        keepdims is not False:
-            raise ValueError('Arguments axis, dtype and keepdims'
-                             'are not yet supported on sparse data!')
+        assert axis in [None, 0, 1], 'Only axis 0 and 1 are currently supported'
+        # To have consistent behaviour with dense matrices, raise error when
+        # `axis=1` and the array is 1d (e.g. [[1 2 3]])
+        if x.shape[0] == 1 and axis == 1:
+            raise ValueError('Axis %d is out of bounds' % axis)
+
+        arr = x if axis == 1 else x.T
+
+        if weights is not None:
+            weights = weights if axis == 1 else weights.T
+
+        arr = arr.tocsr()
+        counts = _count_nans_per_row_sparse(arr, weights, dtype=dtype)
+
+        # We want a scalar value if `axis=None` or if the sparse matrix is
+        # actually a vector (e.g. [[1 2 3]]), but has `ndim=2` due to scipy
+        # implementation
+        if axis is None or x.shape[0] == 1:
+            counts = counts.sum(dtype=dtype)
 
-        counts = _count_nans_per_row_sparse(X, weights)
     return counts
 
 
@@ -234,17 +335,12 @@ def weighted_mean():
             X.shape[0] - nans))
 
 
-def _sparse_has_zeros(x):
-    """ Check if sparse matrix contains any implicit zeros. """
-    return np.prod(x.shape) != x.nnz
-
-
 def _nan_min_max(x, func, axis=0):
     if not sp.issparse(x):
         return func(x, axis=axis)
     if axis is None:
         extreme = func(x.data, axis=axis) if x.nnz else float('nan')
-        if _sparse_has_zeros(x):
+        if sparse_has_zeros(x):
             extreme = func([0, extreme])
         return extreme
     if axis == 0:
@@ -257,7 +353,7 @@ def _nan_min_max(x, func, axis=0):
     for row in x:
         values = row.data
         extreme = func(values) if values.size else float('nan')
-        if _sparse_has_zeros(row):
+        if sparse_has_zeros(row):
             extreme = func([0, extreme])
         r.append(extreme)
     return np.array(r)
@@ -323,7 +419,7 @@ def unique(x, return_counts=False):
     if not sp.issparse(x):
         return np.unique(x, return_counts=return_counts)
 
-    implicit_zeros = np.prod(x.shape) - x.nnz
+    implicit_zeros = sparse_count_zeros(x)
     explicit_zeros = not np.all(x.data)
     r = np.unique(x.data, return_counts=return_counts)
     if not implicit_zeros: