Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Statistics.countnans: Fix sparse implementation and add axis support #2558

Closed
Closed
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
b39db6e
Statistics.countnans: Fix sparse implementation and add axis support
pavlin-policar Sep 4, 2017
1d2bee0
Statistics.bincount: Fix sparse implementation
pavlin-policar Sep 8, 2017
ef2ba73
Statistics.tests: Implement dense_sparse decorator
pavlin-policar Sep 8, 2017
ee8634b
Statistics.countnans: Support 2d weights for sparse matrices
pavlin-policar Sep 8, 2017
941bd2b
Statistics.digitize: Move tests to own class and use dense_sparse dec…
pavlin-policar Sep 8, 2017
ea74b94
Statistics.bincount: Add weight support to sparse, add docstring
pavlin-policar Sep 8, 2017
ab5cc8b
Statistics: Implement sparse_count_zeros
pavlin-policar Sep 8, 2017
b4eb25a
Statistics.countnans: Add dtype param support to sparse
pavlin-policar Sep 8, 2017
ca4c80f
Table._compute_distributions: Fix parameter ordering to bincount call
pavlin-policar Sep 8, 2017
09ddc33
Statistics.sparse_has_zeros: Make public
pavlin-policar Sep 9, 2017
0057143
Table._compute_distributions: Correctly count zeros in sparse continu…
pavlin-policar Sep 9, 2017
a21af1a
DomainDistribution: Change tests to check for true zero counts
pavlin-policar Sep 9, 2017
d7d91c8
TestNormalize: Fix failing test due to previous handling of zeros in …
pavlin-policar Sep 9, 2017
afa3df8
Statistics.countnans: Fix copy=False param from coo.tocsr call
pavlin-policar Sep 9, 2017
6f12808
Pylint: Add pylint ignores to more human-friendly formatted matrices
pavlin-policar Sep 9, 2017
dd516a7
Statistics.countnans: Support csc_matrices
pavlin-policar Oct 20, 2017
e515f30
Statistics: Rename sparse_zeros to sparse_implicit_zeros
pavlin-policar Oct 20, 2017
e4206e2
Statistics.tests: Inject explicit zeros into dense_sparse decorator
pavlin-policar Oct 20, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 39 additions & 22 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
)
from Orange.data.util import SharedComputeValue, vstack, hstack
from Orange.statistics.util import bincount, countnans, contingency, \
stats as fast_stats
stats as fast_stats, sparse_has_zeros, sparse_count_zeros, \
sparse_zero_weights
from Orange.util import flatten

__all__ = ["dataset_dirs", "get_sample_datasets_dir", "RowInstance", "Table"]
Expand Down Expand Up @@ -1384,42 +1385,58 @@ def _compute_distributions(self, columns=None):
columns = range(len(self.domain.variables))
else:
columns = [self.domain.index(var) for var in columns]

distributions = []
if sp.issparse(self.X):
self.X = self.X.tocsc()

W = self.W.ravel() if self.has_weights() else None

for col in columns:
var = self.domain[col]
variable = self.domain[col]

# Select the correct data column from X, Y or metas
if 0 <= col < self.X.shape[1]:
m = self.X[:, col]
x = self.X[:, col]
elif col < 0:
m = self.metas[:, col * (-1) - 1]
if np.issubdtype(m.dtype, np.dtype(object)):
m = m.astype(float)
x = self.metas[:, col * (-1) - 1]
if np.issubdtype(x.dtype, np.dtype(object)):
x = x.astype(float)
else:
m = self._Y[:, col - self.X.shape[1]]
if var.is_discrete:
dist, unknowns = bincount(m, len(var.values) - 1, W)
elif not m.shape[0]:
x = self._Y[:, col - self.X.shape[1]]

if variable.is_discrete:
dist, unknowns = bincount(x, weights=W, max_val=len(variable.values) - 1)
elif not x.shape[0]:
dist, unknowns = np.zeros((2, 0)), 0
else:
if W is not None:
unknowns = countnans(m, W)
if sp.issparse(m):
arg_sort = np.argsort(m.data)
ranks = m.indices[arg_sort]
vals = np.vstack((m.data[arg_sort], W[ranks]))
if sp.issparse(x):
arg_sort = np.argsort(x.data)
ranks = x.indices[arg_sort]
vals = np.vstack((x.data[arg_sort], W[ranks]))
else:
ranks = np.argsort(m)
vals = np.vstack((m[ranks], W[ranks]))
ranks = np.argsort(x)
vals = np.vstack((x[ranks], W[ranks]))
else:
unknowns = countnans(m.astype(float))
if sp.issparse(m):
m = m.data
vals = np.ones((2, m.shape[0]))
vals[0, :] = m
x_values = x.data if sp.issparse(x) else x
vals = np.ones((2, x_values.shape[0]))
vals[0, :] = x_values
vals[0, :].sort()

dist = np.array(_valuecount.valuecount(vals))
# If sparse, then 0s will not be counted with `valuecount`, so
# we have to add them to the result manually.
if sp.issparse(x) and sparse_has_zeros(x):
if W is not None:
zero_weights = sparse_zero_weights(x, W).sum()
else:
zero_weights = sparse_count_zeros(x)
zero_vec = [0, zero_weights]
dist = np.insert(dist, np.searchsorted(dist[0], 0), zero_vec, axis=1)
# Since `countnans` assumes vector shape to be (1, n) and `x`
# shape is (n, 1), we pass the transpose
unknowns = countnans(x.T, W)
distributions.append((dist, unknowns))

return distributions
Expand Down
190 changes: 143 additions & 47 deletions Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,80 +10,181 @@
import bottleneck as bn


def _count_nans_per_row_sparse(X, weights):
def _count_nans_per_row_sparse(X, weights, dtype=None):
""" Count the number of nans (undefined) values per row. """
items_per_row = 1 if X.ndim == 1 else X.shape[1]
counts = np.ones(X.shape[0]) * items_per_row
nnz_per_row = np.bincount(X.indices, minlength=len(counts))
counts -= nnz_per_row
if weights is not None:
counts *= weights
return np.sum(counts)
X = X.tocoo(copy=False)
nonzero_mask = np.isnan(X.data)
nan_rows, nan_cols = X.row[nonzero_mask], X.col[nonzero_mask]

if weights.ndim == 1:
data_weights = weights[nan_rows]
else:
data_weights = weights[nan_rows, nan_cols]

w = sp.coo_matrix((data_weights, (nan_rows, nan_cols)), shape=X.shape)
w = w.tocsr(copy=False)

return np.fromiter((np.sum(row.data) for row in w), dtype=dtype)

return np.fromiter((np.isnan(row.data).sum() for row in X), dtype=dtype)


def sparse_count_zeros(x):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we perhaps rename this to sparse_count_implicit_zeros to make sure explicit zeros aren't counted?

""" Count the number of implicit zeros in a sparse matrix. """
if not sp.issparse(x):
raise TypeError('The matrix provided was not sparse.')
return np.prod(x.shape) - x.nnz


def sparse_has_zeros(x):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we perhaps rename this to sparse_has_implicit_zeros to make sure explicit zeros aren't considered?

""" Check if sparse matrix contains any implicit zeros. """
if not sp.issparse(x):
raise TypeError('The matrix provided was not sparse.')
return np.prod(x.shape) != x.nnz


def bincount(X, max_val=None, weights=None, minlength=None):
def sparse_zero_weights(x, weights):
Copy link
Contributor

@nikicc nikicc Oct 18, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this only meant to be used when both x and weights are of the same shape, i.e. one dimensional? If so, should we add the check for this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding a check here causes more problems than it's worth, because this can be called in two ways that are valid:

  1. x has shape (1, n)
  2. x has shape (n, 1)
    These are basically equivalent since they are both "1d", but this would make the check more complicated.

""" Extract the weight values of all zeros in a sparse matrix. """
if not sp.issparse(x):
raise TypeError('The matrix provided was not sparse.')

if weights.ndim == 1:
n_items = np.prod(x.shape)
zero_indices = np.setdiff1d(np.arange(n_items), x.indices, assume_unique=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does x.indices works here for both csc and csr? If not, please cast it to whatever (csc/csr) is required beforhand.

return weights[zero_indices]
else:
# Can easily be implemented using a coo_matrix
raise NotImplemented(
'Computing zero weights on ndimensinal weight matrix is not implemented'
)


def bincount(x, weights=None, max_val=None, minlength=None):
"""Return counts of values in array X.

Works kind of like np.bincount(), except that it also supports floating
arrays with nans.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you be willing to also document the max_val argument? This doesn't seem to be a numpy argument, and it's not obvious what it is for.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added thorough documentation.

Parameters
----------
x : array_like, 1 dimension, nonnegative ints
Input array.
weights : array_like, optional
Weights, array of the same shape as x.
max_val : int, optional
Indicates the maximum value we expect to find in X and sets the result
array size accordingly. E.g. if we set `max_val=2` yet the largest
value in X is 1, the result will contain a bin for the value 2, and
will be set to 0. See examples for usage.
minlength : int, optional
A minimum number of bins for the output array. See numpy docs for info.

Returns
-------
Tuple[np.ndarray, int]
Returns the bincounts and the number of NaN values.

Examples
--------
In case `max_val` is provided, the return shape includes bins for these
values as well, even if they do not appear in the data. However, this will
not truncate the bincount if values larger than `max_count` are found.
>>> bincount([0, 0, 1, 1, 2], max_val=4)
(array([ 2., 2., 1., 0., 0.]), 0.0)
>>> bincount([0, 1, 2, 3, 4], max_val=2)
(array([ 1., 1., 1., 1., 1.]), 0.0)

"""
if sp.issparse(X):
minlength = max_val + 1
bin_weights = weights[X.indices] if weights is not None else None
return (np.bincount(X.data.astype(int),
weights=bin_weights,
minlength=minlength, ),
_count_nans_per_row_sparse(X, weights))

X = np.asanyarray(X)
if X.dtype.kind == 'f' and bn.anynan(X):
nonnan = ~np.isnan(X)
X = X[nonnan]
# Store the original matrix before any manipulation to check for sparse
x_original = x
if sp.issparse(x):
if weights is not None:
zero_weights = sparse_zero_weights(x, weights).sum()
weights = weights[x.indices]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does x.indices works here for both csc and csr? If not, please cast it to whatever (csc/csr) is required beforhand.

else:
zero_weights = sparse_count_zeros(x)

x = x.data

x = np.asanyarray(x)
if x.dtype.kind == 'f' and bn.anynan(x):
nonnan = ~np.isnan(x)
x = x[nonnan]
if weights is not None:
nans = (~nonnan * weights).sum(axis=0)
weights = weights[nonnan]
else:
nans = (~nonnan).sum(axis=0)
else:
nans = 0. if X.ndim == 1 else np.zeros(X.shape[1], dtype=float)
nans = 0. if x.ndim == 1 else np.zeros(x.shape[1], dtype=float)

if minlength is None and max_val is not None:
minlength = max_val + 1
bc = np.array([]) if minlength is not None and minlength <= 0 else \
np.bincount(X.astype(np.int32, copy=False),
weights=weights, minlength=minlength).astype(float)

if minlength is not None and minlength <= 0:
bc = np.array([])
else:
bc = np.bincount(
x.astype(np.int32, copy=False), weights=weights, minlength=minlength
).astype(float)
# Since `csr_matrix.values` only contain non-zero values, we must count
# those separately and set the appropriate bin
if sp.issparse(x_original):
bc[0] = zero_weights
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be bc[0] = bc[0] + zero_weights to account for explicit zeros stored in x.data.


return bc, nans


def countnans(X, weights=None, axis=None, dtype=None, keepdims=False):
def countnans(x, weights=None, axis=None, dtype=None, keepdims=False):
"""
Count the undefined elements in arr along given axis.
Count the undefined elements in an array along given axis.

Parameters
----------
X : array_like
weights : array_like
x : array_like
weights : array_like, optional
Weights to weight the nans with, before or after counting (depending
on the weights shape).
axis : int, optional
dtype : dtype, optional
The data type of the returned array.

Returns
-------
counts
Union[np.ndarray, float]

"""
if not sp.issparse(X):
X = np.asanyarray(X)
isnan = np.isnan(X)
if weights is not None and weights.shape == X.shape:
if not sp.issparse(x):
x = np.asanyarray(x)
isnan = np.isnan(x)
if weights is not None and weights.shape == x.shape:
isnan = isnan * weights

counts = isnan.sum(axis=axis, dtype=dtype, keepdims=keepdims)
if weights is not None and weights.shape != X.shape:
if weights is not None and weights.shape != x.shape:
counts = counts * weights
else:
if any(attr is not None for attr in [axis, dtype]) or \
keepdims is not False:
raise ValueError('Arguments axis, dtype and keepdims'
'are not yet supported on sparse data!')
assert axis in [None, 0, 1], 'Only axis 0 and 1 are currently supported'
# To have consistent behaviour with dense matrices, raise error when
# `axis=1` and the array is 1d (e.g. [[1 2 3]])
if x.shape[0] == 1 and axis == 1:
raise ValueError('Axis %d is out of bounds' % axis)

arr = x if axis == 1 else x.T

if weights is not None:
weights = weights if axis == 1 else weights.T

arr = arr.tocsr()
counts = _count_nans_per_row_sparse(arr, weights, dtype=dtype)

# We want a scalar value if `axis=None` or if the sparse matrix is
# actually a vector (e.g. [[1 2 3]]), but has `ndim=2` due to scipy
# implementation
if axis is None or x.shape[0] == 1:
counts = counts.sum(dtype=dtype)

counts = _count_nans_per_row_sparse(X, weights)
return counts


Expand Down Expand Up @@ -234,17 +335,12 @@ def weighted_mean():
X.shape[0] - nans))


def _sparse_has_zeros(x):
""" Check if sparse matrix contains any implicit zeros. """
return np.prod(x.shape) != x.nnz


def _nan_min_max(x, func, axis=0):
if not sp.issparse(x):
return func(x, axis=axis)
if axis is None:
extreme = func(x.data, axis=axis) if x.nnz else float('nan')
if _sparse_has_zeros(x):
if sparse_has_zeros(x):
extreme = func([0, extreme])
return extreme
if axis == 0:
Expand All @@ -257,7 +353,7 @@ def _nan_min_max(x, func, axis=0):
for row in x:
values = row.data
extreme = func(values) if values.size else float('nan')
if _sparse_has_zeros(row):
if sparse_has_zeros(row):
extreme = func([0, extreme])
r.append(extreme)
return np.array(r)
Expand Down Expand Up @@ -323,7 +419,7 @@ def unique(x, return_counts=False):
if not sp.issparse(x):
return np.unique(x, return_counts=return_counts)

implicit_zeros = np.prod(x.shape) - x.nnz
implicit_zeros = sparse_count_zeros(x)
explicit_zeros = not np.all(x.data)
r = np.unique(x.data, return_counts=return_counts)
if not implicit_zeros:
Expand Down
Loading