-
-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Statistics.countnans: Fix sparse implementation and add axis support #2558
Changes from 12 commits
b39db6e
1d2bee0
ef2ba73
ee8634b
941bd2b
ea74b94
ab5cc8b
b4eb25a
ca4c80f
09ddc33
0057143
a21af1a
d7d91c8
afa3df8
6f12808
dd516a7
e515f30
e4206e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,80 +10,181 @@ | |
import bottleneck as bn | ||
|
||
|
||
def _count_nans_per_row_sparse(X, weights): | ||
def _count_nans_per_row_sparse(X, weights, dtype=None): | ||
""" Count the number of nans (undefined) values per row. """ | ||
items_per_row = 1 if X.ndim == 1 else X.shape[1] | ||
counts = np.ones(X.shape[0]) * items_per_row | ||
nnz_per_row = np.bincount(X.indices, minlength=len(counts)) | ||
counts -= nnz_per_row | ||
if weights is not None: | ||
counts *= weights | ||
return np.sum(counts) | ||
X = X.tocoo(copy=False) | ||
nonzero_mask = np.isnan(X.data) | ||
nan_rows, nan_cols = X.row[nonzero_mask], X.col[nonzero_mask] | ||
|
||
if weights.ndim == 1: | ||
data_weights = weights[nan_rows] | ||
else: | ||
data_weights = weights[nan_rows, nan_cols] | ||
|
||
w = sp.coo_matrix((data_weights, (nan_rows, nan_cols)), shape=X.shape) | ||
w = w.tocsr(copy=False) | ||
|
||
return np.fromiter((np.sum(row.data) for row in w), dtype=dtype) | ||
|
||
return np.fromiter((np.isnan(row.data).sum() for row in X), dtype=dtype) | ||
|
||
|
||
def sparse_count_zeros(x): | ||
""" Count the number of implicit zeros in a sparse matrix. """ | ||
if not sp.issparse(x): | ||
raise TypeError('The matrix provided was not sparse.') | ||
return np.prod(x.shape) - x.nnz | ||
|
||
|
||
def sparse_has_zeros(x): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we perhaps rename this to |
||
""" Check if sparse matrix contains any implicit zeros. """ | ||
if not sp.issparse(x): | ||
raise TypeError('The matrix provided was not sparse.') | ||
return np.prod(x.shape) != x.nnz | ||
|
||
|
||
def bincount(X, max_val=None, weights=None, minlength=None): | ||
def sparse_zero_weights(x, weights): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this only meant to be used when both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding a check here causes more problems than it's worth, because this can be called in two ways that are valid:
|
||
""" Extract the weight values of all zeros in a sparse matrix. """ | ||
if not sp.issparse(x): | ||
raise TypeError('The matrix provided was not sparse.') | ||
|
||
if weights.ndim == 1: | ||
n_items = np.prod(x.shape) | ||
zero_indices = np.setdiff1d(np.arange(n_items), x.indices, assume_unique=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does |
||
return weights[zero_indices] | ||
else: | ||
# Can easily be implemented using a coo_matrix | ||
raise NotImplemented( | ||
'Computing zero weights on ndimensinal weight matrix is not implemented' | ||
) | ||
|
||
|
||
def bincount(x, weights=None, max_val=None, minlength=None): | ||
"""Return counts of values in array X. | ||
|
||
Works kind of like np.bincount(), except that it also supports floating | ||
arrays with nans. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you be willing to also document the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've added thorough documentation. |
||
Parameters | ||
---------- | ||
x : array_like, 1 dimension, nonnegative ints | ||
Input array. | ||
weights : array_like, optional | ||
Weights, array of the same shape as x. | ||
max_val : int, optional | ||
Indicates the maximum value we expect to find in X and sets the result | ||
array size accordingly. E.g. if we set `max_val=2` yet the largest | ||
value in X is 1, the result will contain a bin for the value 2, and | ||
will be set to 0. See examples for usage. | ||
minlength : int, optional | ||
A minimum number of bins for the output array. See numpy docs for info. | ||
|
||
Returns | ||
------- | ||
Tuple[np.ndarray, int] | ||
Returns the bincounts and the number of NaN values. | ||
|
||
Examples | ||
-------- | ||
In case `max_val` is provided, the return shape includes bins for these | ||
values as well, even if they do not appear in the data. However, this will | ||
not truncate the bincount if values larger than `max_count` are found. | ||
>>> bincount([0, 0, 1, 1, 2], max_val=4) | ||
(array([ 2., 2., 1., 0., 0.]), 0.0) | ||
>>> bincount([0, 1, 2, 3, 4], max_val=2) | ||
(array([ 1., 1., 1., 1., 1.]), 0.0) | ||
|
||
""" | ||
if sp.issparse(X): | ||
minlength = max_val + 1 | ||
bin_weights = weights[X.indices] if weights is not None else None | ||
return (np.bincount(X.data.astype(int), | ||
weights=bin_weights, | ||
minlength=minlength, ), | ||
_count_nans_per_row_sparse(X, weights)) | ||
|
||
X = np.asanyarray(X) | ||
if X.dtype.kind == 'f' and bn.anynan(X): | ||
nonnan = ~np.isnan(X) | ||
X = X[nonnan] | ||
# Store the original matrix before any manipulation to check for sparse | ||
x_original = x | ||
if sp.issparse(x): | ||
if weights is not None: | ||
zero_weights = sparse_zero_weights(x, weights).sum() | ||
weights = weights[x.indices] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does |
||
else: | ||
zero_weights = sparse_count_zeros(x) | ||
|
||
x = x.data | ||
|
||
x = np.asanyarray(x) | ||
if x.dtype.kind == 'f' and bn.anynan(x): | ||
nonnan = ~np.isnan(x) | ||
x = x[nonnan] | ||
if weights is not None: | ||
nans = (~nonnan * weights).sum(axis=0) | ||
weights = weights[nonnan] | ||
else: | ||
nans = (~nonnan).sum(axis=0) | ||
else: | ||
nans = 0. if X.ndim == 1 else np.zeros(X.shape[1], dtype=float) | ||
nans = 0. if x.ndim == 1 else np.zeros(x.shape[1], dtype=float) | ||
|
||
if minlength is None and max_val is not None: | ||
minlength = max_val + 1 | ||
bc = np.array([]) if minlength is not None and minlength <= 0 else \ | ||
np.bincount(X.astype(np.int32, copy=False), | ||
weights=weights, minlength=minlength).astype(float) | ||
|
||
if minlength is not None and minlength <= 0: | ||
bc = np.array([]) | ||
else: | ||
bc = np.bincount( | ||
x.astype(np.int32, copy=False), weights=weights, minlength=minlength | ||
).astype(float) | ||
# Since `csr_matrix.values` only contain non-zero values, we must count | ||
# those separately and set the appropriate bin | ||
if sp.issparse(x_original): | ||
bc[0] = zero_weights | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should probably be |
||
|
||
return bc, nans | ||
|
||
|
||
def countnans(X, weights=None, axis=None, dtype=None, keepdims=False): | ||
def countnans(x, weights=None, axis=None, dtype=None, keepdims=False): | ||
""" | ||
Count the undefined elements in arr along given axis. | ||
Count the undefined elements in an array along given axis. | ||
|
||
Parameters | ||
---------- | ||
X : array_like | ||
weights : array_like | ||
x : array_like | ||
weights : array_like, optional | ||
Weights to weight the nans with, before or after counting (depending | ||
on the weights shape). | ||
axis : int, optional | ||
dtype : dtype, optional | ||
The data type of the returned array. | ||
|
||
Returns | ||
------- | ||
counts | ||
Union[np.ndarray, float] | ||
|
||
""" | ||
if not sp.issparse(X): | ||
X = np.asanyarray(X) | ||
isnan = np.isnan(X) | ||
if weights is not None and weights.shape == X.shape: | ||
if not sp.issparse(x): | ||
x = np.asanyarray(x) | ||
isnan = np.isnan(x) | ||
if weights is not None and weights.shape == x.shape: | ||
isnan = isnan * weights | ||
|
||
counts = isnan.sum(axis=axis, dtype=dtype, keepdims=keepdims) | ||
if weights is not None and weights.shape != X.shape: | ||
if weights is not None and weights.shape != x.shape: | ||
counts = counts * weights | ||
else: | ||
if any(attr is not None for attr in [axis, dtype]) or \ | ||
keepdims is not False: | ||
raise ValueError('Arguments axis, dtype and keepdims' | ||
'are not yet supported on sparse data!') | ||
assert axis in [None, 0, 1], 'Only axis 0 and 1 are currently supported' | ||
# To have consistent behaviour with dense matrices, raise error when | ||
# `axis=1` and the array is 1d (e.g. [[1 2 3]]) | ||
if x.shape[0] == 1 and axis == 1: | ||
raise ValueError('Axis %d is out of bounds' % axis) | ||
|
||
arr = x if axis == 1 else x.T | ||
|
||
if weights is not None: | ||
weights = weights if axis == 1 else weights.T | ||
|
||
arr = arr.tocsr() | ||
counts = _count_nans_per_row_sparse(arr, weights, dtype=dtype) | ||
|
||
# We want a scalar value if `axis=None` or if the sparse matrix is | ||
# actually a vector (e.g. [[1 2 3]]), but has `ndim=2` due to scipy | ||
# implementation | ||
if axis is None or x.shape[0] == 1: | ||
counts = counts.sum(dtype=dtype) | ||
|
||
counts = _count_nans_per_row_sparse(X, weights) | ||
return counts | ||
|
||
|
||
|
@@ -234,17 +335,12 @@ def weighted_mean(): | |
X.shape[0] - nans)) | ||
|
||
|
||
def _sparse_has_zeros(x): | ||
""" Check if sparse matrix contains any implicit zeros. """ | ||
return np.prod(x.shape) != x.nnz | ||
|
||
|
||
def _nan_min_max(x, func, axis=0): | ||
if not sp.issparse(x): | ||
return func(x, axis=axis) | ||
if axis is None: | ||
extreme = func(x.data, axis=axis) if x.nnz else float('nan') | ||
if _sparse_has_zeros(x): | ||
if sparse_has_zeros(x): | ||
extreme = func([0, extreme]) | ||
return extreme | ||
if axis == 0: | ||
|
@@ -257,7 +353,7 @@ def _nan_min_max(x, func, axis=0): | |
for row in x: | ||
values = row.data | ||
extreme = func(values) if values.size else float('nan') | ||
if _sparse_has_zeros(row): | ||
if sparse_has_zeros(row): | ||
extreme = func([0, extreme]) | ||
r.append(extreme) | ||
return np.array(r) | ||
|
@@ -323,7 +419,7 @@ def unique(x, return_counts=False): | |
if not sp.issparse(x): | ||
return np.unique(x, return_counts=return_counts) | ||
|
||
implicit_zeros = np.prod(x.shape) - x.nnz | ||
implicit_zeros = sparse_count_zeros(x) | ||
explicit_zeros = not np.all(x.data) | ||
r = np.unique(x.data, return_counts=return_counts) | ||
if not implicit_zeros: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we perhaps rename this to
sparse_count_implicit_zeros
to make sure explicit zeros aren't counted?