-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
is_bool_dtype for ExtensionArrays #22667
Changes from all commits
1f87ddd
47da6d3
9d4eab6
35f0575
412bd22
20b2add
27b8b68
c94d235
b9c45bd
4d09509
d8bd054
29b1370
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1619,6 +1619,11 @@ def is_bool_dtype(arr_or_dtype): | |
------- | ||
boolean : Whether or not the array or dtype is of a boolean dtype. | ||
|
||
Notes | ||
----- | ||
An ExtensionArray is considered boolean when the ``_is_boolean`` | ||
attribute is set to True. | ||
|
||
Examples | ||
-------- | ||
>>> is_bool_dtype(str) | ||
|
@@ -1635,6 +1640,8 @@ def is_bool_dtype(arr_or_dtype): | |
False | ||
>>> is_bool_dtype(np.array([True, False])) | ||
True | ||
>>> is_bool_dtype(pd.Categorical([True, False])) | ||
True | ||
""" | ||
|
||
if arr_or_dtype is None: | ||
|
@@ -1645,6 +1652,13 @@ def is_bool_dtype(arr_or_dtype): | |
# this isn't even a dtype | ||
return False | ||
|
||
if isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex)): | ||
arr_or_dtype = arr_or_dtype.dtype | ||
|
||
if isinstance(arr_or_dtype, CategoricalDtype): | ||
arr_or_dtype = arr_or_dtype.categories | ||
# now we use the special definition for Index | ||
|
||
if isinstance(arr_or_dtype, ABCIndexClass): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could be elif here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Has to be an |
||
|
||
# TODO(jreback) | ||
|
@@ -1653,6 +1667,9 @@ def is_bool_dtype(arr_or_dtype): | |
# guess this | ||
return (arr_or_dtype.is_object and | ||
arr_or_dtype.inferred_type == 'boolean') | ||
elif is_extension_array_dtype(arr_or_dtype): | ||
dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should use is_bool_dtype There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is |
||
return dtype._is_boolean | ||
|
||
return issubclass(tipo, np.bool_) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
"""Rudimentary Apache Arrow-backed ExtensionArray. | ||
|
||
At the moment, just a boolean array / type is implemented. | ||
Eventually, we'll want to parametrize the type and support | ||
multiple dtypes. Not all methods are implemented yet, and the | ||
current implementation is not efficient. | ||
""" | ||
import copy | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
import itertools | ||
|
||
import numpy as np | ||
import pyarrow as pa | ||
import pandas as pd | ||
from pandas.api.extensions import ( | ||
ExtensionDtype, ExtensionArray, take, register_extension_dtype | ||
) | ||
|
||
|
||
@register_extension_dtype | ||
class ArrowBoolDtype(ExtensionDtype): | ||
|
||
type = np.bool_ | ||
kind = 'b' | ||
name = 'arrow_bool' | ||
na_value = pa.NULL | ||
|
||
@classmethod | ||
def construct_from_string(cls, string): | ||
if string == cls.name: | ||
return cls() | ||
else: | ||
raise TypeError("Cannot construct a '{}' from " | ||
"'{}'".format(cls, string)) | ||
|
||
@classmethod | ||
def construct_array_type(cls): | ||
return ArrowBoolArray | ||
|
||
def _is_boolean(self): | ||
return True | ||
|
||
|
||
class ArrowBoolArray(ExtensionArray): | ||
def __init__(self, values): | ||
if not isinstance(values, pa.ChunkedArray): | ||
raise ValueError | ||
|
||
assert values.type == pa.bool_() | ||
self._data = values | ||
self._dtype = ArrowBoolDtype() | ||
|
||
def __repr__(self): | ||
return "ArrowBoolArray({})".format(repr(self._data)) | ||
|
||
@classmethod | ||
def from_scalars(cls, values): | ||
arr = pa.chunked_array([pa.array(np.asarray(values))]) | ||
return cls(arr) | ||
|
||
@classmethod | ||
def from_array(cls, arr): | ||
assert isinstance(arr, pa.Array) | ||
return cls(pa.chunked_array([arr])) | ||
|
||
@classmethod | ||
def _from_sequence(cls, scalars, dtype=None, copy=False): | ||
return cls.from_scalars(scalars) | ||
|
||
def __getitem__(self, item): | ||
return self._data.to_pandas()[item] | ||
|
||
def __len__(self): | ||
return len(self._data) | ||
|
||
@property | ||
def dtype(self): | ||
return self._dtype | ||
|
||
@property | ||
def nbytes(self): | ||
return sum(x.size for chunk in self._data.chunks | ||
for x in chunk.buffers() | ||
if x is not None) | ||
|
||
def isna(self): | ||
return pd.isna(self._data.to_pandas()) | ||
|
||
def take(self, indices, allow_fill=False, fill_value=None): | ||
data = self._data.to_pandas() | ||
|
||
if allow_fill and fill_value is None: | ||
fill_value = self.dtype.na_value | ||
|
||
result = take(data, indices, fill_value=fill_value, | ||
allow_fill=allow_fill) | ||
return self._from_sequence(result, dtype=self.dtype) | ||
|
||
def copy(self, deep=False): | ||
if deep: | ||
return copy.deepcopy(self._data) | ||
else: | ||
return copy.copy(self._data) | ||
|
||
def _concat_same_type(cls, to_concat): | ||
chunks = list(itertools.chain.from_iterable(x._data.chunks | ||
for x in to_concat)) | ||
arr = pa.chunked_array(chunks) | ||
return cls(arr) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import numpy as np | ||
import pytest | ||
import pandas as pd | ||
import pandas.util.testing as tm | ||
from pandas.tests.extension import base | ||
|
||
pytest.importorskip('pyarrow', minversion="0.10.0") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. woa didn't realize this worked now :> |
||
|
||
from .bool import ArrowBoolDtype, ArrowBoolArray | ||
|
||
|
||
@pytest.fixture | ||
def dtype(): | ||
return ArrowBoolDtype() | ||
|
||
|
||
@pytest.fixture | ||
def data(): | ||
return ArrowBoolArray.from_scalars(np.random.randint(0, 2, size=100, | ||
dtype=bool)) | ||
|
||
|
||
class BaseArrowTests(object): | ||
pass | ||
|
||
|
||
class TestDtype(BaseArrowTests, base.BaseDtypeTests): | ||
def test_array_type_with_arg(self, data, dtype): | ||
pytest.skip("GH-22666") | ||
|
||
|
||
class TestInterface(BaseArrowTests, base.BaseInterfaceTests): | ||
def test_repr(self, data): | ||
raise pytest.skip("TODO") | ||
|
||
|
||
class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): | ||
def test_from_dtype(self, data): | ||
pytest.skip("GH-22666") | ||
|
||
|
||
def test_is_bool_dtype(data): | ||
assert pd.api.types.is_bool_dtype(data) | ||
assert pd.core.common.is_bool_indexer(data) | ||
s = pd.Series(range(len(data))) | ||
result = s[data] | ||
expected = s[np.asarray(data)] | ||
tm.assert_series_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this if clause necessary? e.g. an EA type cannot match key.dtype == np.object_ (which actually should be
is_object_dtype
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It my be redundant with the
is_array_like(key)
. But IIRC the tests here were fairly light, and I don't want to risk breaking the old behavior.