From bfefc96913a0bafeb19172b95c30e5dc09d4ad48 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Nov 2018 14:36:48 -0600 Subject: [PATCH 01/36] added array --- doc/source/api.rst | 50 +++++++++++++ doc/source/whatsnew/v0.24.0.txt | 12 ++++ pandas/core/api.py | 19 ++++- pandas/core/arrays/__init__.py | 1 + pandas/core/arrays/array_.py | 52 ++++++++++++++ pandas/core/arrays/interval.py | 2 + pandas/core/dtypes/dtypes.py | 9 ++- pandas/tests/api/test_api.py | 10 ++- pandas/tests/arrays/test_array.py | 93 +++++++++++++++++++++++++ pandas/tests/arrays/test_period.py | 12 +++- pandas/tests/extension/decimal/array.py | 2 + 11 files changed, 255 insertions(+), 7 deletions(-) create mode 100644 pandas/core/arrays/array_.py create mode 100644 pandas/tests/arrays/test_array.py diff --git a/doc/source/api.rst b/doc/source/api.rst index 665649aead33c..201d932ea9730 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -702,6 +702,19 @@ strings and apply several methods to it. These can be accessed like Series.dt Index.str + +.. _api.arrays: + +Arrays +------ + +Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). + +.. autosummary:: + :toctree: generated/ + + array + .. _api.categorical: Categorical @@ -790,6 +803,42 @@ following usable methods and properties: Series.cat.as_ordered Series.cat.as_unordered +.. _api.interval: + +Interval +~~~~~~~~ + +:class:`IntervalArray` is an array for storing data representing intervals. +The scalar type is a :class:`Interval`. These may be stored in a :class:`Series` +or as a :class:`IntervalIndex`. :class:`IntervalArray` can be closed on the +left, right, or both, or neither sides. + +.. currentmodule:: pandas + +.. autosummary:: + :toctree: generated/ + :template: autosummary/class_without_autosummary.rst + + IntervalArray + +.. autosummary:: + :toctree: generated/ + + IntervalArray.from_arrays + IntervalArray.from_tuples + IntervalArray.from_breaks + IntervalArray.contains + IntervalArray.left + IntervalArray.right + IntervalArray.mid + IntervalArray.closed + IntervalArray.length + IntervalArray.values + IntervalArray.is_non_overlapping_monotonic + IntervalArray.set_closed + IntervalArray.overlaps + IntervalArray.to_tuples + Plotting ~~~~~~~~ @@ -1675,6 +1724,7 @@ IntervalIndex Components IntervalIndex.get_indexer IntervalIndex.set_closed IntervalIndex.overlaps + IntervalArray.to_tuples .. _api.multiindex: diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 5fefb9e3e405c..dab55611f4c8d 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -101,6 +101,18 @@ Reduction and groupby operations such as 'sum' work. The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. +.. _whatsnew_0240.enhancements.array: + +A new top-level method :func:`array` has been added for creating arrays (:issue:`22860`). +This can be used to create NumPy arrays, or any :ref:`extension array `, including +extension arrays registered by :ref:`3rd party libraries `. + +.. ipython:: python + + pd.array([1, 2, np.nan], dtype='Int64') + pd.array(['a', 'b', 'c'], dtype='category') + pd.array([1, 2]) + .. _whatsnew_0240.enhancements.read_html: ``read_html`` Enhancements diff --git a/pandas/core/api.py b/pandas/core/api.py index ad35b647ac458..afc929c39086c 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -4,9 +4,26 @@ import numpy as np +from pandas.core.arrays import IntervalArray +from pandas.core.arrays.integer import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) from pandas.core.algorithms import factorize, unique, value_counts from pandas.core.dtypes.missing import isna, isnull, notna, notnull -from pandas.core.arrays import Categorical +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + PeriodDtype, + IntervalDtype, + DatetimeTZDtype, +) +from pandas.core.arrays import Categorical, array from pandas.core.groupby import Grouper from pandas.io.formats.format import set_eng_float_format from pandas.core.index import (Index, CategoricalIndex, Int64Index, diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index ea8837332633a..850b0a028fb5b 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1,3 +1,4 @@ +from .array_ import array # noqa from .base import (ExtensionArray, # noqa ExtensionOpsMixin, ExtensionScalarOpsMixin) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py new file mode 100644 index 0000000000000..9da54f780fc9a --- /dev/null +++ b/pandas/core/arrays/array_.py @@ -0,0 +1,52 @@ +import numpy as np + +from pandas.core.dtypes.dtypes import registry +from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries + + +def array(data, dtype=None, copy=False): + """ + Create an array. + + Parameters + ---------- + data : Sequence[object] + A sequence of scalar instances for `dtype`. The underlying + array will be extracted from a Series or Index object. + + dtype : Union[str, np.dtype, ExtensionDtype], optional + The dtype to use for the array. This may be a NumPy + dtype, or an extension type registered with pandas using + :meth:`pandas.api.extensions.register_extension_dtype`. + + By default, the dtype will be inferred from the data + with :meth:`numpy.array`. + + copy : bool, default False + Whether to copy the data. + + Returns + ------- + Array : Union[ndarray, ExtensionArray] + + Examples + -------- + >>> pd.array([1, 2]) + array([1, 2]) + + >>> pd.array(['a', 'b', 'a'], dtype='category') + [a, b, a] + Categories (2, object): [a, b] + """ + if isinstance(data, (ABCSeries, ABCIndexClass)): + data = data._values + + # this returns None for not-found dtypes. + dtype = registry.find(dtype) or dtype + + if is_extension_array_dtype(dtype): + cls = dtype.construct_array_type() + return cls._from_sequence(data, dtype=dtype, copy=copy) + + return np.array(data, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 94be29893d2b9..a791a81b2d943 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -81,7 +81,9 @@ from_arrays from_tuples from_breaks +overlaps set_closed +to_tuples %(extra_methods)s\ %(examples)s\ diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 4dfefdec031b2..50df5c196a8bf 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -39,7 +39,12 @@ class Registry(object): Registry for dtype inference The registry allows one to map a string repr of a extension - dtype to an extenstion dtype. + dtype to an extension dtype. The string alias can be used in several + places, including + + * Series and Index constructors + * :meth:`pandas.array` + * :meth:`pandas.Series.astype` Multiple extension types can be registered. These are tried in order. @@ -592,6 +597,7 @@ def __eq__(self, other): str(self.tz) == str(other.tz)) +@register_extension_dtype class PeriodDtype(ExtensionDtype, PandasExtensionDtype): """ A Period duck-typed class, suitable for holding a period with freq dtype. @@ -854,4 +860,3 @@ def is_dtype(cls, dtype): _pandas_registry = Registry() _pandas_registry.register(DatetimeTZDtype) -_pandas_registry.register(PeriodDtype) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index da894a0881400..041bd3a3d9e68 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -45,7 +45,13 @@ class TestPDApi(Base): 'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index', 'Series', 'SparseArray', 'SparseDataFrame', 'SparseDtype', 'SparseSeries', 'Timedelta', - 'TimedeltaIndex', 'Timestamp', 'Interval', 'IntervalIndex'] + 'TimedeltaIndex', 'Timestamp', 'Interval', 'IntervalIndex', + 'IntervalArray', + 'CategoricalDtype', 'PeriodDtype', 'IntervalDtype', + 'DatetimeTZDtype', + 'Int8Dtype', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', + 'UInt8Dtype', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', + ] # these are already deprecated; awaiting removal deprecated_classes = ['TimeGrouper'] @@ -57,7 +63,7 @@ class TestPDApi(Base): modules = ['np', 'datetime'] # top-level functions - funcs = ['bdate_range', 'concat', 'crosstab', 'cut', + funcs = ['array', 'bdate_range', 'concat', 'crosstab', 'cut', 'date_range', 'interval_range', 'eval', 'factorize', 'get_dummies', 'infer_freq', 'isna', 'isnull', 'lreshape', diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py new file mode 100644 index 0000000000000..e0dbf5f9f658f --- /dev/null +++ b/pandas/tests/arrays/test_array.py @@ -0,0 +1,93 @@ +import decimal + +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm +from pandas.core.dtypes.dtypes import registry +from pandas.api.extensions import register_extension_dtype +from pandas.core.arrays import period_array, integer_array +from pandas.tests.extension.decimal import ( + to_decimal, DecimalArray, DecimalDtype +) + + +@pytest.mark.parametrize("data, dtype, expected", [ + ([1, 2], None, np.array([1, 2])), + ([1, 2], object, np.array([1, 2], dtype=object)), + ([1, 2], 'float32', np.array([1., 2.0], dtype=np.dtype('float32'))), + ([pd.Period('2000', 'D'), pd.Period('2001', 'D')], 'Period[D]', + period_array(['2000', '2001'], freq='D')), + ([pd.Period('2000', 'D')], pd.PeriodDtype('D'), + period_array(['2000'], freq='D')), + ([1, 2], 'datetime64[ns]', np.array([1, 2], dtype='datetime64[ns]')), + (['a', 'b'], 'category', pd.Categorical(['a', 'b'])), + (['a', 'b'], pd.CategoricalDtype(None, ordered=True), + pd.Categorical(['a', 'b'], ordered=True)), + ([pd.Interval(1, 2), pd.Interval(3, 4)], 'interval', + pd.IntervalArray.from_tuples([(1, 2), (3, 4)])), + ([0, 1], 'Sparse[int]', pd.SparseArray([0, 1])), + ([1, None], 'Int16', integer_array([1, None], dtype='Int16')), + + # "3rd party" EAs work + ([decimal.Decimal(0), decimal.Decimal(1)], 'decimal', to_decimal([0, 1])), +]) +def test_array(data, dtype, expected): + result = pd.array(data, dtype=dtype) + tm.assert_equal(result, expected) + + +# --------------------------------------------------------------------------- +# A couple dummy classes to ensure that Series and Indexes are unboxed before +# getting to the EA classes. + + +@register_extension_dtype +class DecimalDtype2(DecimalDtype): + name = 'decimal2' + + @classmethod + def construct_array_type(cls): + return DecimalArray2 + + +class DecimalArray2(DecimalArray): + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + if isinstance(scalars, (pd.Series, pd.Index)): + raise TypeError + + return super(DecimalArray2, cls)._from_sequence( + scalars, dtype=dtype, copy=copy + ) + + +@pytest.mark.parametrize("box", [pd.Series, pd.Index]) +def test_array_unboxes(box): + data = box([decimal.Decimal('1'), decimal.Decimal('2')]) + # make sure it works + with pytest.raises(TypeError): + DecimalArray2._from_sequence(data) + + result = pd.array(data, dtype='decimal2') + expected = DecimalArray2._from_sequence(data.values) + tm.assert_equal(result, expected) + + +@pytest.fixture +def registry_without_decimal(): + idx = registry.dtypes.index(DecimalDtype) + registry.dtypes.pop(idx) + yield + registry.dtypes.append(DecimalDtype) + + +def test_array_not_registered(registry_without_decimal): + # check we aren't on it + assert registry.find('decimal') is None + data = [decimal.Decimal('1'), decimal.Decimal('2')] + + result = pd.array(data, dtype=DecimalDtype) + expected = DecimalArray._from_sequence(data) + tm.assert_equal(result, expected) diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 0125729048cdd..c67f622ac6bce 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -5,15 +5,23 @@ from pandas._libs.tslibs.period import IncompatibleFrequency from pandas.core.dtypes.common import pandas_dtype -from pandas.core.dtypes.dtypes import PeriodDtype +from pandas.core.dtypes.dtypes import PeriodDtype, registry import pandas as pd from pandas.core.arrays import PeriodArray, period_array import pandas.util.testing as tm + # ---------------------------------------------------------------------------- -# Constructors +# Dtype + +def test_registered(): + assert PeriodDtype in registry.dtypes + result = registry.find("Period[D]") + expected = PeriodDtype("D") + assert result == expected +# ---------------------------------------------------------------------------- # period_array diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 3c8905c578c4f..c682020766cc0 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -8,9 +8,11 @@ from pandas.core.dtypes.base import ExtensionDtype import pandas as pd +from pandas.api.extensions import register_extension_dtype from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin +@register_extension_dtype class DecimalDtype(ExtensionDtype): type = decimal.Decimal name = 'decimal' From dcb7931e2a1febb419ccec9957436ca2a2bb572f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Nov 2018 09:46:05 -0600 Subject: [PATCH 02/36] update registry test --- pandas/tests/dtypes/test_dtypes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 2927442f9b6ee..33b5bd7a705b2 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -788,12 +788,12 @@ def test_update_dtype_errors(self, bad_dtype): @pytest.mark.parametrize( 'dtype', - [CategoricalDtype, IntervalDtype]) + [CategoricalDtype, IntervalDtype, PeriodDtype]) def test_registry(dtype): assert dtype in registry.dtypes -@pytest.mark.parametrize('dtype', [DatetimeTZDtype, PeriodDtype]) +@pytest.mark.parametrize('dtype', [DatetimeTZDtype]) def test_pandas_registry(dtype): assert dtype not in registry.dtypes assert dtype in _pandas_registry.dtypes @@ -805,6 +805,7 @@ def test_pandas_registry(dtype): ('interval', IntervalDtype()), ('interval[int64]', IntervalDtype()), ('interval[datetime64[ns]]', IntervalDtype('datetime64[ns]')), + ('period[D]', PeriodDtype('D')), ('category', CategoricalDtype())]) def test_registry_find(dtype, expected): assert registry.find(dtype) == expected @@ -812,8 +813,7 @@ def test_registry_find(dtype, expected): @pytest.mark.parametrize( 'dtype, expected', - [('period[D]', PeriodDtype('D')), - ('datetime64[ns, US/Eastern]', DatetimeTZDtype('ns', 'US/Eastern'))]) + [('datetime64[ns, US/Eastern]', DatetimeTZDtype('ns', 'US/Eastern'))]) def test_pandas_registry_find(dtype, expected): assert _pandas_registry.find(dtype) == expected From a6356491641ac7bc9082f28c70fb552e7dc6c653 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Nov 2018 09:50:38 -0600 Subject: [PATCH 03/36] update doc examples --- pandas/core/arrays/array_.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 9da54f780fc9a..3df1cd032d326 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -1,7 +1,7 @@ import numpy as np -from pandas.core.dtypes.dtypes import registry from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.dtypes import registry from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -32,12 +32,29 @@ def array(data, dtype=None, copy=False): Examples -------- + If a dtype is not specified, `data` is passed through to + :meth:`numpy.array`, and an ndarray is returned. + >>> pd.array([1, 2]) array([1, 2]) + Or the NumPy dtype can be specified + + >>> pd.array([1, 2], dtype=np.int32) + array([1, 2], dtype=int32) + + You can use the string alias for `dtype` + >>> pd.array(['a', 'b', 'a'], dtype='category') [a, b, a] Categories (2, object): [a, b] + + Or specify the actual dtype + + >>> pd.array(['a', 'b', 'a'], + ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) + [a, b, a] + Categories (3, object): [a < b < c] """ if isinstance(data, (ABCSeries, ABCIndexClass)): data = data._values From fb0d8bc8f5a6e54439f30fbbc6e71a714ffcdfc2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Nov 2018 10:16:17 -0600 Subject: [PATCH 04/36] wip --- doc/source/api.rst | 29 ++++++++++++++++++++++++++++- pandas/arrays/__init__.py | 17 +++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 pandas/arrays/__init__.py diff --git a/doc/source/api.rst b/doc/source/api.rst index 201d932ea9730..e4cac3e924dc2 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -803,7 +803,20 @@ following usable methods and properties: Series.cat.as_ordered Series.cat.as_unordered -.. _api.interval: +.. _api.arrays.integerna: + +Integer-NA +~~~~~~~~~~ + +:class:`arrays.IntegerArray` can hold integer data, potentially with missing +values. + +.. autosummary:: + :toctree: generated/ + + IntegerArray + +.. _api.arrays.interval: Interval ~~~~~~~~ @@ -839,6 +852,20 @@ left, right, or both, or neither sides. IntervalArray.overlaps IntervalArray.to_tuples +.. _api.arrays.period: + +.. autosummary:: + :toctree: generated/ + + PeriodArray + +.. _api.arrays.sparse: + +.. autosummary:: + :toctree: generated/ + + SparseArray + Plotting ~~~~~~~~ diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py new file mode 100644 index 0000000000000..472ba80bdd909 --- /dev/null +++ b/pandas/arrays/__init__.py @@ -0,0 +1,17 @@ +""" +All of pandas' ExtensionArrays and ExtensionDtypes. + +See :ref:`extending.extension-types` for more. +""" +from pandas.core.arrays import ( + IntervalArray, PeriodArray, Categorical, SparseArray, IntegerArray, +) + + +__all__ = [ + 'Categorical', + 'IntegerArray', + 'IntervalArray', + 'PeriodArray', + 'SparseArray', +] From fe06de4ef51f6a8c78888b8e69c42efa6aa107a6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Nov 2018 16:27:27 -0600 Subject: [PATCH 05/36] inference --- pandas/core/arrays/array_.py | 15 +++++++++++++++ pandas/tests/arrays/test_array.py | 28 ++++++++++++++++++++++------ 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 3df1cd032d326..2b2941e582ad0 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -1,5 +1,7 @@ import numpy as np +from pandas._libs import lib, tslibs + from pandas.core.dtypes.common import is_extension_array_dtype from pandas.core.dtypes.dtypes import registry from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -56,6 +58,8 @@ def array(data, dtype=None, copy=False): [a, b, a] Categories (3, object): [a < b < c] """ + from pandas.core.arrays import period_array + if isinstance(data, (ABCSeries, ABCIndexClass)): data = data._values @@ -66,4 +70,15 @@ def array(data, dtype=None, copy=False): cls = dtype.construct_array_type() return cls._from_sequence(data, dtype=dtype, copy=copy) + if dtype is None: + inferred_dtype = lib.infer_dtype(data) + if inferred_dtype == 'period': + try: + return period_array(data) + except tslibs.IncompatibleFrequency: + pass # we return an array below. + + # TODO(DatetimeArray): handle this type + # TODO(BooleanArray): handle this type + return np.array(data, dtype=dtype, copy=copy) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index e0dbf5f9f658f..cfbe69d24341d 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -1,16 +1,16 @@ import decimal import numpy as np -import pytest -import pandas as pd -import pandas.util.testing as tm from pandas.core.dtypes.dtypes import registry + +import pandas as pd from pandas.api.extensions import register_extension_dtype -from pandas.core.arrays import period_array, integer_array +from pandas.core.arrays import integer_array, period_array from pandas.tests.extension.decimal import ( - to_decimal, DecimalArray, DecimalDtype -) + DecimalArray, DecimalDtype, to_decimal) +import pandas.util.testing as tm +import pytest @pytest.mark.parametrize("data, dtype, expected", [ @@ -38,6 +38,22 @@ def test_array(data, dtype, expected): tm.assert_equal(result, expected) +@pytest.mark.parametrize('data, expected', [ + ([pd.Period("2000", "D"), pd.Period("2001", "D")], + period_array(["2000", "2001"], freq="D")), +]) +def test_array_inference(data, expected): + result = pd.array(data) + tm.assert_equal(result, expected) + + +def test_array_inference_period_fails(): + data = [pd.Period("2000", "D"), pd.Period("2001", "A")] + result = pd.array(data) + expected = np.array(data, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + # --------------------------------------------------------------------------- # A couple dummy classes to ensure that Series and Indexes are unboxed before # getting to the EA classes. From 72f7f067f4f9c6aeda05dd57b8b536f80de53417 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Nov 2018 16:37:57 -0600 Subject: [PATCH 06/36] ia updates --- doc/source/api.rst | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index e4cac3e924dc2..1f1a5b7cda820 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -830,35 +830,22 @@ left, right, or both, or neither sides. .. autosummary:: :toctree: generated/ - :template: autosummary/class_without_autosummary.rst IntervalArray -.. autosummary:: - :toctree: generated/ - - IntervalArray.from_arrays - IntervalArray.from_tuples - IntervalArray.from_breaks - IntervalArray.contains - IntervalArray.left - IntervalArray.right - IntervalArray.mid - IntervalArray.closed - IntervalArray.length - IntervalArray.values - IntervalArray.is_non_overlapping_monotonic - IntervalArray.set_closed - IntervalArray.overlaps - IntervalArray.to_tuples - .. _api.arrays.period: +Period +~~~~~~ + .. autosummary:: :toctree: generated/ PeriodArray +Sparse +~~~~~~ + .. _api.arrays.sparse: .. autosummary:: @@ -1982,6 +1969,8 @@ Methods PeriodIndex.strftime PeriodIndex.to_timestamp +.. api.scalars: + Scalars ------- From c02e183aef01ae903fe9e13460c6c6fb66953f71 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Nov 2018 21:17:27 -0600 Subject: [PATCH 07/36] test fixup --- pandas/tests/arrays/test_array.py | 2 +- pandas/tests/arrays/test_period.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index cfbe69d24341d..7a74f7638f206 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -27,7 +27,7 @@ pd.Categorical(['a', 'b'], ordered=True)), ([pd.Interval(1, 2), pd.Interval(3, 4)], 'interval', pd.IntervalArray.from_tuples([(1, 2), (3, 4)])), - ([0, 1], 'Sparse[int]', pd.SparseArray([0, 1])), + ([0, 1], 'Sparse[int64]', pd.SparseArray([0, 1], dtype='int64')), ([1, None], 'Int16', integer_array([1, None], dtype='Int16')), # "3rd party" EAs work diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index c67f622ac6bce..807680cb8f292 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -1,5 +1,4 @@ import numpy as np -import pytest from pandas._libs.tslibs import iNaT from pandas._libs.tslibs.period import IncompatibleFrequency @@ -10,11 +9,12 @@ import pandas as pd from pandas.core.arrays import PeriodArray, period_array import pandas.util.testing as tm - +import pytest # ---------------------------------------------------------------------------- # Dtype + def test_registered(): assert PeriodDtype in registry.dtypes result = registry.find("Period[D]") From a2d314674f0f07a65596a4c5465601ef966b9dba Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Nov 2018 21:27:07 -0600 Subject: [PATCH 08/36] isort --- pandas/tests/arrays/test_period.py | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 807680cb8f292..1b5cbbc17d3cb 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas._libs.tslibs import iNaT from pandas._libs.tslibs.period import IncompatibleFrequency @@ -9,7 +10,6 @@ import pandas as pd from pandas.core.arrays import PeriodArray, period_array import pandas.util.testing as tm -import pytest # ---------------------------------------------------------------------------- # Dtype diff --git a/setup.cfg b/setup.cfg index 4726a0ddb2fb2..2e07182196d5b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -90,7 +90,7 @@ known_post_core=pandas.tseries,pandas.io,pandas.plotting sections=FUTURE,STDLIB,THIRDPARTY,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER known_first_party=pandas -known_third_party=Cython,numpy,python-dateutil,pytz,pyarrow +known_third_party=Cython,numpy,python-dateutil,pytz,pyarrow,pytest multi_line_output=4 force_grid_wrap=0 combine_as_imports=True From 37901b0d68880bc089304d3f596df519d5ff9684 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 10 Nov 2018 06:17:18 -0600 Subject: [PATCH 09/36] fixups --- pandas/core/arrays/array_.py | 2 ++ pandas/tests/arrays/test_array.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 2b2941e582ad0..b21362fc572e6 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -11,6 +11,8 @@ def array(data, dtype=None, copy=False): """ Create an array. + .. versionadded:: 0.24.0 + Parameters ---------- data : Sequence[object] diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 7a74f7638f206..530ad061ee2de 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -1,6 +1,7 @@ import decimal import numpy as np +import pytest from pandas.core.dtypes.dtypes import registry @@ -10,7 +11,6 @@ from pandas.tests.extension.decimal import ( DecimalArray, DecimalDtype, to_decimal) import pandas.util.testing as tm -import pytest @pytest.mark.parametrize("data, dtype, expected", [ From 9401dd30e776325581d88c2be5f035e1d983f3cb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Nov 2018 06:58:02 -0600 Subject: [PATCH 10/36] wip --- doc/source/api.rst | 11 ++++++++++- doc/source/whatsnew/v0.24.0.txt | 8 ++++++++ pandas/core/arrays/array_.py | 17 +++++++++++++++-- pandas/tests/extension/base/constructors.py | 11 +++++++++++ 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 1f1a5b7cda820..cc496fa883081 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -824,7 +824,8 @@ Interval :class:`IntervalArray` is an array for storing data representing intervals. The scalar type is a :class:`Interval`. These may be stored in a :class:`Series` or as a :class:`IntervalIndex`. :class:`IntervalArray` can be closed on the -left, right, or both, or neither sides. +``'left'``, ``'right'``, or ``'both'``, or ``'neither'`` sides. +See :ref:`indexing.intervallindex` for more. .. currentmodule:: pandas @@ -838,6 +839,10 @@ left, right, or both, or neither sides. Period ~~~~~~ +Periods represent a span of time (e.g. the year 2000, or the hour from 11:00 to 12:00 +on January 1st, 2000). A collection of :class:`Period` objects with a common frequency +can be collected in a :class:`PeriodArray`. See :ref:`timeseries.periods` for more. + .. autosummary:: :toctree: generated/ @@ -846,6 +851,10 @@ Period Sparse ~~~~~~ +Sparse data may be stored and operated on more efficiently when there is a single value +that's often repeated. :class:`SparseArray` is a container for this type of data. +See :ref:`sparse` for more. + .. _api.arrays.sparse: .. autosummary:: diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index b4464146376e4..21912e2d1393b 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -113,6 +113,14 @@ extension arrays registered by :ref:`3rd party libraries ` pd.array(['a', 'b', 'c'], dtype='category') pd.array([1, 2]) +Notice that the default return value, if no ``dtype`` is specified, is a NumPy array. So +the first example of ``[1, 2, np.nan]`` will return a floating-point NumPy array, +since ``NaN`` is a float. + +.. ipython:: python + + pd.array([1, 2, np.nan]) + .. _whatsnew_0240.enhancements.read_html: ``read_html`` Enhancements diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index b21362fc572e6..ef703f939da03 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -37,7 +37,7 @@ def array(data, dtype=None, copy=False): Examples -------- If a dtype is not specified, `data` is passed through to - :meth:`numpy.array`, and an ndarray is returned. + :meth:`numpy.array`, and an ``ndarray`` is returned. >>> pd.array([1, 2]) array([1, 2]) @@ -59,6 +59,19 @@ def array(data, dtype=None, copy=False): ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) [a, b, a] Categories (3, object): [a < b < c] + + Because omitting the `dtype` passes the data through to NumPy, + a mixture of valid integers and NA will return a floating-point + NumPy array. + + >>> pd.array([1, 2, np.nan]) + array([ 1., 2., nan]) + + To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify + the dtype: + + >>> pd.array([1, 2, np.nan], dtype='Int64') + IntegerArray([1, 2, nan], dtype='Int64') """ from pandas.core.arrays import period_array @@ -66,7 +79,7 @@ def array(data, dtype=None, copy=False): data = data._values # this returns None for not-found dtypes. - dtype = registry.find(dtype) or dtype + dtype = dtype or registry.find(dtype) if is_extension_array_dtype(dtype): cls = dtype.construct_array_type() diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 3b966cd8d4774..a7601635b3b4e 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -1,5 +1,7 @@ import pytest +from pandas.core.dtypes.dtypes import registry + import pandas as pd from pandas.core.internals import ExtensionBlock @@ -55,3 +57,12 @@ def test_from_dtype(self, data): result = pd.Series(list(data), dtype=str(dtype)) self.assert_series_equal(result, expected) + + def test_pandas_array(self, data): + if registry.find(data.dtype) is not None: + # they've registered, so pd.array should work. + result = pd.array(data, dtype=data.dtype) + expected = type(data)._from_sequence(data, dtype=data.dtype) + self.assert_extension_array_equal(result, expected) + else: + raise pytest.skip("dtype not registered.") From 838ce5eb5a6ddefaae815c30abb5b9e7aff3a0cc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Nov 2018 07:10:20 -0600 Subject: [PATCH 11/36] dtype from ea --- pandas/core/arrays/array_.py | 17 ++++++++++++++--- pandas/tests/extension/base/constructors.py | 18 +++++++++--------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index ef703f939da03..9879827c1d91d 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -24,8 +24,16 @@ def array(data, dtype=None, copy=False): dtype, or an extension type registered with pandas using :meth:`pandas.api.extensions.register_extension_dtype`. - By default, the dtype will be inferred from the data - with :meth:`numpy.array`. + If not specified, there are two possibilities: + + 1. When `data` is a :class:`Series`, :class:`Index`, or + :class:`ExtensionArray`, the `dtype` will be taken + from the data. + 2. Otherwise, pandas will attempt to infer the `dtype` + from the data. + + In particular, note that when `data` is a NumPy, ``data.dtype`` + is ignored. copy : bool, default False Whether to copy the data. @@ -73,11 +81,14 @@ def array(data, dtype=None, copy=False): >>> pd.array([1, 2, np.nan], dtype='Int64') IntegerArray([1, 2, nan], dtype='Int64') """ - from pandas.core.arrays import period_array + from pandas.core.arrays import period_array, ExtensionArray if isinstance(data, (ABCSeries, ABCIndexClass)): data = data._values + if dtype is None and isinstance(data, ExtensionArray): + dtype = data.dtype + # this returns None for not-found dtypes. dtype = dtype or registry.find(dtype) diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index a7601635b3b4e..30c49a326be69 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -1,7 +1,6 @@ +import numpy as np import pytest -from pandas.core.dtypes.dtypes import registry - import pandas as pd from pandas.core.internals import ExtensionBlock @@ -59,10 +58,11 @@ def test_from_dtype(self, data): self.assert_series_equal(result, expected) def test_pandas_array(self, data): - if registry.find(data.dtype) is not None: - # they've registered, so pd.array should work. - result = pd.array(data, dtype=data.dtype) - expected = type(data)._from_sequence(data, dtype=data.dtype) - self.assert_extension_array_equal(result, expected) - else: - raise pytest.skip("dtype not registered.") + # pd.array(extension_array) should be idempotent... + result = pd.array(data) + self.assert_extension_array_equal(result, data) + + def test_pandas_array_dtype(self, data): + # ... but specifying dtype will override idempotency + result = pd.array(data, dtype=object) + self.assert_equal(result, np.asarray(data, dtype=object)) From 5260b990816c545d6be5d2f7e37fdfa49cf141e3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Nov 2018 07:16:44 -0600 Subject: [PATCH 12/36] series, index tests --- pandas/core/arrays/array_.py | 3 ++- pandas/tests/arrays/test_array.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 9879827c1d91d..00aaf57535a68 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -90,7 +90,8 @@ def array(data, dtype=None, copy=False): dtype = data.dtype # this returns None for not-found dtypes. - dtype = dtype or registry.find(dtype) + if dtype is not None: + dtype = registry.find(dtype) or dtype if is_extension_array_dtype(dtype): cls = dtype.construct_array_type() diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 530ad061ee2de..1163cd73ba7af 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -29,7 +29,8 @@ pd.IntervalArray.from_tuples([(1, 2), (3, 4)])), ([0, 1], 'Sparse[int64]', pd.SparseArray([0, 1], dtype='int64')), ([1, None], 'Int16', integer_array([1, None], dtype='Int16')), - + (pd.Series([1, 2]), None, np.array([1, 2], dtype=np.int64)), + (pd.Index([1, 2]), None, np.array([1, 2], dtype=np.int64)), # "3rd party" EAs work ([decimal.Decimal(0), decimal.Decimal(1)], 'decimal', to_decimal([0, 1])), ]) From cf07c808b6dced16630bd2eb9d19723d007b0d08 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Nov 2018 07:19:52 -0600 Subject: [PATCH 13/36] added ndarray case --- pandas/tests/arrays/test_array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 1163cd73ba7af..7300fa49f2c8f 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -17,6 +17,7 @@ ([1, 2], None, np.array([1, 2])), ([1, 2], object, np.array([1, 2], dtype=object)), ([1, 2], 'float32', np.array([1., 2.0], dtype=np.dtype('float32'))), + (np.array([1, 2]), None, np.array([1, 2])), ([pd.Period('2000', 'D'), pd.Period('2001', 'D')], 'Period[D]', period_array(['2000', '2001'], freq='D')), ([pd.Period('2000', 'D')], pd.PeriodDtype('D'), From fe40189a5cef8c208ce95cd15b8be89d5950fa14 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Nov 2018 13:05:59 -0600 Subject: [PATCH 14/36] added test for a 2d array --- pandas/tests/arrays/test_array.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 7300fa49f2c8f..ad99fbc25aa9a 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -34,6 +34,9 @@ (pd.Index([1, 2]), None, np.array([1, 2], dtype=np.int64)), # "3rd party" EAs work ([decimal.Decimal(0), decimal.Decimal(1)], 'decimal', to_decimal([0, 1])), + # 2D ndarrays pass through + (np.array([[1, 2], [3, 4]]), None, np.array([[1, 2], [3, 4]])), + ([[1, 2], [3, 4]], None, np.array([[1, 2, ], [3, 4]])), ]) def test_array(data, dtype, expected): result = pd.array(data, dtype=dtype) From 7eb9d082b0f01b724b749dc18f432db0f22e5f59 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Nov 2018 13:10:26 -0600 Subject: [PATCH 15/36] TST: test for Series[EA] --- pandas/core/arrays/array_.py | 4 +++- pandas/tests/arrays/test_array.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 00aaf57535a68..50256d53482b8 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -6,6 +6,8 @@ from pandas.core.dtypes.dtypes import registry from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas import compat + def array(data, dtype=None, copy=False): """ @@ -90,7 +92,7 @@ def array(data, dtype=None, copy=False): dtype = data.dtype # this returns None for not-found dtypes. - if dtype is not None: + if isinstance(dtype, compat.string_types): dtype = registry.find(dtype) or dtype if is_extension_array_dtype(dtype): diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index ad99fbc25aa9a..29cb1e9a43523 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -32,6 +32,10 @@ ([1, None], 'Int16', integer_array([1, None], dtype='Int16')), (pd.Series([1, 2]), None, np.array([1, 2], dtype=np.int64)), (pd.Index([1, 2]), None, np.array([1, 2], dtype=np.int64)), + # Series[EA] returns the EA + (pd.Series(pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])), + None, + pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])), # "3rd party" EAs work ([decimal.Decimal(0), decimal.Decimal(1)], 'decimal', to_decimal([0, 1])), # 2D ndarrays pass through From 1ca14fe5f4e30514c8342c82575756ceaa38ba6f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 20 Nov 2018 08:57:25 -0600 Subject: [PATCH 16/36] Added test for period -> category --- pandas/core/arrays/array_.py | 4 ++-- pandas/tests/arrays/test_array.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 50256d53482b8..d376e6f844049 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -34,8 +34,8 @@ def array(data, dtype=None, copy=False): 2. Otherwise, pandas will attempt to infer the `dtype` from the data. - In particular, note that when `data` is a NumPy, ``data.dtype`` - is ignored. + In particular, note that when `data` is a NumPy array, + ``data.dtype`` is ignored. copy : bool, default False Whether to copy the data. diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 29cb1e9a43523..0049483b4854d 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -41,6 +41,11 @@ # 2D ndarrays pass through (np.array([[1, 2], [3, 4]]), None, np.array([[1, 2], [3, 4]])), ([[1, 2], [3, 4]], None, np.array([[1, 2, ], [3, 4]])), + # pass an ExtensionArray, but a different dtype + (period_array(['2000', '2001'], freq='D'), + 'category', + pd.Categorical([pd.Period('2000', 'D'), pd.Period('2001', 'D')])), + ]) def test_array(data, dtype, expected): result = pd.array(data, dtype=dtype) From 4473899f215b3137c56993a1e2a6df330c3b283d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 20 Nov 2018 08:59:54 -0600 Subject: [PATCH 17/36] copy --- pandas/core/arrays/array_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index d376e6f844049..f395f7a00727f 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -103,7 +103,7 @@ def array(data, dtype=None, copy=False): inferred_dtype = lib.infer_dtype(data) if inferred_dtype == 'period': try: - return period_array(data) + return period_array(data, copy=copy) except tslibs.IncompatibleFrequency: pass # we return an array below. From 382f57d9cd39ce2f77ba935ba46dc2b230e42798 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 20 Nov 2018 09:50:32 -0600 Subject: [PATCH 18/36] prefix for arrays --- doc/source/api.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 534a5a620ea29..e0cf09377977e 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -814,7 +814,7 @@ values. .. autosummary:: :toctree: generated/ - IntegerArray + arrays.IntegerArray .. _api.arrays.interval: @@ -846,7 +846,7 @@ can be collected in a :class:`PeriodArray`. See :ref:`timeseries.periods` for mo .. autosummary:: :toctree: generated/ - PeriodArray + arrays.PeriodArray Sparse ~~~~~~ From dd76a2b49c1cfe5930646c54596f3c68d6ffea1e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 20 Nov 2018 09:52:35 -0600 Subject: [PATCH 19/36] Added arrays --- pandas/__init__.py | 1 + pandas/tests/api/test_api.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index e86ed86fda74f..427157acb433f 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -49,6 +49,7 @@ from pandas.io.api import * from pandas.util._tester import test import pandas.testing +import pandas.arrays # use the closest tagged version if possible from ._version import get_versions diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 041bd3a3d9e68..07cf358c765b3 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -28,7 +28,7 @@ class TestPDApi(Base): ignored = ['tests', 'locale', 'conftest'] # top-level sub-packages - lib = ['api', 'compat', 'core', 'errors', 'pandas', + lib = ['api', 'arrays', 'compat', 'core', 'errors', 'pandas', 'plotting', 'test', 'testing', 'tseries', 'util', 'options', 'io'] From 53669507875222c628ccf1781a4eb9003b4d0a02 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 21 Nov 2018 06:02:46 -0600 Subject: [PATCH 20/36] update docstring --- pandas/core/arrays/array_.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index f395f7a00727f..79e447b0285e6 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -18,12 +18,15 @@ def array(data, dtype=None, copy=False): Parameters ---------- data : Sequence[object] - A sequence of scalar instances for `dtype`. The underlying - array will be extracted from a Series or Index object. + A sequence of objects. The scalars inside `data` should + be instances of the scalar type for `dtype`. + + When `data` is an Index or Series, the underlying array + will be extracted from `data`. dtype : Union[str, np.dtype, ExtensionDtype], optional The dtype to use for the array. This may be a NumPy - dtype, or an extension type registered with pandas using + dtype or an extension type registered with pandas using :meth:`pandas.api.extensions.register_extension_dtype`. If not specified, there are two possibilities: @@ -34,15 +37,28 @@ def array(data, dtype=None, copy=False): 2. Otherwise, pandas will attempt to infer the `dtype` from the data. - In particular, note that when `data` is a NumPy array, - ``data.dtype`` is ignored. + Note that when `data` is a NumPy array, ``data.dtype`` is + ignored. copy : bool, default False Whether to copy the data. Returns ------- - Array : Union[ndarray, ExtensionArray] + array : Union[numpy.ndarray, ExtensionArray] + + Notes + ----- + Omitting the `dtype` argument means pandas will attempt to infer the + best array type from the values in the data. As new array types are + added by pandas and 3rd party libraries, the best array type may + change. We recommend specifying `dtype` to ensure that the correct + array type is constructed. + + See Also + -------- + numpy.array : construct a NumPy array + Series : construct a pandas Series Examples -------- From c818a8f355695939a5bd87970c2738c29ce663f9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 21 Nov 2018 06:29:36 -0600 Subject: [PATCH 21/36] docstring order --- pandas/core/arrays/array_.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 79e447b0285e6..1468fc3036540 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -55,11 +55,6 @@ def array(data, dtype=None, copy=False): change. We recommend specifying `dtype` to ensure that the correct array type is constructed. - See Also - -------- - numpy.array : construct a NumPy array - Series : construct a pandas Series - Examples -------- If a dtype is not specified, `data` is passed through to @@ -98,6 +93,11 @@ def array(data, dtype=None, copy=False): >>> pd.array([1, 2, np.nan], dtype='Int64') IntegerArray([1, 2, nan], dtype='Int64') + + See Also + -------- + numpy.array : construct a NumPy array + Series : construct a pandas Series """ from pandas.core.arrays import period_array, ExtensionArray From ba8b80715633a45afdd17d29790c3a1feefd968f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 21 Nov 2018 06:31:02 -0600 Subject: [PATCH 22/36] Revert "docstring order" This reverts commit c818a8f355695939a5bd87970c2738c29ce663f9. --- pandas/core/arrays/array_.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 1468fc3036540..79e447b0285e6 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -55,6 +55,11 @@ def array(data, dtype=None, copy=False): change. We recommend specifying `dtype` to ensure that the correct array type is constructed. + See Also + -------- + numpy.array : construct a NumPy array + Series : construct a pandas Series + Examples -------- If a dtype is not specified, `data` is passed through to @@ -93,11 +98,6 @@ def array(data, dtype=None, copy=False): >>> pd.array([1, 2, np.nan], dtype='Int64') IntegerArray([1, 2, nan], dtype='Int64') - - See Also - -------- - numpy.array : construct a NumPy array - Series : construct a pandas Series """ from pandas.core.arrays import period_array, ExtensionArray From 77cd782140b0f184302d72a3e2a0a204df712a62 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 21 Nov 2018 14:02:09 -0600 Subject: [PATCH 23/36] Updates * interval inference * copy by default * doc clarifications --- doc/source/whatsnew/v0.24.0.rst | 12 ++++---- pandas/core/arrays/array_.py | 46 +++++++++++++++++++++++-------- pandas/tests/arrays/test_array.py | 26 +++++++++++++++-- 3 files changed, 65 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index eba574108e715..6fb4726161bc0 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -109,8 +109,9 @@ Reduction and groupby operations such as 'sum' work. .. _whatsnew_0240.enhancements.array: A new top-level method :func:`array` has been added for creating arrays (:issue:`22860`). -This can be used to create NumPy arrays, or any :ref:`extension array `, including -extension arrays registered by :ref:`3rd party libraries `. +This can be used to create any :ref:`extension array `, including +extension arrays registered by :ref:`3rd party libraries `, or to +create NumPy arrays. .. ipython:: python @@ -118,9 +119,10 @@ extension arrays registered by :ref:`3rd party libraries ` pd.array(['a', 'b', 'c'], dtype='category') pd.array([1, 2]) -Notice that the default return value, if no ``dtype`` is specified, is a NumPy array. So -the first example of ``[1, 2, np.nan]`` will return a floating-point NumPy array, -since ``NaN`` is a float. +Notice that the default return value, if no ``dtype`` is specified, the type of +array is inferred from the data. In particular, note that the first example of +``[1, 2, np.nan]`` will return a floating-point NumPy array, since ``NaN`` +is a float. .. ipython:: python diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 79e447b0285e6..7bca7d2c17b80 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -9,7 +9,11 @@ from pandas import compat -def array(data, dtype=None, copy=False): +def array(data, # type: Sequence[object] + dtype=None, # type: Optional[Union[str, np.dtype, ExtensionDtype]] + copy=True, # type: bool + ): + # type: (...) -> Union[str, np.dtype, ExtensionDtype] """ Create an array. @@ -17,14 +21,14 @@ def array(data, dtype=None, copy=False): Parameters ---------- - data : Sequence[object] - A sequence of objects. The scalars inside `data` should - be instances of the scalar type for `dtype`. + data : Sequence of objects. + The scalars inside `data` should be instances of the + scalar type for `dtype`. When `data` is an Index or Series, the underlying array will be extracted from `data`. - dtype : Union[str, np.dtype, ExtensionDtype], optional + dtype : str, np.dtype, or ExtensionDtype, optional The dtype to use for the array. This may be a NumPy dtype or an extension type registered with pandas using :meth:`pandas.api.extensions.register_extension_dtype`. @@ -38,10 +42,14 @@ def array(data, dtype=None, copy=False): from the data. Note that when `data` is a NumPy array, ``data.dtype`` is - ignored. + *not* used for inferring the array type. This is because + NumPy cannot represent all the types of data that can be + held in extension arrays. - copy : bool, default False - Whether to copy the data. + copy : bool, default True + Whether to copy the data, even if not necessary. Depending + on the type of `data`, creating the new array may require + copying data, even if ``copy=False``. Returns ------- @@ -52,8 +60,11 @@ def array(data, dtype=None, copy=False): Omitting the `dtype` argument means pandas will attempt to infer the best array type from the values in the data. As new array types are added by pandas and 3rd party libraries, the best array type may - change. We recommend specifying `dtype` to ensure that the correct - array type is constructed. + change. We recommend specifying `dtype` to ensure that + + 1. the correct array type for the data is returned + 2. the returned array type doesn't change as new extension types + are added by pandas and third-party libraries See Also -------- @@ -99,7 +110,9 @@ def array(data, dtype=None, copy=False): >>> pd.array([1, 2, np.nan], dtype='Int64') IntegerArray([1, 2, nan], dtype='Int64') """ - from pandas.core.arrays import period_array, ExtensionArray + from pandas.core.arrays import ( + period_array, ExtensionArray, IntervalArray + ) if isinstance(data, (ABCSeries, ABCIndexClass)): data = data._values @@ -121,7 +134,16 @@ def array(data, dtype=None, copy=False): try: return period_array(data, copy=copy) except tslibs.IncompatibleFrequency: - pass # we return an array below. + # We may have a mixture of frequencies. + # We choose to return an ndarray, rather than raising. + pass + elif inferred_dtype == 'interval': + try: + return IntervalArray(data, copy=copy) + except ValueError: + # We may have a mixture of `closed` here. + # We choose to return an ndarray, rather than raising. + pass # TODO(DatetimeArray): handle this type # TODO(BooleanArray): handle this type diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 0049483b4854d..35d6f2f853e2b 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -52,17 +52,39 @@ def test_array(data, dtype, expected): tm.assert_equal(result, expected) +def test_array_copy(): + a = np.array([1, 2]) + # default is to copy + b = pd.array(a) + assert np.shares_memory(a, b) is False + + # copy=True + b = pd.array(a, copy=True) + assert np.shares_memory(a, b) is False + + # copy=False + b = pd.array(a, copy=False) + assert a is b + + @pytest.mark.parametrize('data, expected', [ ([pd.Period("2000", "D"), pd.Period("2001", "D")], period_array(["2000", "2001"], freq="D")), + ([pd.Interval(0, 1), pd.Interval(1, 2)], + pd.IntervalArray.from_breaks([0, 1, 2])), ]) def test_array_inference(data, expected): result = pd.array(data) tm.assert_equal(result, expected) -def test_array_inference_period_fails(): - data = [pd.Period("2000", "D"), pd.Period("2001", "A")] +@pytest.mark.parametrize('data', [ + # mix of frequencies + [pd.Period("2000", "D"), pd.Period("2001", "A")], + # mix of closed + [pd.Interval(0, 1, closed='left'), pd.Interval(1, 2, closed='right')], +]) +def test_array_inference_fails(data): result = pd.array(data) expected = np.array(data, dtype=object) tm.assert_numpy_array_equal(result, expected) From 5eff701eb0424e56f06a69ddc0a5c0bf3f33a459 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 27 Nov 2018 05:30:51 -0600 Subject: [PATCH 24/36] Add docs for the types we infer --- pandas/core/arrays/array_.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 7bca7d2c17b80..5015158237279 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -46,6 +46,17 @@ def array(data, # type: Sequence[object] NumPy cannot represent all the types of data that can be held in extension arrays. + Currently, pandas will infer an extension dtype for sequences of + + ========================== ================================== + scalar type Array Type + ========================== ================================== + * :class:`pandas.Interval` :class:`pandas.IntervalArray` + * :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` + ========================== ================================== + + For all other cases, NumPy's usual inference rules will be used. + copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require From 94064009258259cd9821bab2c07db95002108408 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 27 Nov 2018 10:14:02 -0600 Subject: [PATCH 25/36] API: disallow string alias for NumPy --- pandas/core/arrays/array_.py | 35 +++++++++++++++++++++++++++++-- pandas/tests/arrays/test_array.py | 11 ++++++++-- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 5015158237279..b363b95c07aef 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -57,6 +57,22 @@ def array(data, # type: Sequence[object] For all other cases, NumPy's usual inference rules will be used. + To avoid *future* breaking changing, pandas will not pass through + string aliases like ``dtype="int32"`` through to NumPy. + + >>> pd.array([1, 2, 3], dtype="int32") + Traceback (most recent call last): + ... + ValueError: Ambiguous dtype 'int32'... + + In a future version of pandas, or with a different set of 3rd-party + extension types registered, the meaning of the string alias + ``"int32"`` may change. To avoid this ambiguity, pandas requires that + an actual NumPy dtype be passed instead. + + >>> pd.array([1, 2, 3], dtype=np.dtype("int32")) + array([1, 2, 3], dtype=int32) + copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require @@ -70,7 +86,7 @@ def array(data, # type: Sequence[object] ----- Omitting the `dtype` argument means pandas will attempt to infer the best array type from the values in the data. As new array types are - added by pandas and 3rd party libraries, the best array type may + added by pandas and 3rd party libraries, the "best" array type may change. We recommend specifying `dtype` to ensure that 1. the correct array type for the data is returned @@ -92,7 +108,7 @@ def array(data, # type: Sequence[object] Or the NumPy dtype can be specified - >>> pd.array([1, 2], dtype=np.int32) + >>> pd.array([1, 2], dtype=np.dtype("int32")) array([1, 2], dtype=int32) You can use the string alias for `dtype` @@ -120,6 +136,13 @@ def array(data, # type: Sequence[object] >>> pd.array([1, 2, np.nan], dtype='Int64') IntegerArray([1, 2, nan], dtype='Int64') + + Pandas will infer an ExtensionArray for some types of data: + + >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) + + ['2000-01-01', '2000-01-01'] + Length: 2, dtype: period[D] """ from pandas.core.arrays import ( period_array, ExtensionArray, IntervalArray @@ -159,4 +182,12 @@ def array(data, # type: Sequence[object] # TODO(DatetimeArray): handle this type # TODO(BooleanArray): handle this type + if isinstance(dtype, compat.string_types): + msg = ( + "Ambiguous dtype '{dtype}'. 'pandas.array' will not pass string " + "aliases to NumPy. If you want a NumPy array, specify " + "'dtype=numpy.dtype(\"{dtype}\")'." + ).format(dtype=dtype) + raise ValueError(msg) + return np.array(data, dtype=dtype, copy=copy) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 35d6f2f853e2b..33ff2616dbdca 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -16,13 +16,15 @@ @pytest.mark.parametrize("data, dtype, expected", [ ([1, 2], None, np.array([1, 2])), ([1, 2], object, np.array([1, 2], dtype=object)), - ([1, 2], 'float32', np.array([1., 2.0], dtype=np.dtype('float32'))), + ([1, 2], np.dtype('float32'), + np.array([1., 2.0], dtype=np.dtype('float32'))), (np.array([1, 2]), None, np.array([1, 2])), ([pd.Period('2000', 'D'), pd.Period('2001', 'D')], 'Period[D]', period_array(['2000', '2001'], freq='D')), ([pd.Period('2000', 'D')], pd.PeriodDtype('D'), period_array(['2000'], freq='D')), - ([1, 2], 'datetime64[ns]', np.array([1, 2], dtype='datetime64[ns]')), + ([1, 2], np.dtype('datetime64[ns]'), + np.array([1, 2], dtype='datetime64[ns]')), (['a', 'b'], 'category', pd.Categorical(['a', 'b'])), (['a', 'b'], pd.CategoricalDtype(None, ordered=True), pd.Categorical(['a', 'b'], ordered=True)), @@ -90,6 +92,11 @@ def test_array_inference_fails(data): tm.assert_numpy_array_equal(result, expected) +def test_numpy_string_alias_raises(): + match = "Ambiguous dtype 'int32'.*dtype=numpy.dtype.\"int32\"." + with pytest.raises(ValueError, match=match): + pd.array([1, 2], dtype='int32') + # --------------------------------------------------------------------------- # A couple dummy classes to ensure that Series and Indexes are unboxed before # getting to the EA classes. From ea3a1185c61ee3c6bb83c593523f858f5ca6cd72 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 28 Nov 2018 16:25:02 -0600 Subject: [PATCH 26/36] Wrap long error message --- pandas/core/arrays/array_.py | 4 +++- pandas/tests/arrays/test_array.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index b363b95c07aef..5d6722dfe96a7 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -1,3 +1,5 @@ +import textwrap + import numpy as np from pandas._libs import lib, tslibs @@ -188,6 +190,6 @@ def array(data, # type: Sequence[object] "aliases to NumPy. If you want a NumPy array, specify " "'dtype=numpy.dtype(\"{dtype}\")'." ).format(dtype=dtype) - raise ValueError(msg) + raise ValueError('\n'.join(textwrap.wrap(msg))) return np.array(data, dtype=dtype, copy=copy) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 33ff2616dbdca..b7e1f1eab00f5 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -1,4 +1,5 @@ import decimal +import re import numpy as np import pytest @@ -93,7 +94,10 @@ def test_array_inference_fails(data): def test_numpy_string_alias_raises(): - match = "Ambiguous dtype 'int32'.*dtype=numpy.dtype.\"int32\"." + match = re.compile( + r"Ambiguous dtype 'int32'.*dtype=numpy.dtype.\"int32\".", + flags=re.DOTALL, + ) with pytest.raises(ValueError, match=match): pd.array([1, 2], dtype='int32') From fb814fc5460c348678be3eb5f8b0fc3ab47448cf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 29 Nov 2018 06:37:02 -0600 Subject: [PATCH 27/36] updates * Removed raising on strings * Docstring passes --- pandas/core/arrays/array_.py | 43 ++++++++++++++---------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 5d6722dfe96a7..72b7a3994166d 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -1,5 +1,3 @@ -import textwrap - import numpy as np from pandas._libs import lib, tslibs @@ -23,7 +21,7 @@ def array(data, # type: Sequence[object] Parameters ---------- - data : Sequence of objects. + data : Sequence of objects The scalars inside `data` should be instances of the scalar type for `dtype`. @@ -59,22 +57,21 @@ def array(data, # type: Sequence[object] For all other cases, NumPy's usual inference rules will be used. - To avoid *future* breaking changing, pandas will not pass through - string aliases like ``dtype="int32"`` through to NumPy. + To avoid *future* breaking changing, pandas recommends using actual + dtypes, and not string aliases, for `dtype`. In other words, use - >>> pd.array([1, 2, 3], dtype="int32") - Traceback (most recent call last): - ... - ValueError: Ambiguous dtype 'int32'... + >>> pd.array([1, 2, 3], dtype=np.dtype("int32")) + array([1, 2, 3], dtype=int32) - In a future version of pandas, or with a different set of 3rd-party - extension types registered, the meaning of the string alias - ``"int32"`` may change. To avoid this ambiguity, pandas requires that - an actual NumPy dtype be passed instead. + rather than - >>> pd.array([1, 2, 3], dtype=np.dtype("int32")) + >>> pd.array([1, 2, 3], dtype="int32") array([1, 2, 3], dtype=int32) + If and when pandas switches to a different backend for storing arrays, + the meaning of the string aliases will change, while the actual + dtypes will be unambiguous. + copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require @@ -84,6 +81,11 @@ def array(data, # type: Sequence[object] ------- array : Union[numpy.ndarray, ExtensionArray] + See Also + -------- + numpy.array : Construct a NumPy array. + Series : Construct a pandas Series. + Notes ----- Omitting the `dtype` argument means pandas will attempt to infer the @@ -95,11 +97,6 @@ def array(data, # type: Sequence[object] 2. the returned array type doesn't change as new extension types are added by pandas and third-party libraries - See Also - -------- - numpy.array : construct a NumPy array - Series : construct a pandas Series - Examples -------- If a dtype is not specified, `data` is passed through to @@ -184,12 +181,4 @@ def array(data, # type: Sequence[object] # TODO(DatetimeArray): handle this type # TODO(BooleanArray): handle this type - if isinstance(dtype, compat.string_types): - msg = ( - "Ambiguous dtype '{dtype}'. 'pandas.array' will not pass string " - "aliases to NumPy. If you want a NumPy array, specify " - "'dtype=numpy.dtype(\"{dtype}\")'." - ).format(dtype=dtype) - raise ValueError('\n'.join(textwrap.wrap(msg))) - return np.array(data, dtype=dtype, copy=copy) From a6f6d293e3346c42c697ace0c9fcae851cbcd383 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 29 Nov 2018 08:26:38 -0600 Subject: [PATCH 28/36] removed old test --- pandas/tests/arrays/test_array.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index b7e1f1eab00f5..b114afe656f02 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -1,5 +1,4 @@ import decimal -import re import numpy as np import pytest @@ -15,25 +14,37 @@ @pytest.mark.parametrize("data, dtype, expected", [ + # Basic NumPy defaults. ([1, 2], None, np.array([1, 2])), ([1, 2], object, np.array([1, 2], dtype=object)), ([1, 2], np.dtype('float32'), np.array([1., 2.0], dtype=np.dtype('float32'))), (np.array([1, 2]), None, np.array([1, 2])), + # String alias passes through to NumPy + ([1, 2], 'float32', np.array([1, 2], dtype='float32')), + # Period alias ([pd.Period('2000', 'D'), pd.Period('2001', 'D')], 'Period[D]', period_array(['2000', '2001'], freq='D')), + # Period dtype ([pd.Period('2000', 'D')], pd.PeriodDtype('D'), period_array(['2000'], freq='D')), + # Datetime (naive) ([1, 2], np.dtype('datetime64[ns]'), np.array([1, 2], dtype='datetime64[ns]')), + # TODO(DatetimeArray): add here + # Category (['a', 'b'], 'category', pd.Categorical(['a', 'b'])), (['a', 'b'], pd.CategoricalDtype(None, ordered=True), pd.Categorical(['a', 'b'], ordered=True)), + # Interval ([pd.Interval(1, 2), pd.Interval(3, 4)], 'interval', pd.IntervalArray.from_tuples([(1, 2), (3, 4)])), + # Sparse ([0, 1], 'Sparse[int64]', pd.SparseArray([0, 1], dtype='int64')), + # IntegerNA ([1, None], 'Int16', integer_array([1, None], dtype='Int16')), (pd.Series([1, 2]), None, np.array([1, 2], dtype=np.int64)), + # Index (pd.Index([1, 2]), None, np.array([1, 2], dtype=np.int64)), # Series[EA] returns the EA (pd.Series(pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])), @@ -48,7 +59,6 @@ (period_array(['2000', '2001'], freq='D'), 'category', pd.Categorical([pd.Period('2000', 'D'), pd.Period('2001', 'D')])), - ]) def test_array(data, dtype, expected): result = pd.array(data, dtype=dtype) @@ -93,14 +103,6 @@ def test_array_inference_fails(data): tm.assert_numpy_array_equal(result, expected) -def test_numpy_string_alias_raises(): - match = re.compile( - r"Ambiguous dtype 'int32'.*dtype=numpy.dtype.\"int32\".", - flags=re.DOTALL, - ) - with pytest.raises(ValueError, match=match): - pd.array([1, 2], dtype='int32') - # --------------------------------------------------------------------------- # A couple dummy classes to ensure that Series and Indexes are unboxed before # getting to the EA classes. From 86b81b51381c9a63d2b1392648efa61fec073d90 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 29 Nov 2018 11:04:30 -0600 Subject: [PATCH 29/36] formatting --- pandas/tests/arrays/test_array.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index b114afe656f02..a7b4c661bce4e 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -20,41 +20,54 @@ ([1, 2], np.dtype('float32'), np.array([1., 2.0], dtype=np.dtype('float32'))), (np.array([1, 2]), None, np.array([1, 2])), + # String alias passes through to NumPy ([1, 2], 'float32', np.array([1, 2], dtype='float32')), + # Period alias ([pd.Period('2000', 'D'), pd.Period('2001', 'D')], 'Period[D]', period_array(['2000', '2001'], freq='D')), + # Period dtype ([pd.Period('2000', 'D')], pd.PeriodDtype('D'), period_array(['2000'], freq='D')), + # Datetime (naive) ([1, 2], np.dtype('datetime64[ns]'), np.array([1, 2], dtype='datetime64[ns]')), # TODO(DatetimeArray): add here + # Category (['a', 'b'], 'category', pd.Categorical(['a', 'b'])), (['a', 'b'], pd.CategoricalDtype(None, ordered=True), pd.Categorical(['a', 'b'], ordered=True)), + # Interval ([pd.Interval(1, 2), pd.Interval(3, 4)], 'interval', pd.IntervalArray.from_tuples([(1, 2), (3, 4)])), + # Sparse ([0, 1], 'Sparse[int64]', pd.SparseArray([0, 1], dtype='int64')), + # IntegerNA ([1, None], 'Int16', integer_array([1, None], dtype='Int16')), (pd.Series([1, 2]), None, np.array([1, 2], dtype=np.int64)), + # Index (pd.Index([1, 2]), None, np.array([1, 2], dtype=np.int64)), + # Series[EA] returns the EA (pd.Series(pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])), None, pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])), + # "3rd party" EAs work ([decimal.Decimal(0), decimal.Decimal(1)], 'decimal', to_decimal([0, 1])), + # 2D ndarrays pass through (np.array([[1, 2], [3, 4]]), None, np.array([[1, 2], [3, 4]])), ([[1, 2], [3, 4]], None, np.array([[1, 2, ], [3, 4]])), + # pass an ExtensionArray, but a different dtype (period_array(['2000', '2001'], freq='D'), 'category', From 000967d19ce2c87244bea94c97dbd17cd26c1c22 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 10 Dec 2018 12:36:44 -0600 Subject: [PATCH 30/36] Raise on scalars --- pandas/core/arrays/array_.py | 6 ++++++ pandas/tests/arrays/test_array.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 72b7a3994166d..5c06138da5f83 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -147,6 +147,12 @@ def array(data, # type: Sequence[object] period_array, ExtensionArray, IntervalArray ) + if lib.is_scalar(data): + msg = ( + "Cannot pass scalar '{}' to 'pandas.array'." + ) + raise ValueError(msg.format(data)) + if isinstance(data, (ABCSeries, ABCIndexClass)): data = data._values diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index a7b4c661bce4e..e9063f355185c 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -169,3 +169,9 @@ def test_array_not_registered(registry_without_decimal): result = pd.array(data, dtype=DecimalDtype) expected = DecimalArray._from_sequence(data) tm.assert_equal(result, expected) + + +def test_scalar_raises(): + with pytest.raises(ValueError, + match="Cannot pass scalar '1'"): + pd.array(1) From faf114dee724359ad0c3ac6af258a679e1af3f31 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 11 Dec 2018 08:15:12 -0600 Subject: [PATCH 31/36] docs on raising --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/arrays/array_.py | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 0904bd55f62dc..1fdbeb956d885 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -160,7 +160,7 @@ Reduction and groupby operations such as 'sum' work. .. _whatsnew_0240.enhancements.array: -A new top-level method :func:`array` has been added for creating arrays (:issue:`22860`). +A new top-level method :func:`array` has been added for creating 1-dimensional arrays (:issue:`22860`). This can be used to create any :ref:`extension array `, including extension arrays registered by :ref:`3rd party libraries `, or to create NumPy arrays. diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 5c06138da5f83..5aab4708f55ae 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -23,7 +23,8 @@ def array(data, # type: Sequence[object] ---------- data : Sequence of objects The scalars inside `data` should be instances of the - scalar type for `dtype`. + scalar type for `dtype`. It's expected that `data` + represents a 1-dimensional array of data. When `data` is an Index or Series, the underlying array will be extracted from `data`. @@ -81,6 +82,11 @@ def array(data, # type: Sequence[object] ------- array : Union[numpy.ndarray, ExtensionArray] + Raises + ------ + ValueError + When `data` is not 1-dimensional. + See Also -------- numpy.array : Construct a NumPy array. @@ -134,7 +140,9 @@ def array(data, # type: Sequence[object] the dtype: >>> pd.array([1, 2, np.nan], dtype='Int64') - IntegerArray([1, 2, nan], dtype='Int64') + + [1, 2, NaN] + Length: 3, dtype: Int64 Pandas will infer an ExtensionArray for some types of data: @@ -142,6 +150,13 @@ def array(data, # type: Sequence[object] ['2000-01-01', '2000-01-01'] Length: 2, dtype: period[D] + + A ValueError is raised when the input has the wrong dimensionality. + + >>> pd.array(1) + Traceback (most recent call last): + ... + ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( period_array, ExtensionArray, IntervalArray From 932e119c3383400e78d733f713acd76ae837b765 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 28 Dec 2018 13:06:54 -0600 Subject: [PATCH 32/36] Updates for PandasArray --- doc/source/whatsnew/v0.24.0.rst | 21 ++++++++++--- pandas/core/arrays/array_.py | 51 +++++++++++++++++++----------- pandas/tests/arrays/test_array.py | 52 +++++++++++++++++-------------- 3 files changed, 77 insertions(+), 47 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 4a30ffc634e99..cc7c55aad794f 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -165,18 +165,29 @@ Reduction and groupby operations such as 'sum' work. A new top-level method :func:`array` has been added for creating 1-dimensional arrays (:issue:`22860`). This can be used to create any :ref:`extension array `, including -extension arrays registered by :ref:`3rd party libraries `, or to -create NumPy arrays. +extension arrays registered by :ref:`3rd party libraries `. .. ipython:: python pd.array([1, 2, np.nan], dtype='Int64') pd.array(['a', 'b', 'c'], dtype='category') - pd.array([1, 2]) -Notice that the default return value, if no ``dtype`` is specified, the type of +Passing data for which there isn't dedicated extension type (e.g. float, integer, etc.) +will return a new :class:`arrays.PandasArray`, which is just a thin (no-copy) +wrapper around a :class:`numpy.ndarray` that satisfies the extension array interface. + +.. ipython:: python + + pd.array([1, 2, 3]) + +On their own, a :class:`arrays.PandasArray` isn't a very useful object. +But if you need write low-level code that works generically for any +:class:`~pandas.api.extensions.ExtensionArray`, :class:`arrays.PandasArray` +satisfies that need. + +Notice that by default, if no ``dtype`` is specified, the dtype of the returned array is inferred from the data. In particular, note that the first example of -``[1, 2, np.nan]`` will return a floating-point NumPy array, since ``NaN`` +``[1, 2, np.nan]`` would have returned a floating-point array, since ``NaN`` is a float. .. ipython:: python diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 5aab4708f55ae..2e3f5e407ec57 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -13,7 +13,7 @@ def array(data, # type: Sequence[object] dtype=None, # type: Optional[Union[str, np.dtype, ExtensionDtype]] copy=True, # type: bool ): - # type: (...) -> Union[str, np.dtype, ExtensionDtype] + # type: (...) -> ExtensionArray """ Create an array. @@ -58,20 +58,27 @@ def array(data, # type: Sequence[object] For all other cases, NumPy's usual inference rules will be used. - To avoid *future* breaking changing, pandas recommends using actual - dtypes, and not string aliases, for `dtype`. In other words, use + To avoid *future* breaking changes, when the underlying memory + representation of the returned array matters, we recommend specifying + the `dtype` as a concrete object rather than a string alias or + allowing it to be inferred. For example, a future version of pandas + or a 3rd-party library may include a dedicated ExtensionArray for + string data. In this event, the following would no longer return a + :class:`PandasArray` backed by a NumPy array. - >>> pd.array([1, 2, 3], dtype=np.dtype("int32")) - array([1, 2, 3], dtype=int32) + >>> pd.array(['a', 'b'], dtype=str) + + ['a', 'b'] + Length: 2, dtype: str32 - rather than + This would instead return the new ExtensionArray dedicated for string + data. If you really need the new array to be backed by a NumPy array, + specify that in the dtype. - >>> pd.array([1, 2, 3], dtype="int32") - array([1, 2, 3], dtype=int32) - - If and when pandas switches to a different backend for storing arrays, - the meaning of the string aliases will change, while the actual - dtypes will be unambiguous. + >>> pd.array(['a', 'b'], dtype=np.dtype(" + ['a', 'b'] + Length: 2, dtype: str32 copy : bool, default True Whether to copy the data, even if not necessary. Depending @@ -80,7 +87,7 @@ def array(data, # type: Sequence[object] Returns ------- - array : Union[numpy.ndarray, ExtensionArray] + array : ExtensionArray Raises ------ @@ -109,12 +116,16 @@ def array(data, # type: Sequence[object] :meth:`numpy.array`, and an ``ndarray`` is returned. >>> pd.array([1, 2]) - array([1, 2]) + + [1, 2] + Length: 2, dtype: int64 Or the NumPy dtype can be specified >>> pd.array([1, 2], dtype=np.dtype("int32")) - array([1, 2], dtype=int32) + + [1, 2] + Length: 2, dtype: int32 You can use the string alias for `dtype` @@ -134,7 +145,9 @@ def array(data, # type: Sequence[object] NumPy array. >>> pd.array([1, 2, np.nan]) - array([ 1., 2., nan]) + + [1.0, 2.0, nan] + Length: 3, dtype: float64 To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify the dtype: @@ -159,7 +172,7 @@ def array(data, # type: Sequence[object] ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( - period_array, ExtensionArray, IntervalArray + period_array, ExtensionArray, IntervalArray, PandasArray ) if lib.is_scalar(data): @@ -202,4 +215,6 @@ def array(data, # type: Sequence[object] # TODO(DatetimeArray): handle this type # TODO(BooleanArray): handle this type - return np.array(data, dtype=dtype, copy=copy) + result = np.array(data, dtype=dtype, copy=copy) + result = PandasArray(result) + return result diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index e9063f355185c..76ef85b0317ad 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -7,7 +7,7 @@ import pandas as pd from pandas.api.extensions import register_extension_dtype -from pandas.core.arrays import integer_array, period_array +from pandas.core.arrays import PandasArray, integer_array, period_array from pandas.tests.extension.decimal import ( DecimalArray, DecimalDtype, to_decimal) import pandas.util.testing as tm @@ -15,14 +15,14 @@ @pytest.mark.parametrize("data, dtype, expected", [ # Basic NumPy defaults. - ([1, 2], None, np.array([1, 2])), - ([1, 2], object, np.array([1, 2], dtype=object)), + ([1, 2], None, PandasArray(np.array([1, 2]))), + ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), ([1, 2], np.dtype('float32'), - np.array([1., 2.0], dtype=np.dtype('float32'))), - (np.array([1, 2]), None, np.array([1, 2])), + PandasArray(np.array([1., 2.0], dtype=np.dtype('float32')))), + (np.array([1, 2]), None, PandasArray(np.array([1, 2]))), # String alias passes through to NumPy - ([1, 2], 'float32', np.array([1, 2], dtype='float32')), + ([1, 2], 'float32', PandasArray(np.array([1, 2], dtype='float32'))), # Period alias ([pd.Period('2000', 'D'), pd.Period('2001', 'D')], 'Period[D]', @@ -34,7 +34,7 @@ # Datetime (naive) ([1, 2], np.dtype('datetime64[ns]'), - np.array([1, 2], dtype='datetime64[ns]')), + PandasArray(np.array([1, 2], dtype='datetime64[ns]'))), # TODO(DatetimeArray): add here # Category @@ -51,10 +51,10 @@ # IntegerNA ([1, None], 'Int16', integer_array([1, None], dtype='Int16')), - (pd.Series([1, 2]), None, np.array([1, 2], dtype=np.int64)), + (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Index - (pd.Index([1, 2]), None, np.array([1, 2], dtype=np.int64)), + (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Series[EA] returns the EA (pd.Series(pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])), @@ -64,10 +64,6 @@ # "3rd party" EAs work ([decimal.Decimal(0), decimal.Decimal(1)], 'decimal', to_decimal([0, 1])), - # 2D ndarrays pass through - (np.array([[1, 2], [3, 4]]), None, np.array([[1, 2], [3, 4]])), - ([[1, 2], [3, 4]], None, np.array([[1, 2, ], [3, 4]])), - # pass an ExtensionArray, but a different dtype (period_array(['2000', '2001'], freq='D'), 'category', @@ -82,15 +78,15 @@ def test_array_copy(): a = np.array([1, 2]) # default is to copy b = pd.array(a) - assert np.shares_memory(a, b) is False + assert np.shares_memory(a, b._ndarray) is False # copy=True b = pd.array(a, copy=True) - assert np.shares_memory(a, b) is False + assert np.shares_memory(a, b._ndarray) is False # copy=False b = pd.array(a, copy=False) - assert a is b + assert np.shares_memory(a, b._ndarray) is True @pytest.mark.parametrize('data, expected', [ @@ -112,10 +108,24 @@ def test_array_inference(data, expected): ]) def test_array_inference_fails(data): result = pd.array(data) - expected = np.array(data, dtype=object) - tm.assert_numpy_array_equal(result, expected) + expected = PandasArray(np.array(data, dtype=object)) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("data", [ + np.array([[1, 2], [3, 4]]), + [[1, 2], [3, 4]], +]) +def test_nd_raises(data): + with pytest.raises(ValueError, match='PandasArray must be 1-dimensional'): + pd.array(data) +def test_scalar_raises(): + with pytest.raises(ValueError, + match="Cannot pass scalar '1'"): + pd.array(1) + # --------------------------------------------------------------------------- # A couple dummy classes to ensure that Series and Indexes are unboxed before # getting to the EA classes. @@ -169,9 +179,3 @@ def test_array_not_registered(registry_without_decimal): result = pd.array(data, dtype=DecimalDtype) expected = DecimalArray._from_sequence(data) tm.assert_equal(result, expected) - - -def test_scalar_raises(): - with pytest.raises(ValueError, - match="Cannot pass scalar '1'"): - pd.array(1) From 45d07eb71173adda026a1a88b3846ed50923ba18 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 28 Dec 2018 13:21:38 -0600 Subject: [PATCH 33/36] update docstring --- pandas/core/arrays/array_.py | 59 +++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 2e3f5e407ec57..c8ae88a33ff64 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -58,28 +58,6 @@ def array(data, # type: Sequence[object] For all other cases, NumPy's usual inference rules will be used. - To avoid *future* breaking changes, when the underlying memory - representation of the returned array matters, we recommend specifying - the `dtype` as a concrete object rather than a string alias or - allowing it to be inferred. For example, a future version of pandas - or a 3rd-party library may include a dedicated ExtensionArray for - string data. In this event, the following would no longer return a - :class:`PandasArray` backed by a NumPy array. - - >>> pd.array(['a', 'b'], dtype=str) - - ['a', 'b'] - Length: 2, dtype: str32 - - This would instead return the new ExtensionArray dedicated for string - data. If you really need the new array to be backed by a NumPy array, - specify that in the dtype. - - >>> pd.array(['a', 'b'], dtype=np.dtype(" - ['a', 'b'] - Length: 2, dtype: str32 - copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require @@ -97,7 +75,9 @@ def array(data, # type: Sequence[object] See Also -------- numpy.array : Construct a NumPy array. + arrays.PandasArray : ExtensionArray wrapping a NumPy array. Series : Construct a pandas Series. + Index : Construct a pandas Index. Notes ----- @@ -110,10 +90,40 @@ def array(data, # type: Sequence[object] 2. the returned array type doesn't change as new extension types are added by pandas and third-party libraries + Additionally, if the underlying memory representation of the returned + array matters, we recommend specifying the `dtype` as a concrete object + rather than a string alias or allowing it to be inferred. For example, + a future version of pandas or a 3rd-party library may include a + dedicated ExtensionArray for string data. In this event, the following + would no longer return a :class:`arrays.PandasArray` backed by a NumPy + array. + + >>> pd.array(['a', 'b'], dtype=str) + + ['a', 'b'] + Length: 2, dtype: str32 + + This would instead return the new ExtensionArray dedicated for string + data. If you really need the new array to be backed by a NumPy array, + specify that in the dtype. + + >>> pd.array(['a', 'b'], dtype=np.dtype(" + ['a', 'b'] + Length: 2, dtype: str32 + + Or use the dedicated constructor for the array you're expecting, and + wrap that in a PandasArray + + >>> pd.array(np.array(['a', 'b'], dtype=' + ['a', 'b'] + Length: 2, dtype: str32 + Examples -------- If a dtype is not specified, `data` is passed through to - :meth:`numpy.array`, and an ``ndarray`` is returned. + :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned. >>> pd.array([1, 2]) @@ -164,7 +174,8 @@ def array(data, # type: Sequence[object] ['2000-01-01', '2000-01-01'] Length: 2, dtype: period[D] - A ValueError is raised when the input has the wrong dimensionality. + `data` must be 1-dimensional. A ValueError is raised when the input + has the wrong dimensionality. >>> pd.array(1) Traceback (most recent call last): From d1aba7351ff649d5e4dcc4f4dd1887c516f100c4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 28 Dec 2018 13:47:43 -0600 Subject: [PATCH 34/36] Updates * doc ref * use extract_array * use PandasArray._from_sequence --- doc/source/whatsnew/v0.24.0.rst | 4 +++- pandas/core/arrays/array_.py | 10 +++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index cc7c55aad794f..473bbedabe183 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -165,7 +165,9 @@ Reduction and groupby operations such as 'sum' work. A new top-level method :func:`array` has been added for creating 1-dimensional arrays (:issue:`22860`). This can be used to create any :ref:`extension array `, including -extension arrays registered by :ref:`3rd party libraries `. +extension arrays registered by :ref:`3rd party libraries `. See + +See :ref:`Dtypes ` for more on extension arrays. .. ipython:: python diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index c8ae88a33ff64..173ed7d191ac9 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -1,10 +1,7 @@ -import numpy as np - from pandas._libs import lib, tslibs from pandas.core.dtypes.common import is_extension_array_dtype from pandas.core.dtypes.dtypes import registry -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas import compat @@ -185,6 +182,7 @@ def array(data, # type: Sequence[object] from pandas.core.arrays import ( period_array, ExtensionArray, IntervalArray, PandasArray ) + from pandas.core.internals.arrays import extract_array if lib.is_scalar(data): msg = ( @@ -192,8 +190,7 @@ def array(data, # type: Sequence[object] ) raise ValueError(msg.format(data)) - if isinstance(data, (ABCSeries, ABCIndexClass)): - data = data._values + data = extract_array(data, extract_numpy=True) if dtype is None and isinstance(data, ExtensionArray): dtype = data.dtype @@ -226,6 +223,5 @@ def array(data, # type: Sequence[object] # TODO(DatetimeArray): handle this type # TODO(BooleanArray): handle this type - result = np.array(data, dtype=dtype, copy=copy) - result = PandasArray(result) + result = PandasArray._from_sequence(data, dtype=dtype, copy=copy) return result From 1f3bb508dae496f3044f2516d8b137c6779c0ec4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 28 Dec 2018 14:52:58 -0600 Subject: [PATCH 35/36] fixed test expected --- pandas/tests/extension/base/constructors.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 30c49a326be69..9c719b1304629 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -64,5 +64,6 @@ def test_pandas_array(self, data): def test_pandas_array_dtype(self, data): # ... but specifying dtype will override idempotency - result = pd.array(data, dtype=object) - self.assert_equal(result, np.asarray(data, dtype=object)) + result = pd.array(data, dtype=np.dtype(object)) + expected = pd.arrays.PandasArray(np.asarray(data, dtype=object)) + self.assert_equal(result, expected) From c8d3960df2bd86184b3b5ec9734b6a0cacc0b0ae Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 28 Dec 2018 16:13:24 -0600 Subject: [PATCH 36/36] doc lint --- pandas/core/arrays/period.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index d45cd9402d45b..2c7ee5b277a90 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -115,6 +115,11 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, copy : bool, default False Whether to copy the ordinals before storing. + See Also + -------- + period_array : Create a new PeriodArray. + pandas.PeriodIndex : Immutable Index for period data. + Notes ----- There are two components to a PeriodArray @@ -127,11 +132,6 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, The `freq` indicates the span covered by each element of the array. All elements in the PeriodArray have the same `freq`. - - See Also - -------- - period_array : Create a new PeriodArray. - pandas.PeriodIndex : Immutable Index for period data. """ # array priority higher than numpy scalars __array_priority__ = 1000