From 3464e95064ad1c1d4ac9d37e3d381215165a8ffe Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 17 Apr 2020 09:24:35 +0200 Subject: [PATCH 01/13] initial find_common_type/_get_common_type + tests for IntegerDtype --- pandas/core/arrays/integer.py | 13 +++++++++-- pandas/core/dtypes/base.py | 27 ++++++++++++++++++++++ pandas/core/dtypes/cast.py | 7 +++++- pandas/core/dtypes/concat.py | 24 ++++++++++++++++--- pandas/tests/arrays/integer/test_concat.py | 26 +++++++++++++++++++++ 5 files changed, 91 insertions(+), 6 deletions(-) create mode 100644 pandas/tests/arrays/integer/test_concat.py diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 5605b3fbc5dfa..1f434f0d18db0 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,11 +1,11 @@ import numbers -from typing import TYPE_CHECKING, Tuple, Type, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union import warnings import numpy as np from pandas._libs import lib, missing as libmissing -from pandas._typing import ArrayLike +from pandas._typing import ArrayLike, DtypeObj from pandas.compat import set_function_name from pandas.util._decorators import cache_readonly @@ -95,6 +95,15 @@ def construct_array_type(cls) -> Type["IntegerArray"]: """ return IntegerArray + def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + # for now only handle other integer types + if not all(isinstance(t, _IntegerDtype) for t in dtypes): + return None + np_dtype = np.find_common_type([t.numpy_dtype for t in dtypes], []) + if np.issubdtype(np_dtype, np.integer): + return _dtypes[str(np_dtype)] + return None + def __from_arrow__( self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] ) -> "IntegerArray": diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index a4f0ccc2016c0..a168b97b529e4 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -6,6 +6,7 @@ import numpy as np +from pandas._typing import DtypeObj from pandas.errors import AbstractMethodError from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries @@ -322,3 +323,29 @@ def _is_boolean(self) -> bool: bool """ return False + + def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + """ + Return the common dtype, if one exists. + + Used in `find_common_type` implementation. This is for example used + to determine the resulting dtype in a concat operation. + + If no common dtype exists, return None. If all dtypes in the list + will return None, then the common dtype will be "object" dtype. + + Parameters + ---------- + dtypes : list of dtypes + The dtypes for which to determine a common dtype. This is a list + of np.dtype or ExtensionDtype instances. + + Returns + ------- + Common dtype (np.dtype or ExtensionDtype) or None + """ + if len(set(dtypes)) == 1: + # only itself + return self + else: + return None diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7dda6850ba4f7..b9264f9697bb3 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1474,7 +1474,12 @@ def find_common_type(types): return first if any(isinstance(t, ExtensionDtype) for t in types): - return np.object + for t in types: + if isinstance(t, ExtensionDtype): + res = t._get_common_type(types) + if res is not None: + return res + return np.dtype("object") # take lowest unit if all(is_datetime64_dtype(t) for t in types): diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 257c4fe3c6d30..a474790475d73 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -4,6 +4,7 @@ import numpy as np +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_bool_dtype, is_categorical_dtype, @@ -17,6 +18,9 @@ ) from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries +from pandas.core.arrays import ExtensionArray +from pandas.core.construction import array + def get_dtype_kinds(l): """ @@ -99,9 +103,23 @@ def is_nonempty(x) -> bool: single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) - if any_ea and single_dtype and axis == 0: - cls = type(to_concat[0]) - return cls._concat_same_type(to_concat) + if any_ea and axis == 0: + if not single_dtype: + target_dtype = find_common_type([x.dtype for x in to_concat]) + + def cast(arr, dtype): + if is_extension_array_dtype(dtype): + if isinstance(arr, np.ndarray): + return array(arr, dtype=dtype, copy=False) + return arr.astype(dtype, copy=False) + + to_concat = [cast(arr, target_dtype) for arr in to_concat] + + if isinstance(to_concat[0], ExtensionArray): + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat) + else: + np.concatenate(to_concat) elif "category" in typs: # this must be prior to concat_datetime, diff --git a/pandas/tests/arrays/integer/test_concat.py b/pandas/tests/arrays/integer/test_concat.py new file mode 100644 index 0000000000000..3ace35700bd3e --- /dev/null +++ b/pandas/tests/arrays/integer/test_concat.py @@ -0,0 +1,26 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + (["Int64", "Int64"], "Int64"), + (["UInt64", "UInt64"], "UInt64"), + (["Int8", "Int8"], "Int8"), + (["Int8", "Int16"], "Int16"), + (["UInt8", "Int8"], "Int16"), + (["Int32", "UInt32"], "Int64"), + # this still gives object (awaiting float extension dtype) + (["Int64", "UInt64"], "object"), + ], +) +def test_concat_series(to_concat_dtypes, result_dtype): + + result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes]) + expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype( + result_dtype + ) + tm.assert_series_equal(result, expected) From b1d9d682393061560fd218b54aeb6fc82bfd573a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 17 Apr 2020 11:40:26 +0200 Subject: [PATCH 02/13] handle categoricals --- pandas/core/arrays/categorical.py | 4 +- pandas/core/dtypes/base.py | 4 ++ pandas/core/dtypes/concat.py | 66 ++++++----------------------- pandas/core/dtypes/dtypes.py | 19 ++++++++- pandas/core/internals/concat.py | 5 ++- pandas/tests/reshape/test_concat.py | 6 +-- 6 files changed, 45 insertions(+), 59 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b3fb3459891e0..5ba670fc87762 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2349,9 +2349,9 @@ def _can_hold_na(self): @classmethod def _concat_same_type(self, to_concat): - from pandas.core.dtypes.concat import concat_categorical + from pandas.core.dtypes.concat import union_categoricals - return concat_categorical(to_concat) + return union_categoricals(to_concat) def isin(self, values): """ diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index a168b97b529e4..d7fb5a45152c6 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -344,6 +344,10 @@ def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: ------- Common dtype (np.dtype or ExtensionDtype) or None """ + # QUESTIONS: + # - do we guarantee that `dtypes` is already deduplicated? (list of uniques) + # - do we call this method if `len(dtypes) == 1`, or does this method + # need to handle that case if len(set(dtypes)) == 1: # only itself return self diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index a474790475d73..35ee1064d1189 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -108,8 +108,21 @@ def is_nonempty(x) -> bool: target_dtype = find_common_type([x.dtype for x in to_concat]) def cast(arr, dtype): + if ( + is_categorical_dtype(arr.dtype) + and isinstance(dtype, np.dtype) + and np.issubdtype(dtype, np.integer) + ): + # problem case: categorical of int -> gives int as result dtype, + # but categorical can contain NAs -> fall back to object dtype + try: + return arr.astype(dtype, copy=False) + except ValueError: + return arr.astype(object, copy=False) + if is_extension_array_dtype(dtype): if isinstance(arr, np.ndarray): + # numpy's astype cannot handle ExtensionDtypes return array(arr, dtype=dtype, copy=False) return arr.astype(dtype, copy=False) @@ -119,12 +132,7 @@ def cast(arr, dtype): cls = type(to_concat[0]) return cls._concat_same_type(to_concat) else: - np.concatenate(to_concat) - - elif "category" in typs: - # this must be prior to concat_datetime, - # to support Categorical + datetime-like - return concat_categorical(to_concat, axis=axis) + return np.concatenate(to_concat) elif _contains_datetime or "timedelta" in typs or _contains_period: return concat_datetime(to_concat, axis=axis, typs=typs) @@ -154,52 +162,6 @@ def cast(arr, dtype): return np.concatenate(to_concat, axis=axis) -def concat_categorical(to_concat, axis: int = 0): - """ - Concatenate an object/categorical array of arrays, each of which is a - single dtype - - Parameters - ---------- - to_concat : array of arrays - axis : int - Axis to provide concatenation in the current implementation this is - always 0, e.g. we only have 1D categoricals - - Returns - ------- - Categorical - A single array, preserving the combined dtypes - """ - # we could have object blocks and categoricals here - # if we only have a single categoricals then combine everything - # else its a non-compat categorical - categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] - - # validate the categories - if len(categoricals) != len(to_concat): - pass - else: - # when all categories are identical - first = to_concat[0] - if all(first.is_dtype_equal(other) for other in to_concat[1:]): - return union_categoricals(categoricals) - - # extract the categoricals & coerce to object if needed - to_concat = [ - x._internal_get_values() - if is_categorical_dtype(x.dtype) - else np.asarray(x).ravel() - if not is_datetime64tz_dtype(x) - else np.asarray(x.astype(object)) - for x in to_concat - ] - result = concat_compat(to_concat) - if axis == 1: - result = result.reshape(1, len(result)) - return result - - def union_categoricals( to_union, sort_categories: bool = False, ignore_order: bool = False ): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 8fe2b3c60d6d0..ede82039e427b 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -21,7 +21,7 @@ from pandas._libs.interval import Interval from pandas._libs.tslibs import NaT, Period, Timestamp, timezones -from pandas._typing import Ordered +from pandas._typing import DtypeObj, Ordered from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass @@ -640,6 +640,23 @@ def _is_boolean(self) -> bool: return is_bool_dtype(self.categories) + def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + + # check if we have all categorical dtype with identical categories + if all(isinstance(x, CategoricalDtype) for x in dtypes): + first = dtypes[0] + if all(first == other for other in dtypes[1:]): + return first + + # extract the categories' dtype + non_cat_dtypes = [ + x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes + ] + # TODO should categorical always give an answer? + from pandas.core.dtypes.cast import find_common_type + + return find_common_type(non_cat_dtypes) + @register_extension_dtype class DatetimeTZDtype(PandasExtensionDtype): diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index c8f4ec14545c7..18d9caff5c7c2 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -24,6 +24,7 @@ from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager @@ -65,13 +66,15 @@ def concatenate_block_managers( blk = join_units[0].block vals = [ju.block.values for ju in join_units] - if not blk.is_extension or blk.is_datetimetz or blk.is_categorical: + if not blk.is_extension or blk.is_datetimetz: # datetimetz and categorical can have the same type but multiple # dtypes, concatting does not necessarily preserve dtype values = concat_compat(vals, axis=blk.ndim - 1) else: # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals) + if not isinstance(values, ExtensionArray): + values = values.reshape(1, len(values)) b = make_block(values, placement=placement, ndim=blk.ndim) else: diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index bccae2c4c2772..22f15db03a772 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -610,11 +610,11 @@ def test_concat_categorical_3elem_coercion(self): s2 = pd.Series([2, 1, 2], dtype="category") s3 = pd.Series([1, 2, 1, 2, np.nan]) - exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="object") + exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float") tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="object") + exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float") tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) @@ -698,7 +698,7 @@ def test_concat_categorical_coercion_nan(self): s1 = pd.Series([1, np.nan], dtype="category") s2 = pd.Series([np.nan, np.nan]) - exp = pd.Series([1, np.nan, np.nan, np.nan], dtype="object") + exp = pd.Series([1, np.nan, np.nan, np.nan], dtype="float") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) From bb398e78df218e16704806cd28da883203908c30 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 17 Apr 2020 14:12:08 +0200 Subject: [PATCH 03/13] handle sparse --- pandas/core/arrays/sparse/array.py | 22 +------ pandas/core/arrays/sparse/dtype.py | 26 +++++++- pandas/core/dtypes/base.py | 1 + pandas/core/dtypes/concat.py | 83 ++++++++---------------- pandas/tests/internals/test_internals.py | 2 +- 5 files changed, 54 insertions(+), 80 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 620e157ee54ec..ff564ffb5f4a2 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -952,27 +952,7 @@ def copy(self): @classmethod def _concat_same_type(cls, to_concat): - fill_values = [x.fill_value for x in to_concat] - - fill_value = fill_values[0] - - # np.nan isn't a singleton, so we may end up with multiple - # NaNs here, so we ignore tha all NA case too. - if not (len(set(fill_values)) == 1 or isna(fill_values).all()): - warnings.warn( - "Concatenating sparse arrays with multiple fill " - f"values: '{fill_values}'. Picking the first and " - "converting the rest.", - PerformanceWarning, - stacklevel=6, - ) - keep = to_concat[0] - to_concat2 = [keep] - - for arr in to_concat[1:]: - to_concat2.append(cls(np.asarray(arr), fill_value=fill_value)) - - to_concat = to_concat2 + fill_value = to_concat[0].fill_value values = [] length = 0 diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index afa11586fda04..24f356b4160ab 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -1,11 +1,13 @@ """Sparse Dtype""" import re -from typing import TYPE_CHECKING, Any, Tuple, Type +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type +import warnings import numpy as np -from pandas._typing import Dtype +from pandas._typing import Dtype, DtypeObj +from pandas.errors import PerformanceWarning from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import astype_nansafe @@ -352,3 +354,23 @@ def _subtype_with_str(self): if isinstance(self.fill_value, str): return type(self.fill_value) return self.subtype + + def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + + fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)] + fill_value = fill_values[0] + + # np.nan isn't a singleton, so we may end up with multiple + # NaNs here, so we ignore tha all NA case too. + if not (len(set(fill_values)) == 1 or isna(fill_values).all()): + warnings.warn( + "Concatenating sparse arrays with multiple fill " + f"values: '{fill_values}'. Picking the first and " + "converting the rest.", + PerformanceWarning, + stacklevel=6, + ) + + # TODO also handle non-numpy other dtypes + np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] + return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index d7fb5a45152c6..e85c5ae4d9804 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -348,6 +348,7 @@ def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: # - do we guarantee that `dtypes` is already deduplicated? (list of uniques) # - do we call this method if `len(dtypes) == 1`, or does this method # need to handle that case + # - does this method need to handle "non-fully-initialized" dtypes? if len(set(dtypes)) == 1: # only itself return self diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 35ee1064d1189..08c98d15b557d 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -4,6 +4,8 @@ import numpy as np +from pandas._typing import ArrayLike, DtypeObj + from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_bool_dtype, @@ -62,6 +64,30 @@ def get_dtype_kinds(l): return typs +def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: + """ + Helper function for `arr.astype(common_type)` but handling all special + cases. + """ + if ( + is_categorical_dtype(arr.dtype) + and isinstance(dtype, np.dtype) + and np.issubdtype(dtype, np.integer) + ): + # problem case: categorical of int -> gives int as result dtype, + # but categorical can contain NAs -> fall back to object dtype + try: + return arr.astype(dtype, copy=False) + except ValueError: + return arr.astype(object, copy=False) + + if is_extension_array_dtype(dtype): + if isinstance(arr, np.ndarray): + # numpy's astype cannot handle ExtensionDtypes + return array(arr, dtype=dtype, copy=False) + return arr.astype(dtype, copy=False) + + def concat_compat(to_concat, axis: int = 0): """ provide concatenation of an array of arrays each of which is a single @@ -106,27 +132,7 @@ def is_nonempty(x) -> bool: if any_ea and axis == 0: if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) - - def cast(arr, dtype): - if ( - is_categorical_dtype(arr.dtype) - and isinstance(dtype, np.dtype) - and np.issubdtype(dtype, np.integer) - ): - # problem case: categorical of int -> gives int as result dtype, - # but categorical can contain NAs -> fall back to object dtype - try: - return arr.astype(dtype, copy=False) - except ValueError: - return arr.astype(object, copy=False) - - if is_extension_array_dtype(dtype): - if isinstance(arr, np.ndarray): - # numpy's astype cannot handle ExtensionDtypes - return array(arr, dtype=dtype, copy=False) - return arr.astype(dtype, copy=False) - - to_concat = [cast(arr, target_dtype) for arr in to_concat] + to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) @@ -137,10 +143,6 @@ def cast(arr, dtype): elif _contains_datetime or "timedelta" in typs or _contains_period: return concat_datetime(to_concat, axis=axis, typs=typs) - # these are mandated to handle empties as well - elif "sparse" in typs: - return _concat_sparse(to_concat, axis=axis, typs=typs) - elif any_ea and axis == 1: to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] return np.concatenate(to_concat, axis=axis) @@ -394,34 +396,3 @@ def _wrap_datetimelike(arr): if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]: arr = pd_array(arr) return arr - - -def _concat_sparse(to_concat, axis=0, typs=None): - """ - provide concatenation of an sparse/dense array of arrays each of which is a - single dtype - - Parameters - ---------- - to_concat : array of arrays - axis : axis to provide concatenation - typs : set of to_concat dtypes - - Returns - ------- - a single array, preserving the combined dtypes - """ - from pandas.core.arrays import SparseArray - - fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] - fill_value = fill_values[0] - - # TODO: Fix join unit generation so we aren't passed this. - to_concat = [ - x - if isinstance(x, SparseArray) - else SparseArray(x.squeeze(), fill_value=fill_value) - for x in to_concat - ] - - return SparseArray._concat_same_type(to_concat) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index f1d4c865a0ced..7ebf5e5ecefcd 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -584,7 +584,7 @@ def test_interleave_dtype(self, mgr_string, dtype): mgr = create_mgr("a: complex") assert mgr.as_array().dtype == "complex" mgr = create_mgr("a: f8; b: category") - assert mgr.as_array().dtype == "object" + assert mgr.as_array().dtype == "f8" mgr = create_mgr("a: M8[ns]; b: category") assert mgr.as_array().dtype == "object" mgr = create_mgr("a: M8[ns]; b: bool") From 83fdc9136704e39055c0483ddbd7754ac7a2de23 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 17 Apr 2020 14:26:58 +0200 Subject: [PATCH 04/13] handle non-initialized CategoricalDtype in find_common_type --- pandas/core/dtypes/dtypes.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index ede82039e427b..dc311f0128b2c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -641,13 +641,22 @@ def _is_boolean(self) -> bool: return is_bool_dtype(self.categories) def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: - # check if we have all categorical dtype with identical categories if all(isinstance(x, CategoricalDtype) for x in dtypes): first = dtypes[0] if all(first == other for other in dtypes[1:]): return first + # special case non-initialized categorical + # TODO we should figure out the expected return value in general + non_init_cats = [ + isinstance(x, CategoricalDtype) and x.categories is None for x in dtypes + ] + if all(non_init_cats): + return self + elif any(non_init_cats): + return None + # extract the categories' dtype non_cat_dtypes = [ x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes From 7f2ac2ae43789caf15afd3d77c8e8bca95324031 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 17 Apr 2020 15:51:15 +0200 Subject: [PATCH 05/13] handle datetimelike special case --- pandas/core/dtypes/concat.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 08c98d15b557d..0b8d2efbfd298 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -81,6 +81,16 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: except ValueError: return arr.astype(object, copy=False) + if ( + isinstance(arr, np.ndarray) + and arr.dtype.kind in ["m", "M"] + and dtype is np.dtype("object") + ): + # wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta + # this can happen when concat_compat is called directly on arrays (when arrays + # are not coming from Index/Series._values), eg in BlockManager.quantile + arr = array(arr) + if is_extension_array_dtype(dtype): if isinstance(arr, np.ndarray): # numpy's astype cannot handle ExtensionDtypes @@ -123,7 +133,6 @@ def is_nonempty(x) -> bool: typs = get_dtype_kinds(to_concat) _contains_datetime = any(typ.startswith("datetime") for typ in typs) - _contains_period = any(typ.startswith("period") for typ in typs) all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 @@ -140,7 +149,7 @@ def is_nonempty(x) -> bool: else: return np.concatenate(to_concat) - elif _contains_datetime or "timedelta" in typs or _contains_period: + elif _contains_datetime or "timedelta" in typs: return concat_datetime(to_concat, axis=axis, typs=typs) elif any_ea and axis == 1: From d0f90def968f752ced92cbd7fd8103f8627906b9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 17 Apr 2020 19:47:28 +0200 Subject: [PATCH 06/13] update docstring and comment --- pandas/core/dtypes/base.py | 5 +++-- pandas/core/dtypes/concat.py | 2 +- pandas/core/internals/concat.py | 4 +--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index e85c5ae4d9804..867662f450e8d 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -331,8 +331,9 @@ def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: Used in `find_common_type` implementation. This is for example used to determine the resulting dtype in a concat operation. - If no common dtype exists, return None. If all dtypes in the list - will return None, then the common dtype will be "object" dtype. + If no common dtype exists, return None (which gives the other dtypes + the chance to determine a common dtype). If all dtypes in the list + return None, then the common dtype will be "object" dtype. Parameters ---------- diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 0b8d2efbfd298..82b2795582ff1 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -66,7 +66,7 @@ def get_dtype_kinds(l): def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ - Helper function for `arr.astype(common_type)` but handling all special + Helper function for `arr.astype(common_dtype)` but handling all special cases. """ if ( diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 18d9caff5c7c2..df139915d1593 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -66,9 +66,7 @@ def concatenate_block_managers( blk = join_units[0].block vals = [ju.block.values for ju in join_units] - if not blk.is_extension or blk.is_datetimetz: - # datetimetz and categorical can have the same type but multiple - # dtypes, concatting does not necessarily preserve dtype + if not blk.is_extension: values = concat_compat(vals, axis=blk.ndim - 1) else: # TODO(EA2D): special-casing not needed with 2D EAs From 2d5fcb0a8cfe26c89eb6e25db2d4b195a1be427a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 17 Apr 2020 19:50:05 +0200 Subject: [PATCH 07/13] ignore mypy --- pandas/core/arrays/integer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 1f434f0d18db0..44fcec0d68c91 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -99,7 +99,9 @@ def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: # for now only handle other integer types if not all(isinstance(t, _IntegerDtype) for t in dtypes): return None - np_dtype = np.find_common_type([t.numpy_dtype for t in dtypes], []) + np_dtype = np.find_common_type( + [t.numpy_dtype for t in dtypes], [] # type: ignore + ) if np.issubdtype(np_dtype, np.integer): return _dtypes[str(np_dtype)] return None From fc98b652e48e73d7bb6526618469a1550ccbbfcf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 1 May 2020 15:07:14 +0200 Subject: [PATCH 08/13] common_type -> commong_dtype --- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/sparse/dtype.py | 2 +- pandas/core/dtypes/base.py | 2 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/dtypes/dtypes.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 72f98e413c280..743267534bfaa 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -96,7 +96,7 @@ def construct_array_type(cls) -> Type["IntegerArray"]: """ return IntegerArray - def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: # for now only handle other integer types if not all(isinstance(t, _IntegerDtype) for t in dtypes): return None diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index 24f356b4160ab..156a90f6ce600 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -355,7 +355,7 @@ def _subtype_with_str(self): return type(self.fill_value) return self.subtype - def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)] fill_value = fill_values[0] diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 867662f450e8d..1c1a9dffd0288 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -324,7 +324,7 @@ def _is_boolean(self) -> bool: """ return False - def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: """ Return the common dtype, if one exists. diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2eef3ad3ed7c2..7203187e630d6 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1453,7 +1453,7 @@ def find_common_type(types): if any(isinstance(t, ExtensionDtype) for t in types): for t in types: if isinstance(t, ExtensionDtype): - res = t._get_common_type(types) + res = t._get_common_dtype(types) if res is not None: return res return np.dtype("object") diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index dc311f0128b2c..ceed7e29e4a35 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -640,7 +640,7 @@ def _is_boolean(self) -> bool: return is_bool_dtype(self.categories) - def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: # check if we have all categorical dtype with identical categories if all(isinstance(x, CategoricalDtype) for x in dtypes): first = dtypes[0] From b0725917f6b4a5da6f42b83128019ae1a2377791 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 1 May 2020 15:18:39 +0200 Subject: [PATCH 09/13] ensure deduplicated list of dtypes is passed --- pandas/core/dtypes/base.py | 5 ----- pandas/core/dtypes/cast.py | 3 +++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 1c1a9dffd0288..0f15b0fe03bb1 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -345,11 +345,6 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: ------- Common dtype (np.dtype or ExtensionDtype) or None """ - # QUESTIONS: - # - do we guarantee that `dtypes` is already deduplicated? (list of uniques) - # - do we call this method if `len(dtypes) == 1`, or does this method - # need to handle that case - # - does this method need to handle "non-fully-initialized" dtypes? if len(set(dtypes)) == 1: # only itself return self diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7203187e630d6..977993a5a2cea 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1450,6 +1450,9 @@ def find_common_type(types): if all(is_dtype_equal(first, t) for t in types[1:]): return first + # get unique types (dict.fromkeys is used as order-preserving set()) + types = list(dict.fromkeys(types).keys()) + if any(isinstance(t, ExtensionDtype) for t in types): for t in types: if isinstance(t, ExtensionDtype): From 91c984a55e8974a257473d3694a0e982a5564cb7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 1 May 2020 15:24:45 +0200 Subject: [PATCH 10/13] add very basic base extension test --- pandas/tests/extension/base/dtype.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index ee4e199fbfe45..65e32d716a4db 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -112,3 +112,10 @@ def test_construct_from_string_wrong_type_raises(self, dtype): match="'construct_from_string' expects a string, got ", ): type(dtype).construct_from_string(0) + + def test_get_common_dtype(self, dtype): + # in practice we will not typically call this with a 1-length list + # (we shortcut to just use that dtype as the common dtype), but + # still testing as good practice to have this working (and it is the + # only case we can test in general) + assert dtype._get_common_dtype([dtype]) == dtype From 2a2b9d58667c8cded0bba92dacf74f01f43e69f2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 1 May 2020 15:35:54 +0200 Subject: [PATCH 11/13] document API change --- doc/source/whatsnew/v1.1.0.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7ad7e8f5a27b0..e4ef752a33635 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -241,6 +241,9 @@ Backwards incompatible API changes - :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) - Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) - Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`) +- Combining a ``Categorical`` with integer categories and which contains missing values + with a float dtype column in operations such as :func:`concat` or :meth:`~DataFrame.append` + will now result in a float column instead of an object dtyped column (:issue:`33607`) ``MultiIndex.get_indexer`` interprets `method` argument differently ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 8893165c688c1d6881e0f6455ca4aa10ccb64eac Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 1 May 2020 15:43:07 +0200 Subject: [PATCH 12/13] update EA interface docs --- pandas/core/arrays/base.py | 7 ++++++- pandas/core/dtypes/base.py | 6 ++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7447d593a7ff0..bd903d9b1fae3 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1004,7 +1004,7 @@ def _concat_same_type( cls, to_concat: Sequence["ExtensionArray"] ) -> "ExtensionArray": """ - Concatenate multiple array. + Concatenate multiple array of this dtype. Parameters ---------- @@ -1014,6 +1014,11 @@ def _concat_same_type( ------- ExtensionArray """ + # Implementer note: this method will only be called with a sequence of + # ExtensionArrays of this class and with the same dtype as self. This + # should allow "easy" concatenation (no upcasting needed), and result + # in a new ExtensionArray of the same dtype. + # Note: this strict behaviour is only guaranteed starting with pandas 1.1 raise AbstractMethodError(cls) # The _can_hold_na attribute is set to True so that pandas internals diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 0f15b0fe03bb1..2d81dd4d884a3 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -34,11 +34,12 @@ class ExtensionDtype: * type * name - The following attributes influence the behavior of the dtype in + The following attributes and methods influence the behavior of the dtype in pandas operations * _is_numeric * _is_boolean + * _get_common_dtype Optionally one can override construct_array_type for construction with the name of this dtype via the Registry. See @@ -333,7 +334,8 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: If no common dtype exists, return None (which gives the other dtypes the chance to determine a common dtype). If all dtypes in the list - return None, then the common dtype will be "object" dtype. + return None, then the common dtype will be "object" dtype (this means + it is never needed to return "object" dtype from this method itself). Parameters ---------- From e19e3ef93d95a5d96f47812371d47c36af814245 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 1 May 2020 15:46:23 +0200 Subject: [PATCH 13/13] add type annotation on find_common_type --- pandas/core/dtypes/cast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 977993a5a2cea..ad307fd99ec9c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -3,7 +3,7 @@ """ from datetime import date, datetime, timedelta -from typing import TYPE_CHECKING, Any, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type import numpy as np @@ -1423,7 +1423,7 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): return value -def find_common_type(types): +def find_common_type(types: List[DtypeObj]) -> DtypeObj: """ Find a common data type among the given dtypes.