pandas-dev · jreback · May 2, 2020 · Apr 17, 2020 · Apr 17, 2020 · Apr 17, 2020
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2349,9 +2349,9 @@ def _can_hold_na(self):
 
     @classmethod
     def _concat_same_type(self, to_concat):
-        from pandas.core.dtypes.concat import concat_categorical
+        from pandas.core.dtypes.concat import union_categoricals
 
-        return concat_categorical(to_concat)
+        return union_categoricals(to_concat)
 
     def isin(self, values):
         """

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -1,11 +1,11 @@
 import numbers
-from typing import TYPE_CHECKING, Tuple, Type, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
 import warnings
 
 import numpy as np
 
 from pandas._libs import lib, missing as libmissing
-from pandas._typing import ArrayLike
+from pandas._typing import ArrayLike, DtypeObj
 from pandas.compat import set_function_name
 from pandas.util._decorators import cache_readonly
 
@@ -95,6 +95,15 @@ def construct_array_type(cls) -> Type["IntegerArray"]:
         """
         return IntegerArray
 
+    def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
+        # for now only handle other integer types
+        if not all(isinstance(t, _IntegerDtype) for t in dtypes):
+            return None
+        np_dtype = np.find_common_type([t.numpy_dtype for t in dtypes], [])
+        if np.issubdtype(np_dtype, np.integer):
+            return _dtypes[str(np_dtype)]
+        return None
+
     def __from_arrow__(
         self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]
     ) -> "IntegerArray":

diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 
+from pandas._typing import DtypeObj
 from pandas.errors import AbstractMethodError
 
 from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
@@ -322,3 +323,33 @@ def _is_boolean(self) -> bool:
         bool
         """
         return False
+
+    def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
+        """
+        Return the common dtype, if one exists.
+
+        Used in `find_common_type` implementation. This is for example used
+        to determine the resulting dtype in a concat operation.
+
+        If no common dtype exists, return None. If all dtypes in the list
+        will return None, then the common dtype will be "object" dtype.
+
+        Parameters
+        ----------
+        dtypes : list of dtypes
+            The dtypes for which to determine a common dtype. This is a list
+            of np.dtype or ExtensionDtype instances.
+
+        Returns
+        -------
+        Common dtype (np.dtype or ExtensionDtype) or None
+        """
+        # QUESTIONS:
+        # - do we guarantee that `dtypes` is already deduplicated? (list of uniques)
+        # - do we call this method if `len(dtypes) == 1`, or does this method
+        #   need to handle that case
+        if len(set(dtypes)) == 1:
+            # only itself
+            return self
+        else:
+            return None
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1474,7 +1474,12 @@ def find_common_type(types):
         return first
 
     if any(isinstance(t, ExtensionDtype) for t in types):
-        return np.object
+        for t in types:
+            if isinstance(t, ExtensionDtype):
+                res = t._get_common_type(types)
+                if res is not None:
+                    return res
+        return np.dtype("object")
 
     # take lowest unit
     if all(is_datetime64_dtype(t) for t in types):

diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 
+from pandas.core.dtypes.cast import find_common_type
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_categorical_dtype,
@@ -17,6 +18,9 @@
 )
 from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries
 
+from pandas.core.arrays import ExtensionArray
+from pandas.core.construction import array
+
 
 def get_dtype_kinds(l):
     """
@@ -99,14 +103,36 @@ def is_nonempty(x) -> bool:
     single_dtype = len({x.dtype for x in to_concat}) == 1
     any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat)
 
-    if any_ea and single_dtype and axis == 0:
-        cls = type(to_concat[0])
-        return cls._concat_same_type(to_concat)
-
-    elif "category" in typs:
-        # this must be prior to concat_datetime,
-        # to support Categorical + datetime-like
-        return concat_categorical(to_concat, axis=axis)
+    if any_ea and axis == 0:
+        if not single_dtype:
+            target_dtype = find_common_type([x.dtype for x in to_concat])
+
+            def cast(arr, dtype):
+                if (
+                    is_categorical_dtype(arr.dtype)
+                    and isinstance(dtype, np.dtype)
+                    and np.issubdtype(dtype, np.integer)
+                ):
+                    # problem case: categorical of int -> gives int as result dtype,
+                    # but categorical can contain NAs -> fall back to object dtype
+                    try:
+                        return arr.astype(dtype, copy=False)
+                    except ValueError:
+                        return arr.astype(object, copy=False)
+
+                if is_extension_array_dtype(dtype):
+                    if isinstance(arr, np.ndarray):
+                        # numpy's astype cannot handle ExtensionDtypes
+                        return array(arr, dtype=dtype, copy=False)
+                return arr.astype(dtype, copy=False)
+
+            to_concat = [cast(arr, target_dtype) for arr in to_concat]
+
+        if isinstance(to_concat[0], ExtensionArray):
+            cls = type(to_concat[0])
+            return cls._concat_same_type(to_concat)
+        else:
+            return np.concatenate(to_concat)
 
     elif _contains_datetime or "timedelta" in typs or _contains_period:
         return concat_datetime(to_concat, axis=axis, typs=typs)
@@ -136,52 +162,6 @@ def is_nonempty(x) -> bool:
     return np.concatenate(to_concat, axis=axis)
 
 
-def concat_categorical(to_concat, axis: int = 0):
-    """
-    Concatenate an object/categorical array of arrays, each of which is a
-    single dtype
-
-    Parameters
-    ----------
-    to_concat : array of arrays
-    axis : int
-        Axis to provide concatenation in the current implementation this is
-        always 0, e.g. we only have 1D categoricals
-
-    Returns
-    -------
-    Categorical
-        A single array, preserving the combined dtypes
-    """
-    # we could have object blocks and categoricals here
-    # if we only have a single categoricals then combine everything
-    # else its a non-compat categorical
-    categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]
-
-    # validate the categories
-    if len(categoricals) != len(to_concat):
-        pass
-    else:
-        # when all categories are identical
-        first = to_concat[0]
-        if all(first.is_dtype_equal(other) for other in to_concat[1:]):
-            return union_categoricals(categoricals)
-
-    # extract the categoricals & coerce to object if needed
-    to_concat = [
-        x._internal_get_values()
-        if is_categorical_dtype(x.dtype)
-        else np.asarray(x).ravel()
-        if not is_datetime64tz_dtype(x)
-        else np.asarray(x.astype(object))
-        for x in to_concat
-    ]
-    result = concat_compat(to_concat)
-    if axis == 1:
-        result = result.reshape(1, len(result))
-    return result
-
-
 def union_categoricals(
     to_union, sort_categories: bool = False, ignore_order: bool = False
 ):

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -21,7 +21,7 @@
 
 from pandas._libs.interval import Interval
 from pandas._libs.tslibs import NaT, Period, Timestamp, timezones
-from pandas._typing import Ordered
+from pandas._typing import DtypeObj, Ordered
 
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass
@@ -640,6 +640,23 @@ def _is_boolean(self) -> bool:
 
         return is_bool_dtype(self.categories)
 
+    def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
+
+        # check if we have all categorical dtype with identical categories
+        if all(isinstance(x, CategoricalDtype) for x in dtypes):
+            first = dtypes[0]
+            if all(first == other for other in dtypes[1:]):
+                return first
+
+        # extract the categories' dtype
+        non_cat_dtypes = [
+            x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes
+        ]
+        # TODO should categorical always give an answer?
+        from pandas.core.dtypes.cast import find_common_type
+
+        return find_common_type(non_cat_dtypes)
+
 
 @register_extension_dtype
 class DatetimeTZDtype(PandasExtensionDtype):

diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -24,6 +24,7 @@
 from pandas.core.dtypes.missing import isna
 
 import pandas.core.algorithms as algos
+from pandas.core.arrays import ExtensionArray
 from pandas.core.internals.blocks import make_block
 from pandas.core.internals.managers import BlockManager
 
@@ -65,13 +66,15 @@ def concatenate_block_managers(
             blk = join_units[0].block
             vals = [ju.block.values for ju in join_units]
 
-            if not blk.is_extension or blk.is_datetimetz or blk.is_categorical:
+            if not blk.is_extension or blk.is_datetimetz:
                 # datetimetz and categorical can have the same type but multiple
                 #  dtypes, concatting does not necessarily preserve dtype
                 values = concat_compat(vals, axis=blk.ndim - 1)
             else:
                 # TODO(EA2D): special-casing not needed with 2D EAs
                 values = concat_compat(vals)
+                if not isinstance(values, ExtensionArray):
+                    values = values.reshape(1, len(values))
 
             b = make_block(values, placement=placement, ndim=blk.ndim)
         else:

diff --git a/pandas/tests/arrays/integer/test_concat.py b/pandas/tests/arrays/integer/test_concat.py
@@ -0,0 +1,26 @@
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize(
+    "to_concat_dtypes, result_dtype",
+    [
+        (["Int64", "Int64"], "Int64"),
+        (["UInt64", "UInt64"], "UInt64"),
+        (["Int8", "Int8"], "Int8"),
+        (["Int8", "Int16"], "Int16"),
+        (["UInt8", "Int8"], "Int16"),
+        (["Int32", "UInt32"], "Int64"),
+        # this still gives object (awaiting float extension dtype)
+        (["Int64", "UInt64"], "object"),
+    ],
+)
+def test_concat_series(to_concat_dtypes, result_dtype):
+
+    result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes])
+    expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype(
+        result_dtype
+    )
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
@@ -610,11 +610,11 @@ def test_concat_categorical_3elem_coercion(self):
         s2 = pd.Series([2, 1, 2], dtype="category")
         s3 = pd.Series([1, 2, 1, 2, np.nan])
 
-        exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="object")
+        exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float")
         tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
         tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp)
 
-        exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="object")
+        exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float")
         tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
         tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp)
 
@@ -698,7 +698,7 @@ def test_concat_categorical_coercion_nan(self):
         s1 = pd.Series([1, np.nan], dtype="category")
         s2 = pd.Series([np.nan, np.nan])
 
-        exp = pd.Series([1, np.nan, np.nan, np.nan], dtype="object")
+        exp = pd.Series([1, np.nan, np.nan, np.nan], dtype="float")
         tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
         tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)