Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: Index.__new__ #38665

Merged
merged 8 commits into from
Dec 29, 2020
125 changes: 87 additions & 38 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@
validate_all_hashable,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
IntervalDtype,
PeriodDtype,
)
from pandas.core.dtypes.generic import (
ABCDatetimeIndex,
ABCMultiIndex,
Expand Down Expand Up @@ -331,12 +337,6 @@ def __new__(

# index-like
elif isinstance(data, (np.ndarray, Index, ABCSeries)):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423
from pandas.core.indexes.numeric import (
Float64Index,
Int64Index,
UInt64Index,
)

if dtype is not None:
# we need to avoid having numpy coerce
Expand All @@ -347,42 +347,31 @@ def __new__(
data = _maybe_cast_with_dtype(data, dtype, copy)
dtype = data.dtype # TODO: maybe not for object?

# maybe coerce to a sub-class
if is_signed_integer_dtype(data.dtype):
return Int64Index(data, copy=copy, dtype=dtype, name=name)
elif is_unsigned_integer_dtype(data.dtype):
return UInt64Index(data, copy=copy, dtype=dtype, name=name)
elif is_float_dtype(data.dtype):
return Float64Index(data, copy=copy, dtype=dtype, name=name)
elif issubclass(data.dtype.type, bool) or is_bool_dtype(data):
subarr = data.astype("object")
if data.dtype.kind in ["i", "u", "f"]:
# maybe coerce to a sub-class
arr = data
else:
subarr = com.asarray_tuplesafe(data, dtype=object)

# asarray_tuplesafe does not always copy underlying data,
# so need to make sure that this happens
if copy:
subarr = subarr.copy()
arr = com.asarray_tuplesafe(data, dtype=object)

if dtype is None:
new_data, new_dtype = _maybe_cast_data_without_dtype(subarr)
if new_dtype is not None:
if dtype is None:
new_data = _maybe_cast_data_without_dtype(arr)
new_dtype = new_data.dtype
return cls(
new_data, dtype=new_dtype, copy=False, name=name, **kwargs
new_data, dtype=new_dtype, copy=copy, name=name, **kwargs
)

klass = cls._dtype_to_subclass(arr.dtype)
arr = klass._ensure_array(arr, dtype, copy)
if kwargs:
raise TypeError(f"Unexpected keyword arguments {repr(set(kwargs))}")
if subarr.ndim > 1:
# GH#13601, GH#20285, GH#27125
raise ValueError("Index data must be 1-dimensional")
return cls._simple_new(subarr, name)
return klass._simple_new(arr, name)

elif data is None or is_scalar(data):
elif is_scalar(data):
raise cls._scalar_data_error(data)
elif hasattr(data, "__array__"):
return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs)
else:

if tupleize_cols and is_list_like(data):
# GH21470: convert iterable to list before determining if empty
if is_iterator(data):
Expand All @@ -400,6 +389,64 @@ def __new__(
subarr = com.asarray_tuplesafe(data, dtype=object)
return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs)

@classmethod
def _ensure_array(cls, data, dtype, copy: bool):
"""
Ensure we have a valid array to pass to _simple_new.
"""
if data.ndim > 1:
# GH#13601, GH#20285, GH#27125
raise ValueError("Index data must be 1-dimensional")
if copy:
# asarray_tuplesafe does not always copy underlying data,
# so need to make sure that this happens
data = data.copy()
return data

@classmethod
def _dtype_to_subclass(cls, dtype: DtypeObj):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423

if isinstance(dtype, DatetimeTZDtype) or dtype == np.dtype("M8[ns]"):
jreback marked this conversation as resolved.
Show resolved Hide resolved
from pandas import DatetimeIndex

return DatetimeIndex
if dtype == "m8[ns]":
from pandas import TimedeltaIndex

return TimedeltaIndex
if isinstance(dtype, CategoricalDtype):
from pandas import CategoricalIndex

return CategoricalIndex
if isinstance(dtype, IntervalDtype):
from pandas import IntervalIndex

return IntervalIndex
if isinstance(dtype, PeriodDtype):
from pandas import PeriodIndex

return PeriodIndex

if is_float_dtype(dtype):
from pandas import Float64Index

return Float64Index
if is_unsigned_integer_dtype(dtype):
from pandas import UInt64Index

return UInt64Index
if is_signed_integer_dtype(dtype):
from pandas import Int64Index

return Int64Index

if dtype == object:
# NB: assuming away MultiIndex
return Index

raise NotImplementedError(dtype)

"""
NOTE for new Index creation:

Expand Down Expand Up @@ -6048,25 +6095,27 @@ def _maybe_cast_data_without_dtype(subarr):
TimedeltaArray,
)

assert subarr.dtype == object, subarr.dtype
inferred = lib.infer_dtype(subarr, skipna=False)

if inferred == "integer":
try:
data = _try_convert_to_int_array(subarr, False, None)
return data, data.dtype
return data
except ValueError:
pass

return subarr, object
return subarr

elif inferred in ["floating", "mixed-integer-float", "integer-na"]:
# TODO: Returns IntegerArray for integer-na case in the future
return subarr, np.float64
data = np.asarray(subarr).astype(np.float64)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copy=False

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated+green

return data

elif inferred == "interval":
try:
data = IntervalArray._from_sequence(subarr, copy=False)
return data, data.dtype
return data
except ValueError:
# GH27172: mixed closed Intervals --> object dtype
pass
Expand All @@ -6077,7 +6126,7 @@ def _maybe_cast_data_without_dtype(subarr):
if inferred.startswith("datetime"):
try:
data = DatetimeArray._from_sequence(subarr, copy=False)
return data, data.dtype
return data
except (ValueError, OutOfBoundsDatetime):
# GH 27011
# If we have mixed timezones, just send it
Expand All @@ -6086,15 +6135,15 @@ def _maybe_cast_data_without_dtype(subarr):

elif inferred.startswith("timedelta"):
data = TimedeltaArray._from_sequence(subarr, copy=False)
return data, data.dtype
return data
elif inferred == "period":
try:
data = PeriodArray._from_sequence(subarr)
return data, data.dtype
return data
except IncompatibleFrequency:
pass

return subarr, subarr.dtype
return subarr


def _try_convert_to_int_array(
Expand Down
15 changes: 12 additions & 3 deletions pandas/core/indexes/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,20 @@ class NumericIndex(Index):
_can_hold_strings = False

def __new__(cls, data=None, dtype=None, copy=False, name=None):
cls._validate_dtype(dtype)
name = maybe_extract_name(name, data, cls)

# Coerce to ndarray if not already ndarray or Index
subarr = cls._ensure_array(data, dtype, copy)
return cls._simple_new(subarr, name=name)

@classmethod
def _ensure_array(cls, data, dtype, copy: bool):
"""
Ensure we have a valid array to pass to _simple_new.
"""
cls._validate_dtype(dtype)

if not isinstance(data, (np.ndarray, Index)):
# Coerce to ndarray if not already ndarray or Index
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can use _ensure_array on L81

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

L81 is inside ensure_array

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh i c, i guess i meant can you use the super class version

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

theres really only one line that gets shared, i dont think its worth it (for now at least)

if is_scalar(data):
raise cls._scalar_data_error(data)

Expand All @@ -74,7 +83,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None):
raise ValueError("Index data must be 1-dimensional")

subarr = np.asarray(subarr)
return cls._simple_new(subarr, name=name)
return subarr

@classmethod
def _validate_dtype(cls, dtype: Dtype) -> None:
Expand Down