Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DataFrame.columns = ... retains RangeIndex & set dtype #15129

Merged
merged 12 commits into from
Mar 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 65 additions & 31 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1777,7 +1777,7 @@ def _concat(

# Reassign index and column names
if objs[0]._data.multiindex:
out._set_column_names_like(objs[0])
out._set_columns_like(objs[0]._data)
else:
out.columns = names
if not ignore_index:
Expand Down Expand Up @@ -2215,7 +2215,11 @@ def from_dict(
next(iter(data.values())), (cudf.Series, cupy.ndarray)
):
result = cls(data).T
result.columns = columns
result.columns = (
columns
if columns is not None
else range(len(result._data))
)
if dtype is not None:
result = result.astype(dtype)
return result
Expand Down Expand Up @@ -2619,39 +2623,69 @@ def columns(self):
@columns.setter # type: ignore
@_cudf_nvtx_annotate
def columns(self, columns):
if isinstance(columns, cudf.BaseIndex):
columns = columns.to_pandas()
if columns is None:
columns = pd.Index(range(len(self._data.columns)))
is_multiindex = isinstance(columns, pd.MultiIndex)

if isinstance(columns, (Series, cudf.Index, ColumnBase)):
columns = pd.Index(columns.to_numpy(), tupleize_cols=is_multiindex)
elif not isinstance(columns, pd.Index):
columns = pd.Index(columns, tupleize_cols=is_multiindex)
multiindex = False
rangeindex = False
label_dtype = None
vyasr marked this conversation as resolved.
Show resolved Hide resolved
level_names = None
if isinstance(columns, (pd.MultiIndex, cudf.MultiIndex)):
multiindex = True
if isinstance(columns, cudf.MultiIndex):
pd_columns = columns.to_pandas()
else:
pd_columns = columns
if pd_columns.nunique(dropna=False) != len(pd_columns):
raise ValueError("Duplicate column names are not allowed")
level_names = list(pd_columns.names)
elif isinstance(columns, (cudf.BaseIndex, ColumnBase, Series)):
level_names = (getattr(columns, "name", None),)
rangeindex = isinstance(columns, cudf.RangeIndex)
columns = as_column(columns)
if columns.distinct_count(dropna=False) != len(columns):
raise ValueError("Duplicate column names are not allowed")
pd_columns = pd.Index(columns.to_pandas())
label_dtype = pd_columns.dtype
else:
pd_columns = pd.Index(columns)
if pd_columns.nunique(dropna=False) != len(pd_columns):
raise ValueError("Duplicate column names are not allowed")
rangeindex = isinstance(pd_columns, pd.RangeIndex)
level_names = (pd_columns.name,)
label_dtype = pd_columns.dtype

if not len(columns) == len(self._data.names):
if len(pd_columns) != len(self._data.names):
raise ValueError(
f"Length mismatch: expected {len(self._data.names)} elements, "
f"got {len(columns)} elements"
f"got {len(pd_columns)} elements"
)

self._set_column_names(columns, is_multiindex, columns.names)

def _set_column_names(self, names, multiindex=False, level_names=None):
data = dict(zip(names, self._data.columns))
if len(names) != len(data):
raise ValueError("Duplicate column names are not allowed")

self._data = ColumnAccessor(
data,
data=dict(zip(pd_columns, self._data.columns)),
multiindex=multiindex,
level_names=level_names,
label_dtype=label_dtype,
rangeindex=rangeindex,
verify=False,
)

def _set_column_names_like(self, other):
self._set_column_names(
other._data.names, other._data.multiindex, other._data.level_names
def _set_columns_like(self, other: ColumnAccessor) -> None:
"""
Modify self with the column properties of other.
vyasr marked this conversation as resolved.
Show resolved Hide resolved

* Whether .columns is a MultiIndex/RangeIndex
* The possible .columns.dtype
* The .columns.names/name (depending on if it's a MultiIndex)
"""
if len(self._data.names) != len(other.names):
raise ValueError(
f"Length mismatch: expected {len(other)} elements, "
f"got {len(self)} elements"
)
self._data = ColumnAccessor(
data=dict(zip(other.names, self._data.columns)),
multiindex=other.multiindex,
level_names=other.level_names,
label_dtype=other.label_dtype,
verify=False,
)

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -3023,7 +3057,7 @@ def where(self, cond, other=None, inplace=False):
"Array conditional must be same shape as self"
)
# Setting `self` column names to `cond` as it has no column names.
cond._set_column_names_like(self)
cond._set_columns_like(self._data)

# If other was provided, process that next.
if isinstance(other, DataFrame):
Expand Down Expand Up @@ -6347,7 +6381,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
if isinstance(df, Series):
df = df.to_frame()

df._set_column_names_like(data_df)
df._set_columns_like(data_df._data)

return df

Expand Down Expand Up @@ -6458,7 +6492,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
)
else:
result_df = DataFrame(result).set_index(self.index)
result_df._set_column_names_like(prepared)
result_df._set_columns_like(prepared._data)
return result_df

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -7082,7 +7116,7 @@ def cov(self, **kwargs):
cov = cupy.cov(self.values, rowvar=False)
cols = self._data.to_pandas_index()
df = DataFrame(cupy.asfortranarray(cov)).set_index(cols)
df._set_column_names_like(self)
df._set_columns_like(self._data)
return df

def corr(self, method="pearson", min_periods=None):
Expand Down Expand Up @@ -7118,7 +7152,7 @@ def corr(self, method="pearson", min_periods=None):
corr = cupy.corrcoef(values, rowvar=False)
cols = self._data.to_pandas_index()
df = DataFrame(cupy.asfortranarray(corr)).set_index(cols)
df._set_column_names_like(self)
df._set_columns_like(self._data)
return df

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -7455,7 +7489,7 @@ def _from_columns_like_self(
index_names,
override_dtypes=override_dtypes,
)
result._set_column_names_like(self)
result._set_columns_like(self._data)
return result

@_cudf_nvtx_annotate
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2587,7 +2587,7 @@ def sort_index(
isinstance(self, cudf.core.dataframe.DataFrame)
and self._data.multiindex
):
out._set_column_names_like(self)
out._set_columns_like(self._data)
elif (ascending and idx.is_monotonic_increasing) or (
not ascending and idx.is_monotonic_decreasing
):
Expand All @@ -2607,7 +2607,7 @@ def sort_index(
isinstance(self, cudf.core.dataframe.DataFrame)
and self._data.multiindex
):
out._set_column_names_like(self)
out._set_columns_like(self._data)
if ignore_index:
out = out.reset_index(drop=True)
else:
Expand Down
53 changes: 53 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import contextlib
import datetime
import decimal
import functools
import io
import operator
import random
Expand Down Expand Up @@ -10727,6 +10728,9 @@ def test_init_from_2_categoricalindex_series_diff_categories():
)
result = cudf.DataFrame([s1, s2])
expected = pd.DataFrame([s1.to_pandas(), s2.to_pandas()])
# TODO: Remove once https://github.com/pandas-dev/pandas/issues/57592
# is adressed
expected.columns = result.columns
assert_eq(result, expected, check_dtype=False)


Expand Down Expand Up @@ -10863,6 +10867,55 @@ def test_dataframe_duplicate_index_reindex():
)


def test_dataframe_columns_set_none_raises():
df = cudf.DataFrame({"a": [0]})
with pytest.raises(TypeError):
df.columns = None


@pytest.mark.parametrize(
"columns",
[cudf.RangeIndex(1, name="foo"), pd.RangeIndex(1, name="foo"), range(1)],
)
def test_dataframe_columns_set_rangeindex(columns):
df = cudf.DataFrame([1], columns=["a"])
df.columns = columns
result = df.columns
expected = pd.RangeIndex(1, name=getattr(columns, "name", None))
pd.testing.assert_index_equal(result, expected, exact=True)


@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex])
def test_dataframe_columns_set_multiindex(klass):
columns = klass.from_arrays([[10]], names=["foo"])
df = cudf.DataFrame([1], columns=["a"])
df.columns = columns
result = df.columns
expected = pd.MultiIndex.from_arrays([[10]], names=["foo"])
pd.testing.assert_index_equal(result, expected, exact=True)


@pytest.mark.parametrize(
"klass",
[
functools.partial(cudf.Index, name="foo"),
functools.partial(cudf.Series, name="foo"),
functools.partial(pd.Index, name="foo"),
functools.partial(pd.Series, name="foo"),
np.array,
],
)
def test_dataframe_columns_set_preserve_type(klass):
df = cudf.DataFrame([1], columns=["a"])
columns = klass([10], dtype="int8")
df.columns = columns
result = df.columns
expected = pd.Index(
[10], dtype="int8", name=getattr(columns, "name", None)
)
pd.testing.assert_index_equal(result, expected)


@pytest.mark.parametrize(
"scalar",
[
Expand Down
Loading