Skip to content

Commit

Permalink
FIX-#3764: Ensure df.loc with a scalar out of bounds appends to df (#…
Browse files Browse the repository at this point in the history
…3765)

Co-authored-by: Devin Petersohn <devin-petersohn@users.noreply.github.com>
Co-authored-by: Bill Wang <billiam@ponder.io>
Co-authored-by: Vasily Litvinov <fam1ly.n4me@yandex.ru>
  • Loading branch information
4 people committed Oct 25, 2022
1 parent f563a62 commit 11ba481
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1227,6 +1227,7 @@ def compute_part_size(indexer, remote_part, part_idx, axis):
for row_idx, row_values in enumerate(row_partitions_list):
row_blk_idx, row_internal_idx = row_values
col_position_counter = 0
row_offset = 0
for col_idx, col_values in enumerate(col_partitions_list):
col_blk_idx, col_internal_idx = col_values
remote_part = partition_copy[row_blk_idx, col_blk_idx]
Expand Down
160 changes: 133 additions & 27 deletions modin/pandas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,32 +764,130 @@ def _loc(df):
new_query_compiler=self.df._default_to_pandas(_loc)._query_compiler
)
return
row_loc, col_loc, _ = self._parse_row_and_column_locators(key)
if isinstance(row_loc, list) and len(row_loc) == 1:
if row_loc[0] not in self.qc.index:
index = self.qc.index.insert(len(self.qc.index), row_loc[0])
self.qc = self.qc.reindex(labels=index, axis=0)
self.df._update_inplace(new_query_compiler=self.qc)
row_loc, col_loc, ndims = self._parse_row_and_column_locators(key)
append_axis = self._check_missing_loc(row_loc, col_loc)
if ndims >= 1 and append_axis is not None:
# We enter this codepath if we're either appending a row or a column
if append_axis:
# Appending at least one new column
if is_scalar(col_loc):
col_loc = [col_loc]
self._setitem_with_new_columns(row_loc, col_loc, item)
else:
# Appending at most one new row
if is_scalar(row_loc) or len(row_loc) == 1:
index = self.qc.index.insert(len(self.qc.index), row_loc)
self.qc = self.qc.reindex(labels=index, axis=0, fill_value=0)
self.df._update_inplace(new_query_compiler=self.qc)
self._set_item_existing_loc(row_loc, col_loc, item)
else:
self._set_item_existing_loc(row_loc, col_loc, item)

def _setitem_with_new_columns(self, row_loc, col_loc, item):
"""
Assign `item` value to dataset located by `row_loc` and `col_loc` with new columns.
Parameters
----------
row_loc : scalar, slice, list, array or tuple
Row locator.
col_loc : scalar, slice, list, array or tuple
Columns locator.
item : modin.pandas.DataFrame, modin.pandas.Series or scalar
Value that should be assigned to located dataset.
"""
exist_items = item
common_label_loc = np.isin(col_loc, self.qc.columns.values)
if is_list_like(item) and not isinstance(item, (DataFrame, Series)):
item = np.array(item)
if len(item.shape) == 1:
if item.shape[0] != len(col_loc):
raise ValueError(
"Must have equal len keys and value when setting with an iterable"
)
else:
if item.shape != (len(self.qc.index, len(col_loc))):
raise ValueError(
"Must have equal len keys and value when setting with an iterable"
)
exist_items = (
item[:, common_label_loc]
if len(item.shape) > 1
else item[common_label_loc]
)
if not all(common_label_loc):
# In this case we have some new cols and some old ones
columns = self.qc.columns
for i in range(len(common_label_loc)):
if not common_label_loc[i]:
columns = columns.insert(len(columns), col_loc[i])
self.qc = self.qc.reindex(labels=columns, axis=1, fill_value=0)
self.df._update_inplace(new_query_compiler=self.qc)
self._set_item_existing_loc(row_loc, np.array(col_loc), exist_items)

def _set_item_existing_loc(self, row_loc, col_loc, item):
"""
Assign `item` value to dataset located by `row_loc` and `col_loc` with existing rows and columns.
Parameters
----------
row_loc : scalar, slice, list, array or tuple
Row locator.
col_loc : scalar, slice, list, array or tuple
Columns locator.
item : modin.pandas.DataFrame, modin.pandas.Series or scalar
Value that should be assigned to located dataset.
"""
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
self._setitem_positional(
row_lookup,
col_lookup,
item,
axis=self._determine_setitem_axis(
row_lookup, col_lookup, is_scalar(row_loc), is_scalar(col_loc)
),
)

def _check_missing_loc(self, row_loc, col_loc):
"""
Help `__setitem__` compute whether an axis needs appending.
Parameters
----------
row_loc : scalar, slice, list, array or tuple
Row locator.
col_loc : scalar, slice, list, array or tuple
Columns locator.
Returns
-------
int or None :
0 if new row, 1 if new column, None if neither.
"""
if is_scalar(row_loc):
return 0 if row_loc not in self.qc.index else None
elif isinstance(row_loc, list):
missing_labels = self._compute_enlarge_labels(
pandas.Index(row_loc), self.qc.index
)
if len(missing_labels) > 1:
# We cast to list to copy pandas' error:
# In pandas, we get: KeyError: [a, b,...] not in index
# If we don't convert to list we get: KeyError: [a b ...] not in index
raise KeyError("{} not in index".format(list(missing_labels)))
if (
not (is_list_like(row_loc) or isinstance(row_loc, slice))
and row_loc not in self.qc.index
):
return 0
if (
isinstance(col_loc, list)
and len(col_loc) == 1
and col_loc[0] not in self.qc.columns
and len(pandas.Index(col_loc).difference(self.qc.columns)) >= 1
):
new_col = pandas.Series(index=self.df.index)
new_col[row_loc] = item
self.df.insert(loc=len(self.df.columns), column=col_loc[0], value=new_col)
self.qc = self.df._query_compiler
else:
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
self._setitem_positional(
row_lookup,
col_lookup,
item,
axis=self._determine_setitem_axis(
row_lookup, col_lookup, is_scalar(row_loc), is_scalar(col_loc)
),
)
return 1
if is_scalar(col_loc) and col_loc not in self.qc.columns:
return 1
return None

def _compute_enlarge_labels(self, locator, base_index):
"""
Expand All @@ -810,16 +908,24 @@ def _compute_enlarge_labels(self, locator, base_index):
# base_index_type can be pd.Index or pd.DatetimeIndex
# depending on user input and pandas behavior
# See issue #2264
base_index_type = type(base_index)
locator_as_index = base_index_type(locator)
base_as_index = pandas.Index(list(base_index))
locator_as_index = pandas.Index(list(locator))

nan_labels = locator_as_index.difference(base_index)
common_labels = locator_as_index.intersection(base_index)
if locator_as_index.inferred_type == "boolean":
if len(locator_as_index) != len(base_as_index):
raise ValueError(
f"Item wrong length {len(locator_as_index)} instead of {len(base_as_index)}!"
)
common_labels = base_as_index[locator_as_index]
nan_labels = pandas.Index([])
else:
common_labels = locator_as_index.intersection(base_as_index)
nan_labels = locator_as_index.difference(base_as_index)

if len(common_labels) == 0:
raise KeyError(
"None of [{labels}] are in the [{base_index_name}]".format(
labels=list(locator_as_index), base_index_name=base_index
labels=list(locator_as_index), base_index_name=base_as_index
)
)
return nan_labels
Expand Down
36 changes: 36 additions & 0 deletions modin/pandas/test/dataframe/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,42 @@ def test_loc_assignment(index, columns):
df_equals(md_df, pd_df)


@pytest.mark.parametrize("left, right", [(2, 1), (6, 1), (lambda df: 70, 1), (90, 70)])
def test_loc_insert_row(left, right):
# This test case comes from
# https://github.com/modin-project/modin/issues/3764
pandas_df = pandas.DataFrame([[1, 2, 3], [4, 5, 6]])
modin_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])

def _test_loc_rows(df):
df.loc[left] = df.loc[right]
return df

eval_general(modin_df, pandas_df, _test_loc_rows)


@pytest.mark.parametrize(
"columns", [10, (100, 102), (2, 6), [10, 11, 12], "a", ["b", "c", "d"]]
)
def test_loc_insert_col(columns):
# This test case comes from
# https://github.com/modin-project/modin/issues/3764
pandas_df = pandas.DataFrame([[1, 2, 3], [4, 5, 6]])
modin_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])

if isinstance(columns, tuple) and len(columns) == 2:

def _test_loc_cols(df):
df.loc[:, columns[0] : columns[1]] = 1

else:

def _test_loc_cols(df):
df.loc[:, columns] = 1

eval_general(modin_df, pandas_df, _test_loc_cols)


@pytest.fixture
def loc_iter_dfs():
columns = ["col1", "col2", "col3"]
Expand Down

0 comments on commit 11ba481

Please sign in to comment.