Skip to content

Commit

Permalink
FEAT-modin-project#7340: Add more granular lazy flags to query compiler
Browse files Browse the repository at this point in the history
Signed-off-by: Jonathan Shi <jhshi07@gmail.com>
  • Loading branch information
noloerino committed Jul 22, 2024
1 parent f5f9ae9 commit da5be11
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 33 deletions.
38 changes: 30 additions & 8 deletions modin/core/storage_formats/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,23 @@ class BaseQueryCompiler(
Attributes
----------
lazy_execution : bool
Whether underlying execution engine is designed to be executed in a lazy mode only.
If True, such QueryCompiler will be handled differently at the front-end in order
to reduce execution triggering as much as possible.
lazy_row_labels : bool, default False
True if the backend defers computations of the row labels (`df.index` for a frame).
Used by the frontend to avoid unnecessary execution or defer error validation.
lazy_row_count : bool, default False
True if the backend defers computations of the number of rows (`len(df.index)`).
Used by the frontend to avoid unnecessary execution or defer error validation.
lazy_column_types : bool, default False
True if the backend defers computations of the column types (`df.dtypes`).
Used by the frontend to avoid unnecessary execution or defer error validation.
lazy_column_labels : bool, default False
True if the backend defers computations of the column labels (`df.columns`).
Used by the frontend to avoid unnecessary execution or defer error validation.
lazy_column_count : bool, default False
True if the backend defers computations of the number of columns (`len(df.columns)`).
Used by the frontend to avoid unnecessary execution or defer error validation.
lazy_shape : bool
True if either lazy_row_count or lazy_column_count is True.
_shape_hint : {"row", "column", None}, default: None
Shape hint for frames known to be a column or a row, otherwise None.
Expand Down Expand Up @@ -197,7 +210,16 @@ def default_to_pandas(self, pandas_op, *args, **kwargs) -> Self:
# some of these abstract methods, but for the sake of generality they are
# treated differently.

lazy_execution = False
lazy_row_labels = False
lazy_row_count = False
lazy_column_types = False
lazy_column_labels = False
lazy_column_count = False

@property
def lazy_shape(self):
return self.lazy_row_count or self.lazy_column_count

_shape_hint = None

# Metadata modification abstract methods
Expand Down Expand Up @@ -4524,7 +4546,7 @@ def has_multiindex(self, axis=0):
@property
def frame_has_materialized_dtypes(self) -> bool:
"""
Check if the undelying dataframe has materialized dtypes.
Check if the underlying dataframe has materialized dtypes.
Returns
-------
Expand All @@ -4535,7 +4557,7 @@ def frame_has_materialized_dtypes(self) -> bool:
@property
def frame_has_materialized_columns(self) -> bool:
"""
Check if the undelying dataframe has materialized columns.
Check if the underlying dataframe has materialized columns.
Returns
-------
Expand All @@ -4546,7 +4568,7 @@ def frame_has_materialized_columns(self) -> bool:
@property
def frame_has_materialized_index(self) -> bool:
"""
Check if the undelying dataframe has materialized index.
Check if the underlying dataframe has materialized index.
Returns
-------
Expand Down
33 changes: 20 additions & 13 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,19 +276,24 @@ def __init__(self, modin_frame: PandasDataframe, shape_hint: Optional[str] = Non
self._shape_hint = shape_hint

@property
def lazy_execution(self):
"""
Whether underlying Modin frame should be executed in a lazy mode.
def lazy_row_labels(self):
return not self.frame_has_materialized_index

If True, such QueryCompiler will be handled differently at the front-end in order
to reduce triggering the computation as much as possible.
@property
def lazy_row_count(self):
return not self.frame_has_materialized_index

Returns
-------
bool
"""
frame = self._modin_frame
return not frame.has_materialized_index or not frame.has_materialized_columns
@property
def lazy_column_types(self):
return not self.frame_has_materialized_dtypes

@property
def lazy_column_labels(self):
return not self.frame_has_materialized_columns

@property
def lazy_column_count(self):
return not self.frame_has_materialized_columns

def finalize(self):
self._modin_frame.finalize()
Expand Down Expand Up @@ -607,7 +612,7 @@ def reindex(self, axis, labels, **kwargs):
return self.__constructor__(new_modin_frame)

def reset_index(self, **kwargs) -> PandasQueryCompiler:
if self.lazy_execution:
if self.lazy_row_labels:

def _reset(df, *axis_lengths, partition_idx): # pragma: no cover
df = df.reset_index(**kwargs)
Expand Down Expand Up @@ -1798,7 +1803,9 @@ def get_unique_level_values(index):
new_index = (
get_unique_level_values(index)
if consider_index
else index if isinstance(index, list) else [index]
else index
if isinstance(index, list)
else [index]
)

new_columns = (
Expand Down
14 changes: 8 additions & 6 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1068,7 +1068,7 @@ def astype(
# will handle errors where dtype dict includes keys that are not
# in columns.
if (
not self._query_compiler.lazy_execution
not self._query_compiler.lazy_column_labels
and not set(dtype.keys()).issubset(set(self._query_compiler.columns))
and errors == "raise"
):
Expand Down Expand Up @@ -1462,7 +1462,9 @@ def drop(
axes[axis] = [axes[axis]]
# In case of lazy execution we should bypass these error checking components
# because they can force the materialization of the row or column labels.
if self._query_compiler.lazy_execution:
if (axis == "index" and self._query_compiler.lazy_row_labels) or (
axis == "columns" and self._query_compiler.lazy_column_labels
):
continue
if errors == "raise":
non_existent = pandas.Index(axes[axis]).difference(
Expand Down Expand Up @@ -2657,7 +2659,7 @@ def reset_index(
# exist.
if (
not drop
and not self._query_compiler.lazy_execution
and not self._query_compiler.lazy_column_labels
and not self._query_compiler.has_multiindex()
and all(n in self.columns for n in ["level_0", "index"])
):
Expand Down Expand Up @@ -3944,7 +3946,7 @@ def __getitem__(self, key) -> Self:
BasePandasDataset
Located dataset.
"""
if not self._query_compiler.lazy_execution and len(self) == 0:
if not self._query_compiler.lazy_row_count and len(self) == 0:
return self._default_to_pandas("__getitem__", key)
# see if we can slice the rows
# This lets us reuse code in pandas to error check
Expand Down Expand Up @@ -4075,7 +4077,7 @@ def _getitem_slice(self, key: slice) -> Self:
if is_full_grab_slice(
key,
# Avoid triggering shape computation for lazy executions
sequence_len=(None if self._query_compiler.lazy_execution else len(self)),
sequence_len=(None if self._query_compiler.lazy_row_count else len(self)),
):
return self.copy()
return self.iloc[key]
Expand Down Expand Up @@ -4301,7 +4303,7 @@ def __getattribute__(self, item) -> Any:
Any
"""
attr = super().__getattribute__(item)
if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_execution:
if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_shape:
# We default to pandas on empty DataFrames. This avoids a large amount of
# pain in underlying implementation and returns a result immediately rather
# than dealing with the edge cases that empty DataFrames have.
Expand Down
4 changes: 2 additions & 2 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1084,7 +1084,7 @@ def insert(
+ f"{len(value.columns)} columns instead."
)
value = value.squeeze(axis=1)
if not self._query_compiler.lazy_execution and len(self.index) == 0:
if not self._query_compiler.lazy_row_count and len(self.index) == 0:
if not hasattr(value, "index"):
try:
value = pandas.Series(value)
Expand Down Expand Up @@ -2783,7 +2783,7 @@ def setitem_unhashable_key(df, value):
if not isinstance(value, (Series, Categorical, np.ndarray, list, range)):
value = list(value)

if not self._query_compiler.lazy_execution and len(self.index) == 0:
if not self._query_compiler.lazy_row_count and len(self.index) == 0:
new_self = self.__constructor__({key: value}, columns=self.columns)
self._update_inplace(new_self._query_compiler)
else:
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def concat(
for obj in list_of_objs
if (
isinstance(obj, (Series, pandas.Series))
or (isinstance(obj, DataFrame) and obj._query_compiler.lazy_execution)
or (isinstance(obj, DataFrame) and obj._query_compiler.lazy_shape)
or sum(obj.shape) > 0
)
]
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def __getattr__(self, key):

def __getattribute__(self, item):
attr = super().__getattribute__(item)
if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_execution:
if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_shape:
# We default to pandas on empty DataFrames. This avoids a large amount of
# pain in underlying implementation and returns a result immediately rather
# than dealing with the edge cases that empty DataFrames have.
Expand Down
4 changes: 2 additions & 2 deletions modin/tests/pandas/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2752,15 +2752,15 @@ def lazy_frame(self):
donor_obj = pd.DataFrame()._query_compiler

self._mock_obj = mock.patch(
f"{donor_obj.__module__}.{donor_obj.__class__.__name__}.lazy_execution",
f"{donor_obj.__module__}.{donor_obj.__class__.__name__}.lazy_shape",
new_callable=mock.PropertyMock,
)
patch_obj = self._mock_obj.__enter__()
patch_obj.return_value = True

df = pd.DataFrame(**self._df_kwargs)
# The frame is lazy until `self.__exit__()` is called
assert df._query_compiler.lazy_execution
assert df._query_compiler.lazy_shape
return df

def __enter__(self):
Expand Down

0 comments on commit da5be11

Please sign in to comment.