-
Notifications
You must be signed in to change notification settings - Fork 651
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
FEAT-#7254: Support right merge/join #7226
Changes from 16 commits
e261c83
649fb0d
e089cd6
b380dfd
774e9e4
3692720
49577eb
48942fd
d60e0cf
eb65291
998d841
d72ed82
2194844
089bcce
6cce47b
4a911d0
9cfd9ed
7d7914b
981c74e
b940a3b
0b30e13
3090e09
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,8 @@ | |
queries for the ``PandasDataframe``. | ||
""" | ||
|
||
from __future__ import annotations | ||
|
||
import ast | ||
import hashlib | ||
import re | ||
|
@@ -313,8 +315,8 @@ def from_dataframe(cls, df, data_cls): | |
|
||
# END Dataframe exchange protocol | ||
|
||
index = property(_get_axis(0), _set_axis(0)) | ||
columns = property(_get_axis(1), _set_axis(1)) | ||
index: pandas.Index = property(_get_axis(0), _set_axis(0)) | ||
columns: pandas.Index = property(_get_axis(1), _set_axis(1)) | ||
|
||
@property | ||
def dtypes(self): | ||
|
@@ -524,33 +526,46 @@ def merge(self, right, **kwargs): | |
get_logger().info(message) | ||
return MergeImpl.row_axis_merge(self, right, kwargs) | ||
|
||
def join(self, right, **kwargs): | ||
def join(self, right: PandasQueryCompiler, **kwargs) -> PandasQueryCompiler: | ||
on = kwargs.get("on", None) | ||
how = kwargs.get("how", "left") | ||
sort = kwargs.get("sort", False) | ||
left = self | ||
|
||
if how in ["left", "inner"]: | ||
|
||
def map_func(left, right, kwargs=kwargs): # pragma: no cover | ||
return pandas.DataFrame.join(left, right, **kwargs) | ||
if how in ["left", "inner"] or ( | ||
how == "right" and right._modin_frame._partitions.size != 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What happens if left has size equals to 0? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Empty dataframes are processed at a higher level (simply defaulted to pandas) and all implementation logic relies on the fact that the left operand has a non-empty set of partitions. Therefore, to avoid an error, we need to handle this situation as before. |
||
): | ||
reverted = False | ||
if how == "right": | ||
left, right = right, left | ||
reverted = True | ||
|
||
def map_func( | ||
left, right, kwargs=kwargs | ||
) -> pandas.DataFrame: # pragma: no cover | ||
if reverted: | ||
df = pandas.DataFrame.join(right, left, **kwargs) | ||
else: | ||
df = pandas.DataFrame.join(left, right, **kwargs) | ||
return df | ||
|
||
right_to_broadcast = right._modin_frame.combine() | ||
new_self = self.__constructor__( | ||
self._modin_frame.broadcast_apply_full_axis( | ||
left = left.__constructor__( | ||
left._modin_frame.broadcast_apply_full_axis( | ||
axis=1, | ||
func=map_func, | ||
# We're going to explicitly change the shape across the 1-axis, | ||
# so we want for partitioning to adapt as well | ||
keep_partitioning=False, | ||
num_splits=merge_partitioning( | ||
self._modin_frame, right._modin_frame, axis=1 | ||
left._modin_frame, right._modin_frame, axis=1 | ||
), | ||
other=right_to_broadcast, | ||
) | ||
) | ||
return new_self.sort_rows_by_column_values(on) if sort else new_self | ||
return left.sort_rows_by_column_values(on) if sort else left | ||
else: | ||
return self.default_to_pandas(pandas.DataFrame.join, right, **kwargs) | ||
return left.default_to_pandas(pandas.DataFrame.join, right, **kwargs) | ||
|
||
# END Inter-Data operations | ||
|
||
|
@@ -586,7 +601,7 @@ def reindex(self, axis, labels, **kwargs): | |
) | ||
return self.__constructor__(new_modin_frame) | ||
|
||
def reset_index(self, **kwargs): | ||
def reset_index(self, **kwargs) -> PandasQueryCompiler: | ||
if self.lazy_execution: | ||
|
||
def _reset(df, *axis_lengths, partition_idx): # pragma: no cover | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#7226 (comment)