Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF-#5268: Call get on all partitions at once in to_pandas #4776

Merged
merged 15 commits into from
Nov 27, 2022
Merged
19 changes: 18 additions & 1 deletion modin/core/dataframe/pandas/partitioning/partition_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,24 @@ def to_pandas(cls, partitions):
pandas.DataFrame
A pandas DataFrame
"""
retrieved_objects = [[obj.to_pandas() for obj in part] for part in partitions]
retrieved_objects = cls.get_objects_from_partitions(partitions.flatten())
if all(
isinstance(obj, (pandas.DataFrame, pandas.Series))
vnlitvinov marked this conversation as resolved.
Show resolved Hide resolved
for obj in retrieved_objects
):
height, width, *_ = tuple(partitions.shape) + (0,)
# restore 2d array
objs = iter(retrieved_objects)
retrieved_objects = [
[next(objs) for _ in range(width)] for __ in range(height)
]
else:
# Partitions do not always contain pandas objects, for example, hdk uses pyarrow tables.
# This implementation comes from the fact that calling `partition.get`
# function is not always equivalent to `partition.to_pandas`.
retrieved_objects = [
[obj.to_pandas() for obj in part] for part in partitions
]
vnlitvinov marked this conversation as resolved.
Show resolved Hide resolved
if all(
isinstance(part, pandas.Series) for row in retrieved_objects for part in row
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ def get_objects_from_partitions(cls, partitions):
list
The objects wrapped by `partitions`.
"""
for idx, part in enumerate(partitions):
if hasattr(part, "force_materialization"):
partitions[idx] = part.force_materialization()
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
assert all(
[len(partition.list_of_blocks) == 1 for partition in partitions]
), "Implementation assumes that each partition contains a signle block."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ def get_objects_from_partitions(cls, partitions):
list
The objects wrapped by `partitions`.
"""
for idx, part in enumerate(partitions):
if hasattr(part, "force_materialization"):
partitions[idx] = part.force_materialization()
assert all(
[len(partition.list_of_blocks) == 1 for partition in partitions]
), "Implementation assumes that each partition contains a signle block."
Expand Down