Skip to content

Commit

Permalink
FEAT-modin-project#7100: Add range-partitioning impl for 'nunique()'
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
  • Loading branch information
dchigarev committed Mar 18, 2024
1 parent f059916 commit 6436ce7
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 11 deletions.
3 changes: 2 additions & 1 deletion .github/actions/run-core-tests/group_3/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ runs:
echo "::endgroup::"
shell: bash -l {0}
- run: |
echo "::group::Running experimental groupby tests (group 3)..."
echo "::group::Running range-partitioning tests (group 3)..."
MODIN_RANGE_PARTITIONING_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py
MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_series.py -k "test_nunique"
echo "::endgroup::"
shell: bash -l {0}
29 changes: 19 additions & 10 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -956,26 +956,35 @@ def median(self, axis, **kwargs):

def nunique(self, axis=0, dropna=True):
if not RangePartitioning.get():
return Reduce.register(pandas.DataFrame.nunique)(self, axis=axis, dropna=dropna)

return Reduce.register(pandas.DataFrame.nunique)(
self, axis=axis, dropna=dropna
)

unsupported_message = ""
if axis != 0:
unsupported_message += "Range-partitioning 'nunique()' is only supported for 'axis=0'.\n"

unsupported_message += (
"Range-partitioning 'nunique()' is only supported for 'axis=0'.\n"
)

if len(self.columns) > 1:
unsupported_message += "Range-partitioning 'nunique()' is only supported for a signle-column dataframe.\n"

if len(unsupported_message) > 0:
get_logger().log(
f"Can't use range-partitioning implementation for 'nunique' because:\n{unsupported_message}Falling back to full-axis implementation."
message = (
f"Can't use range-partitioning implementation for 'nunique' because:\n{unsupported_message}"
+ "Falling back to a full-axis implementation."
)
get_logger().info(message)
ErrorMessage.warn(message)
return Reduce.register(pandas.DataFrame.nunique)(
self, axis=axis, dropna=dropna
)
return Reduce.register(pandas.DataFrame.nunique)(self, axis=axis, dropna=dropna)

new_modin_frame = self._modin_frame._apply_func_to_range_partitioning(
key_columns=self.columns.tolist(),
func=lambda df: df.nunique(),
preserve_columns=True,
func=lambda df: df.nunique(dropna=dropna).to_frame(),
)
new_modin_frame = new_modin_frame.reduce(axis=0, function=lambda df: df.sum())
return self.__constructor__(new_modin_frame, shape_hint="column")

def skew(self, axis, **kwargs):
Expand Down
10 changes: 10 additions & 0 deletions modin/pandas/test/dataframe/test_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,16 @@ def test_count(data, axis):
)


@pytest.mark.parametrize("axis", axis_values, ids=axis_keys)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("dropna", [True, False])
def test_nunique(data, axis, dropna):
eval_general(
*create_test_dfs(data),
lambda df: df.nunique(axis=axis, dropna=dropna),
)


@pytest.mark.parametrize("numeric_only", [False, True])
def test_count_specific(numeric_only):
eval_general(
Expand Down

0 comments on commit 6436ce7

Please sign in to comment.