Skip to content

Commit

Permalink
fixes: remove redundant tests, and fix comments
Browse files Browse the repository at this point in the history
Signed-off-by: Navin Kumar <navink@nvidia.com>
  • Loading branch information
NVnavkumar committed Mar 10, 2022
1 parent 6d30f75 commit ac2a70d
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 46 deletions.
43 changes: 0 additions & 43 deletions integration_tests/src/main/python/window_function_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,28 +447,6 @@ def test_window_running_rank_no_part(data_gen):
validate_execs_in_gpu_plan = ['GpuRunningWindowExec'],
conf = conf)

# Percent rank is a rank aggregation like rank and dense rank that does care about ordering. However, unlink rank
# and dense rank, it does not use the GPU running window optimization, so therefore we have to make it a separate
# test from rank and dense rank.
@pytest.mark.parametrize('data_gen',
all_basic_gens_no_nans + [decimal_gen_32bit, decimal_gen_128bit],
ids=meta_idfn('data:'))
def test_window_running_percent_rank_no_part(data_gen):
# Keep the batch size small. We have tested these with operators with exact inputs already, this is mostly
# testing the fixup operation.
conf = {'spark.rapids.sql.batchSizeBytes': 1000}
query_parts = ['a',
'percent_rank() over (order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as rank_val']

# When generating the ordering try really hard to have duplicate values
assert_gpu_and_cpu_are_equal_sql(
lambda spark : unary_op_df(spark, RepeatSeqGen(data_gen, length=500), length=1024 * 14),
"window_agg_table",
'select ' +
', '.join(query_parts) +
' from window_agg_table ',
conf = conf)

# Rank aggregations are running window aggregations but they care about the ordering. In most tests we don't
# allow duplicate ordering, because that makes the results ambiguous. If two rows end up being switched even
# if the order-by column is the same then we can get different results for say a running sum. Here we are going
Expand Down Expand Up @@ -496,25 +474,6 @@ def test_window_running_rank(data_gen):
validate_execs_in_gpu_plan = ['GpuRunningWindowExec'],
conf = conf)

# Percent rank is a running window aggregation that currently does not use the Batch Optimiz
@ignore_order(local=True)
@pytest.mark.parametrize('data_gen', all_basic_gens + [decimal_gen_32bit, decimal_gen_128bit], ids=idfn)
def test_window_running_percent_rank(data_gen):
# Keep the batch size small. We have tested these with operators with exact inputs already, this is mostly
# testing the fixup operation.
conf = {'spark.rapids.sql.batchSizeBytes': 1000}
query_parts = ['b', 'a',
'percent_rank() over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as percent_rank_val']

# When generating the ordering try really hard to have duplicate values
assert_gpu_and_cpu_are_equal_sql(
lambda spark : two_col_df(spark, RepeatSeqGen(data_gen, length=500), RepeatSeqGen(data_gen, length=100), length=1024 * 14),
"window_agg_table",
'select ' +
', '.join(query_parts) +
' from window_agg_table ',
conf = conf)

# This is for aggregations that work with a running window optimization. They don't need to be batched
# specially, but it only works if all of the aggregations can support this.
# In a distributed setup the order of the partitions returned might be different, so we must ignore the order
Expand Down Expand Up @@ -925,8 +884,6 @@ def test_running_window_function_exec_for_all_aggs():
(partition by a order by b,c_int) as rank_val,
dense_rank() over
(partition by a order by b,c_int) as dense_rank_val,
percent_rank() over
(partition by a order by b,c_int) as percent_rank_val,
collect_list(c_float) over
(partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as collect_float,
collect_list(c_decimal_32) over
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1528,9 +1528,11 @@ case class GpuLag(input: Expression, offset: Expression, default: Expression)
}

/**
* Technically, percent_rank() *is* a running window function, but because of
* the current implementation of CUDF, it cannot currently be run with a fixer.
*
* percent_rank() is a running window function in that it only operates on a window of unbounded
* preceding to current row. But, an entire window has to be in the batch because the rank is
* divided by the number of entries in the window to get the percent rank. We cannot know the number
* of entries in the window without the entire window. This is why it is not a
* `GpuBatchedRunningWindowWithFixer`.
*/
case class GpuPercentRank(children: Seq[Expression]) extends GpuRunningWindowFunction {
override def nullable: Boolean = false
Expand Down

0 comments on commit ac2a70d

Please sign in to comment.