-
Notifications
You must be signed in to change notification settings - Fork 594
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(expr, agg): support PERCENTILE_CONT
, PERCENTILE_DISC
and MODE
aggregation
#10252
Merged
Merged
Changes from 36 commits
Commits
Show all changes
38 commits
Select commit
Hold shift + click to select a range
fcc63e7
merge binder
Honeta 546d2bd
support percentile_cont(buggy)
Honeta e8fa9d3
quick fix
stdrc 2307ce0
general compete
Honeta 921636a
arg must be f64
Honeta 2bba651
merge code
Honeta 6cc6792
fix check
Honeta 2c17be4
fix
Honeta cf1a0c5
make project to f64(buggy)
Honeta c603bc3
merge main
Honeta 64acacc
agg complete
Honeta ed5521e
add slt
Honeta 43583cb
Merge branch 'main' into xinjing/percentile_cont
Honeta d6661e8
fix format
Honeta 82939fb
fix check
Honeta 7f3052e
update planner test
Honeta 801d41e
add check in to_stream
Honeta 7c5cd04
Merge branch 'main' into xinjing/percentile_cont
Honeta a32a080
fix format
Honeta 70a3402
modify fuze test
Honeta d3023ee
fix slt
Honeta 53003d2
Merge branch 'main' into xinjing/percentile_cont
Honeta 27f90c4
Merge branch 'main' into xinjing/percentile_cont
kwannoel 2d00628
Merge branch 'main' into xinjing/percentile_cont
Honeta 7dc17a3
support percentile_disc
Honeta e47f724
Merge branch 'main' into xinjing/percentile_cont
Honeta 6878c34
fix check
Honeta f1f8f11
check fraction in binder
Honeta 459012a
Merge branch 'main' into xinjing/percentile_cont
Honeta 4545a2f
fix disc
Honeta 0c3d26e
create e2e_test/batch/aggregate/ordered_set_agg.slt.part
Honeta 45b7f7c
fix
Honeta a46d53b
Merge branch 'main' into xinjing/percentile_cont
Honeta 9522d35
support mode
Honeta 99c848a
fix format
Honeta 25e32b0
Merge branch 'main' into xinjing/percentile_cont
Honeta 25cbec3
improve AggCall::infer_return_type
Honeta c3d9881
Merge branch 'main' into xinjing/percentile_cont
Honeta File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
statement error | ||
select p, percentile_cont(p) within group (order by x::float8) | ||
from generate_series(1,5) x, | ||
(values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1)) v(p) | ||
group by p order by p; | ||
|
||
statement error | ||
select percentile_cont(array[0,1,0.25,0.75,0.5,1,0.3,0.32,0.35,0.38,0.4]) within group (order by x) | ||
from generate_series(1,6) x; | ||
|
||
statement error | ||
select percentile_disc(array[0.25,0.5,0.75]) within group (order by x) | ||
from unnest('{fred,jim,fred,jack,jill,fred,jill,jim,jim,sheila,jim,sheila}'::text[]) u(x); | ||
|
||
statement error | ||
select pg_collation_for(percentile_disc(1) within group (order by x collate "POSIX")) | ||
from (values ('fred'),('jim')) v(x); | ||
|
||
query RR | ||
select | ||
percentile_cont(0.5) within group (order by a), | ||
percentile_disc(0.5) within group (order by a) | ||
from (values(1::float8),(3),(5),(7)) t(a); | ||
---- | ||
4 3 | ||
|
||
query RR | ||
select | ||
percentile_cont(0.25) within group (order by a), | ||
percentile_disc(0.5) within group (order by a) | ||
from (values(1::float8),(3),(5),(7)) t(a); | ||
---- | ||
2.5 3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
// Copyright 2023 RisingWave Labs | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
use risingwave_common::array::*; | ||
use risingwave_common::estimate_size::EstimateSize; | ||
use risingwave_common::types::*; | ||
use risingwave_expr_macro::build_aggregate; | ||
|
||
use super::Aggregator; | ||
use crate::agg::AggCall; | ||
use crate::Result; | ||
|
||
#[build_aggregate("mode(*) -> *")] | ||
fn build(agg: AggCall) -> Result<Box<dyn Aggregator>> { | ||
Ok(Box::new(Mode::new(agg.return_type))) | ||
} | ||
|
||
/// Computes the mode, the most frequent value of the aggregated argument (arbitrarily choosing the | ||
/// first one if there are multiple equally-frequent values). The aggregated argument must be of a | ||
/// sortable type. | ||
/// | ||
/// ```slt | ||
/// query I | ||
/// select mode() within group (order by unnest) from unnest(array[1]); | ||
/// ---- | ||
/// 1 | ||
/// | ||
/// query I | ||
/// select mode() within group (order by unnest) from unnest(array[1,2,2,3,3,4,4,4]); | ||
/// ---- | ||
/// 4 | ||
/// | ||
/// query R | ||
/// select mode() within group (order by unnest) from unnest(array[0.1,0.2,0.2,0.4,0.4,0.3,0.3,0.4]); | ||
/// ---- | ||
/// 0.4 | ||
/// | ||
/// query R | ||
/// select mode() within group (order by unnest) from unnest(array[1,2,2,3,3,4,4,4,3]); | ||
/// ---- | ||
/// 3 | ||
/// | ||
/// query T | ||
/// select mode() within group (order by unnest) from unnest(array['1','2','2','3','3','4','4','4','3']); | ||
/// ---- | ||
/// 3 | ||
/// | ||
/// query I | ||
/// select mode() within group (order by unnest) from unnest(array[]::int[]); | ||
/// ---- | ||
/// NULL | ||
/// ``` | ||
#[derive(Clone, EstimateSize)] | ||
pub struct Mode { | ||
return_type: DataType, | ||
cur_mode: Datum, | ||
cur_mode_freq: usize, | ||
cur_item: Datum, | ||
cur_item_freq: usize, | ||
} | ||
|
||
impl Mode { | ||
pub fn new(return_type: DataType) -> Self { | ||
Self { | ||
return_type, | ||
cur_mode: None, | ||
cur_mode_freq: 0, | ||
cur_item: None, | ||
cur_item_freq: 0, | ||
} | ||
} | ||
|
||
fn add_datum(&mut self, datum_ref: DatumRef<'_>) { | ||
let datum = datum_ref.to_owned_datum(); | ||
if datum.is_some() && self.cur_item == datum { | ||
self.cur_item_freq += 1; | ||
} else if datum.is_some() { | ||
self.cur_item = datum; | ||
self.cur_item_freq = 1; | ||
} | ||
if self.cur_item_freq > self.cur_mode_freq { | ||
self.cur_mode = self.cur_item.clone(); | ||
self.cur_mode_freq = self.cur_item_freq; | ||
} | ||
} | ||
} | ||
|
||
#[async_trait::async_trait] | ||
impl Aggregator for Mode { | ||
fn return_type(&self) -> DataType { | ||
self.return_type.clone() | ||
} | ||
|
||
async fn update_multi( | ||
&mut self, | ||
input: &DataChunk, | ||
start_row_id: usize, | ||
end_row_id: usize, | ||
) -> Result<()> { | ||
let array = input.column_at(0); | ||
for row_id in start_row_id..end_row_id { | ||
self.add_datum(array.value_at(row_id)); | ||
} | ||
Ok(()) | ||
} | ||
|
||
fn output(&mut self, builder: &mut ArrayBuilderImpl) -> Result<()> { | ||
builder.append(self.cur_mode.clone()); | ||
Ok(()) | ||
} | ||
|
||
fn estimated_size(&self) -> usize { | ||
EstimateSize::estimated_size(self) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
// Copyright 2023 RisingWave Labs | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
use risingwave_common::array::*; | ||
use risingwave_common::estimate_size::EstimateSize; | ||
use risingwave_common::types::*; | ||
use risingwave_expr_macro::build_aggregate; | ||
|
||
use super::Aggregator; | ||
use crate::agg::AggCall; | ||
use crate::Result; | ||
|
||
/// Computes the continuous percentile, a value corresponding to the specified fraction within the | ||
/// ordered set of aggregated argument values. This will interpolate between adjacent input items if | ||
/// needed. | ||
/// | ||
/// ```slt | ||
/// statement ok | ||
/// create table t(x int, y bigint, z real, w double, v varchar); | ||
/// | ||
/// statement ok | ||
/// insert into t values(1,10,100,1000,'10000'),(2,20,200,2000,'20000'),(3,30,300,3000,'30000'); | ||
/// | ||
/// query R | ||
/// select percentile_cont(0.45) within group (order by x desc) from t; | ||
/// ---- | ||
/// 2.1 | ||
/// | ||
/// query R | ||
/// select percentile_cont(0.45) within group (order by y desc) from t; | ||
/// ---- | ||
/// 21 | ||
/// | ||
/// query R | ||
/// select percentile_cont(0.45) within group (order by z desc) from t; | ||
/// ---- | ||
/// 210 | ||
/// | ||
/// query R | ||
/// select percentile_cont(0.45) within group (order by w desc) from t; | ||
/// ---- | ||
/// 2100 | ||
/// | ||
/// query R | ||
/// select percentile_cont(NULL) within group (order by w desc) from t; | ||
/// ---- | ||
/// NULL | ||
/// | ||
/// statement ok | ||
/// drop table t; | ||
/// ``` | ||
#[build_aggregate("percentile_cont(float64) -> float64")] | ||
fn build(agg: AggCall) -> Result<Box<dyn Aggregator>> { | ||
let fraction: Option<f64> = agg.direct_args[0] | ||
.literal() | ||
.map(|x| (*x.as_float64()).into()); | ||
Ok(Box::new(PercentileCont::new(fraction))) | ||
} | ||
|
||
#[derive(Clone, EstimateSize)] | ||
pub struct PercentileCont { | ||
fractions: Option<f64>, | ||
data: Vec<f64>, | ||
} | ||
|
||
impl PercentileCont { | ||
pub fn new(fractions: Option<f64>) -> Self { | ||
Self { | ||
fractions, | ||
data: vec![], | ||
} | ||
} | ||
|
||
fn add_datum(&mut self, datum_ref: DatumRef<'_>) { | ||
if let Some(datum) = datum_ref.to_owned_datum() { | ||
self.data.push((*datum.as_float64()).into()); | ||
} | ||
} | ||
} | ||
|
||
#[async_trait::async_trait] | ||
impl Aggregator for PercentileCont { | ||
fn return_type(&self) -> DataType { | ||
DataType::Float64 | ||
} | ||
|
||
async fn update_multi( | ||
&mut self, | ||
input: &DataChunk, | ||
start_row_id: usize, | ||
end_row_id: usize, | ||
) -> Result<()> { | ||
let array = input.column_at(0); | ||
for row_id in start_row_id..end_row_id { | ||
self.add_datum(array.value_at(row_id)); | ||
} | ||
Ok(()) | ||
} | ||
|
||
fn output(&mut self, builder: &mut ArrayBuilderImpl) -> Result<()> { | ||
if let Some(fractions) = self.fractions && !self.data.is_empty() { | ||
let rn = fractions * (self.data.len() - 1) as f64; | ||
let crn = f64::ceil(rn); | ||
let frn = f64::floor(rn); | ||
let result = if crn == frn { | ||
self.data[crn as usize] | ||
} else { | ||
(crn - rn) * self.data[frn as usize] | ||
+ (rn - frn) * self.data[crn as usize] | ||
}; | ||
builder.append(Some(ScalarImpl::Float64(result.into()))); | ||
} else { | ||
builder.append(Datum::None); | ||
} | ||
Ok(()) | ||
} | ||
|
||
fn estimated_size(&self) -> usize { | ||
EstimateSize::estimated_size(self) | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The signature should be
mode(*) -> auto
, meaning the output is same as input. Using two*
produces the Cartesian product of all combinations.cc @wangrunji0408 Any ideas on preventing such misuses?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
indeed, let me fix it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is valid and possible to have such a signature to make cartesian product, e.g.
cast(*) -> *
. So we can't forbid this usage. But it's possible to raise a warning in proc-macro and allow user to silence it if intended.