Skip to content

Commit

Permalink
feat(binder): support group by output alias or index (#10305)
Browse files Browse the repository at this point in the history
  • Loading branch information
xiangjinwu authored Jun 14, 2023
1 parent 8eb0e43 commit e4aec8b
Show file tree
Hide file tree
Showing 4 changed files with 265 additions and 16 deletions.
82 changes: 82 additions & 0 deletions src/frontend/planner_test/tests/testdata/input/agg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,88 @@
select v1 from t group by v1 + v2;
expected_outputs:
- planner_error
- name: group by output column ordinal ok
sql: |
select 4 + 5 group by 1;
expected_outputs:
- logical_plan
- name: group by const int expr
sql: |
select 4 + 5 group by 3 - 2; -- not folded
expected_outputs:
- logical_plan
- name: group by output column ordinal of agg
sql: |
select sum(2) group by 1;
expected_outputs:
- planner_error
- name: group by output column ordinal non-integer const
sql: |
select 4 + 5 group by null; -- no implicit cast
expected_outputs:
- binder_error
- name: group by output column ordinal bigint
sql: |
select 4 + 5 group by 2147483648;
expected_outputs:
- binder_error
- name: group by output column ordinal negative
sql: |
select 4 + 5 group by -2147483648;
expected_outputs:
- binder_error
- name: group by output column ordinal zero
sql: |
select 4 + 5 group by 0;
expected_outputs:
- binder_error
- name: group by output column ordinal out of bound (excl extra order by exprs)
sql: |
select 4 + 5 group by 2 order by 7 + 8, 3 + 2;
expected_outputs:
- binder_error
- name: group by output column name ok
sql: |
select 4 + 5 as a group by a;
expected_outputs:
- logical_plan
- name: group by output column name ambiguous 2
sql: |
select 4 + 5 as a, 2 + 3 as a group by a;
expected_outputs:
- binder_error
- name: group by output column name ambiguous 3
sql: |
select 4 + 5 as a, 2 + 3 as a, 3 + 4 as a group by a;
expected_outputs:
- binder_error
- name: group by output column name not ambiguous when unused
sql: |
select 4 + 5 as a, 2 + 3 as a, 3 + 4 as b group by b;
expected_outputs:
- logical_plan
- name: group by output column name not ambiguous when input preferred
sql: |
create table t (a int);
select 4 + 5 as a, 2 + 3 as a from t group by a;
expected_outputs:
- logical_plan
- name: group by output column name expr disallowed
sql: |
select 4 + 5 as a group by a + 1;
expected_outputs:
- binder_error
- name: group by prefers input while order by prefers output
sql: |
create table t (a int);
select 4 + 5 as a from t group by a order by a;
expected_outputs:
- batch_plan
- name: group by column not found
sql: |
select 4 + 5 as a group by b;
expected_outputs:
- binder_error
- sql: |
create table t(v1 int, v2 int);
select count(v1 + v2) as cnt, sum(v1 + v2) as sum from t;
Expand Down
102 changes: 102 additions & 0 deletions src/frontend/planner_test/tests/testdata/output/agg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,108 @@
create table t(v1 int, v2 int);
select v1 from t group by v1 + v2;
planner_error: 'Invalid input syntax: column must appear in the GROUP BY clause or be used in an aggregate function'
- name: group by output column ordinal ok
sql: |
select 4 + 5 group by 1;
logical_plan: |
LogicalProject { exprs: [$expr1] }
└─LogicalAgg { group_key: [$expr1], aggs: [] }
└─LogicalProject { exprs: [(4:Int32 + 5:Int32) as $expr1] }
└─LogicalValues { rows: [[]], schema: Schema { fields: [] } }
- name: group by const int expr
sql: |
select 4 + 5 group by 3 - 2; -- not folded
logical_plan: |
LogicalProject { exprs: [(4:Int32 + 5:Int32) as $expr2] }
└─LogicalAgg { group_key: [$expr1], aggs: [] }
└─LogicalProject { exprs: [(3:Int32 - 2:Int32) as $expr1] }
└─LogicalValues { rows: [[]], schema: Schema { fields: [] } }
- name: group by output column ordinal of agg
sql: |
select sum(2) group by 1;
planner_error: |-
Feature is not yet implemented: aggregate function inside GROUP BY
No tracking issue yet. Feel free to submit a feature request at https://github.com/risingwavelabs/risingwave/issues/new?labels=type%2Ffeature&template=feature_request.yml
- name: group by output column ordinal non-integer const
sql: |
select 4 + 5 group by null; -- no implicit cast
binder_error: 'Bind error: non-integer constant in GROUP BY'
- name: group by output column ordinal bigint
sql: |
select 4 + 5 group by 2147483648;
binder_error: 'Bind error: non-integer constant in GROUP BY'
- name: group by output column ordinal negative
sql: |
select 4 + 5 group by -2147483648;
binder_error: 'Bind error: GROUP BY position -2147483648 is not in select list'
- name: group by output column ordinal zero
sql: |
select 4 + 5 group by 0;
binder_error: 'Bind error: GROUP BY position 0 is not in select list'
- name: group by output column ordinal out of bound (excl extra order by exprs)
sql: |
select 4 + 5 group by 2 order by 7 + 8, 3 + 2;
binder_error: 'Bind error: GROUP BY position 2 is not in select list'
- name: group by output column name ok
sql: |
select 4 + 5 as a group by a;
logical_plan: |
LogicalProject { exprs: [$expr1] }
└─LogicalAgg { group_key: [$expr1], aggs: [] }
└─LogicalProject { exprs: [(4:Int32 + 5:Int32) as $expr1] }
└─LogicalValues { rows: [[]], schema: Schema { fields: [] } }
- name: group by output column name ambiguous 2
sql: |
select 4 + 5 as a, 2 + 3 as a group by a;
binder_error: 'Bind error: GROUP BY "a" is ambiguous'
- name: group by output column name ambiguous 3
sql: |
select 4 + 5 as a, 2 + 3 as a, 3 + 4 as a group by a;
binder_error: 'Bind error: GROUP BY "a" is ambiguous'
- name: group by output column name not ambiguous when unused
sql: |
select 4 + 5 as a, 2 + 3 as a, 3 + 4 as b group by b;
logical_plan: |
LogicalProject { exprs: [(4:Int32 + 5:Int32) as $expr2, (2:Int32 + 3:Int32) as $expr3, $expr1] }
└─LogicalAgg { group_key: [$expr1], aggs: [] }
└─LogicalProject { exprs: [(3:Int32 + 4:Int32) as $expr1] }
└─LogicalValues { rows: [[]], schema: Schema { fields: [] } }
- name: group by output column name not ambiguous when input preferred
sql: |
create table t (a int);
select 4 + 5 as a, 2 + 3 as a from t group by a;
logical_plan: |
LogicalProject { exprs: [(4:Int32 + 5:Int32) as $expr1, (2:Int32 + 3:Int32) as $expr2] }
└─LogicalAgg { group_key: [t.a], aggs: [] }
└─LogicalProject { exprs: [t.a] }
└─LogicalScan { table: t, columns: [t.a, t._row_id] }
- name: group by output column name expr disallowed
sql: |
select 4 + 5 as a group by a + 1;
binder_error: |-
Bind error: failed to bind expression: a + 1
Caused by:
Item not found: Invalid column: a
- name: group by prefers input while order by prefers output
sql: |
create table t (a int);
select 4 + 5 as a from t group by a order by a;
batch_plan: |
BatchExchange { order: [9:Int32 ASC], dist: Single }
└─BatchSort { order: [9:Int32 ASC] }
└─BatchProject { exprs: [9:Int32] }
└─BatchHashAgg { group_key: [t.a], aggs: [] }
└─BatchExchange { order: [], dist: HashShard(t.a) }
└─BatchScan { table: t, columns: [t.a], distribution: SomeShard }
- name: group by column not found
sql: |
select 4 + 5 as a group by b;
binder_error: |-
Bind error: failed to bind expression: b
Caused by:
Item not found: Invalid column: b
- sql: |
create table t(v1 int, v2 int);
select count(v1 + v2) as cnt, sum(v1 + v2) as sum from t;
Expand Down
30 changes: 16 additions & 14 deletions src/frontend/src/binder/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,20 +176,8 @@ impl Binder {
self.bind_with(with)?;
}
let body = self.bind_set_expr(body)?;
let mut name_to_index = HashMap::new();
body.schema()
.fields()
.iter()
.enumerate()
.for_each(|(index, field)| {
name_to_index
.entry(field.name.clone())
// Ambiguous (duplicate) output names are marked with usize::MAX.
// This is not necessarily an error as long as not actually referenced by order
// by.
.and_modify(|v| *v = usize::MAX)
.or_insert(index);
});
let name_to_index =
Self::build_name_to_index(body.schema().fields().iter().map(|f| f.name.clone()));
let mut extra_order_exprs = vec![];
let visible_output_num = body.schema().len();
let order = order_by
Expand All @@ -213,11 +201,25 @@ impl Binder {
})
}

pub fn build_name_to_index(names: impl Iterator<Item = String>) -> HashMap<String, usize> {
let mut m = HashMap::new();
names.enumerate().for_each(|(index, name)| {
m.entry(name)
// Ambiguous (duplicate) output names are marked with usize::MAX.
// This is not necessarily an error as long as not actually referenced.
.and_modify(|v| *v = usize::MAX)
.or_insert(index);
});
m
}

/// Bind an `ORDER BY` expression in a [`Query`], which can be either:
/// * an output-column name
/// * index of an output column
/// * an arbitrary expression
///
/// Refer to `bind_group_by_expr_in_select` to see their similarities and differences.
///
/// # Arguments
///
/// * `name_to_index` - visible output column name -> index. Ambiguous (duplicate) output names
Expand Down
67 changes: 65 additions & 2 deletions src/frontend/src/binder/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use std::collections::HashMap;
use std::fmt::Debug;

use itertools::Itertools;
use risingwave_common::catalog::{Field, Schema, PG_CATALOG_SCHEMA_NAME};
use risingwave_common::error::{ErrorCode, Result, RwError};
use risingwave_common::types::DataType;
use risingwave_common::types::{DataType, ScalarImpl};
use risingwave_common::util::iter_util::ZipEqFast;
use risingwave_sqlparser::ast::{DataType as AstDataType, Distinct, Expr, Select, SelectItem};

Expand Down Expand Up @@ -177,11 +178,12 @@ impl Binder {
self.context.clause = None;

// Bind GROUP BY clause.
let out_name_to_index = Self::build_name_to_index(aliases.iter().filter_map(Clone::clone));
self.context.clause = Some(Clause::GroupBy);
let group_by = select
.group_by
.into_iter()
.map(|expr| self.bind_expr(expr))
.map(|expr| self.bind_group_by_expr_in_select(expr, &out_name_to_index, &select_items))
.try_collect()?;
self.context.clause = None;

Expand Down Expand Up @@ -297,6 +299,67 @@ impl Binder {
Ok((select_list, aliases))
}

/// Bind an `GROUP BY` expression in a [`Select`], which can be either:
/// * index of an output column
/// * an arbitrary expression on input columns
/// * an output-column name
///
/// Note the differences from `bind_order_by_expr_in_query`:
/// * When a name matches both an input column and an output column, `group by` interprets it as
/// input column while `order by` interprets it as output column.
/// * As the name suggests, `group by` is part of `select` while `order by` is part of `query`.
/// A `query` may consist unions of multiple `select`s (each with their own `group by`) but
/// only one `order by`.
/// * Logically / semantically, `group by` evaluates before `select items`, which evaluates
/// before `order by`. This means, `group by` can evaluate arbitrary expressions itself, or
/// take expressions from `select items` (we `clone` here and `logical_agg` will rewrite those
/// `select items` to `InputRef`). However, `order by` can only refer to `select items`, or
/// append its extra arbitrary expressions as hidden `select items` for evaluation.
///
/// # Arguments
///
/// * `name_to_index` - output column name -> index. Ambiguous (duplicate) output names are
/// marked with `usize::MAX`.
fn bind_group_by_expr_in_select(
&mut self,
expr: Expr,
name_to_index: &HashMap<String, usize>,
select_items: &[ExprImpl],
) -> Result<ExprImpl> {
let name = match &expr {
Expr::Identifier(ident) => Some(ident.real_value()),
_ => None,
};
match self.bind_expr(expr) {
Ok(ExprImpl::Literal(lit)) => match lit.get_data() {
Some(ScalarImpl::Int32(idx)) => idx
.saturating_sub(1)
.try_into()
.ok()
.and_then(|i: usize| select_items.get(i).cloned())
.ok_or_else(|| {
ErrorCode::BindError(format!(
"GROUP BY position {idx} is not in select list"
))
.into()
}),
_ => Err(ErrorCode::BindError("non-integer constant in GROUP BY".into()).into()),
},
Ok(e) => Ok(e),
Err(e) => match name {
None => Err(e),
Some(name) => match name_to_index.get(&name) {
None => Err(e),
Some(&usize::MAX) => Err(ErrorCode::BindError(format!(
"GROUP BY \"{name}\" is ambiguous"
))
.into()),
Some(out_idx) => Ok(select_items[*out_idx].clone()),
},
},
}
}

pub fn bind_returning_list(
&mut self,
returning_items: Vec<SelectItem>,
Expand Down

0 comments on commit e4aec8b

Please sign in to comment.