From 99c7f4ddebfdaa9a2d9de0ec167f6f4eddcb768d Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 28 Nov 2024 15:24:31 +0100 Subject: [PATCH] fix: Fix invalid len due to projection pushdown selection of scalar (#20049) --- .../projection_pushdown/projection.rs | 73 +++++++++++++++---- py-polars/tests/unit/test_scalar.py | 16 ++++ 2 files changed, 73 insertions(+), 16 deletions(-) diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/projection.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/projection.rs index 1bd4b6f1a9ec..6327238c4c21 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/projection.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/projection.rs @@ -67,25 +67,66 @@ pub(super) fn process_projection( let expr = if input_schema.is_empty() { // If the input schema is empty, we should just project // ourselves - exprs[0].node() + Some(exprs[0].node()) } else { - // simply select the last column - // NOTE: the first can be the inserted index column, so that might not work - let (first_name, _) = input_schema.try_get_at_index(input_schema.len() - 1)?; - let expr = expr_arena.add(AExpr::Column(first_name.clone())); - if !acc_projections.is_empty() { - check_double_projection( - &exprs[0], - expr_arena, - &mut acc_projections, - &mut projected_names, - ); + // Select the last column projection. + let mut name = None; + for (_, plan) in (&*lp_arena).iter(input) { + match plan { + IR::Select { expr: exprs, .. } | IR::HStack { exprs, .. } => { + for e in exprs { + if !e.is_scalar(expr_arena) { + name = Some(e.output_name()); + break; + } + } + }, + IR::Scan { + file_info, + output_schema, + .. + } => { + let schema = output_schema.as_ref().unwrap_or(&file_info.schema); + // NOTE: the first can be the inserted index column, so that might not work + let (last_name, _) = schema.try_get_at_index(schema.len() - 1)?; + name = Some(last_name); + break; + }, + IR::DataFrameScan { + schema, + output_schema, + .. + } => { + // NOTE: the first can be the inserted index column, so that might not work + let schema = output_schema.as_ref().unwrap_or(schema); + let (last_name, _) = schema.try_get_at_index(schema.len() - 1)?; + name = Some(last_name); + break; + }, + _ => {}, + } + } + + if let Some(name) = name { + let expr = expr_arena.add(AExpr::Column(name.clone())); + if !acc_projections.is_empty() { + check_double_projection( + &exprs[0], + expr_arena, + &mut acc_projections, + &mut projected_names, + ); + } + Some(expr) + } else { + None } - expr }; - add_expr_to_accumulated(expr, &mut acc_projections, &mut projected_names, expr_arena); - local_projection.push(exprs.pop().unwrap()); - proj_pd.is_count_star = true; + if let Some(expr) = expr { + add_expr_to_accumulated(expr, &mut acc_projections, &mut projected_names, expr_arena); + local_projection.push(exprs.pop().unwrap()); + proj_pd.is_count_star = true; + } } else { // A projection can consist of a chain of expressions followed by an alias. // We want to do the chain locally because it can have complicated side effects. diff --git a/py-polars/tests/unit/test_scalar.py b/py-polars/tests/unit/test_scalar.py index 4b1e9c135d5c..868cb72e6991 100644 --- a/py-polars/tests/unit/test_scalar.py +++ b/py-polars/tests/unit/test_scalar.py @@ -47,3 +47,19 @@ def test_scalar_19957() -> None: "foo": [1, 1, 1], "bar": [1, 1, 1], } + + +def test_scalar_len_20046() -> None: + df = pl.DataFrame({"a": [1, 2, 3]}) + + assert ( + df.lazy() + .select( + pl.col("a"), + pl.lit(1), + ) + .select(pl.len()) + .collect() + .item() + == 3 + )