mabel-dev · joocer · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,4 +1,4 @@
-__build__ = 817
+__build__ = 818
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/opteryx/planner/binder/binder_visitor.py b/opteryx/planner/binder/binder_visitor.py
@@ -803,19 +803,6 @@ def visit_project(self, node: Node, context: BindingContext) -> Tuple[Node, Bind
                 message=f"Query result contains multiple instances of the same column(s) - `{'`, `'.join(matches)}`"
             )
 
-        # get any column or field from a realtion referenced
-        # 1984
-        all_identities = set(
-            [
-                item.schema_column.identity
-                for sublist in [
-                    get_all_nodes_of_type(c, (NodeType.IDENTIFIER,)) for c in node.columns
-                ]
-                for item in sublist
-            ]
-            + all_top_level_identities
-        )
-
         # Remove columns not being projected from the schemas, and remove empty schemas
         columns = []
         for relation, schema in list(context.schemas.items()):
@@ -838,8 +825,6 @@ def visit_project(self, node: Node, context: BindingContext) -> Tuple[Node, Bind
                 # update the schema with columns we have references to, removing redundant columns
                 schema.columns = schema_columns
                 for column in node.columns:
-                    # indirect references are when we're keeping a column for a function or sort
-                    # 1984                    column.direct_reference = column.identity in all_top_level_identities
                     if column.schema_column.identity in [i.identity for i in schema_columns]:
                         columns.append(column)
 

diff --git a/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py b/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py
@@ -24,6 +24,9 @@ def __init__(self, tree: LogicalPlan):
         self.pre_optimized_tree = tree
         self.optimized_plan = LogicalPlan()
 
+        self.seen_projections: int = 0
+        self.seen_unions: int = 0
+
         self.collected_predicates: list = []
         """We collect predicates we should be able to push to reads and joins"""
 

diff --git a/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py
@@ -37,14 +37,18 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo
         """
         node.pre_update_columns = set(context.collected_identities)
 
-        # If we're at a project, we only keep the columns that are referenced
-        # this is mainly when we have columns in a subquery which aren't used
-        # in the outer query
-        # 1984
-        #        if node.node_type == LogicalPlanStepType.Project:
-        #            node.columns = [
-        #                n for n in node.columns if n.schema_column.identity in context.collected_identities
-        #            ]
+        # If we're at the something other than the top project (e.g. in a subquery) in a plan we
+        # may be able to remove some columns (and potentially some evaluations) if the columns
+        # aren't referenced in the outer query.
+        if node.node_type == LogicalPlanStepType.Union:
+            context.seen_unions += 1
+        if node.node_type == LogicalPlanStepType.Project:
+            if context.seen_unions == 0 and context.seen_projections > 0:
+                node.columns = [
+                    n for n in node.columns if n.schema_column.identity in node.pre_update_columns
+                ]
+            if context.seen_unions == 0:
+                context.seen_projections += 1
 
         # Subqueries act like all columns are referenced
         if node.node_type != LogicalPlanStepType.Subquery:

diff --git a/tests/plan_optimization/test_projection_pushdown_parquet.py b/tests/plan_optimization/test_projection_pushdown_parquet.py
@@ -18,6 +18,17 @@
     ("SELECT obliquityToOrbit, meanTemperature, surfacePressure FROM testdata.planets;", 3),
     ("SELECT numberOfMoons, name FROM testdata.planets;", 2),
 
+    # Pushing past subqueries (we're using a view here)
+    ("SELECT DISTINCT Company FROM launches", 1),
+    ("SELECT Company FROM launches", 1),
+    ("SELECT * FROM launches", 3),
+    ("SELECT DISTINCT Company FROM launches ORDER BY Company", 1),
+    ("SELECT DISTINCT Mission FROM launches", 1),
+    ("SELECT LL FROM launches", 1),
+    ("SELECT LOG2(LL) FROM launches", 1),
+    ("SELECT LEN(Company) > LL from launches", 2),
+    ("SELECT LL from launches WHERE LEN(Company) < LL", 2)
+
 ]
 
 @pytest.mark.parametrize("query, expected_columns", STATEMENTS)

diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py
@@ -2005,7 +2005,7 @@
         #2042
         ("SELECT DISTINCT Company FROM launches", 62, 1, None),
         ("SELECT Company FROM launches", 4630, 1, None),
-        ("SELECT * FROM launches", 4630, 2, None),
+        ("SELECT * FROM launches", 4630, 3, None),
         ("SELECT DISTINCT Company FROM launches ORDER BY Company", 62, 1, None),
         ("SELECT DISTINCT Mission FROM launches", 4556, 1, None)
 ]

diff --git a/views.json b/views.json
@@ -6,6 +6,6 @@
         "statement": "/* A test case for row-permissions functionality */ SELECT * FROM $astronauts WHERE LIST_CONTAINS_ANY(missions, @@user_memberships)"
     },
     "launches": {
-        "statement": "/* regression test */ SELECT Company, Mission FROM $missions" 
+        "statement": "/* regression test */ SELECT Company, Mission, LEN(Location) AS LL FROM $missions" 
     }
 }