Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#1984 #2049

Merged
merged 2 commits into from
Oct 4, 2024
Merged

#1984 #2049

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion opteryx/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__build__ = 817
__build__ = 818

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
15 changes: 0 additions & 15 deletions opteryx/planner/binder/binder_visitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,19 +803,6 @@ def visit_project(self, node: Node, context: BindingContext) -> Tuple[Node, Bind
message=f"Query result contains multiple instances of the same column(s) - `{'`, `'.join(matches)}`"
)

# get any column or field from a realtion referenced
# 1984
all_identities = set(
[
item.schema_column.identity
for sublist in [
get_all_nodes_of_type(c, (NodeType.IDENTIFIER,)) for c in node.columns
]
for item in sublist
]
+ all_top_level_identities
)

# Remove columns not being projected from the schemas, and remove empty schemas
columns = []
for relation, schema in list(context.schemas.items()):
Expand All @@ -838,8 +825,6 @@ def visit_project(self, node: Node, context: BindingContext) -> Tuple[Node, Bind
# update the schema with columns we have references to, removing redundant columns
schema.columns = schema_columns
for column in node.columns:
# indirect references are when we're keeping a column for a function or sort
# 1984 column.direct_reference = column.identity in all_top_level_identities
if column.schema_column.identity in [i.identity for i in schema_columns]:
columns.append(column)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ def __init__(self, tree: LogicalPlan):
self.pre_optimized_tree = tree
self.optimized_plan = LogicalPlan()

self.seen_projections: int = 0
self.seen_unions: int = 0

self.collected_predicates: list = []
"""We collect predicates we should be able to push to reads and joins"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,18 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo
"""
node.pre_update_columns = set(context.collected_identities)

# If we're at a project, we only keep the columns that are referenced
# this is mainly when we have columns in a subquery which aren't used
# in the outer query
# 1984
# if node.node_type == LogicalPlanStepType.Project:
# node.columns = [
# n for n in node.columns if n.schema_column.identity in context.collected_identities
# ]
# If we're at the something other than the top project (e.g. in a subquery) in a plan we
# may be able to remove some columns (and potentially some evaluations) if the columns
# aren't referenced in the outer query.
if node.node_type == LogicalPlanStepType.Union:
context.seen_unions += 1
if node.node_type == LogicalPlanStepType.Project:
if context.seen_unions == 0 and context.seen_projections > 0:
node.columns = [
n for n in node.columns if n.schema_column.identity in node.pre_update_columns
]
if context.seen_unions == 0:
context.seen_projections += 1

# Subqueries act like all columns are referenced
if node.node_type != LogicalPlanStepType.Subquery:
Expand Down
11 changes: 11 additions & 0 deletions tests/plan_optimization/test_projection_pushdown_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,17 @@
("SELECT obliquityToOrbit, meanTemperature, surfacePressure FROM testdata.planets;", 3),
("SELECT numberOfMoons, name FROM testdata.planets;", 2),

# Pushing past subqueries (we're using a view here)
("SELECT DISTINCT Company FROM launches", 1),
("SELECT Company FROM launches", 1),
("SELECT * FROM launches", 3),
("SELECT DISTINCT Company FROM launches ORDER BY Company", 1),
("SELECT DISTINCT Mission FROM launches", 1),
("SELECT LL FROM launches", 1),
("SELECT LOG2(LL) FROM launches", 1),
("SELECT LEN(Company) > LL from launches", 2),
("SELECT LL from launches WHERE LEN(Company) < LL", 2)

]

@pytest.mark.parametrize("query, expected_columns", STATEMENTS)
Expand Down
2 changes: 1 addition & 1 deletion tests/sql_battery/test_shapes_and_errors_battery.py
Original file line number Diff line number Diff line change
Expand Up @@ -2005,7 +2005,7 @@
#2042
("SELECT DISTINCT Company FROM launches", 62, 1, None),
("SELECT Company FROM launches", 4630, 1, None),
("SELECT * FROM launches", 4630, 2, None),
("SELECT * FROM launches", 4630, 3, None),
("SELECT DISTINCT Company FROM launches ORDER BY Company", 62, 1, None),
("SELECT DISTINCT Mission FROM launches", 4556, 1, None)
]
Expand Down
2 changes: 1 addition & 1 deletion views.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@
"statement": "/* A test case for row-permissions functionality */ SELECT * FROM $astronauts WHERE LIST_CONTAINS_ANY(missions, @@user_memberships)"
},
"launches": {
"statement": "/* regression test */ SELECT Company, Mission FROM $missions"
"statement": "/* regression test */ SELECT Company, Mission, LEN(Location) AS LL FROM $missions"
}
}
Loading