Skip to content

Commit

Permalink
opt: split a disjunction of equijoin predicates into a union of joins
Browse files Browse the repository at this point in the history
Previously, when the ON clause of an inner, semi or anti join contained
ORed equality predicates, the only available join method was cross join.

This was inadequate because cross join is often the worst performing
join method for joining large tables.

To address this, this patch adds a new cost-based transformation which
evaluates each disjunct in a separate join and unions or intersects the
results together.

Fixes #74302

Example query:

```
SELECT *
FROM   classRequest
       INNER JOIN classes
               ON classRequest.firstChoiceClassid = classes.classid
                  OR classRequest.secondChoiceClassid = classes.classid;
```
Transformation result written in pseudo-SQL:
```
SELECT DISTINCT ON (classes.<rowid_or_primary_key_columns>,
                    classRequest.<rowid_or_primary_key_columns>)
       dt.*
FROM   (
        SELECT     *
        FROM       classRequest
                   INNER JOIN classes
                   ON         classRequest.firstChoiceClassid =
		              classes.classid
                   UNION ALL
        SELECT     *
        FROM       classRequest
                   INNER JOIN classes
                   ON         classRequest.secondChoiceClassid =
		              classes.classid
       ) dt;
```

In addition, ORed ON clause selectivity estimation is enhanced to
estimate the selectivity of each '=' predicate separately and
combine the estimates in an iterative fashion like PostgreSQL does. This
enables the optimizer to cost the rewritten plan more accurately so it
will get picked.

Release note (performance improvement): Performance of inner, semi or
anti join between two tables with ORed equijoin predicates is improved
by enabling the optimizer to select a join plan in which each equijoin
predicate is evaluated by a separate join, with the results of the joins
unioned or intersected together.
  • Loading branch information
msirek committed Dec 29, 2021
1 parent 01c0c73 commit 42ce2b2
Show file tree
Hide file tree
Showing 11 changed files with 1,502 additions and 18 deletions.
95 changes: 95 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/inner-join
Original file line number Diff line number Diff line change
@@ -1,9 +1,41 @@
statement ok
CREATE TABLE abc (a INT, b INT, c INT, PRIMARY KEY (a, b));
ALTER TABLE abc INJECT STATISTICS '[
{
"columns": ["a"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 10000,
"distinct_count": 10000
}
]';
ALTER TABLE abc INJECT STATISTICS '[
{
"columns": ["b"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 10000,
"distinct_count": 10000
}
]';
INSERT INTO abc VALUES (1, 1, 2), (2, 1, 1), (2, 2, NULL)

statement ok
CREATE TABLE def (d INT, e INT, f INT, PRIMARY KEY (d, e));
ALTER TABLE def INJECT STATISTICS '[
{
"columns": ["d"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 1000,
"distinct_count": 1000
}
]';
ALTER TABLE def INJECT STATISTICS '[
{
"columns": ["e"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 1000,
"distinct_count": 1000
}
]';
INSERT INTO def VALUES (1, 1, 2), (2, 1, 0), (1, 2, NULL)

query III rowsort
Expand Down Expand Up @@ -43,6 +75,10 @@ SELECT a, b, c FROM abc WHERE EXISTS (SELECT * FROM def WHERE a=d OR a=e)
----
1 1 1

query III rowsort
SELECT a, b, c FROM abc WHERE NOT EXISTS (SELECT * FROM def WHERE a=d OR a=e)
----

# Given that we know the reason the above query would fail if an InnerJoin
# was used - multiple rows emitted for each matching row in the LHS - we
# might think that adding a DistinctOn over the InnerJoin would help.
Expand All @@ -58,12 +94,51 @@ SELECT a, b, c FROM abc WHERE EXISTS (SELECT * FROM def WHERE a=d OR a=e)
#
# This tests that the InnerJoin commute rule for semi joins behaves sanely in
# these cases.

query III rowsort
SELECT a, b, c FROM abc, def WHERE a=d OR a=e
----
1 1 1
1 1 1

statement ok
CREATE TABLE abc_decimal (a DECIMAL, b DECIMAL, c DECIMAL);
ALTER TABLE abc_decimal INJECT STATISTICS '[
{
"columns": ["a"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 10000,
"distinct_count": 10000
}
]';
ALTER TABLE abc_decimal INJECT STATISTICS '[
{
"columns": ["b"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 10000,
"distinct_count": 10000
}
]';
INSERT INTO abc_decimal VALUES (1, 1, 1), (1, 1, 1), (1.0, 1.0, 1.0), (1.00, 1.00, 1.00)

statement ok
CREATE TABLE def_decimal (d DECIMAL, e DECIMAL, f DECIMAL);
ALTER TABLE def_decimal INJECT STATISTICS '[
{
"columns": ["d"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 1000,
"distinct_count": 1000
}
]';
ALTER TABLE def_decimal INJECT STATISTICS '[
{
"columns": ["e"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 1000,
"distinct_count": 1000
}
]';
INSERT INTO def_decimal VALUES (1, 1, 1), (1.0, 1.0, 1.0), (1.00, 1.00, 1.00)

query RRR rowsort
Expand All @@ -73,3 +148,23 @@ SELECT a, b, c FROM abc_decimal WHERE EXISTS (SELECT * FROM def_decimal WHERE a:
1 1 1
1.0 1.0 1.0
1.00 1.00 1.00

query RRR rowsort
SELECT a, b, c FROM abc_decimal WHERE EXISTS (SELECT * FROM def_decimal WHERE a::string=d::string or a::string=e::string)
----
1 1 1
1 1 1
1.0 1.0 1.0
1.00 1.00 1.00

query RRR rowsort
SELECT a, b, c FROM abc_decimal, def_decimal WHERE a::string=d::string or a::string=e::string
----
1 1 1
1 1 1
1.0 1.0 1.0
1.00 1.00 1.00

query RRR rowsort
SELECT a, b, c FROM abc_decimal WHERE NOT EXISTS (SELECT * FROM def_decimal WHERE a::string=d::string or a::string=e::string)
----
146 changes: 146 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/subquery
Original file line number Diff line number Diff line change
Expand Up @@ -550,3 +550,149 @@ WHERE
t.oid
NOT IN (SELECT (ARRAY[704, 11676, 10005, 3912, 11765, 59410, 11397])[i] FROM generate_series(1, 376) AS i)
----

statement ok
ALTER TABLE abc INJECT STATISTICS '[
{
"columns": ["a"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 10000,
"distinct_count": 10000
}
]'

statement ok
ALTER TABLE abc INJECT STATISTICS '[
{
"columns": ["b"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 10000,
"distinct_count": 10000
}
]'

statement ok
ALTER TABLE xyz INJECT STATISTICS '[
{
"columns": ["x"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 1000,
"distinct_count": 1000
}
]'

statement ok
ALTER TABLE xyz INJECT STATISTICS '[
{
"columns": ["y"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 1000,
"distinct_count": 1000
}
]'

statement ok
INSERT INTO xyz VALUES(5, 4, 7)

statement ok
INSERT INTO abc VALUES(12, 13, 14)

statement ok
CREATE INDEX abc_b ON abc(b)

statement ok
CREATE INDEX xyz_y ON xyz(y)

### Split Disjunctions Tests
query IIIIII rowsort
SELECT * FROM abc INNER JOIN xyz on abc.a = xyz.x or abc.b = xyz.y
----
4 5 6 4 5 6
7 8 9 7 11 12
2 5 6 4 5 6

query IIIIII rowsort
SELECT * FROM abc INNER JOIN xyz on abc.a = xyz.y or abc.b = xyz.x
----
2 5 6 1 2 3
4 5 6 5 4 7
2 5 6 5 4 7
12 13 14 13 11 12

query IIIIII rowsort
SELECT * FROM abc INNER JOIN xyz on abc.a = xyz.x or abc.b = xyz.y where abc.a > 3 and xyz.z > 10
----
7 8 9 7 11 12

query IIIIII rowsort
SELECT * FROM abc INNER JOIN xyz on abc.a = xyz.y or abc.b = xyz.x where abc.a > 3 and xyz.z > 10
----
12 13 14 13 11 12

query III rowsort
SELECT * FROM abc WHERE EXISTS (SELECT * FROM xyz WHERE abc.a = xyz.x or abc.b = xyz.y)
----
4 5 6
7 8 9
2 5 6

query III rowsort
SELECT * FROM abc WHERE EXISTS (SELECT * FROM xyz WHERE abc.a = xyz.y or abc.b = xyz.x)
----
2 5 6
4 5 6
12 13 14

query III rowsort
SELECT * FROM abc WHERE EXISTS (SELECT * FROM xyz WHERE (abc.a = xyz.x or abc.b = xyz.y)and abc.a > 3 and xyz.z > 10)
----
7 8 9

query III rowsort
SELECT * FROM abc WHERE EXISTS (SELECT * FROM xyz WHERE (abc.a = xyz.y or abc.b = xyz.x) and abc.a > 3 and xyz.z > 10)
----
12 13 14

query III rowsort
SELECT * FROM abc WHERE NOT EXISTS (SELECT * FROM xyz WHERE abc.a = xyz.x or abc.b = xyz.y)
----
12 13 14

query III rowsort
SELECT * FROM abc WHERE NOT EXISTS (SELECT * FROM xyz WHERE abc.a = xyz.y or abc.b = xyz.x)
----
7 8 9

query III rowsort
SELECT * FROM abc WHERE NOT EXISTS (SELECT * FROM xyz WHERE (abc.a = xyz.x or abc.b = xyz.y)and abc.a > 3 and xyz.z > 10)
----
2 5 6
4 5 6
12 13 14

query III rowsort
SELECT * FROM abc WHERE NOT EXISTS (SELECT * FROM xyz WHERE (abc.a = xyz.y or abc.b = xyz.x) and abc.a > 3 and xyz.z > 10)
----
2 5 6
4 5 6
7 8 9

query IIIIII rowsort
SELECT * FROM abc inner join xyz on (abc.a = xyz.x or abc.b = xyz.y) and (abc.a = xyz.y or abc.b = xyz.y)
----
4 5 6 4 5 6
2 5 6 4 5 6

query III rowsort
SELECT * FROM abc WHERE EXISTS (SELECT * FROM xyz WHERE (abc.a = xyz.x or abc.b = xyz.y) and (abc.a = xyz.y or abc.b = xyz.y))
----
4 5 6
2 5 6

query III rowsort
SELECT * FROM abc WHERE NOT EXISTS (SELECT * FROM xyz WHERE (abc.a = xyz.x or abc.b = xyz.y) and (abc.a = xyz.y or abc.b = xyz.y))
----
7 8 9
12 13 14

### End Split Disjunctions Tests
94 changes: 94 additions & 0 deletions pkg/sql/opt/exec/execbuilder/testdata/join
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# LogicTest: local

statement ok
CREATE TABLE classes (classID int, description varchar(50));
INSERT INTO classes SELECT g, 'dummy description' FROM generate_series(1,100) g(g);

statement ok
CREATE TABLE classRequest (studentID int, firstChoiceClassID int, secondChoiceClassID int, thirdChoiceClassID int, primary key(studentID));
INSERT INTO classRequest SELECT g, g, g, g FROM generate_series(1,100) g(g);

statement ok
CREATE TABLE onecolumn (x INT); INSERT INTO onecolumn(x) VALUES (44), (NULL), (42)

Expand Down Expand Up @@ -2274,3 +2282,89 @@ vectorized: true
missing stats
table: customers@customers_pkey
spans: FULL SCAN

# Test selectivity of ORed join predicates

statement ok
set vectorize=off;

query T
EXPLAIN (verbose)
SELECT * FROM classes, classRequest WHERE classRequest.firstChoiceClassID = classes.classID or
classRequest.secondChoiceClassID = classes.classID;
----
distribution: local
vectorized: false
·
• project
│ columns: (classid, description, studentid, firstchoiceclassid, secondchoiceclassid, thirdchoiceclassid)
│ estimated row count: 19,701 (missing stats)
└── • distinct
│ columns: (classid, description, rowid, studentid, firstchoiceclassid, secondchoiceclassid, thirdchoiceclassid)
│ estimated row count: 19,602 (missing stats)
│ distinct on: rowid, studentid
└── • union all
│ columns: (classid, description, rowid, studentid, firstchoiceclassid, secondchoiceclassid, thirdchoiceclassid)
│ estimated row count: 19,602 (missing stats)
├── • hash join (inner)
│ │ columns: (classid, description, rowid, studentid, firstchoiceclassid, secondchoiceclassid, thirdchoiceclassid)
│ │ estimated row count: 9,801 (missing stats)
│ │ equality: (classid) = (firstchoiceclassid)
│ │
│ ├── • scan
│ │ columns: (classid, description, rowid)
│ │ estimated row count: 1,000 (missing stats)
│ │ table: classes@classes_pkey
│ │ spans: FULL SCAN
│ │
│ └── • scan
│ columns: (studentid, firstchoiceclassid, secondchoiceclassid, thirdchoiceclassid)
│ estimated row count: 1,000 (missing stats)
│ table: classrequest@classrequest_pkey
│ spans: FULL SCAN
└── • hash join (inner)
│ columns: (classid, description, rowid, studentid, firstchoiceclassid, secondchoiceclassid, thirdchoiceclassid)
│ estimated row count: 9,801 (missing stats)
│ equality: (classid) = (secondchoiceclassid)
├── • scan
│ columns: (classid, description, rowid)
│ estimated row count: 1,000 (missing stats)
│ table: classes@classes_pkey
│ spans: FULL SCAN
└── • scan
columns: (studentid, firstchoiceclassid, secondchoiceclassid, thirdchoiceclassid)
estimated row count: 1,000 (missing stats)
table: classrequest@classrequest_pkey
spans: FULL SCAN

query T
EXPLAIN (verbose)
SELECT * FROM classes, classRequest WHERE classRequest.firstChoiceClassID = classes.classID or
classRequest.secondChoiceClassID = classes.classID or
classRequest.thirdChoiceClassID = classes.classID;
----
distribution: local
vectorized: false
·
• cross join (inner)
│ columns: (classid, description, studentid, firstchoiceclassid, secondchoiceclassid, thirdchoiceclassid)
│ estimated row count: 29,404 (missing stats)
│ pred: ((firstchoiceclassid = classid) OR (secondchoiceclassid = classid)) OR (thirdchoiceclassid = classid)
├── • scan
│ columns: (classid, description)
│ estimated row count: 1,000 (missing stats)
│ table: classes@classes_pkey
│ spans: FULL SCAN
└── • scan
columns: (studentid, firstchoiceclassid, secondchoiceclassid, thirdchoiceclassid)
estimated row count: 1,000 (missing stats)
table: classrequest@classrequest_pkey
spans: FULL SCAN
Loading

0 comments on commit 42ce2b2

Please sign in to comment.