Skip to content

Commit

Permalink
planner: improve range underestimation (#53860)
Browse files Browse the repository at this point in the history
close #53907
  • Loading branch information
terry1purcell authored Jun 16, 2024
1 parent 6ba9435 commit a8af911
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 9 deletions.
126 changes: 123 additions & 3 deletions pkg/planner/cardinality/testdata/cardinality_suite_out.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
{
"Start": 800,
"End": 900,
"Count": 735.504166655054
"Count": 771.504166655054
},
{
"Start": 900,
Expand Down Expand Up @@ -79,7 +79,7 @@
{
"Start": 800,
"End": 1000,
"Count": 1193.696869573942
"Count": 1229.696869573942
},
{
"Start": 900,
Expand All @@ -104,7 +104,7 @@
{
"Start": 200,
"End": 400,
"Count": 1237.5288209899081
"Count": 1226.2788209899081
},
{
"Start": 200,
Expand Down Expand Up @@ -2535,6 +2535,23 @@
}
]
},
{
"github.com/pingcap/tidb/pkg/statistics.(*Histogram).EqualRowCount": [
{
"Locate value in buckets": {
"BucketIdx": 0,
"Exceed": false,
"InBucket": false,
"MatchLastValue": false,
"Value": "KindMinNotNull <nil>"
}
},
{
"Count": 0,
"Matched": false
}
]
},
{
"github.com/pingcap/tidb/pkg/statistics.(*Histogram).OutOfRangeRowCount": [
{
Expand Down Expand Up @@ -2816,6 +2833,23 @@
}
]
},
{
"github.com/pingcap/tidb/pkg/statistics.(*Histogram).EqualRowCount": [
{
"Locate value in buckets": {
"BucketIdx": 0,
"Exceed": false,
"InBucket": false,
"MatchLastValue": false,
"Value": "KindBytes \\x01"
}
},
{
"Count": 0,
"Matched": false
}
]
},
{
"github.com/pingcap/tidb/pkg/statistics.(*Histogram).OutOfRangeRowCount": [
{
Expand Down Expand Up @@ -3526,6 +3560,23 @@
}
]
},
{
"github.com/pingcap/tidb/pkg/statistics.(*Histogram).EqualRowCount": [
{
"Locate value in buckets": {
"BucketIdx": 0,
"Exceed": false,
"InBucket": false,
"MatchLastValue": false,
"Value": "KindMinNotNull <nil>"
}
},
{
"Count": 0,
"Matched": false
}
]
},
{
"github.com/pingcap/tidb/pkg/statistics.(*TopN).BetweenCount": {
"Result": 0
Expand Down Expand Up @@ -3666,6 +3717,32 @@
}
]
},
{
"github.com/pingcap/tidb/pkg/statistics.(*Histogram).EqualRowCount": [
{
"Locate value in buckets": {
"BucketIdx": 111,
"Exceed": false,
"InBucket": true,
"MatchLastValue": false,
"Value": "KindInt64 400"
}
},
{
"Related Buckets in Histogram": [
{
"Count": 896,
"Index": 111,
"Repeat": 1
}
]
},
{
"Count": 0.99,
"Matched": false
}
]
},
{
"github.com/pingcap/tidb/pkg/statistics.(*TopN).BetweenCount": [
{
Expand Down Expand Up @@ -3813,6 +3890,23 @@
}
]
},
{
"github.com/pingcap/tidb/pkg/statistics.(*Histogram).EqualRowCount": [
{
"Locate value in buckets": {
"BucketIdx": 0,
"Exceed": false,
"InBucket": false,
"MatchLastValue": false,
"Value": "KindBytes \\x01"
}
},
{
"Count": 0,
"Matched": false
}
]
},
{
"github.com/pingcap/tidb/pkg/statistics.(*TopN).BetweenCount": {
"Result": 0
Expand Down Expand Up @@ -3942,6 +4036,32 @@
}
]
},
{
"github.com/pingcap/tidb/pkg/statistics.(*Histogram).EqualRowCount": [
{
"Locate value in buckets": {
"BucketIdx": 111,
"Exceed": false,
"InBucket": true,
"MatchLastValue": false,
"Value": "KindBytes \\x03\\x80\\x00\\x00\\x00\\x00\\x00\\x01\\x91"
}
},
{
"Related Buckets in Histogram": [
{
"Count": 896,
"Index": 111,
"Repeat": 1
}
]
},
{
"Count": 0.99,
"Matched": false
}
]
},
{
"github.com/pingcap/tidb/pkg/statistics.(*TopN).BetweenCount": {
"Result": 0
Expand Down
15 changes: 9 additions & 6 deletions pkg/statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -593,14 +593,17 @@ func (hg *Histogram) LessRowCount(sctx context.PlanContext, value types.Datum) f
func (hg *Histogram) BetweenRowCount(sctx context.PlanContext, a, b types.Datum) float64 {
lessCountA := hg.LessRowCount(sctx, a)
lessCountB := hg.LessRowCount(sctx, b)
// If lessCountA is not less than lessCountB, it may be that they fall to the same bucket and we cannot estimate
// the fraction, so we use `totalCount / NDV` to estimate the row count, but the result should not greater than
// lessCountB or notNullCount-lessCountA.
if lessCountA >= lessCountB && hg.NDV > 0 {
rangeEst := lessCountB - lessCountA
lowEqual, _ := hg.EqualRowCount(sctx, a, false)
ndvAvg := hg.NotNullCount() / float64(hg.NDV)
// If values fall in the same bucket, we may underestimate the fractional result. So estimate the low value (a) as an equals, and
// estimate the high value as the default (because the input high value may be "larger" than the true high value). The range should
// not be less than both the low+high - or the lesser of the estimate for the individual range of a or b is used as a bound.
if rangeEst < math.Max(lowEqual, ndvAvg) && hg.NDV > 0 {
result := math.Min(lessCountB, hg.NotNullCount()-lessCountA)
return math.Min(result, hg.NotNullCount()/float64(hg.NDV))
return math.Min(result, lowEqual+ndvAvg)
}
return lessCountB - lessCountA
return rangeEst
}

// TotalRowCount returns the total count of this histogram.
Expand Down

0 comments on commit a8af911

Please sign in to comment.