Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

statistics: fix get wrong topn with data -0 and 0 #57464

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pkg/planner/core/tests/analyze/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,15 @@ go_test(
"analyze_test.go",
"main_test.go",
],
data = glob(["testdata/**"]),
flaky = True,
deps = [
"//pkg/executor",
"//pkg/lightning/mydump",
"//pkg/sessionctx",
"//pkg/testkit",
"//pkg/testkit/testsetup",
"@com_github_stretchr_testify//require",
"@org_uber_go_goleak//:goleak",
],
)
28 changes: 28 additions & 0 deletions pkg/planner/core/tests/analyze/analyze_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,15 @@
package analyze

import (
"io"
"os"
"testing"

"github.com/pingcap/tidb/pkg/executor"
"github.com/pingcap/tidb/pkg/lightning/mydump"
"github.com/pingcap/tidb/pkg/sessionctx"
"github.com/pingcap/tidb/pkg/testkit"
"github.com/stretchr/testify/require"
)

func TestAnalyzeVirtualColumns(t *testing.T) {
Expand All @@ -27,3 +33,25 @@ func TestAnalyzeVirtualColumns(t *testing.T) {
tk.MustExec(`CREATE TABLE t1 (id bigint NOT NULL,c1 varchar(50) NOT NULL ,c2 int DEFAULT NULL ,c3 json DEFAULT NULL ,c4 varchar(255) GENERATED ALWAYS AS (json_unquote(json_extract(c3, '$.oppositePlaceId'))) VIRTUAL ,PRIMARY KEY (id),UNIQUE KEY idx_unique (c1,c2)) ;`)
tk.MustExec("analyze table t1 all columns")
}

func TestAnalyzeWithSpecificData(t *testing.T) {
store := testkit.CreateMockStore(t)
tk := testkit.NewTestKit(t, store)
tk.MustExec("use test")
// https://github.com/pingcap/tidb/issues/57448

datapath := "./testdata/test_data.csv"
content, err := os.ReadFile(datapath)
require.NoError(t, err)
var reader io.ReadCloser = mydump.NewStringReader(string(content))
var readerBuilder executor.LoadDataReaderBuilder = func(_ string) (
r io.ReadCloser, err error,
) {
return reader, nil
}
ctx := tk.Session().(sessionctx.Context)
ctx.SetValue(executor.LoadDataReaderBuilderKey, readerBuilder)
tk.MustExec(" CREATE TABLE t1 (COL102 float DEFAULT NULL,COL103 float DEFAULT NULL,COL1 float GENERATED ALWAYS AS (COL102 % 10) STORED,COL2 varchar(20) DEFAULT NULL,COL4 datetime DEFAULT NULL,COL3 bigint DEFAULT NULL,COL5 float DEFAULT NULL, KEY UK_COL1 (COL1) /*!80000 INVISIBLE */) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin")
tk.MustExec("load data local infile '/tmp/nonexistence.csv' INTO TABLE t1 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' LINES (COL102,COL103,COL1,COL2,COL4,COL3,COL5) ")
tk.MustExec("analyze table t1")
}
102 changes: 102 additions & 0 deletions pkg/planner/core/tests/analyze/testdata/test_data.csv

Large diffs are not rendered by default.

25 changes: 14 additions & 11 deletions pkg/statistics/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,9 @@ func BuildColumnHist(ctx sessionctx.Context, numBuckets, id int64, collector *Sa
}
sc := ctx.GetSessionVars().StmtCtx
samples := collector.Samples
err := sortSampleItems(sc, samples)
err := sortSampleItemsByBinary(samples, func(datum types.Datum) ([]byte, error) {
return getComparedBytesFromColumn(ctx, datum)
})
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -256,6 +258,12 @@ func BuildColumn(ctx sessionctx.Context, numBuckets, id int64, collector *Sample
return BuildColumnHist(ctx, numBuckets, id, collector, tp, collector.Count, collector.FMSketch.NDV(), collector.NullCount)
}

func getComparedBytesFromColumn(ctx sessionctx.Context, datum types.Datum) ([]byte, error) {
encoded, err := codec.EncodeKey(ctx.GetSessionVars().StmtCtx.TimeZone(), nil, datum)
err = ctx.GetSessionVars().StmtCtx.HandleError(err)
return encoded, err
}

// BuildHistAndTopN build a histogram and TopN for a column or an index from samples.
func BuildHistAndTopN(
ctx sessionctx.Context,
Expand All @@ -278,8 +286,7 @@ func BuildHistAndTopN(
var getComparedBytes func(datum types.Datum) ([]byte, error)
if isColumn {
getComparedBytes = func(datum types.Datum) ([]byte, error) {
encoded, err := codec.EncodeKey(ctx.GetSessionVars().StmtCtx.TimeZone(), nil, datum)
err = ctx.GetSessionVars().StmtCtx.HandleError(err)
encoded, err := getComparedBytesFromColumn(ctx, datum)
if memTracker != nil {
// tmp memory usage
deltaSize := int64(cap(encoded))
Expand Down Expand Up @@ -311,7 +318,7 @@ func BuildHistAndTopN(
} else {
samples = collector.Samples
}
err := sortSampleItems(sc, samples)
err := sortSampleItemsByBinary(samples, getComparedBytes)
if err != nil {
return nil, nil, err
}
Expand All @@ -336,7 +343,6 @@ func BuildHistAndTopN(
}
curCnt := float64(0)
var corrXYSum float64

// Iterate through the samples
for i := int64(0); i < sampleNum; i++ {
if isColumn {
Expand Down Expand Up @@ -386,7 +392,6 @@ func BuildHistAndTopN(
}
cur, curCnt = sampleBytes, 1
}

// Calc the correlation of the column between the handle column.
if isColumn {
hg.Correlation = calcCorrelation(sampleNum, corrXYSum)
Expand Down Expand Up @@ -435,6 +440,9 @@ func BuildHistAndTopN(
if bytes.Equal(sampleBytes, topNList[j].Encoded) {
// This should never happen, but we met this panic before, so we add this check here.
// See: https://github.com/pingcap/tidb/issues/35948
//
// it has been fixed at https://github.com/pingcap/tidb/pull/57464
// so it can remove this debug code after 2025-11-18
if foundTwice {
datumString, err := firstTimeSample.ToString()
if err != nil {
Expand All @@ -449,11 +457,6 @@ func BuildHistAndTopN(
zap.Binary("sampleBytes", sampleBytes),
zap.Binary("topNBytes", topNList[j].Encoded),
)
// NOTE: if we don't return here, we may meet panic in the following code.
// The i may decrease to a negative value.
// We haven't fix the issue here, because we don't know how to
// remove the invalid sample data from the samples.
break
}
// First time to find the same value in topN: need to record the sample data for debugging.
firstTimeSample = samples[i].Value
Expand Down
12 changes: 7 additions & 5 deletions pkg/statistics/builder_ext_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/pingcap/tidb/pkg/meta/model"
"github.com/pingcap/tidb/pkg/parser/ast"
"github.com/pingcap/tidb/pkg/sessionctx"
"github.com/pingcap/tidb/pkg/types"
"github.com/pingcap/tidb/pkg/util/logutil"
"go.uber.org/zap"
)
Expand Down Expand Up @@ -97,11 +98,10 @@ func fillExtStatsCorrVals(sctx sessionctx.Context, item *ExtendedStatsItem, cols
item.ScalarVals = 0
return item
}

sc := sctx.GetSessionVars().StmtCtx

var err error
err = sortSampleItems(sc, samplesX)
err = sortSampleItemsByBinary(samplesX, func(datum types.Datum) ([]byte, error) {
return getComparedBytesFromColumn(sctx, datum)
})
if err != nil {
return nil
}
Expand All @@ -116,7 +116,9 @@ func fillExtStatsCorrVals(sctx sessionctx.Context, item *ExtendedStatsItem, cols
}
samplesYInYOrder := make([]*SampleItem, len(samplesYInXOrder))
copy(samplesYInYOrder, samplesYInXOrder)
err = sortSampleItems(sc, samplesYInYOrder)
err = sortSampleItemsByBinary(samplesYInYOrder, func(datum types.Datum) ([]byte, error) {
return getComparedBytesFromColumn(sctx, datum)
})
if err != nil {
return nil
}
Expand Down
13 changes: 9 additions & 4 deletions pkg/statistics/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ import (

"github.com/pingcap/tidb/pkg/config"
"github.com/pingcap/tidb/pkg/parser/mysql"
"github.com/pingcap/tidb/pkg/sessionctx/stmtctx"
"github.com/pingcap/tidb/pkg/testkit/testdata"
"github.com/pingcap/tidb/pkg/testkit/testmain"
"github.com/pingcap/tidb/pkg/testkit/testsetup"
"github.com/pingcap/tidb/pkg/types"
"github.com/pingcap/tidb/pkg/util/mock"
"github.com/stretchr/testify/require"
"go.uber.org/goleak"
)
Expand Down Expand Up @@ -104,9 +104,11 @@ func createTestStatisticsSamples(t *testing.T) *testStatisticsSamples {
for i := start; i < len(samples); i += 5 {
samples[i].Value.SetInt64(samples[i].Value.GetInt64() + 2)
}
sc := stmtctx.NewStmtCtx()

err := sortSampleItems(sc, samples)
err := sortSampleItemsByBinary(samples, func(datum types.Datum) ([]byte, error) {
ctx := mock.NewContext()
return getComparedBytesFromColumn(ctx, datum)
})
require.NoError(t, err)
s.samples = samples
rc := &recordSet{
Expand All @@ -128,7 +130,10 @@ func createTestStatisticsSamples(t *testing.T) *testStatisticsSamples {
for i := start; i < rc.count; i += 5 {
rc.data[i].SetInt64(rc.data[i].GetInt64() + 2)
}
require.NoError(t, types.SortDatums(sc.TypeCtx(), rc.data))
require.NoError(t, sortDatumByBinary(rc.data, func(datum types.Datum) ([]byte, error) {
ctx := mock.NewContext()
return getComparedBytesFromColumn(ctx, datum)
}))

s.rc = rc

Expand Down
30 changes: 26 additions & 4 deletions pkg/statistics/sample.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package statistics

import (
"bytes"
"context"
"slices"
"time"
Expand Down Expand Up @@ -61,15 +62,36 @@ func CopySampleItems(items []*SampleItem) []*SampleItem {
return n
}

func sortSampleItems(sc *stmtctx.StatementContext, items []*SampleItem) error {
func sortDatumByBinary(items []types.Datum, getComparedBytes func(datum types.Datum) ([]byte, error)) error {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you comment on why binary format matters rather than simple item value comparison? for later reviewer's quick grabbing

var err error
slices.SortStableFunc(items, func(i, j types.Datum) int {
var ib, jb []byte
ib, err = getComparedBytes(i)
if err != nil {
return 1
}
jb, err = getComparedBytes(j)
if err != nil {
return -1
}
return bytes.Compare(ib, jb)
})
return err
}

func sortSampleItemsByBinary(items []*SampleItem, getComparedBytes func(datum types.Datum) ([]byte, error)) error {
var err error
slices.SortStableFunc(items, func(i, j *SampleItem) int {
var cmp int
cmp, err = i.Value.Compare(sc.TypeCtx(), &j.Value, collate.GetBinaryCollator())
var ib, jb []byte
ib, err = getComparedBytes(i.Value)
if err != nil {
return 1
}
jb, err = getComparedBytes(j.Value)
if err != nil {
return -1
}
return cmp
return bytes.Compare(ib, jb)
})
return err
}
Expand Down