Skip to content

Commit

Permalink
statistics: remove statistics.Column.Count (#43033)
Browse files Browse the repository at this point in the history
ref #42160
  • Loading branch information
xuyifangreeneyes authored Apr 14, 2023
1 parent 15ce809 commit 579f47e
Show file tree
Hide file tree
Showing 13 changed files with 31 additions and 123 deletions.
5 changes: 2 additions & 3 deletions planner/core/casetest/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3205,8 +3205,8 @@ func TestIssue32632(t *testing.T) {
"`S_ACCTBAL` decimal(15,2) NOT NULL," +
"`S_COMMENT` varchar(101) NOT NULL," +
"PRIMARY KEY (`S_SUPPKEY`) /*T![clustered_index] CLUSTERED */)")
tk.MustExec("analyze table partsupp;")
tk.MustExec("analyze table supplier;")
h := dom.StatsHandle()
require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh()))
tk.MustExec("set @@tidb_enforce_mpp = 1")

tbl1, err := dom.InfoSchema().TableByName(model.CIStr{O: "test", L: "test"}, model.CIStr{O: "partsupp", L: "partsupp"})
Expand All @@ -3217,7 +3217,6 @@ func TestIssue32632(t *testing.T) {
tbl1.Meta().TiFlashReplica = &model.TiFlashReplicaInfo{Count: 1, Available: true}
tbl2.Meta().TiFlashReplica = &model.TiFlashReplicaInfo{Count: 1, Available: true}

h := dom.StatsHandle()
statsTbl1 := h.GetTableStats(tbl1.Meta())
statsTbl1.RealtimeCount = 800000
statsTbl2 := h.GetTableStats(tbl2.Meta())
Expand Down
4 changes: 2 additions & 2 deletions planner/core/casetest/testdata/integration_suite_out.json
Original file line number Diff line number Diff line change
Expand Up @@ -10594,8 +10594,8 @@
" └─HashJoin 12500.00 mpp[tiflash] inner join, equal:[eq(test.supplier.s_suppkey, test.partsupp.ps_suppkey)]",
" ├─ExchangeReceiver(Build) 10000.00 mpp[tiflash] ",
" │ └─ExchangeSender 10000.00 mpp[tiflash] ExchangeType: Broadcast, Compression: FAST",
" │ └─TableFullScan 10000.00 mpp[tiflash] table:supplier keep order:false",
" └─TableFullScan(Probe) 800000.00 mpp[tiflash] table:partsupp keep order:false"
" │ └─TableFullScan 10000.00 mpp[tiflash] table:supplier keep order:false, stats:pseudo",
" └─TableFullScan(Probe) 800000.00 mpp[tiflash] table:partsupp keep order:false, stats:pseudo"
]
}
]
Expand Down
30 changes: 27 additions & 3 deletions planner/core/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,37 @@ func (p *baseLogicalPlan) DeriveStats(childStats []*property.StatsInfo, selfSche
return profile, nil
}

// getTotalRowCount returns the total row count, which is obtained when collecting colHist.
func getTotalRowCount(statsTbl *statistics.Table, colHist *statistics.Column) int64 {
if colHist.IsFullLoad() {
return int64(colHist.TotalRowCount())
}
// If colHist is not fully loaded, we may still get its total row count from other index/column stats.
for _, idx := range statsTbl.Indices {
if idx.IsFullLoad() && idx.LastUpdateVersion == colHist.LastUpdateVersion {
return int64(idx.TotalRowCount())
}
}
for _, col := range statsTbl.Columns {
if col.IsFullLoad() && col.LastUpdateVersion == colHist.LastUpdateVersion {
return int64(col.TotalRowCount())
}
}
return 0
}

// getColumnNDV computes estimated NDV of specified column using the original
// histogram of `DataSource` which is retrieved from storage(not the derived one).
func (ds *DataSource) getColumnNDV(colID int64) (ndv float64) {
hist, ok := ds.statisticTable.Columns[colID]
if ok && hist.Count > 0 {
factor := float64(ds.statisticTable.RealtimeCount) / float64(hist.Count)
ndv = float64(hist.Histogram.NDV) * factor
if ok && hist.IsStatsInitialized() {
ndv = float64(hist.Histogram.NDV)
// TODO: a better way to get the total row count derived from the last analyze.
analyzeCount := getTotalRowCount(ds.statisticTable, hist)
if analyzeCount > 0 {
factor := float64(ds.statisticTable.RealtimeCount) / float64(analyzeCount)
ndv *= factor
}
} else {
ndv = float64(ds.statisticTable.RealtimeCount) * distinctFactor
}
Expand Down
1 change: 0 additions & 1 deletion statistics/column.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ type Column struct {
TopN *TopN
FMSketch *FMSketch
PhysicalID int64
Count int64
Info *model.ColumnInfo
IsHandle bool
ErrorRate
Expand Down
37 changes: 0 additions & 37 deletions statistics/handle/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,23 +138,12 @@ func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, cache *stat
if colInfo == nil {
continue
}
var topnCount int64
// If this is stats of the Version2, we need to consider the topn's count as well.
// See the comments of Version2 for more details.
if statsVer >= statistics.Version2 {
var err error
topnCount, err = h.initTopNCountSum(tblID, id)
if err != nil {
terror.Log(err)
}
}
hist := statistics.NewHistogram(id, ndv, nullCount, version, &colInfo.FieldType, 0, totColSize)
hist.Correlation = row.GetFloat64(9)
col := &statistics.Column{
Histogram: *hist,
PhysicalID: table.PhysicalID,
Info: colInfo,
Count: nullCount + topnCount,
IsHandle: tbl.Meta().PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
Flag: row.GetInt64(10),
StatsVer: statsVer,
Expand Down Expand Up @@ -306,7 +295,6 @@ func (h *Handle) initStatsBuckets4Chunk(cache *statsCache, iter *chunk.Iterator4
if !ok {
continue
}
column.Count += row.GetInt64(3)
if !mysql.HasPriKeyFlag(column.Info.GetFlag()) {
continue
}
Expand Down Expand Up @@ -334,31 +322,6 @@ func (h *Handle) initStatsBuckets4Chunk(cache *statsCache, iter *chunk.Iterator4
}
}

func (h *Handle) initTopNCountSum(tableID, colID int64) (int64, error) {
// Before stats ver 2, histogram represents all data in this column.
// In stats ver 2, histogram + TopN represent all data in this column.
// So we need to add TopN total count here.
ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats)
selSQL := "select sum(count) from mysql.stats_top_n where table_id = %? and is_index = 0 and hist_id = %?"
rs, err := h.initStatsCtx.(sqlexec.SQLExecutor).ExecuteInternal(ctx, selSQL, tableID, colID)
if rs != nil {
defer terror.Call(rs.Close)
}
if err != nil {
return 0, err
}
req := rs.NewChunk(nil)
iter := chunk.NewIterator4Chunk(req)
err = rs.Next(ctx, req)
if err != nil {
return 0, err
}
if req.NumRows() == 0 {
return 0, nil
}
return iter.Begin().GetMyDecimal(0).ToInt()
}

func (h *Handle) initStatsBuckets(cache *statsCache) error {
ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats)
sql := "select HIGH_PRIORITY table_id, is_index, hist_id, count, repeats, lower_bound, upper_bound, ndv from mysql.stats_buckets order by table_id, is_index, hist_id, bucket_id"
Expand Down
1 change: 0 additions & 1 deletion statistics/handle/dump.go
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,6 @@ func TableStatsFromJSON(tableInfo *model.TableInfo, physicalID int64, jsonTbl *J
StatsVer: statsVer,
StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
}
col.Count = int64(col.TotalRowCount())
tbl.Columns[col.ID] = col
}
}
Expand Down
1 change: 0 additions & 1 deletion statistics/handle/dump_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ func requireTableEqual(t *testing.T, a *statistics.Table, b *statistics.Table) {
require.Equal(t, b.ModifyCount, a.ModifyCount)
require.Equal(t, len(b.Columns), len(a.Columns))
for i := range a.Columns {
require.Equal(t, b.Columns[i].Count, a.Columns[i].Count)
require.True(t, statistics.HistogramEqual(&a.Columns[i].Histogram, &b.Columns[i].Histogram, false))
if a.Columns[i].CMSketch == nil {
require.Nil(t, b.Columns[i].CMSketch)
Expand Down
2 changes: 0 additions & 2 deletions statistics/handle/handle.go
Original file line number Diff line number Diff line change
Expand Up @@ -1095,8 +1095,6 @@ func (h *Handle) loadNeededColumnHistograms(reader *statistics.StatsReader, col
IsHandle: c.IsHandle,
StatsVer: statsVer,
}
// Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing colHist.
colHist.Count = int64(colHist.TotalRowCount())
if colHist.StatsAvailable() {
colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
}
Expand Down
2 changes: 0 additions & 2 deletions statistics/handle/handle_hist.go
Original file line number Diff line number Diff line change
Expand Up @@ -395,8 +395,6 @@ func (h *Handle) readStatsForOneItem(item model.TableItemID, w *statsWrapper, re
IsHandle: c.IsHandle,
StatsVer: statsVer,
}
// Column.Count is calculated by Column.TotalRowCount(). Hence, we don't set Column.Count when initializing colHist.
colHist.Count = int64(colHist.TotalRowCount())
if colHist.StatsAvailable() {
colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
}
Expand Down
26 changes: 0 additions & 26 deletions statistics/handle/handletest/handle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,6 @@ func TestLoadHist(t *testing.T) {
hist.TotColSize = temp

require.True(t, hist.CMSketch.Equal(newStatsTbl.Columns[id].CMSketch))
require.Equal(t, newStatsTbl.Columns[id].Count, hist.Count)
require.Equal(t, newStatsTbl.Columns[id].Info, hist.Info)
}
// Add column c3, we only update c3.
Expand Down Expand Up @@ -3142,31 +3141,6 @@ func TestIssues27147(t *testing.T) {
require.Equal(t, nil, err)
}

func TestColumnCountFromStorage(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
testKit := testkit.NewTestKit(t, store)
do := dom
h := do.StatsHandle()
originLease := h.Lease()
defer h.SetLease(originLease)
// `Update` will not use load by need strategy when `Lease` is 0, and `InitStats` is only called when
// `Lease` is not 0, so here we just change it.
h.SetLease(time.Millisecond)
testKit.MustExec("use test")
testKit.MustExec("set tidb_analyze_version = 2")
testKit.MustExec("create table tt (c int)")
testKit.MustExec("insert into tt values(1), (2)")
testKit.MustExec("analyze table tt")
is := do.InfoSchema()
h = do.StatsHandle()
tbl, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("tt"))
require.NoError(t, err)
tblInfo := tbl.Meta()
h.TableStatsFromStorage(tblInfo, tblInfo.ID, false, 0)
statsTbl := h.GetTableStats(tblInfo)
require.Equal(t, int64(2), statsTbl.Columns[tblInfo.Columns[0].ID].Count)
}

func testIncrementalModifyCountUpdateHelper(analyzeSnapshot bool) func(*testing.T) {
return func(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
Expand Down
1 change: 0 additions & 1 deletion statistics/handle/internal/testutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ func AssertTableEqual(t *testing.T, a *statistics.Table, b *statistics.Table) {
require.Equal(t, b.ModifyCount, a.ModifyCount)
require.Len(t, a.Columns, len(b.Columns))
for i := range a.Columns {
require.Equal(t, b.Columns[i].Count, a.Columns[i].Count)
require.True(t, statistics.HistogramEqual(&a.Columns[i].Histogram, &b.Columns[i].Histogram, false))
if a.Columns[i].CMSketch == nil {
require.Nil(t, b.Columns[i].CMSketch)
Expand Down
3 changes: 0 additions & 3 deletions statistics/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,6 @@ func TestIndexJoinInnerRowCountUpperBound(t *testing.T) {
require.NoError(t, err)
for i := 1; i <= 2; i++ {
mockStatsTbl.Columns[int64(i)] = &statistics.Column{
Count: 500000,
Histogram: *mockStatsHistogram(int64(i), colValues, 1000, types.NewFieldType(mysql.TypeLonglong)),
Info: tblInfo.Columns[i-1],
StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
Expand Down Expand Up @@ -821,7 +820,6 @@ func TestOrderingIdxSelectivityThreshold(t *testing.T) {
pkColValues, err := generateIntDatum(1, 100000)
require.NoError(t, err)
mockStatsTbl.Columns[1] = &statistics.Column{
Count: 100000,
Histogram: *mockStatsHistogram(1, pkColValues, 1, types.NewFieldType(mysql.TypeLonglong)),
Info: tblInfo.Columns[0],
StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
Expand All @@ -838,7 +836,6 @@ func TestOrderingIdxSelectivityThreshold(t *testing.T) {

for i := 2; i <= 3; i++ {
mockStatsTbl.Columns[int64(i)] = &statistics.Column{
Count: 100000,
Histogram: *mockStatsHistogram(int64(i), colValues, 10, types.NewFieldType(mysql.TypeLonglong)),
Info: tblInfo.Columns[i-1],
StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
Expand Down
41 changes: 0 additions & 41 deletions statistics/interact_with_storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,40 +169,6 @@ func FMSketchFromStorage(reader *StatsReader, tblID int64, isIndex, histID int64
return DecodeFMSketch(rows[0].GetBytes(0))
}

// columnCountFromStorage reads column count from storage
func columnCountFromStorage(reader *StatsReader, tableID, colID, statsVer int64) (int64, error) {
count := int64(0)
rows, _, err := reader.Read("select sum(count) from mysql.stats_buckets where table_id = %? and is_index = 0 and hist_id = %?", tableID, colID)
if err != nil {
return 0, errors.Trace(err)
}
// If there doesn't exist any buckets, the SQL will return NULL. So we only use the result if it's not NULL.
if !rows[0].IsNull(0) {
count, err = rows[0].GetMyDecimal(0).ToInt()
if err != nil {
return 0, errors.Trace(err)
}
}

if statsVer >= Version2 {
// Before stats ver 2, histogram represents all data in this column.
// In stats ver 2, histogram + TopN represent all data in this column.
// So we need to add TopN total count here.
rows, _, err = reader.Read("select sum(count) from mysql.stats_top_n where table_id = %? and is_index = 0 and hist_id = %?", tableID, colID)
if err != nil {
return 0, errors.Trace(err)
}
if !rows[0].IsNull(0) {
topNCount, err := rows[0].GetMyDecimal(0).ToInt()
if err != nil {
return 0, errors.Trace(err)
}
count += topNCount
}
}
return count, err
}

// ExtendedStatsFromStorage reads extended stats from storage.
func ExtendedStatsFromStorage(reader *StatsReader, table *Table, physicalID int64, loadAll bool) (*Table, error) {
failpoint.Inject("injectExtStatsLoadErr", func() {
Expand Down Expand Up @@ -357,15 +323,10 @@ func columnStatsFromStorage(reader *StatsReader, row chunk.Row, table *Table, ta
// Here is
//For one column, if there is no stats for it in the storage(analyze is never)
if notNeedLoad {
count, err := columnCountFromStorage(reader, table.PhysicalID, histID, statsVer)
if err != nil {
return errors.Trace(err)
}
col = &Column{
PhysicalID: table.PhysicalID,
Histogram: *NewHistogram(histID, distinct, nullCount, histVer, &colInfo.FieldType, 0, totColSize),
Info: colInfo,
Count: count + nullCount,
ErrorRate: errorRate,
IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
Flag: flag,
Expand Down Expand Up @@ -408,8 +369,6 @@ func columnStatsFromStorage(reader *StatsReader, row chunk.Row, table *Table, ta
Flag: flag,
StatsVer: statsVer,
}
// Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing col.
col.Count = int64(col.TotalRowCount())
if col.StatsAvailable() {
col.StatsLoadedStatus = NewStatsFullLoadStatus()
}
Expand Down

0 comments on commit 579f47e

Please sign in to comment.