*: support use like to build range for new collation columns (#51164)

close #48181
pingcap · Feb 20, 2024 · 1dae341 · 1dae341
1 parent af141d5
commit 1dae341
Show file tree

Hide file tree

Showing 22 changed files with 1,678 additions and 229 deletions.
diff --git a/pkg/executor/point_get.go b/pkg/executor/point_get.go
@@ -540,11 +540,13 @@ func EncodeUniqueIndexValuesForKey(ctx sessionctx.Context, tblInfo *model.TableI
 		colInfo := tblInfo.Columns[idxInfo.Columns[i].Offset]
 		// table.CastValue will append 0x0 if the string value's length is smaller than the BINARY column's length.
 		// So we don't use CastValue for string value for now.
-		// TODO: merge two if branch.
+		// TODO: The first if branch should have been removed, because the functionality of set the collation of the datum
+		// have been moved to util/ranger (normal path) and getNameValuePairs/getPointGetValue (fast path). But this change
+		// will be cherry-picked to a hotfix, so we choose to be a bit conservative and keep this for now.
 		if colInfo.GetType() == mysql.TypeString || colInfo.GetType() == mysql.TypeVarString || colInfo.GetType() == mysql.TypeVarchar {
 			var str string
 			str, err = idxVals[i].ToString()
-			idxVals[i].SetString(str, colInfo.FieldType.GetCollate())
+			idxVals[i].SetString(str, idxVals[i].Collation())
 		} else if colInfo.GetType() == mysql.TypeEnum && (idxVals[i].Kind() == types.KindString || idxVals[i].Kind() == types.KindBytes || idxVals[i].Kind() == types.KindBinaryLiteral) {
 			var str string
 			var e types.Enum

diff --git a/pkg/planner/core/casetest/index/index_test.go b/pkg/planner/core/casetest/index/index_test.go
@@ -461,7 +461,7 @@ func TestIndexMergeSingleCaseCouldFeelIndexMergeHint(t *testing.T) {
 	tk.MustQuery("explain format=\"brief\" SELECT /*+ use_index_merge(t, nslc) */ * FROM t WHERE 57260686 member of (fpi) AND \"OC8p1763XTkt.org/s/link\" member of (nslc) LIMIT 1;").Check(
 		testkit.Rows("Limit 1.00 root  offset:0, count:1",
 			"└─IndexMerge 1.00 root  type: union",
-			"  ├─IndexRangeScan(Build) 1.00 cop[tikv] table:t, index:nslc(cast(`nslc` as char(1000) array), point_of_sale_country) range:[0x4F4338703137363358546B742E6F72672F732F6C696E6B,0x4F4338703137363358546B742E6F72672F732F6C696E6B], keep order:false, stats:pseudo",
+			"  ├─IndexRangeScan(Build) 1.00 cop[tikv] table:t, index:nslc(cast(`nslc` as char(1000) array), point_of_sale_country) range:[\"OC8p1763XTkt.org/s/link\",\"OC8p1763XTkt.org/s/link\"], keep order:false, stats:pseudo",
 			"  └─Limit(Probe) 1.00 cop[tikv]  offset:0, count:1",
 			"    └─Selection 1.00 cop[tikv]  json_memberof(cast(57260686, json BINARY), test.t.fpi)",
 			"      └─TableRowIDScan 1.00 cop[tikv] table:t keep order:false, stats:pseudo"))

diff --git a/pkg/planner/core/integration_test.go b/pkg/planner/core/integration_test.go
@@ -2230,14 +2230,14 @@ func TestPlanCacheForIndexJoinRangeFallback(t *testing.T) {
 	tk.MustExec("drop table if exists t1, t2")
 	tk.MustExec("create table t1(a int, b varchar(10), c varchar(10), index idx_a_b(a, b))")
 	tk.MustExec("create table t2(d int)")
-	tk.MustExec("set @@tidb_opt_range_max_size=1275")
-	// 1275 is enough for [? a,? a], [? b,? b], [? c,? c] but is not enough for [? aaaaaa,? aaaaaa], [? bbbbbb,? bbbbbb], [? cccccc,? cccccc].
+	tk.MustExec("set @@tidb_opt_range_max_size=1260")
+	// 1260 is enough for [? a,? a], [? b,? b], [? c,? c] but is not enough for [? aaaaaa,? aaaaaa], [? bbbbbb,? bbbbbb], [? cccccc,? cccccc].
 	rows := tk.MustQuery("explain format='brief' select /*+ inl_join(t1) */ * from  t1 join t2 on t1.a = t2.d where t1.b in ('a', 'b', 'c')").Rows()
 	require.True(t, strings.Contains(rows[6][4].(string), "range: decided by [eq(test.t1.a, test.t2.d) in(test.t1.b, a, b, c)]"))
 	tk.MustQuery("show warnings").Check(testkit.Rows())
 	rows = tk.MustQuery("explain format='brief' select /*+ inl_join(t1) */ * from  t1 join t2 on t1.a = t2.d where t1.b in ('aaaaaa', 'bbbbbb', 'cccccc');").Rows()
-	require.True(t, strings.Contains(rows[6][4].(string), "range: decided by [eq(test.t1.a, test.t2.d)]"))
-	tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Memory capacity of 1275 bytes for 'tidb_opt_range_max_size' exceeded when building ranges. Less accurate ranges such as full range are chosen"))
+	require.Contains(t, rows[6][4].(string), "range: decided by [eq(test.t1.a, test.t2.d)]")
+	tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Memory capacity of 1260 bytes for 'tidb_opt_range_max_size' exceeded when building ranges. Less accurate ranges such as full range are chosen"))
 
 	tk.MustExec("prepare stmt1 from 'select /*+ inl_join(t1) */ * from  t1 join t2 on t1.a = t2.d where t1.b in (?, ?, ?)'")
 	tk.MustExec("set @a='a', @b='b', @c='c'")
@@ -2252,13 +2252,13 @@ func TestPlanCacheForIndexJoinRangeFallback(t *testing.T) {
 	tk.Session().SetSessionManager(&testkit.MockSessionManager{PS: ps})
 	rows = tk.MustQuery(fmt.Sprintf("explain for connection %d", tkProcess.ID)).Rows()
 	// We don't limit range mem usage when rebuilding index join ranges for the cached plan. So [? aaaaaa,? aaaaaa], [? bbbbbb,? bbbbbb], [? cccccc,? cccccc] can be built.
-	require.True(t, strings.Contains(rows[6][4].(string), "range: decided by [eq(test.t1.a, test.t2.d) in(test.t1.b, aaaaaa, bbbbbb, cccccc)]"))
+	require.Contains(t, rows[6][4].(string), "range: decided by [eq(test.t1.a, test.t2.d) in(test.t1.b, aaaaaa, bbbbbb, cccccc)]")
 
 	// Test the plan with range fallback would not be put into cache.
 	tk.MustExec("prepare stmt2 from 'select /*+ inl_join(t1) */ * from  t1 join t2 on t1.a = t2.d where t1.b in (?, ?, ?, ?, ?)'")
 	tk.MustExec("set @a='a', @b='b', @c='c', @d='d', @e='e'")
 	tk.MustExec("execute stmt2 using @a, @b, @c, @d, @e")
-	tk.MustQuery("show warnings").Sort().Check(testkit.Rows("Warning 1105 Memory capacity of 1275 bytes for 'tidb_opt_range_max_size' exceeded when building ranges. Less accurate ranges such as full range are chosen",
+	tk.MustQuery("show warnings").Sort().Check(testkit.Rows("Warning 1105 Memory capacity of 1260 bytes for 'tidb_opt_range_max_size' exceeded when building ranges. Less accurate ranges such as full range are chosen",
 		"Warning 1105 skip prepared plan-cache: in-list is too long"))
 	tk.MustExec("execute stmt2 using @a, @b, @c, @d, @e")
 	tk.MustQuery("select @@last_plan_from_cache").Check(testkit.Rows("0"))

diff --git a/pkg/planner/core/point_get_plan.go b/pkg/planner/core/point_get_plan.go
@@ -1438,7 +1438,15 @@ func getNameValuePairs(ctx sessionctx.Context, tbl *model.TableInfo, tblName mod
 		col := model.FindColumnInfo(tbl.Cols(), colName.Name.Name.L)
 		if col == nil { // Handling the case when the column is _tidb_rowid.
 			return append(nvPairs, nameValuePair{colName: colName.Name.Name.L, colFieldType: types.NewFieldType(mysql.TypeLonglong), value: d, con: con}), false
-		} else if col.GetType() == mysql.TypeString && col.GetCollate() == charset.CollationBin { // This type we needn't to pad `\0` in here.
+		}
+
+		// As in buildFromBinOp in util/ranger, when we build key from the expression to do range scan or point get on
+		// a string column, we should set the collation of the string datum to collation of the column.
+		if col.FieldType.EvalType() == types.ETString && (d.Kind() == types.KindString || d.Kind() == types.KindBinaryLiteral) {
+			d.SetString(d.GetString(), col.FieldType.GetCollate())
+		}
+
+		if col.GetType() == mysql.TypeString && col.GetCollate() == charset.CollationBin { // This type we needn't to pad `\0` in here.
 			return append(nvPairs, nameValuePair{colName: colName.Name.Name.L, colFieldType: &col.FieldType, value: d, con: con}), false
 		}
 		if !checkCanConvertInPointGet(col, d) {
@@ -1468,6 +1476,11 @@ func getPointGetValue(stmtCtx *stmtctx.StatementContext, col *model.ColumnInfo,
 	if !checkCanConvertInPointGet(col, *d) {
 		return nil
 	}
+	// As in buildFromBinOp in util/ranger, when we build key from the expression to do range scan or point get on
+	// a string column, we should set the collation of the string datum to collation of the column.
+	if col.FieldType.EvalType() == types.ETString && (d.Kind() == types.KindString || d.Kind() == types.KindBinaryLiteral) {
+		d.SetString(d.GetString(), col.FieldType.GetCollate())
+	}
 	dVal, err := d.ConvertTo(stmtCtx, &col.FieldType)
 	if err != nil {
 		return nil

diff --git a/pkg/planner/core/testdata/index_merge_suite_out.json b/pkg/planner/core/testdata/index_merge_suite_out.json
@@ -131,8 +131,8 @@
           "IndexMerge 0.00 root  type: intersection",
           "├─IndexRangeScan(Build) 10.00 cop[tikv] table:t5, index:is1(s1) range:[\"Abc\",\"Abc\"], keep order:false, stats:pseudo",
           "├─IndexRangeScan(Build) 3333.33 cop[tikv] table:t5, index:is2(s2) range:(\"zzz\",+inf], keep order:false, stats:pseudo",
-          "├─IndexRangeScan(Build) 3323.33 cop[tikv] table:t5, index:is3(s3) range:[-inf,\"B啊a\"), keep order:false, stats:pseudo",
-          "├─IndexRangeScan(Build) 10.00 cop[tikv] table:t5, index:is4(s4) range:[\"CcC\",\"CcC\"], keep order:false, stats:pseudo",
+          "├─IndexRangeScan(Build) 3323.33 cop[tikv] table:t5, index:is3(s3) range:[-inf,\"\\x0eJ\\xfb@\\xd5J\\x0e3\"), keep order:false, stats:pseudo",
+          "├─IndexRangeScan(Build) 10.00 cop[tikv] table:t5, index:is4(s4) range:[\"CCC\",\"CCC\"], keep order:false, stats:pseudo",
           "└─TableRowIDScan(Probe) 0.00 cop[tikv] table:t5 keep order:false, stats:pseudo"
         ],
         "Result": [
@@ -144,7 +144,7 @@
         "Plan": [
           "IndexMerge 0.03 root  type: intersection",
           "├─IndexRangeScan(Build) 33.33 cop[tikv] table:t6, index:PRIMARY(s1, s2) range:(\"Abc\" \"zzz\",\"Abc\" +inf], keep order:false, stats:pseudo",
-          "├─IndexRangeScan(Build) 10.00 cop[tikv] table:t6, index:is3(s3) range:[\"A啊a\",\"A啊a\"], keep order:false, stats:pseudo",
+          "├─IndexRangeScan(Build) 10.00 cop[tikv] table:t6, index:is3(s3) range:[\"\\x0e3\\xfb@\\xd5J\\x0e3\",\"\\x0e3\\xfb@\\xd5J\\x0e3\"], keep order:false, stats:pseudo",
           "└─Selection(Probe) 0.03 cop[tikv]  gt(test.t6.s2, \"zzz\"), not(like(test.t6.s4, \"Cd_\", 92))",
           "  └─TableRowIDScan 0.03 cop[tikv] table:t6 keep order:false, stats:pseudo"
         ],
@@ -172,13 +172,14 @@
       {
         "SQL": "select /*+ use_index_merge(t8, primary,is2,is3,is4,is5) */ * from t8 where s1 like '啊A%' and s2 > 'abc' and s3 > 'cba' and s4 in ('aA', '??') and s5 = 'test,2'",
         "Plan": [
-          "Selection 1.42 root  eq(test.t8.s5, \"test,2\")",
-          "└─IndexMerge 0.59 root  type: intersection",
-          "  ├─IndexRangeScan(Build) 3333.33 cop[tikv] table:t8, index:is2(s2) range:(0x616263,+inf], keep order:false, stats:pseudo",
-          "  ├─IndexRangeScan(Build) 3333.33 cop[tikv] table:t8, index:is3(s3) range:(0x636261,+inf], keep order:false, stats:pseudo",
+          "Selection 0.04 root  eq(test.t8.s5, \"test,2\")",
+          "└─IndexMerge 0.06 root  type: intersection",
+          "  ├─IndexRangeScan(Build) 250.00 cop[tikv] table:t8, index:PRIMARY(s1) range:[\"UJ\\x00A\",\"UJ\\x00B\"), keep order:false, stats:pseudo",
+          "  ├─IndexRangeScan(Build) 3333.33 cop[tikv] table:t8, index:is2(s2) range:(\"abc\",+inf], keep order:false, stats:pseudo",
+          "  ├─IndexRangeScan(Build) 3333.33 cop[tikv] table:t8, index:is3(s3) range:(\"cba\",+inf], keep order:false, stats:pseudo",
           "  ├─IndexRangeScan(Build) 20.00 cop[tikv] table:t8, index:is4(s4) range:[\"aA\",\"aA\"], [\"??\",\"??\"], keep order:false, stats:pseudo",
-          "  └─Selection(Probe) 0.59 cop[tikv]  gt(test.t8.s3, \"cba\"), like(test.t8.s1, \"啊A%\", 92)",
-          "    └─TableRowIDScan 2.22 cop[tikv] table:t8 keep order:false, stats:pseudo"
+          "  └─Selection(Probe) 0.06 cop[tikv]  gt(test.t8.s3, \"cba\"), like(test.t8.s1, \"啊A%\", 92)",
+          "    └─TableRowIDScan 0.06 cop[tikv] table:t8 keep order:false, stats:pseudo"
         ],
         "Result": [
           "啊aabbccdd abcc cccc aA tEsT,2"

diff --git a/pkg/util/ranger/BUILD.bazel b/pkg/util/ranger/BUILD.bazel
@@ -30,6 +30,7 @@ go_library(
         "//pkg/util/codec",
         "//pkg/util/collate",
         "//pkg/util/dbterror",
+        "//pkg/util/hack",
         "//pkg/util/mathutil",
         "@com_github_pingcap_errors//:errors",
     ],

diff --git a/pkg/util/ranger/checker.go b/pkg/util/ranger/checker.go
@@ -18,12 +18,14 @@ import (
 	"github.com/pingcap/tidb/pkg/expression"
 	"github.com/pingcap/tidb/pkg/parser/ast"
 	"github.com/pingcap/tidb/pkg/parser/mysql"
+	"github.com/pingcap/tidb/pkg/sessionctx"
 	"github.com/pingcap/tidb/pkg/types"
 	"github.com/pingcap/tidb/pkg/util/collate"
 )
 
 // conditionChecker checks if this condition can be pushed to index planner.
 type conditionChecker struct {
+	ctx                      sessionctx.Context
 	checkerCol               *expression.Column
 	length                   int
 	optPrefixIndexSingleScan bool
@@ -139,16 +141,6 @@ func (c *conditionChecker) checkScalarFunction(scalar *expression.ScalarFunction
 
 func (c *conditionChecker) checkLikeFunc(scalar *expression.ScalarFunction) (isAccessCond, shouldReserve bool) {
 	_, collation := scalar.CharsetAndCollation()
-	if collate.NewCollationEnabled() && !collate.IsBinCollation(collation) {
-		// The algorithm constructs the range in byte-level: for example, ab% is mapped to [ab, ac] by adding 1 to the last byte.
-		// However, this is incorrect for non-binary collation strings because the sort key order is not the same as byte order.
-		// For example, "`%" is mapped to the range [`, a](where ` is 0x60 and a is 0x61).
-		// Because the collation utf8_general_ci is case-insensitive, a and A have the same sort key.
-		// Finally, the range comes to be [`, A], which is actually an empty range.
-		// See https://github.com/pingcap/tidb/issues/31174 for more details.
-		// In short, when the column type is non-binary collation string, we cannot use `like` expressions to generate the range.
-		return false, true
-	}
 	if !collate.CompatibleCollate(scalar.GetArgs()[0].GetType().GetCollate(), collation) {
 		return false, true
 	}