From 6c40feda19b4a7b9a146b055a248419e23379e56 Mon Sep 17 00:00:00 2001 From: winkyao Date: Tue, 18 Dec 2018 19:28:56 +0800 Subject: [PATCH 1/5] table: check non-BMP characters and return error when the charset is utf8 and sql mode is strict mode --- executor/statement_context_test.go | 15 +++++++++++++++ table/column.go | 30 +++++++++++++++++++++++------- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/executor/statement_context_test.go b/executor/statement_context_test.go index df7d86d4115a8..8a65d13aeb151 100644 --- a/executor/statement_context_test.go +++ b/executor/statement_context_test.go @@ -81,4 +81,19 @@ func (s *testSuite) TestStatementContext(c *C) { tk.MustExec("set @@tidb_skip_utf8_check = '0'") runeErrStr := string(utf8.RuneError) tk.MustExec(fmt.Sprintf("insert sc2 values ('%s')", runeErrStr)) + + // Test non-BMP characters. + tk.MustExec(nonStrictModeSQL) + tk.MustExec("truncate table sc2") + tk.MustExec("insert sc2 values (unhex('f09f8c80'))") + c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Greater, uint16(0)) + tk.MustQuery("select * from sc2").Check(testkit.Rows("")) + tk.MustExec("insert sc2 values (unhex('4040f09f8c80'))") + c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Greater, uint16(0)) + tk.MustQuery("select * from sc2").Check(testkit.Rows("", "@@")) + tk.MustQuery("select length(a) from sc2").Check(testkit.Rows("0", "2")) + tk.MustExec(strictModeSQL) + tk.MustExec("insert sc2 values (unhex('f09f8c80'))") + c.Assert(err, NotNil) + c.Assert(terror.ErrorEqual(err, table.ErrTruncateWrongValue), IsTrue, Commentf("err %v", err)) } diff --git a/table/column.go b/table/column.go index 00ed5b5aa9192..0f833de299c77 100644 --- a/table/column.go +++ b/table/column.go @@ -145,6 +145,16 @@ func CastValues(ctx sessionctx.Context, rec []types.Datum, cols []*Column) (err return nil } +func handleWrongUtf8Value(ctx sessionctx.Context, col *model.ColumnInfo, casted *types.Datum, str string, i int) (types.Datum, error) { + sc := ctx.GetSessionVars().StmtCtx + err := ErrTruncateWrongValue.FastGen("incorrect utf8 value %x(%s) for column %s", casted.GetBytes(), str, col.Name) + log.Errorf("con:%d %v", ctx.GetSessionVars().ConnectionID, err) + // Truncate to valid utf8 string. + truncateVal := types.NewStringDatum(str[:i]) + err = sc.HandleTruncate(err) + return truncateVal, err +} + // CastValue casts a value based on column type. func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo) (casted types.Datum, err error) { sc := ctx.GetSessionVars().StmtCtx @@ -166,18 +176,24 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo) ( return casted, nil } str := casted.GetString() - for i, r := range str { - if r == utf8.RuneError { + + for i, w := 0, 0; i < len(str); i += w { + runeValue, width := utf8.DecodeRuneInString(str[i:]) + if runeValue == utf8.RuneError { if strings.HasPrefix(str[i:], string(utf8.RuneError)) { + w = width continue } - err = ErrTruncateWrongValue.FastGen("incorrect utf8 value %x(%s) for column %s", casted.GetBytes(), str, col.Name) - log.Errorf("con:%d %v", ctx.GetSessionVars().ConnectionID, err) - // Truncate to valid utf8 string. - casted = types.NewStringDatum(str[:i]) - err = sc.HandleTruncate(err) + casted, err = handleWrongUtf8Value(ctx, col, &casted, str, i) + break + } + + if width > 3 && col.Charset == mysql.UTF8Charset { + // Handle non-BMP characters. + casted, err = handleWrongUtf8Value(ctx, col, &casted, str, i) break } + w = width } return casted, errors.Trace(err) From a994183c36696efacae130a56497a9b938dbe626 Mon Sep 17 00:00:00 2001 From: winkyao Date: Tue, 18 Dec 2018 19:32:08 +0800 Subject: [PATCH 2/5] add test --- executor/statement_context_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/executor/statement_context_test.go b/executor/statement_context_test.go index 8a65d13aeb151..b081ac2f0ef61 100644 --- a/executor/statement_context_test.go +++ b/executor/statement_context_test.go @@ -96,4 +96,7 @@ func (s *testSuite) TestStatementContext(c *C) { tk.MustExec("insert sc2 values (unhex('f09f8c80'))") c.Assert(err, NotNil) c.Assert(terror.ErrorEqual(err, table.ErrTruncateWrongValue), IsTrue, Commentf("err %v", err)) + tk.MustExec("insert sc2 values (unhex('F0A48BAE'))") + c.Assert(err, NotNil) + c.Assert(terror.ErrorEqual(err, table.ErrTruncateWrongValue), IsTrue, Commentf("err %v", err)) } From 7b2f0b5f238188e61b242044097dc7a0299395fa Mon Sep 17 00:00:00 2001 From: winkyao Date: Tue, 18 Dec 2018 19:39:16 +0800 Subject: [PATCH 3/5] fix --- table/column.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/table/column.go b/table/column.go index 0f833de299c77..90f2b2648c39f 100644 --- a/table/column.go +++ b/table/column.go @@ -186,9 +186,7 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo) ( } casted, err = handleWrongUtf8Value(ctx, col, &casted, str, i) break - } - - if width > 3 && col.Charset == mysql.UTF8Charset { + } else if width > 3 && col.Charset == mysql.UTF8Charset { // Handle non-BMP characters. casted, err = handleWrongUtf8Value(ctx, col, &casted, str, i) break From f7d2036777f804cd69d48fe921e24aa8fad52063 Mon Sep 17 00:00:00 2001 From: winkyao Date: Tue, 18 Dec 2018 19:53:59 +0800 Subject: [PATCH 4/5] fix ci --- executor/statement_context_test.go | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/executor/statement_context_test.go b/executor/statement_context_test.go index b081ac2f0ef61..895e7a81e2df6 100644 --- a/executor/statement_context_test.go +++ b/executor/statement_context_test.go @@ -84,19 +84,21 @@ func (s *testSuite) TestStatementContext(c *C) { // Test non-BMP characters. tk.MustExec(nonStrictModeSQL) - tk.MustExec("truncate table sc2") - tk.MustExec("insert sc2 values (unhex('f09f8c80'))") + tk.MustExec("drop table if exists t1") + tk.MustExec("create table t1(a varchar(100) charset utf8);") + defer tk.MustExec("drop table if exists t1") + tk.MustExec("insert t1 values (unhex('f09f8c80'))") c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Greater, uint16(0)) - tk.MustQuery("select * from sc2").Check(testkit.Rows("")) - tk.MustExec("insert sc2 values (unhex('4040f09f8c80'))") + tk.MustQuery("select * from t1").Check(testkit.Rows("")) + tk.MustExec("insert t1 values (unhex('4040f09f8c80'))") c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Greater, uint16(0)) - tk.MustQuery("select * from sc2").Check(testkit.Rows("", "@@")) - tk.MustQuery("select length(a) from sc2").Check(testkit.Rows("0", "2")) + tk.MustQuery("select * from t1").Check(testkit.Rows("", "@@")) + tk.MustQuery("select length(a) from t1").Check(testkit.Rows("0", "2")) tk.MustExec(strictModeSQL) - tk.MustExec("insert sc2 values (unhex('f09f8c80'))") + _, err = tk.Exec("insert t1 values (unhex('f09f8c80'))") c.Assert(err, NotNil) c.Assert(terror.ErrorEqual(err, table.ErrTruncateWrongValue), IsTrue, Commentf("err %v", err)) - tk.MustExec("insert sc2 values (unhex('F0A48BAE'))") + _, err = tk.Exec("insert t1 values (unhex('F0A48BAE'))") c.Assert(err, NotNil) c.Assert(terror.ErrorEqual(err, table.ErrTruncateWrongValue), IsTrue, Commentf("err %v", err)) } From 0de7c0a9e86160e91b67ec0efa6d162cdab7a62a Mon Sep 17 00:00:00 2001 From: winkyao Date: Wed, 19 Dec 2018 15:05:51 +0800 Subject: [PATCH 5/5] refine check --- table/column.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/table/column.go b/table/column.go index 90f2b2648c39f..d6e2d6affd832 100644 --- a/table/column.go +++ b/table/column.go @@ -176,7 +176,7 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo) ( return casted, nil } str := casted.GetString() - + utf8Charset := col.Charset == mysql.UTF8Charset for i, w := 0, 0; i < len(str); i += w { runeValue, width := utf8.DecodeRuneInString(str[i:]) if runeValue == utf8.RuneError { @@ -186,7 +186,7 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo) ( } casted, err = handleWrongUtf8Value(ctx, col, &casted, str, i) break - } else if width > 3 && col.Charset == mysql.UTF8Charset { + } else if width > 3 && utf8Charset { // Handle non-BMP characters. casted, err = handleWrongUtf8Value(ctx, col, &casted, str, i) break