-
Notifications
You must be signed in to change notification settings - Fork 5.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
table: improve error message for incorrect utf8 value #25087
Conversation
We could change this to something like: diff --git a/table/column.go b/table/column.go
index 6bdcceaa1..583740000 100644
--- a/table/column.go
+++ b/table/column.go
@@ -139,19 +139,10 @@ func truncateTrailingSpaces(v *types.Datum) {
v.SetString(str, v.Collation())
}
-func handleWrongASCIIValue(ctx sessionctx.Context, col *model.ColumnInfo, casted *types.Datum, str string, i int) (types.Datum, error) {
+func handleWrongCharsetValue(ctx sessionctx.Context, col *model.ColumnInfo, casted *types.Datum, str string, i int) (types.Datum, error) {
sc := ctx.GetSessionVars().StmtCtx
- err := ErrTruncatedWrongValueForField.FastGen("incorrect ascii value hex=%x string=%s for column %s", casted.GetBytes(), str, col.Name)
- logutil.BgLogger().Error("incorrect ASCII value", zap.Uint64("conn", ctx.GetSessionVars().ConnectionID), zap.Error(err))
- truncateVal := types.NewStringDatum(str[:i])
- err = sc.HandleTruncate(err)
- return truncateVal, err
-}
-
-func handleWrongUtf8Value(ctx sessionctx.Context, col *model.ColumnInfo, casted *types.Datum, str string, i int) (types.Datum, error) {
- sc := ctx.GetSessionVars().StmtCtx
- err := ErrTruncatedWrongValueForField.FastGen("incorrect utf8 value hex=%x string=%s for column %s", casted.GetBytes(), str, col.Name)
- logutil.BgLogger().Error("incorrect UTF-8 value", zap.Uint64("conn", ctx.GetSessionVars().ConnectionID), zap.Error(err))
+ err := ErrTruncatedWrongValueForField.FastGen("incorrect %s/%s value hex=%x string=%s for column %s", col.Charset, col.Collate, casted.GetBytes(), str, col.Name)
+ logutil.BgLogger().Error("incorrect value for charset", zap.Uint64("conn", ctx.GetSessionVars().ConnectionID), zap.Error(err))
// Truncate to valid utf8 string.
truncateVal := types.NewStringDatum(str[:i])
err = sc.HandleTruncate(err)
@@ -280,7 +271,7 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo, r
str := casted.GetString()
for i := 0; i < len(str); i++ {
if str[i] > unicode.MaxASCII {
- casted, err = handleWrongASCIIValue(ctx, col, &casted, str, i)
+ casted, err = handleWrongCharsetValue(ctx, col, &casted, str, i)
break
}
}
@@ -307,11 +298,11 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo, r
w = width
continue
}
- casted, err = handleWrongUtf8Value(ctx, col, &casted, str, i)
+ casted, err = handleWrongCharsetValue(ctx, col, &casted, str, i)
break
} else if width > 3 && doMB4CharCheck {
// Handle non-BMP characters.
- casted, err = handleWrongUtf8Value(ctx, col, &casted, str, i)
+ casted, err = handleWrongCharsetValue(ctx, col, &casted, str, i)
break
}
w = width But maybe the simple change as proposed in this PR is sufficient. |
What MySQL does: mysql 8.0.22 > create table t1(id int auto_increment primary key, c1 varchar(255) character set ascii);
Query OK, 0 rows affected (0.17 sec)
mysql 8.0.22 > insert into t1(c1) values (0x24412430303524031A69251C34295C4B35167C7F1E5A7B63091349503974624D34504B5A424679354856336868686F52485A736E4A733368786E427575516C73446469496537);
Query OK, 1 row affected (0.02 sec)
mysql 8.0.22 > select * from t1;
+----+------------------------------------------------------------------------+
| id | c1 |
+----+------------------------------------------------------------------------+
| 1 | $A$005$�i%4)\K5|Z{c IP9tbM4PKZBFy5HV3hhhoRHZsnJs3hxnBuuQlsDdiIe7 |
+----+------------------------------------------------------------------------+
1 row in set (0.00 sec)
mysql 8.0.22 > show create table t1\G
*************************** 1. row ***************************
Table: t1
Create Table: CREATE TABLE `t1` (
`id` int NOT NULL AUTO_INCREMENT,
`c1` varchar(255) CHARACTER SET ascii COLLATE ascii_general_ci DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci
1 row in set (0.01 sec)
mysql 8.0.22 > insert into t1(c1) values ('$A$005$�i%4)\K5|Z{cIP9tbM4PKZBFy5HV3hhhoRHZsnJs3hxnBuuQlsDdiIe7');
ERROR 1366 (HY000): Incorrect string value: '\xEF\xBF\xBDi%4...' for column 'c1' at row 1 What TiDB does (with my patch): 5.7.25-TiDB-v5.1.0-alpha-68-g5dfb4888d-dirty 127.0.0.1:4000 test SQL create table t1(id int auto_increment primary key, c1 varchar(255) character set ascii);
Query OK, 0 rows affected (0.0116 sec)
5.7.25-TiDB-v5.1.0-alpha-68-g5dfb4888d-dirty 127.0.0.1:4000 test SQL insert into t1(c1) values (0x24412430303524031A69251C34295C4B35167C7F1E5A7B63091349503974624D34504B5A424679354856336868686F52485A736E4A733368786E427575516C73446469496537);
Query OK, 1 row affected (0.0060 sec)
5.7.25-TiDB-v5.1.0-alpha-68-g5dfb4888d-dirty 127.0.0.1:4000 test SQL select * from t1;
+----+------------------------------------------------------------------------+
| id | c1 |
+----+------------------------------------------------------------------------+
| 1 | $A$005$�i%4)\K5|Z{c IP9tbM4PKZBFy5HV3hhhoRHZsnJs3hxnBuuQlsDdiIe7 |
+----+------------------------------------------------------------------------+
1 row in set (0.0007 sec)
5.7.25-TiDB-v5.1.0-alpha-68-g5dfb4888d-dirty 127.0.0.1:4000 test SQL show create table t1\G
*************************** 1. row ***************************
Table: t1
Create Table: CREATE TABLE `t1` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`c1` varchar(255) CHARACTER SET ascii COLLATE ascii_bin DEFAULT NULL,
PRIMARY KEY (`id`) /*T![clustered_index] CLUSTERED */
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin AUTO_INCREMENT=39122
1 row in set (0.0012 sec)
5.7.25-TiDB-v5.1.0-alpha-68-g5dfb4888d-dirty 127.0.0.1:4000 test SQL insert into t1(c1) values ('$A$005$�i%4)\K5|Z{cIP9tbM4PKZBFy5HV3hhhoRHZsnJs3hxnBuuQlsDdiIe7');
ERROR: 1366 (HY000): incorrect ascii/ascii_bin value hex=24412430303524efbfbd692534294b357c5a7b6349503974624d34504b5a424679354856336868686f52485a736e4a733368786e427575516c73446469496537 string=$A$005$�i%4)K5|Z{cIP9tbM4PKZBFy5HV3hhhoRHZsnJs3hxnBuuQlsDdiIe7 for column c1 I don't think we should try to mimic the exact error that MySQL is giving as our message seems to be more informative. |
|
With the 5.7.25-TiDB-v5.1.0-alpha-68-g5dfb4888d-dirty 127.0.0.1:4000 test SQL insert into t1(c1) values ('$A$005$�i%4)\K5|Z{cIP9tbM4PKZBFy5HV3hhhoRHZsnJs3hxnBuuQlsDdiIe7');
ERROR: 1366 (HY000): incorrect ascii/ascii_bin value hex=24412430303524efbfbd692534294b357c5a7b6349503974624d34504b5a424679354856336868686f52485a736e4a733368786e427575516c73446469496537 string=$A$005$�i%4)K5|Z{cIP9tbM4PKZBFy5HV3hhhoRHZsnJs3hxnBuuQlsDdiIe7 at character 7 for column c1 |
Would you like to do it in this PR, or in the next PR? |
I could include it in this PR if others think this is useful. If that isn't the case we can just merge this PR as-is. |
@morgo what do you think about this? minimal change as proposed, or should we add more details? Or should this be identical to MySQL? |
I do prefer identical to MySQL, but it is not a strong preference. |
I prefer the MySQL message because the row number is reported(and useful), however, it is not easy to archive since the row number is lost in the call stack. So I think we can keep the current patch and put the
IMO it's a good idea!
I think we can include the charset but not collation(not related to the charset encoding). While if the explicit charset name is given, 'changing utf8 to UTF-8' is not needed. |
5dfb488
to
01ae0c4
Compare
I've tried to make this more similar to what MySQL does, but without the rownumber. |
01ae0c4
to
6dc9cb3
Compare
[dvaneeden@dve-carbon tidb]$ mysqlsh --quiet-start=2 mysql://root@127.0.0.1:4000/test 5.7.25-TiDB-v5.2.0-alpha-8-g8f3f2158f-dirty 127.0.0.1:4000 test SQL insert into t1(c1) values ('$A$005$�i%4)\K5|Z{cIP9tbM4PKZBFy5HV3hhhoRHZsnJs3hxnBuuQlsDdiIe7'); ERROR: 1366 (HY000): Incorrect string value '\xEF\xBF\xBDi%4...' for column 'c1' |
@dveeden It looks good to me, but can you write an integration test ( |
6dc9cb3
to
459aad5
Compare
done |
[REVIEW NOTIFICATION] This pull request has been approved by:
To complete the pull request process, please ask the reviewers in the list to review by filling The full list of commands accepted by this bot can be found here. Reviewer can indicate their review by submitting an approval review. |
17496b3
to
428808b
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
/merge |
/run-check_dev_2 |
ac7b5c5
to
8cfba98
Compare
@dveeden: Your PR was out of date, I have automatically updated it for you. At the same time I will also trigger all tests for you: /run-all-tests If the CI test fails, you just re-trigger the test that failed and the bot will merge the PR for you after the CI passes. Instructions for interacting with me using PR comments are available here. If you have questions or suggestions related to my behavior, please file an issue against the ti-community-infra/tichi repository. |
/merge cancel |
It seems the test
|
This now more closely matches the output from MySQL 8.0
e75184c
to
321842f
Compare
/merge |
This pull request has been accepted and is ready to merge. Commit hash: 321842f
|
What problem does this PR solve?
While working on #24991 I hit a
incorrect utf8 value
error. As the HEX and string representation ofthe string are concaternated it looked like a single value while it was
not. This more clearly splits and labels the two.
Check List
Tests
Release note
Some thoughts/questions related to this:
incorrect utf8 value
event if the column isutf8mb4
intead ofutf8
. Should we usecol.Charset
here? Or changeutf8
toUTF-8
? I think the latter would be the best option.