From e2a29eaa86d29777cb37422ee70e5be6680078be Mon Sep 17 00:00:00 2001 From: tangenta Date: Wed, 22 Dec 2021 15:59:53 +0800 Subject: [PATCH 1/2] parser: add IsValid to Encoding to speed up string validation for utf-8 --- parser/charset/encoding.go | 2 ++ parser/charset/encoding_ascii.go | 13 ++++++++++++- parser/charset/encoding_base.go | 9 +++++++++ parser/charset/encoding_bin.go | 5 +++++ parser/charset/encoding_latin1.go | 5 +++++ parser/charset/encoding_utf8.go | 13 +++++++++++++ table/column.go | 2 +- 7 files changed, 47 insertions(+), 2 deletions(-) diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go index 25257c44e440b..92b1433549860 100644 --- a/parser/charset/encoding.go +++ b/parser/charset/encoding.go @@ -57,6 +57,8 @@ type Encoding interface { Tp() EncodingTp // Peek returns the next char. Peek(src []byte) []byte + // IsValid checks whether a utf-8 string is valid in the current encoding. + IsValid(src []byte) bool // Foreach iterates the characters in in current encoding. Foreach(src []byte, op Op, fn func(from, to []byte, ok bool) bool) // Transform map the bytes in src to dest according to Op. diff --git a/parser/charset/encoding_ascii.go b/parser/charset/encoding_ascii.go index df5fed9c3bce2..34432d5b42e3c 100644 --- a/parser/charset/encoding_ascii.go +++ b/parser/charset/encoding_ascii.go @@ -49,8 +49,19 @@ func (e *encodingASCII) Peek(src []byte) []byte { return src[:1] } +// IsValid implements Encoding interface. +func (e *encodingASCII) IsValid(src []byte) bool { + srcLen := len(src) + for i := 0; i < srcLen; i++ { + if src[i] > go_unicode.MaxASCII { + return false + } + } + return true +} + func (e *encodingASCII) Transform(dest, src []byte, op Op) ([]byte, error) { - if IsValid(e, src) { + if e.IsValid(src) { return src, nil } return e.encodingBase.Transform(dest, src, op) diff --git a/parser/charset/encoding_base.go b/parser/charset/encoding_base.go index 275db24c5a3d6..213596c6aec55 100644 --- a/parser/charset/encoding_base.go +++ b/parser/charset/encoding_base.go @@ -42,6 +42,15 @@ func (b encodingBase) ToLower(src string) string { return strings.ToLower(src) } +func (b encodingBase) IsValid(src []byte) bool { + isValid := true + b.self.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool { + isValid = ok + return ok + }) + return isValid +} + func (b encodingBase) Transform(dest, src []byte, op Op) (result []byte, err error) { if dest == nil { dest = make([]byte, len(src)) diff --git a/parser/charset/encoding_bin.go b/parser/charset/encoding_bin.go index 30fd87644c571..30b35ceb1d856 100644 --- a/parser/charset/encoding_bin.go +++ b/parser/charset/encoding_bin.go @@ -47,6 +47,11 @@ func (e *encodingBin) Peek(src []byte) []byte { return src[:1] } +// IsValid implements Encoding interface. +func (e *encodingBin) IsValid(src []byte) bool { + return true +} + // Foreach implements Encoding interface. func (e *encodingBin) Foreach(src []byte, op Op, fn func(from, to []byte, ok bool) bool) { for i := 0; i < len(src); i++ { diff --git a/parser/charset/encoding_latin1.go b/parser/charset/encoding_latin1.go index 1d2992b87642d..d627ed63ec419 100644 --- a/parser/charset/encoding_latin1.go +++ b/parser/charset/encoding_latin1.go @@ -41,6 +41,11 @@ func (e *encodingLatin1) Peek(src []byte) []byte { return src[:1] } +// IsValid implements Encoding interface. +func (e *encodingLatin1) IsValid(src []byte) bool { + return true +} + // Tp implements Encoding interface. func (e *encodingLatin1) Tp() EncodingTp { return EncodingTpLatin1 diff --git a/parser/charset/encoding_utf8.go b/parser/charset/encoding_utf8.go index 871a5e5ec33c1..6c7d79941a31d 100644 --- a/parser/charset/encoding_utf8.go +++ b/parser/charset/encoding_utf8.go @@ -67,6 +67,14 @@ func (e *encodingUTF8) Peek(src []byte) []byte { return src[:nextLen] } +// IsValid implements Encoding interface. +func (e *encodingUTF8) IsValid(src []byte) bool { + if utf8.Valid(src) { + return true + } + return e.encodingBase.IsValid(src) +} + // Transform implements Encoding interface. func (e *encodingUTF8) Transform(dest, src []byte, op Op) ([]byte, error) { if IsValid(e, src) { @@ -93,6 +101,11 @@ type encodingUTF8MB3Strict struct { encodingUTF8 } +// IsValid implements Encoding interface. +func (e *encodingUTF8MB3Strict) IsValid(src []byte) bool { + return e.encodingBase.IsValid(src) +} + // Foreach implements Encoding interface. func (e *encodingUTF8MB3Strict) Foreach(src []byte, op Op, fn func(srcCh, dstCh []byte, ok bool) bool) { for i, w := 0, 0; i < len(src); i += w { diff --git a/table/column.go b/table/column.go index d7e9a9ec5dadb..27efcaaa7e373 100644 --- a/table/column.go +++ b/table/column.go @@ -372,7 +372,7 @@ func validateStringDatum(ctx sessionctx.Context, origin, casted *types.Datum, co } // Check if the string is valid in the given column charset. str := casted.GetBytes() - if !charset.IsValid(enc, str) { + if !enc.IsValid(str) { replace, _ := enc.Transform(nil, str, charset.OpReplace) casted.SetBytesAsString(replace, charset.CollationUTF8MB4, 0) nSrc := charset.CountValidBytes(enc, str) From 0b331622bc9c10437f92423e1ea471ba978f3d92 Mon Sep 17 00:00:00 2001 From: tangenta Date: Wed, 22 Dec 2021 16:10:57 +0800 Subject: [PATCH 2/2] expression, parser: update the all the usage for IsValid --- expression/builtin_string.go | 2 +- expression/builtin_string_vec.go | 2 +- expression/collation.go | 3 ++- parser/charset/encoding.go | 17 +---------------- parser/charset/encoding_test.go | 3 +-- parser/charset/encoding_utf8.go | 4 ++-- 6 files changed, 8 insertions(+), 23 deletions(-) diff --git a/expression/builtin_string.go b/expression/builtin_string.go index c494d9fcb5c10..acac019139708 100644 --- a/expression/builtin_string.go +++ b/expression/builtin_string.go @@ -1150,7 +1150,7 @@ func (b *builtinConvertSig) evalString(row chunk.Row) (string, bool, error) { return string(ret), false, err } enc := charset.FindEncoding(resultTp.Charset) - if !charset.IsValidString(enc, expr) { + if !enc.IsValid(hack.Slice(expr)) { replace, _ := enc.Transform(nil, hack.Slice(expr), charset.OpReplace) return string(replace), false, nil } diff --git a/expression/builtin_string_vec.go b/expression/builtin_string_vec.go index 3da555f9319ed..202a3d74ed3f1 100644 --- a/expression/builtin_string_vec.go +++ b/expression/builtin_string_vec.go @@ -689,7 +689,7 @@ func (b *builtinConvertSig) vecEvalString(input *chunk.Chunk, result *chunk.Colu continue } exprI := expr.GetBytes(i) - if !charset.IsValid(enc, exprI) { + if !enc.IsValid(exprI) { encBuf, _ = enc.Transform(encBuf, exprI, charset.OpReplace) result.AppendBytes(encBuf) } else { diff --git a/expression/collation.go b/expression/collation.go index 8dc5df02e55e0..813560775e2b4 100644 --- a/expression/collation.go +++ b/expression/collation.go @@ -22,6 +22,7 @@ import ( "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/chunk" "github.com/pingcap/tidb/util/collate" + "github.com/pingcap/tidb/util/hack" "github.com/pingcap/tidb/util/logutil" ) @@ -315,7 +316,7 @@ func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression) if isNull { continue } - if !charset.IsValidString(enc, str) { + if !enc.IsValid(hack.Slice(str)) { return false } } else { diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go index 92b1433549860..bf3d6b8ff269c 100644 --- a/parser/charset/encoding.go +++ b/parser/charset/encoding.go @@ -57,7 +57,7 @@ type Encoding interface { Tp() EncodingTp // Peek returns the next char. Peek(src []byte) []byte - // IsValid checks whether a utf-8 string is valid in the current encoding. + // IsValid checks whether the utf-8 bytes can be convert to valid string in current encoding. IsValid(src []byte) bool // Foreach iterates the characters in in current encoding. Foreach(src []byte, op Op, fn func(from, to []byte, ok bool) bool) @@ -103,21 +103,6 @@ const ( OpDecodeReplace = opToUTF8 | opTruncateReplace | opCollectTo ) -// IsValid checks whether the bytes is valid in current encoding. -func IsValid(e Encoding, src []byte) bool { - isValid := true - e.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool { - isValid = ok - return ok - }) - return isValid -} - -// IsValidString is a string version of IsValid. -func IsValidString(e Encoding, str string) bool { - return IsValid(e, Slice(str)) -} - // CountValidBytes counts the first valid bytes in src that // can be encode to the current encoding. func CountValidBytes(e Encoding, src []byte) int { diff --git a/parser/charset/encoding_test.go b/parser/charset/encoding_test.go index a78aa640d8be5..27d41dbf5ebd2 100644 --- a/parser/charset/encoding_test.go +++ b/parser/charset/encoding_test.go @@ -133,8 +133,7 @@ func TestEncodingValidate(t *testing.T) { enc = charset.EncodingUTF8MB3StrictImpl } strBytes := []byte(tc.str) - ok := charset.IsValid(enc, strBytes) - require.Equal(t, tc.ok, ok, msg) + require.Equal(t, tc.ok, enc.IsValid(strBytes), msg) replace, _ := enc.Transform(nil, strBytes, charset.OpReplace) require.Equal(t, tc.expected, string(replace), msg) } diff --git a/parser/charset/encoding_utf8.go b/parser/charset/encoding_utf8.go index 6c7d79941a31d..499ce5ea50de7 100644 --- a/parser/charset/encoding_utf8.go +++ b/parser/charset/encoding_utf8.go @@ -77,7 +77,7 @@ func (e *encodingUTF8) IsValid(src []byte) bool { // Transform implements Encoding interface. func (e *encodingUTF8) Transform(dest, src []byte, op Op) ([]byte, error) { - if IsValid(e, src) { + if e.IsValid(src) { return src, nil } return e.encodingBase.Transform(dest, src, op) @@ -120,7 +120,7 @@ func (e *encodingUTF8MB3Strict) Foreach(src []byte, op Op, fn func(srcCh, dstCh // Transform implements Encoding interface. func (e *encodingUTF8MB3Strict) Transform(dest, src []byte, op Op) ([]byte, error) { - if IsValid(e, src) { + if e.IsValid(src) { return src, nil } return e.encodingBase.Transform(dest, src, op)