From 30fc64c323d262e0faad10150d31025d06efe517 Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Tue, 2 Aug 2022 17:02:06 +0800 Subject: [PATCH] Optimize expression `LIKE() ESCAPE()` for bin collator (#5489) ref pingcap/tiflash#5294 --- .../Functions/CollationOperatorOptimized.h | 14 +- .../CollationStringSearchOptimized.h | 472 ++++++++++++++++++ dbms/src/Functions/FunctionsStringSearch.cpp | 58 +-- dbms/src/Storages/Transaction/Collator.cpp | 2 - .../Transaction/tests/gtest_tidb_collator.cpp | 141 +++++- 5 files changed, 637 insertions(+), 50 deletions(-) create mode 100644 dbms/src/Functions/CollationStringSearchOptimized.h diff --git a/dbms/src/Functions/CollationOperatorOptimized.h b/dbms/src/Functions/CollationOperatorOptimized.h index 8276a41fa17..e1bf36a537f 100644 --- a/dbms/src/Functions/CollationOperatorOptimized.h +++ b/dbms/src/Functions/CollationOperatorOptimized.h @@ -60,8 +60,6 @@ __attribute__((flatten, always_inline)) inline void LoopTwoColumns( { ColumnString::Offset a_prev_offset = 0; ColumnString::Offset b_prev_offset = 0; - const auto * a_ptr = reinterpret_cast(a_data.data()); - const auto * b_ptr = reinterpret_cast(b_data.data()); for (size_t i = 0; i < size; ++i) { @@ -69,10 +67,9 @@ __attribute__((flatten, always_inline)) inline void LoopTwoColumns( auto b_size = b_offsets[i] - b_prev_offset; // Remove last zero byte. - func({a_ptr, a_size - 1}, {b_ptr, b_size - 1}, i); - - a_ptr += a_size; - b_ptr += b_size; + func({reinterpret_cast(&a_data[a_prev_offset]), a_size - 1}, + {reinterpret_cast(&b_data[b_prev_offset]), b_size - 1}, + i); a_prev_offset = a_offsets[i]; b_prev_offset = b_offsets[i]; @@ -89,16 +86,13 @@ __attribute__((flatten, always_inline)) inline void LoopOneColumn( F && func) { ColumnString::Offset a_prev_offset = 0; - const auto * a_ptr = reinterpret_cast(a_data.data()); for (size_t i = 0; i < size; ++i) { auto a_size = a_offsets[i] - a_prev_offset; // Remove last zero byte. - func({a_ptr, a_size - 1}, i); - - a_ptr += a_size; + func({reinterpret_cast(&a_data[a_prev_offset]), a_size - 1}, i); a_prev_offset = a_offsets[i]; } } diff --git a/dbms/src/Functions/CollationStringSearchOptimized.h b/dbms/src/Functions/CollationStringSearchOptimized.h new file mode 100644 index 00000000000..499b95ce36e --- /dev/null +++ b/dbms/src/Functions/CollationStringSearchOptimized.h @@ -0,0 +1,472 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace TiDB +{ + +static constexpr char ANY = '%'; +static constexpr char ONE = '_'; + +/* + Unicode Code UTF-8 Code + 0000~007F 0xxxxxxx + 0080~07FF 110xxxxx 10xxxxxx + 0800~FFFF 1110xxxx 10xxxxxx 10xxxxxx + 10000~10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx +*/ +template +inline size_t BinCharSizeFromHead(const uint8_t b0) +{ + if constexpr (!utf8) + { + return 1; + } + return DB::UTF8::seqLength(b0); +} + +template +inline size_t BinCharSizeFromEnd(const char * b_, const char * begin_) +{ + if constexpr (!utf8) + { + return 1; + } + + const auto * b = reinterpret_cast(b_); + if (*b < 0x80) + { + return 1; + } + const auto * ori = b; + + const auto * begin = reinterpret_cast(begin_); + + // check range in case that bin str is invalid + while (begin < b && *b < 0xC0) + { + --b; + } + return ori - b + 1; +} + +template +struct BinStrPattern +{ + void compile(std::string_view pattern, char escape_) + { + { + match_sub_str.clear(); + match_sub_str.reserve(8); + match_types.clear(); + match_types.reserve(8); + } + escape = escape_; + + auto last_match_start = std::string_view::npos; + + const auto & fn_try_add_last_match_str = [&](size_t end_offset) { + if (last_match_start != std::string_view::npos) + { + match_sub_str.emplace_back(&pattern[last_match_start], end_offset - last_match_start); + match_types.emplace_back(MatchType::Match); + // reset + last_match_start = std::string_view::npos; + } + }; + + for (size_t offset = 0; offset < pattern.size();) + { + auto c = pattern[offset]; + auto cur_offset = offset; + auto size = BinCharSizeFromHead(pattern[offset]); + offset += size; // move next + + if (size == 1) + { + if (c == escape) + { + fn_try_add_last_match_str(cur_offset); + + if (offset < pattern.size()) + { + // start from current offset + last_match_start = offset; + + // use next to match + auto new_size = BinCharSizeFromHead(pattern[offset]); + offset += new_size; // move next + } + else + { + // use `escape` to match + match_sub_str.emplace_back(&escape, sizeof(escape)); + match_types.emplace_back(MatchType::Match); + } + } + else if (c == ANY) + { + fn_try_add_last_match_str(cur_offset); + match_types.emplace_back(MatchType::Any); + } + else if (c == ONE) + { + fn_try_add_last_match_str(cur_offset); + match_types.emplace_back(MatchType::One); + } + else + { + // if last match start offset is none, start from current offset. + last_match_start = last_match_start == std::string_view::npos ? cur_offset : last_match_start; + } + } + else + { + // if last match start offset is none, start from current offset. + last_match_start = last_match_start == std::string_view::npos ? cur_offset : last_match_start; + } + } + fn_try_add_last_match_str(pattern.size()); + } + struct MatchDesc + { + ssize_t pattern_index_start{}, pattern_index_end{}; + ssize_t match_str_index_start{}, match_str_index_end{}; + ssize_t src_index_start{}, src_index_end{}; + + bool isSrcValid() const + { + return !isSrcEmpty(); + } + bool isSrcEmpty() const + { + return src_index_start >= src_index_end; + } + size_t srcSize() const + { + return src_index_end - src_index_start; + } + std::string_view getSrcStrView(const char * src_data, size_t size) const + { + return std::string_view{src_data + src_index_start, size}; + } + void srcMoveByOffset(size_t size) + { + src_index_start += size; + } + void srcSkipChar(const char * src_data) + { + auto size = BinCharSizeFromHead(src_data[src_index_start]); + srcMoveByOffset(size); + } + bool patternEmpty() const + { + return pattern_index_start >= pattern_index_end; + } + void makeSrcInvalid() + { + src_index_start = src_index_end; + } + }; + + // check str equality + // - make src invalid if remain size if smaller than required + bool matchStrEqual(const std::string_view & src, MatchDesc & desc) const + { + const auto & match_str = match_sub_str[desc.match_str_index_start]; + if (desc.srcSize() < match_str.size()) + { + desc.makeSrcInvalid(); + return false; + } + if (DB::RawStrEqualCompare(desc.getSrcStrView(src.data(), match_str.size()), match_str)) + { + return false; + } + desc.match_str_index_start++; + desc.srcMoveByOffset(match_str.size()); + return true; + } + + // match from start exactly + // - return true if meet % + // - return false if failed to match else true + bool matchExactly(const std::string_view & src, MatchDesc & cur_match_desc) const + { + // match from start + for (; !cur_match_desc.patternEmpty(); cur_match_desc.pattern_index_start++) + { + const auto & type = match_types[cur_match_desc.pattern_index_start]; + if (type == MatchType::Any) + { + // break from loop + break; + } + + if (type == MatchType::Match) + { + if (!matchStrEqual(src, cur_match_desc)) + return false; + } + else + { + // src must be not empty + if (!cur_match_desc.isSrcValid()) + return false; + cur_match_desc.srcSkipChar(src.data()); + } + } + return true; + }; + + // match from end exactly + // - return true if meet % + // - return false if failed to match else true + bool matchExactlyReverse(const std::string_view & src, MatchDesc & cur_match_desc) const + { + for (; !cur_match_desc.patternEmpty(); --cur_match_desc.pattern_index_end) + { + const auto & type = match_types[cur_match_desc.pattern_index_end - 1]; + if (type == MatchType::Any) + { + break; + } + + if (type == MatchType::Match) + { + const auto & match_str = match_sub_str[cur_match_desc.match_str_index_end - 1]; + if (cur_match_desc.srcSize() < match_str.size()) + { + return false; + } + + if (DB::RawStrEqualCompare({src.data() + cur_match_desc.src_index_end - match_str.size(), match_str.size()}, match_str)) + { + return false; + } + cur_match_desc.match_str_index_end--; + cur_match_desc.src_index_end -= match_str.size(); + } + else + { + // src must be not empty + if (!cur_match_desc.isSrcValid()) + return false; + + auto size = BinCharSizeFromEnd(&src[cur_match_desc.src_index_end - 1], &src[cur_match_desc.src_index_start]); + cur_match_desc.src_index_end -= size; // remove from end + } + } + return true; + }; + + // search by pattern `...%..%` + // - return true if meet % + // - return false if failed to search + bool searchByPattern(const std::string_view & src, MatchDesc & desc) const + { + assert(match_types[desc.pattern_index_end - 1] == MatchType::Any); + assert(!desc.patternEmpty()); + + // leading `MatchType::One` can be removed first + for (; match_types[desc.pattern_index_start] == MatchType::One; desc.pattern_index_start++) + { + // src must be not empty + if (!desc.isSrcValid()) + return false; + desc.srcSkipChar(src.data()); + } + + if (match_types[desc.pattern_index_start] == MatchType::Any) + { + return true; + } + + // current type is MatchType::Match + // loop: + // - search next position of match sub str + // - if position found, start to match exactly + // - if match fail, fallback to loop + // - if match success, return match end pos + // - if position not found, return with fail. + for (;;) + { + const auto & match_str = match_sub_str[desc.match_str_index_start]; + auto src_view = desc.getSrcStrView(src.data(), desc.srcSize()); + auto pos = std::string_view::npos; + + // search sub str + // - seachers like `ASCIICaseSensitiveStringSearcher` or `Volnitsky` are too heavy for small str + // - TODO: optimize strstr search by simd + { + pos = src_view.find(match_str); + // pos = sse2_strstr(src_view, match_str); + } + + if (pos == std::string_view::npos) + { + return false; + } + else + { + // move to sub str position + desc.src_index_start = pos + src_view.data() - src.data(); + + MatchDesc new_desc = desc; + new_desc.srcMoveByOffset(match_str.size()); // start to check rest + new_desc.match_str_index_start++; + new_desc.pattern_index_start++; + + if (!matchExactly(src, new_desc)) + { + if (!new_desc.isSrcValid()) + return false; + // skip one char and restart to search + desc.srcSkipChar(src.data()); + } + else + { + desc = new_desc; + return true; + } + } + } + }; + + bool match(std::string_view src) const + { + MatchDesc cur_match_desc; + { + cur_match_desc.pattern_index_end = match_types.size(); + cur_match_desc.match_str_index_end = match_sub_str.size(); + cur_match_desc.src_index_end = src.size(); + } + + // if pattern starts or ends with `MatchType::Match` or `MatchType::One`, match exactly + { + // match from start + if (!matchExactly(src, cur_match_desc)) + { + return false; + } + // match from end + if (!matchExactlyReverse(src, cur_match_desc)) + { + return false; + } + } + + // if remain pattern is empty, src must be empty + if (cur_match_desc.patternEmpty()) + { + return cur_match_desc.isSrcEmpty(); + } + + assert(match_types[cur_match_desc.pattern_index_end - 1] == MatchType::Any); + + // remain pattern should be %..%...% + // search sub str one by one based on greedy rule + for (;;) + { + assert(match_types[cur_match_desc.pattern_index_start] == MatchType::Any); + + // move to next match type + cur_match_desc.pattern_index_start++; + + if (cur_match_desc.patternEmpty()) // if % is the last one + break; + + if (!searchByPattern(src, cur_match_desc)) + return false; + } + return true; + } + + enum class MatchType + { + Match, + One, + Any, + }; + + std::vector match_types; + std::vector match_sub_str; + char escape{}; +}; +} // namespace TiDB + +namespace DB +{ +template +ALWAYS_INLINE inline void BinStringPatternMatch( + const ColumnString::Chars_t & a_data, + const ColumnString::Offsets & a_offsets, + const std::string_view & pattern_str, + uint8_t escape_char, + Result & c) +{ + TiDB::BinStrPattern matcher; + matcher.compile(pattern_str, escape_char); + LoopOneColumn(a_data, a_offsets, a_offsets.size(), [&](const std::string_view & view, size_t i) { + c[i] = revert ^ matcher.match(view); + }); +} + +template +ALWAYS_INLINE inline bool StringPatternMatch( + const ColumnString::Chars_t & a_data, + const ColumnString::Offsets & a_offsets, + const std::string_view & pattern_str, + uint8_t escape_char, + const TiDB::TiDBCollatorPtr & collator, + Result & c) +{ + bool use_optimized_path = false; + + switch (collator->getCollatorId()) + { + case TiDB::ITiDBCollator::UTF8MB4_BIN: + case TiDB::ITiDBCollator::UTF8_BIN: + { + BinStringPatternMatch(a_data, a_offsets, pattern_str, escape_char, c); + use_optimized_path = true; + break; + } + case TiDB::ITiDBCollator::BINARY: + case TiDB::ITiDBCollator::ASCII_BIN: + case TiDB::ITiDBCollator::LATIN1_BIN: + { + BinStringPatternMatch(a_data, a_offsets, pattern_str, escape_char, c); + use_optimized_path = true; + break; + } + + default: + break; + } + return use_optimized_path; +} +} // namespace DB \ No newline at end of file diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index f0c6cd6f303..5da3ee55e60 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -504,14 +505,14 @@ struct MatchImpl /// fully supported.(Only case sensitive/insensitive is supported) if (like && collator != nullptr) { - auto matcher = collator->pattern(); - matcher->compile(orig_pattern, escape_char); - size_t size = offsets.size(); - size_t prev_offset = 0; - for (size_t i = 0; i < size; ++i) + bool use_optimized_path = StringPatternMatch(data, offsets, orig_pattern, escape_char, collator, res); + if (!use_optimized_path) { - res[i] = revert ^ matcher->match(reinterpret_cast(&data[prev_offset]), offsets[i] - prev_offset - 1); - prev_offset = offsets[i]; + auto matcher = collator->pattern(); + matcher->compile(orig_pattern, escape_char); + LoopOneColumn(data, offsets, offsets.size(), [&](const std::string_view & view, size_t i) { + res[i] = revert ^ matcher->match(view.data(), view.size()); + }); } return; } @@ -1930,18 +1931,18 @@ class FunctionStringReplace : public IFunction const String & match_type, ColumnWithTypeAndName & column_result) const { - const ColumnConst * c1_const = typeid_cast(column_needle.get()); - const ColumnConst * c2_const = typeid_cast(column_replacement.get()); - String needle = c1_const->getValue(); - String replacement = c2_const->getValue(); + const auto * c1_const = typeid_cast(column_needle.get()); + const auto * c2_const = typeid_cast(column_replacement.get()); + auto needle = c1_const->getValue(); + auto replacement = c2_const->getValue(); - if (const ColumnString * col = checkAndGetColumn(column_src.get())) + if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vector(col->getChars(), col->getOffsets(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); column_result.column = std::move(col_res); } - else if (const ColumnFixedString * col = checkAndGetColumn(column_src.get())) + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorFixed(col->getChars(), col->getN(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); @@ -1964,17 +1965,17 @@ class FunctionStringReplace : public IFunction { if constexpr (Impl::support_non_const_needle) { - const ColumnString * col_needle = typeid_cast(column_needle.get()); - const ColumnConst * col_replacement_const = typeid_cast(column_replacement.get()); - String replacement = col_replacement_const->getValue(); + const auto * col_needle = typeid_cast(column_needle.get()); + const auto * col_replacement_const = typeid_cast(column_replacement.get()); + auto replacement = col_replacement_const->getValue(); - if (const ColumnString * col = checkAndGetColumn(column_src.get())) + if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorNonConstNeedle(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); column_result.column = std::move(col_res); } - else if (const ColumnFixedString * col = checkAndGetColumn(column_src.get())) + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorFixedNonConstNeedle(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); @@ -2002,17 +2003,17 @@ class FunctionStringReplace : public IFunction { if constexpr (Impl::support_non_const_replacement) { - const ColumnConst * col_needle_const = typeid_cast(column_needle.get()); - String needle = col_needle_const->getValue(); - const ColumnString * col_replacement = typeid_cast(column_replacement.get()); + const auto * col_needle_const = typeid_cast(column_needle.get()); + auto needle = col_needle_const->getValue(); + const auto * col_replacement = typeid_cast(column_replacement.get()); - if (const ColumnString * col = checkAndGetColumn(column_src.get())) + if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorNonConstReplacement(col->getChars(), col->getOffsets(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); column_result.column = std::move(col_res); } - else if (const ColumnFixedString * col = checkAndGetColumn(column_src.get())) + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorFixedNonConstReplacement(col->getChars(), col->getN(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); @@ -2040,16 +2041,16 @@ class FunctionStringReplace : public IFunction { if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) { - const ColumnString * col_needle = typeid_cast(column_needle.get()); - const ColumnString * col_replacement = typeid_cast(column_replacement.get()); + const auto * col_needle = typeid_cast(column_needle.get()); + const auto * col_replacement = typeid_cast(column_replacement.get()); - if (const ColumnString * col = checkAndGetColumn(column_src.get())) + if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorNonConstNeedleReplacement(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); column_result.column = std::move(col_res); } - else if (const ColumnFixedString * col = checkAndGetColumn(column_src.get())) + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorFixedNonConstNeedleReplacement(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); @@ -2065,7 +2066,8 @@ class FunctionStringReplace : public IFunction throw Exception("Argument at index 2 and 3 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); } } - TiDB::TiDBCollatorPtr collator; + + TiDB::TiDBCollatorPtr collator{}; }; struct NamePosition diff --git a/dbms/src/Storages/Transaction/Collator.cpp b/dbms/src/Storages/Transaction/Collator.cpp index fc40701e3c5..a8434cd7eb7 100644 --- a/dbms/src/Storages/Transaction/Collator.cpp +++ b/dbms/src/Storages/Transaction/Collator.cpp @@ -17,8 +17,6 @@ #include #include -#include - namespace DB::ErrorCodes { extern const int LOGICAL_ERROR; diff --git a/dbms/src/Storages/Transaction/tests/gtest_tidb_collator.cpp b/dbms/src/Storages/Transaction/tests/gtest_tidb_collator.cpp index 13c51dba2db..904cefb26ef 100644 --- a/dbms/src/Storages/Transaction/tests/gtest_tidb_collator.cpp +++ b/dbms/src/Storages/Transaction/tests/gtest_tidb_collator.cpp @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include @@ -108,8 +109,82 @@ const typename CollatorCases::PatternCase CollatorCases::pattern_cases[] = { {{"ÀÀ", {false, false, false, false, false}}, {"aÀÀ", {false, false, true, false, true}}, {"ÀÀÀa", {true, true, true, true, true}}, {"aÀÀÀ", {false, false, true, false, true}}}}, {"___a", {{"中a", {true, true, false, false, false}}, {"中文字a", {false, false, true, true, true}}}}, {"𐐭", {{"𐐨", {false, false, true, false, false}}}}, + { + "%pending%deposits%", + { + {"riously after the carefully pending foxes. deposits are careful", {true, true, true, true, true}}, + {"pendingdeposits", {true, true, true, true, true}}, + {"pendingdeposits", {true, true, true, true, true}}, + }, + }, + { + "1234567\\", // `ESCAPE` at last + { + {"1234567\\", {true, true, true, true, true}}, + {"1234567", {false, false, false, false, false}}, + {"1234567\\1", {false, false, false, false, false}}, + }, + }, + { + "1234567\\910", // `ESCAPE` at middle + { + {"1234567\\910", {false, false, false, false, false}}, + {"1234567910", {true, true, true, true, true}}, + }, + }, + { + "%__", // test match from end + { + {"1", {false, false, false, false, false}}, // 1 bytes + {"À", {true, true, false, false, false}}, // 2 bytes + {"12", {true, true, true, true, true}}, // 2 bytes + {"中", {true, true, false, false, false}}, // 3 bytes + {"À1", {true, true, true, true, true}}, // 3 bytes + {"ÀÀ", {true, true, true, true, true}}, // 4 bytes + {"𒀈", {true, true, false, false, false}}, // 4 bytes 1 char + {"À中", {true, true, true, true, true}}, // 5 bytes + {"中中", {true, true, true, true, true}}, // 6 bytes + }, + }, + { + "%__%", // test + { + {"1", {false, false, false, false, false}}, // 1 bytes + {"À", {true, true, false, false, false}}, // 2 bytes + {"12", {true, true, true, true, true}}, // 2 bytes + {"中", {true, true, false, false, false}}, // 3 bytes + {"À1", {true, true, true, true, true}}, // 3 bytes + {"ÀÀ", {true, true, true, true, true}}, // 4 bytes + {"𒀈", {true, true, false, false, false}}, // 4 bytes 1 char + }, + }, + { + "%一_二", // test match from end + { + {"xx一a二", {true, true, true, true, true}}, + {"xx一À二", {false, false, true, true, true}}, + }, + }, + { + "%一_三%四五六%七", + { + {"一二三四五七", {false, false, false, false, false}}, + {"0一二三四五六.七", {false, false, true, true, true}}, + {"一二四五六七", {false, false, false, false, false}}, + {"一2三.四五六...七", {true, true, true, true, true}}, + }, + }, + { + "%一_三%", + { + {"000一二3", {false, false, false, false, false}}, + {"000一", {false, false, false, false, false}}, + }, + }, }; +static constexpr char ESCAPE = '\\'; + template void testCollator() { @@ -130,18 +205,64 @@ void testCollator() std::string buf; ASSERT_EQ(collator->sortKey(s.data(), s.length(), buf).toString(), ans); } - auto pattern = collator->pattern(); - for (const auto & c : CollatorCases::pattern_cases) { - const std::string & p = c.first; - pattern->compile(p, '\\'); - const auto & inner_cases = c.second; - for (const auto & inner_c : inner_cases) + TiDB::BinStrPattern matcher; + matcher.compile("%%%", '%'); + ASSERT_TRUE(matcher.match("%%")); + matcher.compile("%%", '.'); + ASSERT_TRUE(matcher.match("")); + + auto pattern = collator->pattern(); + pattern->compile("%%%", '%'); + ASSERT_TRUE(pattern->match("%%", 2)); + } + { + auto pattern = collator->pattern(); + for (const auto & c : CollatorCases::pattern_cases) { - const std::string & s = inner_c.first; - bool ans = std::get(inner_c.second); - std::cout << "Pattern case (" << p << ", " << s << ", " << ans << ")" << std::endl; - ASSERT_EQ(pattern->match(s.data(), s.length()), ans); + const std::string & p = c.first; + const auto & inner_cases = c.second; + + ColumnString::Chars_t strs; + ColumnString::Offsets offsets; + std::vector res; + { // init data + ColumnString::Offset current_new_offset = 0; + for (const auto & inner_c : inner_cases) + { + const auto s = inner_c.first + char(0); + { + current_new_offset += s.size(); + offsets.push_back(current_new_offset); + } + { + strs.resize(strs.size() + s.size()); + memcpySmallAllowReadWriteOverflow15( + &strs[strs.size() - s.size()], + s.data(), + s.size()); + } + res.emplace_back(0); + } + } + if (!StringPatternMatch(strs, offsets, p, ESCAPE, collator, res)) + { + pattern->compile(p, ESCAPE); + for (size_t idx = 0; idx < std::size(inner_cases); ++idx) + { + const auto & inner_c = inner_cases[idx]; + const std::string & s = inner_c.first; + res[idx] = pattern->match(s.data(), s.length()); + } + } + + for (size_t idx = 0; idx < std::size(inner_cases); ++idx) + { + const auto & inner_c = inner_cases[idx]; + bool ans = std::get(inner_c.second); + std::cout << "Pattern case (" << p << ", " << inner_c.first << ", " << ans << ")" << std::endl; + ASSERT_EQ(res[idx], ans); + } } } }