diff --git a/dbms/src/Common/UTF8Helpers.h b/dbms/src/Common/UTF8Helpers.h index e6ef2577f32..10fb798e55e 100644 --- a/dbms/src/Common/UTF8Helpers.h +++ b/dbms/src/Common/UTF8Helpers.h @@ -52,7 +52,7 @@ inline void syncForward(const UInt8 *& s, const UInt8 * const end) /// returns UTF-8 code point sequence length judging by it's first octet inline size_t seqLength(const UInt8 first_octet) { - if (first_octet < 0x80u) + if (first_octet < 0x80 || first_octet >= 0xF8) /// The specs of UTF-8. return 1; const size_t bits = 8; diff --git a/dbms/src/Functions/CharUtil.h b/dbms/src/Functions/CharUtil.h index 59de5b897ab..0e0f1fdde5c 100644 --- a/dbms/src/Functions/CharUtil.h +++ b/dbms/src/Functions/CharUtil.h @@ -230,6 +230,141 @@ const CaseRange caseRange[]{ {0x1F00, 0x1F07, {8, 0, 8}}, {0x1F08, 0x1F0F, {0, -8, 0}}, {0x1F10, 0x1F15, {8, 0, 8}}, + {0x1F18, 0x1F1D, {0, -8, 0}}, + {0x1F20, 0x1F27, {8, 0, 8}}, + {0x1F28, 0x1F2F, {0, -8, 0}}, + {0x1F30, 0x1F37, {8, 0, 8}}, + {0x1F38, 0x1F3F, {0, -8, 0}}, + {0x1F40, 0x1F45, {8, 0, 8}}, + {0x1F48, 0x1F4D, {0, -8, 0}}, + {0x1F51, 0x1F51, {8, 0, 8}}, + {0x1F53, 0x1F53, {8, 0, 8}}, + {0x1F55, 0x1F55, {8, 0, 8}}, + {0x1F57, 0x1F57, {8, 0, 8}}, + {0x1F59, 0x1F59, {0, -8, 0}}, + {0x1F5B, 0x1F5B, {0, -8, 0}}, + {0x1F5D, 0x1F5D, {0, -8, 0}}, + {0x1F5F, 0x1F5F, {0, -8, 0}}, + {0x1F60, 0x1F67, {8, 0, 8}}, + {0x1F68, 0x1F6F, {0, -8, 0}}, + {0x1F70, 0x1F71, {74, 0, 74}}, + {0x1F72, 0x1F75, {86, 0, 86}}, + {0x1F76, 0x1F77, {100, 0, 100}}, + {0x1F78, 0x1F79, {128, 0, 128}}, + {0x1F7A, 0x1F7B, {112, 0, 112}}, + {0x1F7C, 0x1F7D, {126, 0, 126}}, + {0x1F80, 0x1F87, {8, 0, 8}}, + {0x1F88, 0x1F8F, {0, -8, 0}}, + {0x1F90, 0x1F97, {8, 0, 8}}, + {0x1F98, 0x1F9F, {0, -8, 0}}, + {0x1FA0, 0x1FA7, {8, 0, 8}}, + {0x1FA8, 0x1FAF, {0, -8, 0}}, + {0x1FB0, 0x1FB1, {8, 0, 8}}, + {0x1FB3, 0x1FB3, {9, 0, 9}}, + {0x1FB8, 0x1FB9, {0, -8, 0}}, + {0x1FBA, 0x1FBB, {0, -74, 0}}, + {0x1FBC, 0x1FBC, {0, -9, 0}}, + {0x1FBE, 0x1FBE, {-7205, 0, -7205}}, + {0x1FC3, 0x1FC3, {9, 0, 9}}, + {0x1FC8, 0x1FCB, {0, -86, 0}}, + {0x1FCC, 0x1FCC, {0, -9, 0}}, + {0x1FD0, 0x1FD1, {8, 0, 8}}, + {0x1FD8, 0x1FD9, {0, -8, 0}}, + {0x1FDA, 0x1FDB, {0, -100, 0}}, + {0x1FE0, 0x1FE1, {8, 0, 8}}, + {0x1FE5, 0x1FE5, {7, 0, 7}}, + {0x1FE8, 0x1FE9, {0, -8, 0}}, + {0x1FEA, 0x1FEB, {0, -112, 0}}, + {0x1FEC, 0x1FEC, {0, -7, 0}}, + {0x1FF3, 0x1FF3, {9, 0, 9}}, + {0x1FF8, 0x1FF9, {0, -128, 0}}, + {0x1FFA, 0x1FFB, {0, -126, 0}}, + {0x1FFC, 0x1FFC, {0, -9, 0}}, + {0x2126, 0x2126, {0, -7517, 0}}, + {0x212A, 0x212A, {0, -8383, 0}}, + {0x212B, 0x212B, {0, -8262, 0}}, + {0x2132, 0x2132, {0, 28, 0}}, + {0x214E, 0x214E, {-28, 0, -28}}, + {0x2160, 0x216F, {0, 16, 0}}, + {0x2170, 0x217F, {-16, 0, -16}}, + {0x2183, 0x2184, {UpperLower, UpperLower, UpperLower}}, + {0x24B6, 0x24CF, {0, 26, 0}}, + {0x24D0, 0x24E9, {-26, 0, -26}}, + {0x2C00, 0x2C2F, {0, 48, 0}}, + {0x2C30, 0x2C5F, {-48, 0, -48}}, + {0x2C60, 0x2C61, {UpperLower, UpperLower, UpperLower}}, + {0x2C62, 0x2C62, {0, -10743, 0}}, + {0x2C63, 0x2C63, {0, -3814, 0}}, + {0x2C64, 0x2C64, {0, -10727, 0}}, + {0x2C65, 0x2C65, {-10795, 0, -10795}}, + {0x2C66, 0x2C66, {-10792, 0, -10792}}, + {0x2C67, 0x2C6C, {UpperLower, UpperLower, UpperLower}}, + {0x2C6D, 0x2C6D, {0, -10780, 0}}, + {0x2C6E, 0x2C6E, {0, -10749, 0}}, + {0x2C6F, 0x2C6F, {0, -10783, 0}}, + {0x2C70, 0x2C70, {0, -10782, 0}}, + {0x2C72, 0x2C73, {UpperLower, UpperLower, UpperLower}}, + {0x2C75, 0x2C76, {UpperLower, UpperLower, UpperLower}}, + {0x2C7E, 0x2C7F, {0, -10815, 0}}, + {0x2C80, 0x2CE3, {UpperLower, UpperLower, UpperLower}}, + {0x2CEB, 0x2CEE, {UpperLower, UpperLower, UpperLower}}, + {0x2CF2, 0x2CF3, {UpperLower, UpperLower, UpperLower}}, + {0x2D00, 0x2D25, {-7264, 0, -7264}}, + {0x2D27, 0x2D27, {-7264, 0, -7264}}, + {0x2D2D, 0x2D2D, {-7264, 0, -7264}}, + {0xA640, 0xA66D, {UpperLower, UpperLower, UpperLower}}, + {0xA680, 0xA69B, {UpperLower, UpperLower, UpperLower}}, + {0xA722, 0xA72F, {UpperLower, UpperLower, UpperLower}}, + {0xA732, 0xA76F, {UpperLower, UpperLower, UpperLower}}, + {0xA779, 0xA77C, {UpperLower, UpperLower, UpperLower}}, + {0xA77D, 0xA77D, {0, -35332, 0}}, + {0xA77E, 0xA787, {UpperLower, UpperLower, UpperLower}}, + {0xA78B, 0xA78C, {UpperLower, UpperLower, UpperLower}}, + {0xA78D, 0xA78D, {0, -42280, 0}}, + {0xA790, 0xA793, {UpperLower, UpperLower, UpperLower}}, + {0xA794, 0xA794, {48, 0, 48}}, + {0xA796, 0xA7A9, {UpperLower, UpperLower, UpperLower}}, + {0xA7AA, 0xA7AA, {0, -42308, 0}}, + {0xA7AB, 0xA7AB, {0, -42319, 0}}, + {0xA7AC, 0xA7AC, {0, -42315, 0}}, + {0xA7AD, 0xA7AD, {0, -42305, 0}}, + {0xA7AE, 0xA7AE, {0, -42308, 0}}, + {0xA7B0, 0xA7B0, {0, -42258, 0}}, + {0xA7B1, 0xA7B1, {0, -42282, 0}}, + {0xA7B2, 0xA7B2, {0, -42261, 0}}, + {0xA7B3, 0xA7B3, {0, 928, 0}}, + {0xA7B4, 0xA7C3, {UpperLower, UpperLower, UpperLower}}, + {0xA7C4, 0xA7C4, {0, -48, 0}}, + {0xA7C5, 0xA7C5, {0, -42307, 0}}, + {0xA7C6, 0xA7C6, {0, -35384, 0}}, + {0xA7C7, 0xA7CA, {UpperLower, UpperLower, UpperLower}}, + {0xA7D0, 0xA7D1, {UpperLower, UpperLower, UpperLower}}, + {0xA7D6, 0xA7D9, {UpperLower, UpperLower, UpperLower}}, + {0xA7F5, 0xA7F6, {UpperLower, UpperLower, UpperLower}}, + {0xAB53, 0xAB53, {-928, 0, -928}}, + {0xAB70, 0xABBF, {-38864, 0, -38864}}, + {0xFF21, 0xFF3A, {0, 32, 0}}, + {0xFF41, 0xFF5A, {-32, 0, -32}}, + {0x10400, 0x10427, {0, 40, 0}}, + {0x10428, 0x1044F, {-40, 0, -40}}, + {0x104B0, 0x104D3, {0, 40, 0}}, + {0x104D8, 0x104FB, {-40, 0, -40}}, + {0x10570, 0x1057A, {0, 39, 0}}, + {0x1057C, 0x1058A, {0, 39, 0}}, + {0x1058C, 0x10592, {0, 39, 0}}, + {0x10594, 0x10595, {0, 39, 0}}, + {0x10597, 0x105A1, {-39, 0, -39}}, + {0x105A3, 0x105B1, {-39, 0, -39}}, + {0x105B3, 0x105B9, {-39, 0, -39}}, + {0x105BB, 0x105BC, {-39, 0, -39}}, + {0x10C80, 0x10CB2, {0, 64, 0}}, + {0x10CC0, 0x10CF2, {-64, 0, -64}}, + {0x118A0, 0x118BF, {0, 32, 0}}, + {0x118C0, 0x118DF, {-32, 0, -32}}, + {0x16E40, 0x16E5F, {0, 32, 0}}, + {0x16E60, 0x16E7F, {-32, 0, -32}}, + {0x1E900, 0x1E921, {0, 34, 0}}, + {0x1E922, 0x1E943, {-34, 0, -34}}, }; inline int toCase(int _case, int ch) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index b9fc749f6ae..c2e04e504a2 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -426,25 +426,62 @@ template __attribute__((always_inline)) inline void toCaseImplTiDB( - ConstPtr & src, - const ConstPtr src_end, - Ptr & dst) + const UInt8 *& src, + const UInt8 * src_end, + size_t offsets_pos, + ColumnString::Chars_t & dst_data, + IColumn::Offsets & dst_offsets, + bool & is_diff_offsets) { - if (src[0] <= ascii_upper_bound) + if (*src <= ascii_upper_bound) { + size_t dst_size = dst_data.size(); + dst_data.resize(dst_size + 1); if (*src >= not_case_lower_bound && *src <= not_case_upper_bound) - *dst++ = *src++ ^ flip_case_mask; + dst_data[dst_size] = *src++ ^ flip_case_mask; else - *dst++ = *src++; + dst_data[dst_size] = *src++; } else { static const Poco::UTF8Encoding utf8; - if (const auto chars = utf8.convert(to_case(utf8.convert(src)), dst, src_end - src)) - src += chars, dst += chars; - else - ++src, ++dst; + int src_sequence_length = utf8.sequenceLength(src, 1); + assert(src_sequence_length > 0); + if unlikely (src + src_sequence_length > src_end) + { + /// If this row has invalid utf-8 characters, just copy it to dst string and do not influence others + size_t dst_size = dst_data.size(); + dst_data.resize(src_end - src + dst_size); + memcpy(&dst_data[dst_size], src, src_end - src); + src = src_end; + return; + } + + int src_ch = utf8.convert(src); + if unlikely (src_ch == -1) + { + /// If this row has invalid utf-8 characters, just copy it to dst string and do not influence others + size_t dst_size = dst_data.size(); + dst_data.resize(dst_size + src_sequence_length); + memcpy(&dst_data[dst_size], src, src_sequence_length); + src += src_sequence_length; + return; + } + int dst_ch = to_case(src_ch); + int dst_sequence_length = utf8.convert(dst_ch, nullptr, 0); + size_t dst_size = dst_data.size(); + dst_data.resize(dst_size + dst_sequence_length); + utf8.convert(dst_ch, &dst_data[dst_size], dst_sequence_length); + + if (dst_sequence_length != src_sequence_length) + { + assert((Int64)dst_offsets[offsets_pos] + dst_sequence_length - src_sequence_length >= 0); + dst_offsets[offsets_pos] += dst_sequence_length - src_sequence_length; + is_diff_offsets = true; + } + + src += src_sequence_length; } } @@ -547,12 +584,19 @@ TIFLASH_DECLARE_MULTITARGET_FUNCTION_TP( to_case), void, lowerUpperUTF8ArrayImplTiDB, - (src, src_end, dst), - (ConstPtr & src, - const ConstPtr src_end, - Ptr & dst), - { + (src_data, src_offsets, dst_data, dst_offsets), + (const ColumnString::Chars_t & src_data, + const IColumn::Offsets & src_offsets, + ColumnString::Chars_t & dst_data, + IColumn::Offsets & dst_offsets), + { + dst_data.reserve(src_data.size()); + dst_offsets.assign(src_offsets); static const auto flip_mask = SimdWord::template fromSingle(flip_case_mask); + const UInt8 *src = src_data.data(), *src_end = src_data.data() + src_data.size(); + auto * begin = src; + bool is_diff_offsets = false; + size_t offsets_pos = 0; while (src + WORD_SIZE < src_end) { auto word = SimdWord::fromUnaligned(src); @@ -567,31 +611,71 @@ TIFLASH_DECLARE_MULTITARGET_FUNCTION_TP( range_check.as_int8 = (word.as_int8 >= lower_bounds.as_int8) & (word.as_int8 <= upper_bounds.as_int8); selected.as_int8 = range_check.as_int8 & flip_mask.as_int8; word.as_int8 ^= selected.as_int8; - word.toUnaligned(dst); + size_t dst_size = dst_data.size(); + dst_data.resize(dst_size + WORD_SIZE); + word.toUnaligned(&dst_data[dst_size]); src += WORD_SIZE; - dst += WORD_SIZE; } else { + size_t offset_from_begin = src - begin; + while (offset_from_begin >= src_offsets[offsets_pos]) + ++offsets_pos; auto expected_end = src + WORD_SIZE; - while (src < expected_end) + while (true) + { + const UInt8 * row_end = begin + src_offsets[offsets_pos]; + assert(row_end >= src); + auto end = std::min(expected_end, row_end); + while (src < end) + { + toCaseImplTiDB< + not_case_lower_bound, + not_case_upper_bound, + ascii_upper_bound, + flip_case_mask, + to_case>(src, row_end, offsets_pos, dst_data, dst_offsets, is_diff_offsets); + } + if (src >= expected_end) + break; + ++offsets_pos; + } + } + } + + if (src < src_end) + { + size_t offset_from_begin = src - begin; + while (offset_from_begin >= src_offsets[offsets_pos]) + ++offsets_pos; + + while (src < src_end) + { + const UInt8 * row_end = begin + src_offsets[offsets_pos]; + assert(row_end >= src); + while (src < row_end) { toCaseImplTiDB< not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, - to_case>(src, src_end, dst); + to_case>(src, row_end, offsets_pos, dst_data, dst_offsets, is_diff_offsets); } + ++offsets_pos; + } + } + + if unlikely (is_diff_offsets) + { + Int64 diff = 0; + for (size_t i = 0; i < dst_offsets.size(); ++i) + { + /// diff is the cumulative offset difference from 0 to the i position + diff += (Int64)dst_offsets[i] - (Int64)src_offsets[i]; + dst_offsets[i] = src_offsets[i] + diff; } } - while (src < src_end) - toCaseImplTiDB< - not_case_lower_bound, - not_case_upper_bound, - ascii_upper_bound, - flip_case_mask, - to_case>(src, src_end, dst); }) } // namespace @@ -622,66 +706,22 @@ void TiDBLowerUpperUTF8Impl ColumnString::Chars_t & res_data, IColumn::Offsets & res_offsets) { - res_data.resize(data.size()); - res_offsets.assign(offsets); - array(data.data(), data.data() + data.size(), res_data.data()); + lowerUpperUTF8ArrayImplTiDB( + data, + offsets, + res_data, + res_offsets); } template void TiDBLowerUpperUTF8Impl::vectorFixed( - const ColumnString::Chars_t & data, + const ColumnString::Chars_t & /*data*/, size_t /*n*/, - ColumnString::Chars_t & res_data) -{ - res_data.resize(data.size()); - array(data.data(), data.data() + data.size(), res_data.data()); -} - -template -void TiDBLowerUpperUTF8Impl::constant( - const std::string & data, - std::string & res_data) + ColumnString::Chars_t & /*res_data*/) { - res_data.resize(data.size()); - array(reinterpret_cast(data.data()), - reinterpret_cast(data.data() + data.size()), - reinterpret_cast(&res_data[0])); -} - -template -void TiDBLowerUpperUTF8Impl::toCase( - const UInt8 *& src, - const UInt8 * src_end, - UInt8 *& dst) -{ - toCaseImplTiDB< - not_case_lower_bound, - not_case_upper_bound, - ascii_upper_bound, - flip_case_mask, - to_case>(src, src_end, dst); -} - -template -void TiDBLowerUpperUTF8Impl::array( - const UInt8 * src, - const UInt8 * src_end, - UInt8 * dst) -{ - lowerUpperUTF8ArrayImplTiDB< - not_case_lower_bound, - not_case_upper_bound, - ascii_upper_bound, - flip_case_mask, - to_case>(src, src_end, dst); + throw Exception("Cannot apply function TiDBLowerUpperUTF8 to fixed string.", ErrorCodes::ILLEGAL_COLUMN); } /** If the string is encoded in UTF-8, then it selects a substring of code points in it. diff --git a/dbms/src/Functions/FunctionsString.h b/dbms/src/Functions/FunctionsString.h index df35822eb93..c174420eafe 100644 --- a/dbms/src/Functions/FunctionsString.h +++ b/dbms/src/Functions/FunctionsString.h @@ -189,13 +189,13 @@ class FunctionStringToString : public IFunction void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override { const ColumnPtr column = block.getByPosition(arguments[0]).column; - if (const ColumnString * col = checkAndGetColumn(column.get())) + if (const auto * col = checkAndGetColumn(column.get())) { auto col_res = ColumnString::create(); Impl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets()); block.getByPosition(result).column = std::move(col_res); } - else if (const ColumnFixedString * col = checkAndGetColumn(column.get())) + else if (const auto * col = checkAndGetColumn(column.get())) { auto col_res = ColumnFixedString::create(col->getN()); Impl::vectorFixed(col->getChars(), col->getN(), col_res->getChars()); @@ -220,17 +220,9 @@ struct TiDBLowerUpperUTF8Impl static void vectorFixed(const ColumnString::Chars_t & data, size_t n, ColumnString::Chars_t & res_data); - static void constant(const std::string & data, std::string & res_data); - - /** Converts a single code point starting at `src` to desired case, storing result starting at `dst`. - * `src` and `dst` are incremented by corresponding sequence lengths. */ - static void toCase(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst); - private: static constexpr auto ascii_upper_bound = '\x7f'; static constexpr auto flip_case_mask = 'A' ^ 'a'; - - static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst); }; struct TiDBLowerUpperBinaryImpl diff --git a/dbms/src/Functions/tests/gtest_strings_lower.cpp b/dbms/src/Functions/tests/gtest_strings_lower.cpp index 6d384831564..e53ea07f3cd 100644 --- a/dbms/src/Functions/tests/gtest_strings_lower.cpp +++ b/dbms/src/Functions/tests/gtest_strings_lower.cpp @@ -63,9 +63,35 @@ class StringLower : public DB::tests::FunctionTest TEST_F(StringLower, lowerAll) { - std::vector> candidate_strings = {"one WEEK’S time TEST", "abc测试def", "ABCテストabc", "ЀЁЂѓЄЅІїЈЉЊЋЌѝЎЏ", "+Ѐ-ё*Ђ/ѓ!Є@Ѕ#І$@Ї%Ј……љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", "ΑΒΓΔΕΖΗΘικΛΜΝΞΟΠΡΣτΥΦΧΨωΣ", "▲Α▼Βγ➨ΔΕ☎ΖΗ✂ΘΙ€ΚΛ♫ΜΝ✓ΞΟ✚ΠΡ℉ΣΤ♥ΥΦ♖ΧΨ♘Ω★Σ✕", "թՓՁՋՐՉՃԺԾՔՈԵՌՏԸՒԻՕՊԱՍԴՖԳՀՅԿԼԽԶՂՑՎԲՆմՇ"}; - std::vector> lower_case_strings = {"one week’s time test", "abc测试def", "abcテストabc", "ѐёђѓєѕіїјљњћќѝўџ", "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……љ&њ(ћ)ќ¥ѝ#ў@џ!^", "αβγδεζηθικλμνξοπρστυφχψωσ", "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★σ✕", "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}; - + std::vector> candidate_strings + = {"one WEEK'S time TEST", + "abc测试def", + "ABCテストabc", + "ЀЁЂѓЄЅІїЈЉЊЋЌѝЎЏ", + "+Ѐ-ё*Ђ/ѓ!Є@Ѕ#І$@Ї%Ј……љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", + "İaSdİİİİdDS", + "ΑΒΓΔΕΖΗΘικΛΜΝΞΟΠΡΣτΥΦΧΨωΣ", + "ȺDȺİȺaȺȾOİȺ", + "TEST_WRONG_UTF8_1\x80\xe0\x21", + "▲Α▼Βγ➨ΔΕ☎ΖΗ✂ΘΙ€ΚΛ♫ΜΝ✓ΞΟ✚ΠΡ℉ΣΤ♥ΥΦ♖ΧΨ♘Ω★Σ✕", + "ⱮⱭȺΩABCDEFGHIJKLMNOPꞍaȾ", + "TEST_WRONG_UTF8_2\xf1\x22", + "թՓՁՋՐՉՃԺԾՔՈԵՌՏԸՒԻՕՊԱՍԴՖԳՀՅԿԼԽԶՂՑՎԲՆմՇ"}; + + std::vector> lower_case_strings + = {"one week's time test", + "abc测试def", + "abcテストabc", + "ѐёђѓєѕіїјљњћќѝўџ", + "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……љ&њ(ћ)ќ¥ѝ#ў@џ!^", + "iasdiiiidds", + "αβγδεζηθικλμνξοπρστυφχψωσ", + "ⱥdⱥiⱥaⱥⱦoiⱥ", + "test_wrong_utf8_1\x80\xe0\x21", + "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★σ✕", + "ɱɑⱥωabcdefghijklmnopɥaⱦ", + "test_wrong_utf8_2\xf1\x22", + "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}; ASSERT_COLUMN_EQ( toNullableVec(lower_case_strings), diff --git a/dbms/src/Functions/tests/gtest_strings_upper.cpp b/dbms/src/Functions/tests/gtest_strings_upper.cpp index 83fd5c88a8e..acf80258ffc 100644 --- a/dbms/src/Functions/tests/gtest_strings_upper.cpp +++ b/dbms/src/Functions/tests/gtest_strings_upper.cpp @@ -43,9 +43,16 @@ class StringUpper : public DB::tests::FunctionTest return createColumn>(v); } - static ColumnWithTypeAndName toVec(const std::vector & v) + static ColumnWithTypeAndName toVec(const std::vector> & v) { - return createColumn(v); + std::vector strings; + strings.reserve(v.size()); + for (std::optional s : v) + { + strings.push_back(s.value()); + } + + return createColumn(strings); } static ColumnWithTypeAndName toConst(const String & s) @@ -56,29 +63,43 @@ class StringUpper : public DB::tests::FunctionTest TEST_F(StringUpper, upperAll) { - ASSERT_COLUMN_EQ( - toNullableVec({"ONE WEEK’S TIME TEST", "ABC测试DEF", "ABCテストABC", "ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏ", "+Ѐ-Ё*Ђ/Ѓ!Є@Ѕ#І$@Ї%Ј……Љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΣ", "▲Α▼ΒΓ➨ΔΕ☎ΖΗ✂ΘΙ€ΚΛ♫ΜΝ✓ΞΟ✚ΠΡ℉ΣΤ♥ΥΦ♖ΧΨ♘Ω★Σ✕", "ԹՓՁՋՐՉՃԺԾՔՈԵՌՏԸՒԻՕՊԱՍԴՖԳՀՅԿԼԽԶՂՑՎԲՆՄՇ"}), - executeFunction( - "upperUTF8", - toNullableVec({"one week’s time TEST", "abc测试DeF", "AbCテストAbC", "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……Љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", "αβγδεζηθικλμνξοπρστυφχψως", "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★ς✕", "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}))); - - ASSERT_COLUMN_EQ( - toVec({"ONE WEEK’S TIME TEST", "ABC测试DEF", "ABCテストABC", "ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏ", "+Ѐ-Ё*Ђ/Ѓ!Є@Ѕ#І$@Ї%Ј……Љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΣ", "▲Α▼ΒΓ➨ΔΕ☎ΖΗ✂ΘΙ€ΚΛ♫ΜΝ✓ΞΟ✚ΠΡ℉ΣΤ♥ΥΦ♖ΧΨ♘Ω★Σ✕", "ԹՓՁՋՐՉՃԺԾՔՈԵՌՏԸՒԻՕՊԱՍԴՖԳՀՅԿԼԽԶՂՑՎԲՆՄՇ"}), - executeFunction( - "upperUTF8", - toVec({"one week’s time TEST", "abc测试DeF", "AbCテストAbC", "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……Љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", "αβγδεζηθικλμνξοπρστυφχψως", "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★ς✕", "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}))); + std::vector> candidate_strings + = {"one week's time TEST", + "abc测试DeF", + "AbCテストAbC", + "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", + "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……Љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", + "ſⱥⱦⱥaſfɫoomɑɱɒ", + "αβγδεζηθικλμνξοπρστυφχψως", + "test_wrong_utf8_1\x80\xe0\x21", + "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★ς✕", + "ȿɀabcdefghijklmnopɥı", + "test_wrong_utf8_2\xf1\x22", + "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}; + + std::vector> upper_case_strings + = {"ONE WEEK'S TIME TEST", + "ABC测试DEF", + "ABCテストABC", + "ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏ", + "+Ѐ-Ё*Ђ/Ѓ!Є@Ѕ#І$@Ї%Ј……Љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", + "SȺȾȺASFⱢOOMⱭⱮⱰ", + "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΣ", + "TEST_WRONG_UTF8_1\x80\xe0\x21", + "▲Α▼ΒΓ➨ΔΕ☎ΖΗ✂ΘΙ€ΚΛ♫ΜΝ✓ΞΟ✚ΠΡ℉ΣΤ♥ΥΦ♖ΧΨ♘Ω★Σ✕", + "ⱾⱿABCDEFGHIJKLMNOPꞍI", + "TEST_WRONG_UTF8_2\xf1\x22", + "ԹՓՁՋՐՉՃԺԾՔՈԵՌՏԸՒԻՕՊԱՍԴՖԳՀՅԿԼԽԶՂՑՎԲՆՄՇ"}; + + ASSERT_COLUMN_EQ(toNullableVec(upper_case_strings), executeFunction("upperUTF8", toNullableVec(candidate_strings))); + + ASSERT_COLUMN_EQ(toVec(upper_case_strings), executeFunction("upperUTF8", toVec(candidate_strings))); ASSERT_COLUMN_EQ( - toNullableVec({"one week’s time TEST", "abc测试DeF", "AbCテストAbC", "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", "αβγδεζηθικλμνξοπρστυφχψως", "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}), - executeFunction( - "upperBinary", - toNullableVec({"one week’s time TEST", "abc测试DeF", "AbCテストAbC", "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", "αβγδεζηθικλμνξοπρστυφχψως", "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}))); + toNullableVec(candidate_strings), + executeFunction("upperBinary", toNullableVec(candidate_strings))); - ASSERT_COLUMN_EQ( - toVec({"one week’s time TEST", "abc测试DeF", "AbCテストAbC", "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", "αβγδεζηθικλμνξοπρστυφχψως", "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}), - executeFunction( - "upperBinary", - toVec({"one week’s time TEST", "abc测试DeF", "AbCテストAbC", "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", "αβγδεζηθικλμνξοπρστυφχψως", "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}))); + ASSERT_COLUMN_EQ(toVec(candidate_strings), executeFunction("upperBinary", toVec(candidate_strings))); ASSERT_COLUMN_EQ( toConst("ONE WEEK’S TIME TEST"),