lowerUTF8/upperUTF8 allow lower/uppercase characters occupy different…

… number of bytes (#8622) (#8669) close #8484
pingcap · Jan 19, 2024 · 0618c04 · 0618c04
1 parent 7bb0d4e
commit 0618c04
Show file tree

Hide file tree

Showing 5 changed files with 179 additions and 155 deletions.
diff --git a/dbms/src/Common/UTF8Helpers.h b/dbms/src/Common/UTF8Helpers.h
@@ -66,7 +66,7 @@ inline void syncForward(const UInt8 *& s, const UInt8 * const end)
 /// returns UTF-8 code point sequence length judging by it's first octet
 inline size_t seqLength(const UInt8 first_octet)
 {
-    if (first_octet < 0x80u)
+    if (first_octet < 0x80 || first_octet >= 0xF8) /// The specs of UTF-8.
         return 1;
 
     const size_t bits = 8;

diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp
@@ -437,25 +437,62 @@ template <
     char flip_case_mask,
     int to_case(int)>
 __attribute__((always_inline)) inline void toCaseImplTiDB(
-    ConstPtr<UInt8> & src,
-    const ConstPtr<UInt8> src_end,
-    Ptr<UInt8> & dst)
+    const UInt8 *& src,
+    const UInt8 * src_end,
+    size_t offsets_pos,
+    ColumnString::Chars_t & dst_data,
+    IColumn::Offsets & dst_offsets,
+    bool & is_diff_offsets)
 {
-    if (src[0] <= ascii_upper_bound)
+    if (*src <= ascii_upper_bound)
     {
+        size_t dst_size = dst_data.size();
+        dst_data.resize(dst_size + 1);
         if (*src >= not_case_lower_bound && *src <= not_case_upper_bound)
-            *dst++ = *src++ ^ flip_case_mask;
+            dst_data[dst_size] = *src++ ^ flip_case_mask;
         else
-            *dst++ = *src++;
+            dst_data[dst_size] = *src++;
     }
     else
     {
         static const Poco::UTF8Encoding utf8;
 
-        if (const auto chars = utf8.convert(to_case(utf8.convert(src)), dst, src_end - src))
-            src += chars, dst += chars;
-        else
-            ++src, ++dst;
+        int src_sequence_length = utf8.sequenceLength(src, 1);
+        assert(src_sequence_length > 0);
+        if unlikely (src + src_sequence_length > src_end)
+        {
+            /// If this row has invalid utf-8 characters, just copy it to dst string and do not influence others
+            size_t dst_size = dst_data.size();
+            dst_data.resize(src_end - src + dst_size);
+            memcpy(&dst_data[dst_size], src, src_end - src);
+            src = src_end;
+            return;
+        }
+
+        int src_ch = utf8.convert(src);
+        if unlikely (src_ch == -1)
+        {
+            /// If this row has invalid utf-8 characters, just copy it to dst string and do not influence others
+            size_t dst_size = dst_data.size();
+            dst_data.resize(dst_size + src_sequence_length);
+            memcpy(&dst_data[dst_size], src, src_sequence_length);
+            src += src_sequence_length;
+            return;
+        }
+        int dst_ch = to_case(src_ch);
+        int dst_sequence_length = utf8.convert(dst_ch, nullptr, 0);
+        size_t dst_size = dst_data.size();
+        dst_data.resize(dst_size + dst_sequence_length);
+        utf8.convert(dst_ch, &dst_data[dst_size], dst_sequence_length);
+
+        if (dst_sequence_length != src_sequence_length)
+        {
+            assert((Int64)dst_offsets[offsets_pos] + dst_sequence_length - src_sequence_length >= 0);
+            dst_offsets[offsets_pos] += dst_sequence_length - src_sequence_length;
+            is_diff_offsets = true;
+        }
+
+        src += src_sequence_length;
     }
 }
 
@@ -548,10 +585,19 @@ TIFLASH_DECLARE_MULTITARGET_FUNCTION_TP(
     (not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, to_case),
     void,
     lowerUpperUTF8ArrayImplTiDB,
-    (src, src_end, dst),
-    (ConstPtr<UInt8> & src, const ConstPtr<UInt8> src_end, Ptr<UInt8> & dst),
-    {
+    (src_data, src_offsets, dst_data, dst_offsets),
+    (const ColumnString::Chars_t & src_data,
+     const IColumn::Offsets & src_offsets,
+     ColumnString::Chars_t & dst_data,
+     IColumn::Offsets & dst_offsets),
+    {
+        dst_data.reserve(src_data.size());
+        dst_offsets.assign(src_offsets);
         static const auto flip_mask = SimdWord::template fromSingle<int8_t>(flip_case_mask);
+        const UInt8 *src = src_data.data(), *src_end = src_data.data() + src_data.size();
+        auto * begin = src;
+        bool is_diff_offsets = false;
+        size_t offsets_pos = 0;
         while (src + WORD_SIZE < src_end)
         {
             auto word = SimdWord::fromUnaligned(src);
@@ -566,29 +612,71 @@ TIFLASH_DECLARE_MULTITARGET_FUNCTION_TP(
                 range_check.as_int8 = (word.as_int8 >= lower_bounds.as_int8) & (word.as_int8 <= upper_bounds.as_int8);
                 selected.as_int8 = range_check.as_int8 & flip_mask.as_int8;
                 word.as_int8 ^= selected.as_int8;
-                word.toUnaligned(dst);
+                size_t dst_size = dst_data.size();
+                dst_data.resize(dst_size + WORD_SIZE);
+                word.toUnaligned(&dst_data[dst_size]);
                 src += WORD_SIZE;
-                dst += WORD_SIZE;
             }
             else
             {
+                size_t offset_from_begin = src - begin;
+                while (offset_from_begin >= src_offsets[offsets_pos])
+                    ++offsets_pos;
                 auto expected_end = src + WORD_SIZE;
-                while (src < expected_end)
+                while (true)
+                {
+                    const UInt8 * row_end = begin + src_offsets[offsets_pos];
+                    assert(row_end >= src);
+                    auto end = std::min(expected_end, row_end);
+                    while (src < end)
+                    {
+                        toCaseImplTiDB<
+                            not_case_lower_bound,
+                            not_case_upper_bound,
+                            ascii_upper_bound,
+                            flip_case_mask,
+                            to_case>(src, row_end, offsets_pos, dst_data, dst_offsets, is_diff_offsets);
+                    }
+                    if (src >= expected_end)
+                        break;
+                    ++offsets_pos;
+                }
+            }
+        }
+
+        if (src < src_end)
+        {
+            size_t offset_from_begin = src - begin;
+            while (offset_from_begin >= src_offsets[offsets_pos])
+                ++offsets_pos;
+
+            while (src < src_end)
+            {
+                const UInt8 * row_end = begin + src_offsets[offsets_pos];
+                assert(row_end >= src);
+                while (src < row_end)
                 {
                     toCaseImplTiDB<
                         not_case_lower_bound,
                         not_case_upper_bound,
                         ascii_upper_bound,
                         flip_case_mask,
-                        to_case>(src, src_end, dst);
+                        to_case>(src, row_end, offsets_pos, dst_data, dst_offsets, is_diff_offsets);
                 }
+                ++offsets_pos;
+            }
+        }
+
+        if unlikely (is_diff_offsets)
+        {
+            Int64 diff = 0;
+            for (size_t i = 0; i < dst_offsets.size(); ++i)
+            {
+                /// diff is the cumulative offset difference from 0 to the i position
+                diff += (Int64)dst_offsets[i] - (Int64)src_offsets[i];
+                dst_offsets[i] = src_offsets[i] + diff;
             }
         }
-        while (src < src_end)
-            toCaseImplTiDB<not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, to_case>(
-                src,
-                src_end,
-                dst);
     })
 } // namespace
 
@@ -618,55 +706,20 @@ void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>
     ColumnString::Chars_t & res_data,
     IColumn::Offsets & res_offsets)
 {
-    res_data.resize(data.size());
-    res_offsets.assign(offsets);
-    array(data.data(), data.data() + data.size(), res_data.data());
+    lowerUpperUTF8ArrayImplTiDB<not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, to_case>(
+        data,
+        offsets,
+        res_data,
+        res_offsets);
 }
 
 template <char not_case_lower_bound, char not_case_upper_bound, int to_case(int)>
 void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>::vectorFixed(
-    const ColumnString::Chars_t & data,
+    const ColumnString::Chars_t & /*data*/,
     size_t /*n*/,
-    ColumnString::Chars_t & res_data)
-{
-    res_data.resize(data.size());
-    array(data.data(), data.data() + data.size(), res_data.data());
-}
-
-template <char not_case_lower_bound, char not_case_upper_bound, int to_case(int)>
-void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>::constant(
-    const std::string & data,
-    std::string & res_data)
+    ColumnString::Chars_t & /*res_data*/)
 {
-    res_data.resize(data.size());
-    array(
-        reinterpret_cast<const UInt8 *>(data.data()),
-        reinterpret_cast<const UInt8 *>(data.data() + data.size()),
-        reinterpret_cast<UInt8 *>(&res_data[0]));
-}
-
-template <char not_case_lower_bound, char not_case_upper_bound, int to_case(int)>
-void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>::toCase(
-    const UInt8 *& src,
-    const UInt8 * src_end,
-    UInt8 *& dst)
-{
-    toCaseImplTiDB<not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, to_case>(
-        src,
-        src_end,
-        dst);
-}
-
-template <char not_case_lower_bound, char not_case_upper_bound, int to_case(int)>
-void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>::array(
-    const UInt8 * src,
-    const UInt8 * src_end,
-    UInt8 * dst)
-{
-    lowerUpperUTF8ArrayImplTiDB<not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, to_case>(
-        src,
-        src_end,
-        dst);
+    throw Exception("Cannot apply function TiDBLowerUpperUTF8 to fixed string.", ErrorCodes::ILLEGAL_COLUMN);
 }
 
 /** If the string is encoded in UTF-8, then it selects a substring of code points in it.

diff --git a/dbms/src/Functions/FunctionsString.h b/dbms/src/Functions/FunctionsString.h
@@ -179,13 +179,13 @@ class FunctionStringToString : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const ColumnPtr column = block.getByPosition(arguments[0]).column;
-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
+        if (const auto * col = checkAndGetColumn<ColumnString>(column.get()))
         {
             auto col_res = ColumnString::create();
             Impl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets());
             block.getByPosition(result).column = std::move(col_res);
         }
-        else if (const ColumnFixedString * col = checkAndGetColumn<ColumnFixedString>(column.get()))
+        else if (const auto * col = checkAndGetColumn<ColumnFixedString>(column.get()))
         {
             auto col_res = ColumnFixedString::create(col->getN());
             Impl::vectorFixed(col->getChars(), col->getN(), col_res->getChars());
@@ -212,17 +212,9 @@ struct TiDBLowerUpperUTF8Impl
 
     static void vectorFixed(const ColumnString::Chars_t & data, size_t n, ColumnString::Chars_t & res_data);
 
-    static void constant(const std::string & data, std::string & res_data);
-
-    /** Converts a single code point starting at `src` to desired case, storing result starting at `dst`.
-     *    `src` and `dst` are incremented by corresponding sequence lengths. */
-    static void toCase(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst);
-
 private:
     static constexpr auto ascii_upper_bound = '\x7f';
     static constexpr auto flip_case_mask = 'A' ^ 'a';
-
-    static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst);
 };
 
 struct TiDBLowerUpperBinaryImpl

diff --git a/dbms/src/Functions/tests/gtest_strings_lower.cpp b/dbms/src/Functions/tests/gtest_strings_lower.cpp
@@ -60,22 +60,33 @@ class StringLower : public DB::tests::FunctionTest
 TEST_F(StringLower, lowerAll)
 {
     std::vector<std::optional<String>> candidate_strings
-        = {"one WEEK’S time TEST",
+        = {"one WEEK'S time TEST",
            "abc测试def",
            "ABCテストabc",
            "ЀЁЂѓЄЅІїЈЉЊЋЌѝЎЏ",
            "+Ѐ-ё*Ђ/ѓ!Є@Ѕ#І$@Ї%Ј……љ&Њ（Ћ）Ќ￥Ѝ#Ў@Џ！^",
+           "İaSdİİİİdDS",
            "ΑΒΓΔΕΖΗΘικΛΜΝΞΟΠΡΣτΥΦΧΨωΣ",
+           "ȺDȺİȺaȺȾOİȺ",
+           "TEST_WRONG_UTF8_1\x80\xe0\x21",
            "▲Α▼Βγ➨ΔΕ☎ΖΗ✂ΘΙ€ΚΛ♫ΜΝ✓ΞΟ✚ΠΡ℉ΣΤ♥ΥΦ♖ΧΨ♘Ω★Σ✕",
+           "ⱮⱭȺΩABCDEFGHIJKLMNOPꞍaȾ",
+           "TEST_WRONG_UTF8_2\xf1\x22",
            "թՓՁՋՐՉՃԺԾՔՈԵՌՏԸՒԻՕՊԱՍԴՖԳՀՅԿԼԽԶՂՑՎԲՆմՇ"};
+
     std::vector<std::optional<String>> lower_case_strings
-        = {"one week’s time test",
+        = {"one week's time test",
            "abc测试def",
            "abcテストabc",
            "ѐёђѓєѕіїјљњћќѝўџ",
            "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……љ&њ（ћ）ќ￥ѝ#ў@џ！^",
+           "iasdiiiidds",
            "αβγδεζηθικλμνξοπρστυφχψωσ",
+           "ⱥdⱥiⱥaⱥⱦoiⱥ",
+           "test_wrong_utf8_1\x80\xe0\x21",
            "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★σ✕",
+           "ɱɑⱥωabcdefghijklmnopɥaⱦ",
+           "test_wrong_utf8_2\xf1\x22",
            "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"};