Skip to content

Commit

Permalink
lowerUTF8/upperUTF8 allow lower/uppercase characters occupy different…
Browse files Browse the repository at this point in the history
… number of bytes (#8622) (#8669)

close #8484
  • Loading branch information
ti-chi-bot authored Jan 19, 2024
1 parent 7bb0d4e commit 0618c04
Show file tree
Hide file tree
Showing 5 changed files with 179 additions and 155 deletions.
2 changes: 1 addition & 1 deletion dbms/src/Common/UTF8Helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ inline void syncForward(const UInt8 *& s, const UInt8 * const end)
/// returns UTF-8 code point sequence length judging by it's first octet
inline size_t seqLength(const UInt8 first_octet)
{
if (first_octet < 0x80u)
if (first_octet < 0x80 || first_octet >= 0xF8) /// The specs of UTF-8.
return 1;

const size_t bits = 8;
Expand Down
183 changes: 118 additions & 65 deletions dbms/src/Functions/FunctionsString.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -437,25 +437,62 @@ template <
char flip_case_mask,
int to_case(int)>
__attribute__((always_inline)) inline void toCaseImplTiDB(
ConstPtr<UInt8> & src,
const ConstPtr<UInt8> src_end,
Ptr<UInt8> & dst)
const UInt8 *& src,
const UInt8 * src_end,
size_t offsets_pos,
ColumnString::Chars_t & dst_data,
IColumn::Offsets & dst_offsets,
bool & is_diff_offsets)
{
if (src[0] <= ascii_upper_bound)
if (*src <= ascii_upper_bound)
{
size_t dst_size = dst_data.size();
dst_data.resize(dst_size + 1);
if (*src >= not_case_lower_bound && *src <= not_case_upper_bound)
*dst++ = *src++ ^ flip_case_mask;
dst_data[dst_size] = *src++ ^ flip_case_mask;
else
*dst++ = *src++;
dst_data[dst_size] = *src++;
}
else
{
static const Poco::UTF8Encoding utf8;

if (const auto chars = utf8.convert(to_case(utf8.convert(src)), dst, src_end - src))
src += chars, dst += chars;
else
++src, ++dst;
int src_sequence_length = utf8.sequenceLength(src, 1);
assert(src_sequence_length > 0);
if unlikely (src + src_sequence_length > src_end)
{
/// If this row has invalid utf-8 characters, just copy it to dst string and do not influence others
size_t dst_size = dst_data.size();
dst_data.resize(src_end - src + dst_size);
memcpy(&dst_data[dst_size], src, src_end - src);
src = src_end;
return;
}

int src_ch = utf8.convert(src);
if unlikely (src_ch == -1)
{
/// If this row has invalid utf-8 characters, just copy it to dst string and do not influence others
size_t dst_size = dst_data.size();
dst_data.resize(dst_size + src_sequence_length);
memcpy(&dst_data[dst_size], src, src_sequence_length);
src += src_sequence_length;
return;
}
int dst_ch = to_case(src_ch);
int dst_sequence_length = utf8.convert(dst_ch, nullptr, 0);
size_t dst_size = dst_data.size();
dst_data.resize(dst_size + dst_sequence_length);
utf8.convert(dst_ch, &dst_data[dst_size], dst_sequence_length);

if (dst_sequence_length != src_sequence_length)
{
assert((Int64)dst_offsets[offsets_pos] + dst_sequence_length - src_sequence_length >= 0);
dst_offsets[offsets_pos] += dst_sequence_length - src_sequence_length;
is_diff_offsets = true;
}

src += src_sequence_length;
}
}

Expand Down Expand Up @@ -548,10 +585,19 @@ TIFLASH_DECLARE_MULTITARGET_FUNCTION_TP(
(not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, to_case),
void,
lowerUpperUTF8ArrayImplTiDB,
(src, src_end, dst),
(ConstPtr<UInt8> & src, const ConstPtr<UInt8> src_end, Ptr<UInt8> & dst),
{
(src_data, src_offsets, dst_data, dst_offsets),
(const ColumnString::Chars_t & src_data,
const IColumn::Offsets & src_offsets,
ColumnString::Chars_t & dst_data,
IColumn::Offsets & dst_offsets),
{
dst_data.reserve(src_data.size());
dst_offsets.assign(src_offsets);
static const auto flip_mask = SimdWord::template fromSingle<int8_t>(flip_case_mask);
const UInt8 *src = src_data.data(), *src_end = src_data.data() + src_data.size();
auto * begin = src;
bool is_diff_offsets = false;
size_t offsets_pos = 0;
while (src + WORD_SIZE < src_end)
{
auto word = SimdWord::fromUnaligned(src);
Expand All @@ -566,29 +612,71 @@ TIFLASH_DECLARE_MULTITARGET_FUNCTION_TP(
range_check.as_int8 = (word.as_int8 >= lower_bounds.as_int8) & (word.as_int8 <= upper_bounds.as_int8);
selected.as_int8 = range_check.as_int8 & flip_mask.as_int8;
word.as_int8 ^= selected.as_int8;
word.toUnaligned(dst);
size_t dst_size = dst_data.size();
dst_data.resize(dst_size + WORD_SIZE);
word.toUnaligned(&dst_data[dst_size]);
src += WORD_SIZE;
dst += WORD_SIZE;
}
else
{
size_t offset_from_begin = src - begin;
while (offset_from_begin >= src_offsets[offsets_pos])
++offsets_pos;
auto expected_end = src + WORD_SIZE;
while (src < expected_end)
while (true)
{
const UInt8 * row_end = begin + src_offsets[offsets_pos];
assert(row_end >= src);
auto end = std::min(expected_end, row_end);
while (src < end)
{
toCaseImplTiDB<
not_case_lower_bound,
not_case_upper_bound,
ascii_upper_bound,
flip_case_mask,
to_case>(src, row_end, offsets_pos, dst_data, dst_offsets, is_diff_offsets);
}
if (src >= expected_end)
break;
++offsets_pos;
}
}
}

if (src < src_end)
{
size_t offset_from_begin = src - begin;
while (offset_from_begin >= src_offsets[offsets_pos])
++offsets_pos;

while (src < src_end)
{
const UInt8 * row_end = begin + src_offsets[offsets_pos];
assert(row_end >= src);
while (src < row_end)
{
toCaseImplTiDB<
not_case_lower_bound,
not_case_upper_bound,
ascii_upper_bound,
flip_case_mask,
to_case>(src, src_end, dst);
to_case>(src, row_end, offsets_pos, dst_data, dst_offsets, is_diff_offsets);
}
++offsets_pos;
}
}

if unlikely (is_diff_offsets)
{
Int64 diff = 0;
for (size_t i = 0; i < dst_offsets.size(); ++i)
{
/// diff is the cumulative offset difference from 0 to the i position
diff += (Int64)dst_offsets[i] - (Int64)src_offsets[i];
dst_offsets[i] = src_offsets[i] + diff;
}
}
while (src < src_end)
toCaseImplTiDB<not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, to_case>(
src,
src_end,
dst);
})
} // namespace

Expand Down Expand Up @@ -618,55 +706,20 @@ void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>
ColumnString::Chars_t & res_data,
IColumn::Offsets & res_offsets)
{
res_data.resize(data.size());
res_offsets.assign(offsets);
array(data.data(), data.data() + data.size(), res_data.data());
lowerUpperUTF8ArrayImplTiDB<not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, to_case>(
data,
offsets,
res_data,
res_offsets);
}

template <char not_case_lower_bound, char not_case_upper_bound, int to_case(int)>
void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>::vectorFixed(
const ColumnString::Chars_t & data,
const ColumnString::Chars_t & /*data*/,
size_t /*n*/,
ColumnString::Chars_t & res_data)
{
res_data.resize(data.size());
array(data.data(), data.data() + data.size(), res_data.data());
}

template <char not_case_lower_bound, char not_case_upper_bound, int to_case(int)>
void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>::constant(
const std::string & data,
std::string & res_data)
ColumnString::Chars_t & /*res_data*/)
{
res_data.resize(data.size());
array(
reinterpret_cast<const UInt8 *>(data.data()),
reinterpret_cast<const UInt8 *>(data.data() + data.size()),
reinterpret_cast<UInt8 *>(&res_data[0]));
}

template <char not_case_lower_bound, char not_case_upper_bound, int to_case(int)>
void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>::toCase(
const UInt8 *& src,
const UInt8 * src_end,
UInt8 *& dst)
{
toCaseImplTiDB<not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, to_case>(
src,
src_end,
dst);
}

template <char not_case_lower_bound, char not_case_upper_bound, int to_case(int)>
void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>::array(
const UInt8 * src,
const UInt8 * src_end,
UInt8 * dst)
{
lowerUpperUTF8ArrayImplTiDB<not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, to_case>(
src,
src_end,
dst);
throw Exception("Cannot apply function TiDBLowerUpperUTF8 to fixed string.", ErrorCodes::ILLEGAL_COLUMN);
}

/** If the string is encoded in UTF-8, then it selects a substring of code points in it.
Expand Down
12 changes: 2 additions & 10 deletions dbms/src/Functions/FunctionsString.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,13 +179,13 @@ class FunctionStringToString : public IFunction
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
{
const ColumnPtr column = block.getByPosition(arguments[0]).column;
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
if (const auto * col = checkAndGetColumn<ColumnString>(column.get()))
{
auto col_res = ColumnString::create();
Impl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets());
block.getByPosition(result).column = std::move(col_res);
}
else if (const ColumnFixedString * col = checkAndGetColumn<ColumnFixedString>(column.get()))
else if (const auto * col = checkAndGetColumn<ColumnFixedString>(column.get()))
{
auto col_res = ColumnFixedString::create(col->getN());
Impl::vectorFixed(col->getChars(), col->getN(), col_res->getChars());
Expand All @@ -212,17 +212,9 @@ struct TiDBLowerUpperUTF8Impl

static void vectorFixed(const ColumnString::Chars_t & data, size_t n, ColumnString::Chars_t & res_data);

static void constant(const std::string & data, std::string & res_data);

/** Converts a single code point starting at `src` to desired case, storing result starting at `dst`.
* `src` and `dst` are incremented by corresponding sequence lengths. */
static void toCase(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst);

private:
static constexpr auto ascii_upper_bound = '\x7f';
static constexpr auto flip_case_mask = 'A' ^ 'a';

static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst);
};

struct TiDBLowerUpperBinaryImpl
Expand Down
15 changes: 13 additions & 2 deletions dbms/src/Functions/tests/gtest_strings_lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,22 +60,33 @@ class StringLower : public DB::tests::FunctionTest
TEST_F(StringLower, lowerAll)
{
std::vector<std::optional<String>> candidate_strings
= {"one WEEKS time TEST",
= {"one WEEK'S time TEST",
"abc测试def",
"ABCテストabc",
"ЀЁЂѓЄЅІїЈЉЊЋЌѝЎЏ",
"+Ѐ-ё*Ђ/ѓ!Є@Ѕ#І$@Ї%Ј……љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^",
"İaSdİİİİdDS",
"ΑΒΓΔΕΖΗΘικΛΜΝΞΟΠΡΣτΥΦΧΨωΣ",
"ȺDȺİȺaȺȾOİȺ",
"TEST_WRONG_UTF8_1\x80\xe0\x21",
"▲Α▼Βγ➨ΔΕ☎ΖΗ✂ΘΙ€ΚΛ♫ΜΝ✓ΞΟ✚ΠΡ℉ΣΤ♥ΥΦ♖ΧΨ♘Ω★Σ✕",
"ⱮⱭȺΩABCDEFGHIJKLMNOPꞍaȾ",
"TEST_WRONG_UTF8_2\xf1\x22",
"թՓՁՋՐՉՃԺԾՔՈԵՌՏԸՒԻՕՊԱՍԴՖԳՀՅԿԼԽԶՂՑՎԲՆմՇ"};

std::vector<std::optional<String>> lower_case_strings
= {"one weeks time test",
= {"one week's time test",
"abc测试def",
"abcテストabc",
"ѐёђѓєѕіїјљњћќѝўџ",
"+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……љ&њ(ћ)ќ¥ѝ#ў@џ!^",
"iasdiiiidds",
"αβγδεζηθικλμνξοπρστυφχψωσ",
"ⱥdⱥiⱥaⱥⱦoiⱥ",
"test_wrong_utf8_1\x80\xe0\x21",
"▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★σ✕",
"ɱɑⱥωabcdefghijklmnopɥaⱦ",
"test_wrong_utf8_2\xf1\x22",
"թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"};


Expand Down
Loading

0 comments on commit 0618c04

Please sign in to comment.