Skip to content

Commit

Permalink
Wrap & Optimize rest bin collator (#5545)
Browse files Browse the repository at this point in the history
ref #5294
  • Loading branch information
solotzg authored Aug 11, 2022
1 parent 016d60a commit 9e3f5d0
Show file tree
Hide file tree
Showing 6 changed files with 263 additions and 93 deletions.
72 changes: 60 additions & 12 deletions dbms/src/Columns/ColumnString.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,11 +355,12 @@ struct ColumnString::LessWithCollation
}
};

struct Utf8MB4BinCmp
template <bool padding>
struct CompareBinCollator
{
static FLATTEN_INLINE_PURE inline int compare(const char * s1, size_t length1, const char * s2, size_t length2)
{
return DB::BinCollatorCompare<true>(s1, length1, s2, length2);
return DB::BinCollatorCompare<padding>(s1, length1, s2, length2);
}
};

Expand Down Expand Up @@ -400,18 +401,28 @@ void ColumnString::getPermutationWithCollationImpl(const ICollator & collator, b
{
using PermutationWithCollationUtils = ColumnString::LessWithCollation<false, void>;

// optimize path for default collator `UTF8MB4_BIN`
if (TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_BIN) == &collator)
switch (TiDB::GetTiDBCollatorType(&collator))
{
Utf8MB4BinCmp cmp_impl;
case TiDB::ITiDBCollator::CollatorType::UTF8MB4_BIN:
case TiDB::ITiDBCollator::CollatorType::UTF8_BIN:
case TiDB::ITiDBCollator::CollatorType::LATIN1_BIN:
case TiDB::ITiDBCollator::CollatorType::ASCII_BIN:
{
CompareBinCollator<true> cmp_impl;
PermutationWithCollationUtils::getPermutationWithCollationImpl(*this, cmp_impl, reverse, limit, res);
///
return;
break;
}

case TiDB::ITiDBCollator::CollatorType::BINARY:
{
CompareBinCollator<false> cmp_impl;
PermutationWithCollationUtils::getPermutationWithCollationImpl(*this, cmp_impl, reverse, limit, res);
break;
}
default:
{
PermutationWithCollationUtils::getPermutationWithCollationImpl(*this, collator, reverse, limit, res);
}
}
}

void ColumnString::updateWeakHash32(WeakHash32 & hash, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const
Expand All @@ -425,23 +436,41 @@ void ColumnString::updateWeakHash32(WeakHash32 & hash, const TiDB::TiDBCollatorP

if (collator != nullptr)
{
if (collator->getCollatorId() == TiDB::ITiDBCollator::UTF8MB4_BIN)
switch (collator->getCollatorType())
{
case TiDB::ITiDBCollator::CollatorType::UTF8MB4_BIN:
case TiDB::ITiDBCollator::CollatorType::LATIN1_BIN:
case TiDB::ITiDBCollator::CollatorType::ASCII_BIN:
case TiDB::ITiDBCollator::CollatorType::UTF8_BIN:
{
// Skip last zero byte.
LoopOneColumn(chars, offsets, offsets.size(), [&](const std::string_view & view, size_t) {
auto sort_key = BinCollatorSortKey<true>(view.data(), view.size());
*hash_data = ::updateWeakHash32(reinterpret_cast<const UInt8 *>(sort_key.data), sort_key.size, *hash_data);
++hash_data;
});
break;
}
else
case TiDB::ITiDBCollator::CollatorType::BINARY:
{
// Skip last zero byte.
LoopOneColumn(chars, offsets, offsets.size(), [&](const std::string_view & view, size_t) {
auto sort_key = BinCollatorSortKey<false>(view.data(), view.size());
*hash_data = ::updateWeakHash32(reinterpret_cast<const UInt8 *>(sort_key.data), sort_key.size, *hash_data);
++hash_data;
});
break;
}
default:
{
// Skip last zero byte.
LoopOneColumn(chars, offsets, offsets.size(), [&](const std::string_view & view, size_t) {
auto sort_key = collator->sortKey(view.data(), view.size(), sort_key_container);
*hash_data = ::updateWeakHash32(reinterpret_cast<const UInt8 *>(sort_key.data), sort_key.size, *hash_data);
++hash_data;
});
break;
}
}
}
else
Expand All @@ -458,7 +487,12 @@ void ColumnString::updateHashWithValues(IColumn::HashValues & hash_values, const
{
if (collator != nullptr)
{
if (collator->getCollatorId() == TiDB::ITiDBCollator::UTF8MB4_BIN)
switch (collator->getCollatorType())
{
case TiDB::ITiDBCollator::CollatorType::UTF8MB4_BIN:
case TiDB::ITiDBCollator::CollatorType::LATIN1_BIN:
case TiDB::ITiDBCollator::CollatorType::ASCII_BIN:
case TiDB::ITiDBCollator::CollatorType::UTF8_BIN:
{
// Skip last zero byte.
LoopOneColumn(chars, offsets, offsets.size(), [&hash_values](const std::string_view & view, size_t i) {
Expand All @@ -467,8 +501,20 @@ void ColumnString::updateHashWithValues(IColumn::HashValues & hash_values, const
hash_values[i].update(reinterpret_cast<const char *>(&string_size), sizeof(string_size));
hash_values[i].update(sort_key.data, sort_key.size);
});
break;
}
else
case TiDB::ITiDBCollator::CollatorType::BINARY:
{
// Skip last zero byte.
LoopOneColumn(chars, offsets, offsets.size(), [&hash_values](const std::string_view & view, size_t i) {
auto sort_key = BinCollatorSortKey<false>(view.data(), view.size());
size_t string_size = sort_key.size;
hash_values[i].update(reinterpret_cast<const char *>(&string_size), sizeof(string_size));
hash_values[i].update(sort_key.data, sort_key.size);
});
break;
}
default:
{
// Skip last zero byte.
LoopOneColumn(chars, offsets, offsets.size(), [&](const std::string_view & view, size_t i) {
Expand All @@ -477,6 +523,8 @@ void ColumnString::updateHashWithValues(IColumn::HashValues & hash_values, const
hash_values[i].update(reinterpret_cast<const char *>(&string_size), sizeof(string_size));
hash_values[i].update(sort_key.data, sort_key.size);
});
break;
}
}
}
else
Expand Down
61 changes: 51 additions & 10 deletions dbms/src/Functions/CollationOperatorOptimized.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,11 @@ __attribute__((flatten, always_inline)) inline void LoopOneColumn(
}

// Handle str-column compare str-column.
// - Optimize UTF8_BIN and UTF8MB4_BIN
// - Optimize bin collator
// - Check if columns do NOT contain tail space
// - If Op is `EqualsOp` or `NotEqualsOp`, optimize comparison by faster way
template <typename Op, typename Result>
ALWAYS_INLINE inline bool StringVectorStringVector(
ALWAYS_INLINE inline bool CompareStringVectorStringVector(
const ColumnString::Chars_t & a_data,
const ColumnString::Offsets & a_offsets,
const ColumnString::Chars_t & b_data,
Expand All @@ -112,10 +112,12 @@ ALWAYS_INLINE inline bool StringVectorStringVector(
{
bool use_optimized_path = false;

switch (collator->getCollatorId())
switch (collator->getCollatorType())
{
case TiDB::ITiDBCollator::UTF8MB4_BIN:
case TiDB::ITiDBCollator::UTF8_BIN:
case TiDB::ITiDBCollator::CollatorType::UTF8MB4_BIN:
case TiDB::ITiDBCollator::CollatorType::UTF8_BIN:
case TiDB::ITiDBCollator::CollatorType::LATIN1_BIN:
case TiDB::ITiDBCollator::CollatorType::ASCII_BIN:
{
size_t size = a_offsets.size();

Expand All @@ -134,19 +136,39 @@ ALWAYS_INLINE inline bool StringVectorStringVector(

break;
}
case TiDB::ITiDBCollator::CollatorType::BINARY:
{
size_t size = a_offsets.size();

LoopTwoColumns(a_data, a_offsets, b_data, b_offsets, size, [&c](const std::string_view & va, const std::string_view & vb, size_t i) {
if constexpr (IsEqualRelated<Op>::value)
{
c[i] = Op::apply(RawStrEqualCompare((va), (vb)), 0);
}
else
{
c[i] = Op::apply(RawStrCompare(va, vb), 0);
}
});

use_optimized_path = true;

break;
}

default:
break;
}
return use_optimized_path;
}

// Handle str-column compare const-str.
// - Optimize UTF8_BIN and UTF8MB4_BIN
// - Optimize bin collator
// - Right trim const-str first
// - Check if column does NOT contain tail space
// - If Op is `EqualsOp` or `NotEqualsOp`, optimize comparison by faster way
template <typename Op, typename Result>
ALWAYS_INLINE inline bool StringVectorConstant(
ALWAYS_INLINE inline bool CompareStringVectorConstant(
const ColumnString::Chars_t & a_data,
const ColumnString::Offsets & a_offsets,
const std::string_view & b,
Expand All @@ -155,10 +177,12 @@ ALWAYS_INLINE inline bool StringVectorConstant(
{
bool use_optimized_path = false;

switch (collator->getCollatorId())
switch (collator->getCollatorType())
{
case TiDB::ITiDBCollator::UTF8MB4_BIN:
case TiDB::ITiDBCollator::UTF8_BIN:
case TiDB::ITiDBCollator::CollatorType::UTF8MB4_BIN:
case TiDB::ITiDBCollator::CollatorType::UTF8_BIN:
case TiDB::ITiDBCollator::CollatorType::LATIN1_BIN:
case TiDB::ITiDBCollator::CollatorType::ASCII_BIN:
{
size_t size = a_offsets.size();

Expand All @@ -178,6 +202,23 @@ ALWAYS_INLINE inline bool StringVectorConstant(
use_optimized_path = true;
break;
}
case TiDB::ITiDBCollator::CollatorType::BINARY:
{
size_t size = a_offsets.size();
LoopOneColumn(a_data, a_offsets, size, [&c, &b](const std::string_view & view, size_t i) {
if constexpr (IsEqualRelated<Op>::value)
{
c[i] = Op::apply(RawStrEqualCompare((view), b), 0);
}
else
{
c[i] = Op::apply(RawStrCompare((view), b), 0);
}
});

use_optimized_path = true;
break;
}
default:
break;
}
Expand Down
12 changes: 6 additions & 6 deletions dbms/src/Functions/CollationStringSearchOptimized.h
Original file line number Diff line number Diff line change
Expand Up @@ -446,18 +446,18 @@ ALWAYS_INLINE inline bool StringPatternMatch(
{
bool use_optimized_path = false;

switch (collator->getCollatorId())
switch (collator->getCollatorType())
{
case TiDB::ITiDBCollator::UTF8MB4_BIN:
case TiDB::ITiDBCollator::UTF8_BIN:
case TiDB::ITiDBCollator::CollatorType::UTF8MB4_BIN:
case TiDB::ITiDBCollator::CollatorType::UTF8_BIN:
{
BinStringPatternMatch<Result, revert, true>(a_data, a_offsets, pattern_str, escape_char, c);
use_optimized_path = true;
break;
}
case TiDB::ITiDBCollator::BINARY:
case TiDB::ITiDBCollator::ASCII_BIN:
case TiDB::ITiDBCollator::LATIN1_BIN:
case TiDB::ITiDBCollator::CollatorType::BINARY:
case TiDB::ITiDBCollator::CollatorType::ASCII_BIN:
case TiDB::ITiDBCollator::CollatorType::LATIN1_BIN:
{
BinStringPatternMatch<Result, revert, false>(a_data, a_offsets, pattern_str, escape_char, c);
use_optimized_path = true;
Expand Down
4 changes: 2 additions & 2 deletions dbms/src/Functions/FunctionsComparison.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ struct StringComparisonWithCollatorImpl
const TiDB::TiDBCollatorPtr & collator,
PaddedPODArray<ResultType> & c)
{
bool optimized_path = StringVectorStringVector<Op>(a_data, a_offsets, b_data, b_offsets, collator, c);
bool optimized_path = CompareStringVectorStringVector<Op>(a_data, a_offsets, b_data, b_offsets, collator, c);
if (optimized_path)
{
return;
Expand All @@ -328,7 +328,7 @@ struct StringComparisonWithCollatorImpl
const TiDB::TiDBCollatorPtr & collator,
PaddedPODArray<ResultType> & c)
{
bool optimized_path = StringVectorConstant<Op>(a_data, a_offsets, b, collator, c);
bool optimized_path = CompareStringVectorConstant<Op>(a_data, a_offsets, b, collator, c);

if (optimized_path)
{
Expand Down
Loading

0 comments on commit 9e3f5d0

Please sign in to comment.