Skip to content

Commit

Permalink
Revert "MinMax Index Supports Nullable DataType" (#4895)
Browse files Browse the repository at this point in the history
ref #4787
  • Loading branch information
hehechen authored May 16, 2022
1 parent e9227b1 commit 07f39f9
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 264 deletions.
1 change: 1 addition & 0 deletions dbms/src/DataTypes/IDataType.h
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,7 @@ class IDataType : private boost::noncopyable
virtual bool isEnum() const { return false; };

virtual bool isNullable() const { return false; }

/** Is this type can represent only NULL value? (It also implies isNullable)
*/
virtual bool onlyNull() const { return false; }
Expand Down
5 changes: 3 additions & 2 deletions dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,10 @@ DMFileWriter::DMFileWriter(const DMFilePtr & dmfile_,
for (auto & cd : write_columns)
{
// TODO: currently we only generate index for Integers, Date, DateTime types, and this should be configurable by user.
// TODO: If column type is nullable, we won't generate index for it
/// for handle column always generate index
auto type = removeNullable(cd.type);
bool do_index = cd.id == EXTRA_HANDLE_COLUMN_ID || type->isInteger() || type->isDateOrDateTime();
bool do_index = cd.id == EXTRA_HANDLE_COLUMN_ID || cd.type->isInteger() || cd.type->isDateOrDateTime();

if (options.flags.isSingleFile())
{
if (do_index)
Expand Down
191 changes: 6 additions & 185 deletions dbms/src/Storages/DeltaMerge/Index/MinMaxIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ inline std::pair<size_t, size_t> minmax(const IColumn & column, const ColumnVect

void MinMaxIndex::addPack(const IColumn & column, const ColumnVector<UInt8> * del_mark)
{
const IColumn * column_ptr = &column;
auto size = column.size();
bool has_null = false;
if (column.isColumnNullable())
Expand All @@ -69,6 +70,7 @@ void MinMaxIndex::addPack(const IColumn & column, const ColumnVector<UInt8> * de

const auto & nullable_column = static_cast<const ColumnNullable &>(column);
const auto & null_mark_data = nullable_column.getNullMapColumn().getData();
column_ptr = &nullable_column.getNestedColumn();

for (size_t i = 0; i < size; ++i)
{
Expand All @@ -80,13 +82,14 @@ void MinMaxIndex::addPack(const IColumn & column, const ColumnVector<UInt8> * de
}
}

auto [min_index, max_index] = details::minmax(column, del_mark, 0, column.size());
const IColumn & updated_column = *column_ptr;
auto [min_index, max_index] = details::minmax(updated_column, del_mark, 0, updated_column.size());
if (min_index != NONE_EXIST)
{
has_null_marks->push_back(has_null);
has_value_marks->push_back(1);
minmaxes->insertFrom(column, min_index);
minmaxes->insertFrom(column, max_index);
minmaxes->insertFrom(updated_column, min_index);
minmaxes->insertFrom(updated_column, max_index);
}
else
{
Expand Down Expand Up @@ -155,64 +158,6 @@ std::pair<UInt64, UInt64> MinMaxIndex::getUInt64MinMax(size_t pack_index)
return {minmaxes->get64(pack_index * 2), minmaxes->get64(pack_index * 2 + 1)};
}

RSResult MinMaxIndex::checkNullableEqual(size_t pack_index, const Field & value, const DataTypePtr & type)
{
const ColumnNullable & column_nullable = static_cast<const ColumnNullable &>(*minmaxes);

const auto * raw_type = type.get();

// if minmaxes_data has null value, the value of minmaxes_data[i] is meaningless and maybe just some random value.
// But in checkEqual, we have checked the has_null_marks and ensured that there is no null value in MinMax Indexes.
#define DISPATCH(TYPE) \
if (typeid_cast<const DataType##TYPE *>(raw_type)) \
{ \
auto & minmaxes_data = toColumnVectorData<TYPE>(column_nullable.getNestedColumnPtr()); \
auto min = minmaxes_data[pack_index * 2]; \
auto max = minmaxes_data[pack_index * 2 + 1]; \
return RoughCheck::checkEqual<TYPE>(value, type, min, max); \
}
FOR_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
if (typeid_cast<const DataTypeDate *>(raw_type))
{
const auto & minmaxes_data = toColumnVectorData<DataTypeDate::FieldType>(column_nullable.getNestedColumnPtr());
auto min = minmaxes_data[pack_index * 2];
auto max = minmaxes_data[pack_index * 2 + 1];
return RoughCheck::checkEqual<DataTypeDate::FieldType>(value, type, min, max);
}
if (typeid_cast<const DataTypeDateTime *>(raw_type))
{
const auto & minmaxes_data = toColumnVectorData<DataTypeDateTime::FieldType>(column_nullable.getNestedColumnPtr());
auto min = minmaxes_data[pack_index * 2];
auto max = minmaxes_data[pack_index * 2 + 1];
return RoughCheck::checkEqual<DataTypeDateTime::FieldType>(value, type, min, max);
}
if (typeid_cast<const DataTypeMyDateTime *>(raw_type) || typeid_cast<const DataTypeMyDate *>(raw_type))
{
// For DataTypeMyDateTime / DataTypeMyDate, simply compare them as comparing UInt64 is OK.
// Check `struct MyTimeBase` for more details.
const auto & minmaxes_data = toColumnVectorData<DataTypeMyTimeBase::FieldType>(column_nullable.getNestedColumnPtr());
auto min = minmaxes_data[pack_index * 2];
auto max = minmaxes_data[pack_index * 2 + 1];
return RoughCheck::checkEqual<DataTypeMyTimeBase::FieldType>(value, type, min, max);
}
if (typeid_cast<const DataTypeString *>(raw_type))
{
const auto * string_column = checkAndGetColumn<ColumnString>(column_nullable.getNestedColumnPtr().get());
const auto & chars = string_column->getChars();
const auto & offsets = string_column->getOffsets();
size_t pos = pack_index * 2;
size_t prev_offset = pos == 0 ? 0 : offsets[pos - 1];
// todo use StringRef instead of String
auto min = String(chars[prev_offset], offsets[pos] - prev_offset - 1);
pos = pack_index * 2 + 1;
prev_offset = offsets[pos - 1];
auto max = String(chars[prev_offset], offsets[pos] - prev_offset - 1);
return RoughCheck::checkEqual<String>(value, type, min, max);
}
return RSResult::Some;
}

RSResult MinMaxIndex::checkEqual(size_t pack_index, const Field & value, const DataTypePtr & type)
{
if ((*has_null_marks)[pack_index] || value.isNull())
Expand All @@ -221,10 +166,6 @@ RSResult MinMaxIndex::checkEqual(size_t pack_index, const Field & value, const D
return RSResult::None;

const auto * raw_type = type.get();
if (typeid_cast<const DataTypeNullable *>(raw_type))
{
return checkNullableEqual(pack_index, value, removeNullable(type));
}
#define DISPATCH(TYPE) \
if (typeid_cast<const DataType##TYPE *>(raw_type)) \
{ \
Expand Down Expand Up @@ -274,62 +215,6 @@ RSResult MinMaxIndex::checkEqual(size_t pack_index, const Field & value, const D
}
return RSResult::Some;
}

RSResult MinMaxIndex::checkNullableGreater(size_t pack_index, const Field & value, const DataTypePtr & type)
{
const ColumnNullable & column_nullable = static_cast<const ColumnNullable &>(*minmaxes);
const auto * raw_type = type.get();

#define DISPATCH(TYPE) \
if (typeid_cast<const DataType##TYPE *>(raw_type)) \
{ \
auto & minmaxes_data = toColumnVectorData<TYPE>(column_nullable.getNestedColumnPtr()); \
auto min = minmaxes_data[pack_index * 2]; \
auto max = minmaxes_data[pack_index * 2 + 1]; \
return RoughCheck::checkGreater<TYPE>(value, type, min, max); \
}
FOR_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
if (typeid_cast<const DataTypeDate *>(raw_type))
{
const auto & minmaxes_data = toColumnVectorData<DataTypeDate::FieldType>(column_nullable.getNestedColumnPtr());
auto min = minmaxes_data[pack_index * 2];
auto max = minmaxes_data[pack_index * 2 + 1];
return RoughCheck::checkGreater<DataTypeDate::FieldType>(value, type, min, max);
}
if (typeid_cast<const DataTypeDateTime *>(raw_type))
{
const auto & minmaxes_data = toColumnVectorData<DataTypeDateTime::FieldType>(column_nullable.getNestedColumnPtr());
auto min = minmaxes_data[pack_index * 2];
auto max = minmaxes_data[pack_index * 2 + 1];
return RoughCheck::checkGreater<DataTypeDateTime::FieldType>(value, type, min, max);
}
if (typeid_cast<const DataTypeMyDateTime *>(raw_type) || typeid_cast<const DataTypeMyDate *>(raw_type))
{
// For DataTypeMyDateTime / DataTypeMyDate, simply compare them as comparing UInt64 is OK.
// Check `struct MyTimeBase` for more details.
const auto & minmaxes_data = toColumnVectorData<DataTypeMyTimeBase::FieldType>(column_nullable.getNestedColumnPtr());
auto min = minmaxes_data[pack_index * 2];
auto max = minmaxes_data[pack_index * 2 + 1];
return RoughCheck::checkGreater<DataTypeMyTimeBase::FieldType>(value, type, min, max);
}
if (typeid_cast<const DataTypeString *>(raw_type))
{
const auto * string_column = checkAndGetColumn<ColumnString>(column_nullable.getNestedColumnPtr().get());
const auto & chars = string_column->getChars();
const auto & offsets = string_column->getOffsets();
size_t pos = pack_index * 2;
size_t prev_offset = pos == 0 ? 0 : offsets[pos - 1];
// todo use StringRef instead of String
auto min = String(chars[prev_offset], offsets[pos] - prev_offset - 1);
pos = pack_index * 2 + 1;
prev_offset = offsets[pos - 1];
auto max = String(chars[prev_offset], offsets[pos] - prev_offset - 1);
return RoughCheck::checkGreater<String>(value, type, min, max);
}
return RSResult::Some;
}

RSResult MinMaxIndex::checkGreater(size_t pack_index, const Field & value, const DataTypePtr & type, int /*nan_direction_hint*/)
{
if ((*has_null_marks)[pack_index] || value.isNull())
Expand All @@ -338,10 +223,6 @@ RSResult MinMaxIndex::checkGreater(size_t pack_index, const Field & value, const
return RSResult::None;

const auto * raw_type = type.get();
if (typeid_cast<const DataTypeNullable *>(raw_type))
{
return checkNullableGreater(pack_index, value, removeNullable(type));
}
#define DISPATCH(TYPE) \
if (typeid_cast<const DataType##TYPE *>(raw_type)) \
{ \
Expand Down Expand Up @@ -391,62 +272,6 @@ RSResult MinMaxIndex::checkGreater(size_t pack_index, const Field & value, const
}
return RSResult::Some;
}

RSResult MinMaxIndex::checkNullableGreaterEqual(size_t pack_index, const Field & value, const DataTypePtr & type)
{
const ColumnNullable & column_nullable = static_cast<const ColumnNullable &>(*minmaxes);

const auto * raw_type = type.get();
#define DISPATCH(TYPE) \
if (typeid_cast<const DataType##TYPE *>(raw_type)) \
{ \
auto & minmaxes_data = toColumnVectorData<TYPE>(column_nullable.getNestedColumnPtr()); \
auto min = minmaxes_data[pack_index * 2]; \
auto max = minmaxes_data[pack_index * 2 + 1]; \
return RoughCheck::checkGreaterEqual<TYPE>(value, type, min, max); \
}
FOR_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
if (typeid_cast<const DataTypeDate *>(raw_type))
{
const auto & minmaxes_data = toColumnVectorData<DataTypeDate::FieldType>(column_nullable.getNestedColumnPtr());
auto min = minmaxes_data[pack_index * 2];
auto max = minmaxes_data[pack_index * 2 + 1];
return RoughCheck::checkGreaterEqual<DataTypeDate::FieldType>(value, type, min, max);
}
if (typeid_cast<const DataTypeDateTime *>(raw_type))
{
const auto & minmaxes_data = toColumnVectorData<DataTypeDateTime::FieldType>(column_nullable.getNestedColumnPtr());
auto min = minmaxes_data[pack_index * 2];
auto max = minmaxes_data[pack_index * 2 + 1];
return RoughCheck::checkGreaterEqual<DataTypeDateTime::FieldType>(value, type, min, max);
}
if (typeid_cast<const DataTypeMyDateTime *>(raw_type) || typeid_cast<const DataTypeMyDate *>(raw_type))
{
// For DataTypeMyDateTime / DataTypeMyDate, simply compare them as comparing UInt64 is OK.
// Check `struct MyTimeBase` for more details.
const auto & minmaxes_data = toColumnVectorData<DataTypeMyTimeBase::FieldType>(column_nullable.getNestedColumnPtr());
auto min = minmaxes_data[pack_index * 2];
auto max = minmaxes_data[pack_index * 2 + 1];
return RoughCheck::checkGreaterEqual<DataTypeMyTimeBase::FieldType>(value, type, min, max);
}
if (typeid_cast<const DataTypeString *>(raw_type))
{
const auto * string_column = checkAndGetColumn<ColumnString>(column_nullable.getNestedColumnPtr().get());
const auto & chars = string_column->getChars();
const auto & offsets = string_column->getOffsets();
size_t pos = pack_index * 2;
size_t prev_offset = pos == 0 ? 0 : offsets[pos - 1];
// todo use StringRef instead of String
auto min = String(reinterpret_cast<const char *>(&chars[prev_offset]), offsets[pos] - prev_offset - 1);
pos = pack_index * 2 + 1;
prev_offset = offsets[pos - 1];
auto max = String(reinterpret_cast<const char *>(&chars[prev_offset]), offsets[pos] - prev_offset - 1);
return RoughCheck::checkGreaterEqual<String>(value, type, min, max);
}
return RSResult::Some;
}

RSResult MinMaxIndex::checkGreaterEqual(size_t pack_index, const Field & value, const DataTypePtr & type, int /*nan_direction_hint*/)
{
if ((*has_null_marks)[pack_index] || value.isNull())
Expand All @@ -455,10 +280,6 @@ RSResult MinMaxIndex::checkGreaterEqual(size_t pack_index, const Field & value,
return RSResult::None;

const auto * raw_type = type.get();
if (typeid_cast<const DataTypeNullable *>(raw_type))
{
return checkNullableGreaterEqual(pack_index, value, removeNullable(type));
}
#define DISPATCH(TYPE) \
if (typeid_cast<const DataType##TYPE *>(raw_type)) \
{ \
Expand Down
3 changes: 0 additions & 3 deletions dbms/src/Storages/DeltaMerge/Index/MinMaxIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,6 @@ class MinMaxIndex
RSResult checkGreaterEqual(size_t pack_index, const Field & value, const DataTypePtr & type, int nan_direction);

static String toString();
RSResult checkNullableEqual(size_t pack_index, const Field & value, const DataTypePtr & type);
RSResult checkNullableGreater(size_t pack_index, const Field & value, const DataTypePtr & type);
RSResult checkNullableGreaterEqual(size_t pack_index, const Field & value, const DataTypePtr & type);
};


Expand Down
Loading

0 comments on commit 07f39f9

Please sign in to comment.