Skip to content

Commit

Permalink
Add dictionary encoded array
Browse files Browse the repository at this point in the history
  • Loading branch information
Alex-PLACET committed Oct 7, 2024
1 parent 1145453 commit 7bfb98f
Show file tree
Hide file tree
Showing 24 changed files with 1,803 additions and 133 deletions.
25 changes: 16 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,17 +106,18 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE "${BINARY_BUILD_DIR}")

set(SPARROW_HEADERS
# arrow_interface
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_array_schema_factory.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_array_schema_info_utils.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_array_schema_utils.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_array.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_array/deleter.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_array/private_data.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_array/smart_pointers.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_flag_utils.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_schema.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_schema/deleter.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_schema/private_data.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_schema/smart_pointers.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_array.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_array_schema_info_utils.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_array_schema_utils.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_flag_utils.hpp
${SPARROW_INCLUDE_DIR}/sparrow/arrow_interface/arrow_schema.hpp
# buffer
${SPARROW_INCLUDE_DIR}/sparrow/buffer/allocator.hpp
${SPARROW_INCLUDE_DIR}/sparrow/buffer/buffer.hpp
Expand All @@ -129,30 +130,36 @@ set(SPARROW_HEADERS
# detail
${SPARROW_INCLUDE_DIR}/sparrow/details/3rdparty/float16_t.hpp
# layout
${SPARROW_INCLUDE_DIR}/sparrow/layout/list_layout/list_array.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/list_layout/list_value.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/struct_layout/struct_array.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/struct_layout/struct_value.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/array_base.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/dictionary_encoded_array.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/dictionary_encoded_array/dictionary_encoded_array_bitmap_iterator.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/dictionary_encoded_array/dictionary_encoded_array_bitmap.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/dictionary_encoded_array/dictionary_encoded_array_iterator.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/dispatch.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/layout_iterator.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/layout_utils.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/list_layout/list_array.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/list_layout/list_value.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/nested_value_types.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/null_array.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/primitive_array.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/struct_layout/struct_array.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/struct_layout/struct_value.hpp
${SPARROW_INCLUDE_DIR}/sparrow/layout/variable_size_binary_array.hpp
# array
${SPARROW_INCLUDE_DIR}/sparrow/types/data_traits.hpp
${SPARROW_INCLUDE_DIR}/sparrow/types/data_type.hpp
# Utils
${SPARROW_INCLUDE_DIR}/sparrow/utils/algorithm.hpp
${SPARROW_INCLUDE_DIR}/sparrow/utils/bit.hpp
${SPARROW_INCLUDE_DIR}/sparrow/utils/buffers.hpp
${SPARROW_INCLUDE_DIR}/sparrow/utils/contracts.hpp
${SPARROW_INCLUDE_DIR}/sparrow/utils/functor_index_iterator.hpp
${SPARROW_INCLUDE_DIR}/sparrow/utils/iterator.hpp
${SPARROW_INCLUDE_DIR}/sparrow/utils/memory.hpp
${SPARROW_INCLUDE_DIR}/sparrow/utils/mp_utils.hpp
${SPARROW_INCLUDE_DIR}/sparrow/utils/nullable.hpp
${SPARROW_INCLUDE_DIR}/sparrow/utils/offsets.hpp
${SPARROW_INCLUDE_DIR}/sparrow/utils/reference_wrapper_utils.hpp
${SPARROW_INCLUDE_DIR}/sparrow/utils/variant_visitor.hpp
# ../
Expand Down
12 changes: 6 additions & 6 deletions include/sparrow/arrow_array_schema_proxy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,12 @@ namespace sparrow
*/
[[nodiscard]] arrow_proxy view();

[[nodiscard]] ArrowArray& array();
[[nodiscard]] const ArrowArray& array() const;

[[nodiscard]] ArrowSchema& schema();
[[nodiscard]] const ArrowSchema& schema() const;

private:

std::variant<ArrowArray*, ArrowArray> m_array;
Expand Down Expand Up @@ -261,12 +267,6 @@ namespace sparrow

void validate_array_and_schema() const;

[[nodiscard]] ArrowArray& array();
[[nodiscard]] const ArrowArray& array() const;

[[nodiscard]] ArrowSchema& schema();
[[nodiscard]] const ArrowSchema& schema() const;

arrow_schema_private_data* get_schema_private_data();
arrow_array_private_data* get_array_private_data();

Expand Down
126 changes: 126 additions & 0 deletions include/sparrow/arrow_interface/arrow_array_schema_factory.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// Copyright 2024 Man Group Operations Limited
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <cstdint>

#include "sparrow/arrow_interface/arrow_array.hpp"
#include "sparrow/arrow_interface/arrow_schema.hpp"
#include "sparrow/buffer/buffer.hpp"
#include "sparrow/buffer/dynamic_bitset.hpp"
#include "sparrow/c_interface.hpp"
#include "sparrow/types/data_type.hpp"
#include "sparrow/utils/buffers.hpp"
#include "sparrow/utils/offsets.hpp"

namespace sparrow
{
inline ArrowSchema make_dictionary_encoded_arrow_schema(data_type value_data_type, data_type keys_data_type)
{
ArrowSchema* values_schema = new ArrowSchema;
fill_arrow_schema(
*values_schema,
data_type_to_format(value_data_type),
"dictionary values",
std::nullopt,
std::nullopt,
0,
nullptr,
nullptr
);

ArrowSchema keys_schema = make_arrow_schema(
data_type_to_format(keys_data_type),
"dictionary keys",
std::nullopt,
std::nullopt,
0,
nullptr,
values_schema
);

return keys_schema;
}

template <std::ranges::range R>
requires(std::integral<std::ranges::range_value_t<R>>)
buffer<uint8_t> make_bitmap_buffer(size_t count, R&& nulls)
{
if (!std::ranges::empty(nulls))
{
SPARROW_ASSERT_TRUE(*std::ranges::max_element(nulls) < count);
}
dynamic_bitset<uint8_t> bitmap(count, true);
for (const auto i : nulls)
{
bitmap.set(i, false);
}
return bitmap.buffer();
};

template <layout_offset offset_type, std::ranges::sized_range Values, std::ranges::sized_range Nulls>
requires std::integral<std::ranges::range_value_t<Nulls>>
constexpr ArrowArray make_variable_size_binary_arrow_array(Values&& range, Nulls&& nulls, int64_t offset)
{
const auto length = static_cast<int64_t>(std::ranges::size(range)) - offset;
const auto null_count = static_cast<int64_t>(std::ranges::size(nulls));
std::vector<buffer<uint8_t>> value_buffers{
make_bitmap_buffer(std::ranges::size(range), std::move(nulls)),
range_to_buffer(make_offset_buffer<offset_type>(range)),
strings_to_buffer(std::move(range))
};
return make_arrow_array(length, null_count, offset, std::move(value_buffers), 0, nullptr, nullptr);
}

template <std::ranges::sized_range Values, std::ranges::sized_range Nulls>
requires std::is_arithmetic_v<std::ranges::range_value_t<Values>>
&& std::integral<std::ranges::range_value_t<Nulls>>
constexpr ArrowArray make_primitive_arrow_array(Values&& range, Nulls&& nulls, int64_t offset)
{
const int64_t length = static_cast<int64_t>(std::ranges::size(range)) - offset;
const auto null_count = static_cast<int64_t>(std::ranges::size(nulls));
std::vector<buffer<uint8_t>> value_buffers{
make_bitmap_buffer(std::ranges::size(range), std::move(nulls)),
range_to_buffer(std::move(range))
};
return make_arrow_array(length, null_count, offset, std::move(value_buffers), 0, nullptr, nullptr);
}

template <
std::ranges::sized_range Keys,
std::ranges::sized_range KeyNulls,
std::ranges::sized_range Values,
std::ranges::sized_range ValuesNulls>
requires std::integral<std::ranges::range_value_t<Keys>>
&& std::integral<std::ranges::range_value_t<KeyNulls>>
ArrowArray make_dictionary_encoded_arrow_array(
Keys&& keys,
KeyNulls&& keys_nulls,
int64_t keys_offset,
Values&& values,
ValuesNulls&& values_nulls,
int64_t values_offset
)
{
ArrowArray keys_arrow_array = make_primitive_arrow_array(keys, keys_nulls, keys_offset);
keys_arrow_array.dictionary = new ArrowArray;
*keys_arrow_array.dictionary = make_variable_size_binary_arrow_array<int32_t>(
std::move(values),
std::move(values_nulls),
values_offset
);
return keys_arrow_array;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,11 @@ namespace sparrow
buffer_type previous_buffer_type
)
{
constexpr double bit_per_byte = 8.;
constexpr size_t bit_per_byte = 8;
switch (bt)
{
case buffer_type::VALIDITY:
return static_cast<std::size_t>(std::ceil(static_cast<double>(length + offset) / bit_per_byte));
return (length + offset + bit_per_byte - 1) / bit_per_byte;
case buffer_type::DATA:
if (bt == buffer_type::DATA && (dt == data_type::STRING || dt == data_type::BINARY))
{
Expand Down
2 changes: 2 additions & 0 deletions include/sparrow/buffer/buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ namespace sparrow
constexpr buffer(It first, It last, const A& a = A());

template <std::ranges::input_range Range, allocator A = allocator_type>
requires std::same_as<std::ranges::range_value_t<Range>, T>
constexpr buffer(const Range& range, const A& a = A());

~buffer();
Expand Down Expand Up @@ -444,6 +445,7 @@ namespace sparrow

template <class T>
template <std::ranges::input_range Range, allocator A>
requires std::same_as<std::ranges::range_value_t<Range>, T>
constexpr buffer<T>::buffer(const Range& range, const A& a)
: base_type(check_init_length(static_cast<size_type>(std::ranges::size(range)), a), a)
{
Expand Down
27 changes: 5 additions & 22 deletions include/sparrow/layout/array_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,12 @@
#pragma once

#include <ranges>
#include <string>
#include <string_view>

#include "sparrow/arrow_array_schema_proxy.hpp"
#include "sparrow/buffer/dynamic_bitset.hpp"
#include "sparrow/layout/layout_iterator.hpp"
#include "sparrow/utils/nullable.hpp"
#include "sparrow/utils/iterator.hpp"
#include "sparrow/utils/mp_utils.hpp"

namespace sparrow
{
Expand Down Expand Up @@ -92,6 +89,7 @@ namespace sparrow
using bitmap_reference = bitmap_type::reference;
using bitmap_const_reference = bitmap_type::const_reference;
using bitmap_iterator = bitmap_type::iterator;
using bitmap_range = std::ranges::subrange<bitmap_iterator>;
using const_bitmap_iterator = bitmap_type::const_iterator;
using const_bitmap_range = std::ranges::subrange<const_bitmap_iterator>;

Expand Down Expand Up @@ -147,15 +145,10 @@ namespace sparrow

private:

static constexpr std::size_t m_bitmap_buffer_index = 0;

bitmap_type make_bitmap();

derived_type& derived_cast();
const derived_type& derived_cast() const;

arrow_proxy m_proxy;
bitmap_type m_bitmap;

// friend classes
friend class layout_iterator<self_type, false>;
Expand Down Expand Up @@ -260,14 +253,12 @@ namespace sparrow
template <class D>
array_crtp_base<D>::array_crtp_base(arrow_proxy proxy)
: m_proxy(std::move(proxy))
, m_bitmap(make_bitmap())
{
}

template <class D>
array_crtp_base<D>::array_crtp_base(const array_crtp_base& rhs)
: m_proxy(rhs.m_proxy)
, m_bitmap(make_bitmap())
{
}

Expand All @@ -287,20 +278,20 @@ namespace sparrow
auto array_crtp_base<D>::has_value(size_type i) -> bitmap_reference
{
SPARROW_ASSERT_TRUE(i < size());
return m_bitmap[i + static_cast<size_type>(storage().offset())];
return derived_cast().get_bitmap()[static_cast<difference_type>(i)];
}

template <class D>
auto array_crtp_base<D>::has_value(size_type i) const -> bitmap_const_reference
{
SPARROW_ASSERT_TRUE(i < size());
return m_bitmap[i + static_cast<size_type>(storage().offset())];
return derived_cast().get_bitmap()[static_cast<difference_type>(i)];
}

template <class D>
auto array_crtp_base<D>::bitmap_begin() -> bitmap_iterator
{
return sparrow::next(m_bitmap.begin(), storage().offset());
return derived_cast().get_bitmap().begin();
}

template <class D>
Expand All @@ -312,7 +303,7 @@ namespace sparrow
template <class D>
auto array_crtp_base<D>::bitmap_begin() const -> const_bitmap_iterator
{
return sparrow::next(m_bitmap.cbegin(), storage().offset());
return derived_cast().get_bitmap().begin();
}

template <class D>
Expand All @@ -321,14 +312,6 @@ namespace sparrow
return sparrow::next(bitmap_begin(), size());
}

template <class D>
auto array_crtp_base<D>::make_bitmap() -> bitmap_type
{
SPARROW_ASSERT_TRUE(storage().buffers().size() > m_bitmap_buffer_index);
const auto bitmap_size = static_cast<std::size_t>(storage().length() + storage().offset());
return bitmap_type(storage().buffers()[m_bitmap_buffer_index].data(), bitmap_size);
}

template <class D>
auto array_crtp_base<D>::derived_cast() -> derived_type&
{
Expand Down
Loading

0 comments on commit 7bfb98f

Please sign in to comment.