Skip to content

Commit

Permalink
apacheGH-34561: [C++] Implement RunEndEncodedBuilder::AppendEmptyValu…
Browse files Browse the repository at this point in the history
…es() (apache#34562)

* Closes: apache#34561

Authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
  • Loading branch information
felipecrv authored and rtpsw committed Mar 27, 2023
1 parent 77ab504 commit ce1d3ee
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 16 deletions.
28 changes: 23 additions & 5 deletions cpp/src/arrow/array/array_run_end_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -301,22 +301,40 @@ TEST_P(TestRunEndEncodedArray, Builder) {
R"(["unique", null, "common", "common", "appended", "common", "common", "appended"])"));
continue;
}
if (step == 10) {
ASSERT_EQ(builder->length(), 505);
// Append empty values
ASSERT_OK(builder->AppendEmptyValues(10));
if (step == 11) {
ASSERT_EQ(builder->length(), 515);
ASSERT_OK(BuilderEquals(
*builder, 515, "[1, 3, 105, 165, 205, 305, 405, 505, 515]",
R"(["unique", null, "common", "common", "appended", "common", "common", "appended", ""])"));
continue;
}
// Append NULL after empty
ASSERT_OK(builder->AppendNull());
if (step == 12) {
ASSERT_EQ(builder->length(), 516);
ASSERT_OK(BuilderEquals(
*builder, 516, "[1, 3, 105, 165, 205, 305, 405, 505, 515, 516]",
R"(["unique", null, "common", "common", "appended", "common", "common", "appended", "", null])"));
continue;
}
if (step == 13) {
ASSERT_EQ(builder->length(), 516);
ASSERT_EQ(*builder->type(), *run_end_encoded(run_end_type, utf8()));

auto expected_run_ends =
ArrayFromJSON(run_end_type, "[1, 3, 105, 165, 205, 305, 405, 505]");
ArrayFromJSON(run_end_type, "[1, 3, 105, 165, 205, 305, 405, 505, 515, 516]");
auto expected_values = ArrayFromJSON(
value_type,
R"(["unique", null, "common", "common", "appended", "common", "common", "appended"])");
R"(["unique", null, "common", "common", "appended", "common", "common", "appended", "", null])");

ASSERT_OK_AND_ASSIGN(auto array, builder->Finish());
auto ree_array = std::dynamic_pointer_cast<RunEndEncodedArray>(array);
ASSERT_NE(ree_array, NULLPTR);
ASSERT_ARRAYS_EQUAL(*expected_run_ends, *ree_array->run_ends());
ASSERT_ARRAYS_EQUAL(*expected_values, *ree_array->values());
ASSERT_EQ(array->length(), 505);
ASSERT_EQ(array->length(), 516);
ASSERT_EQ(array->offset(), 0);
break;
}
Expand Down
24 changes: 20 additions & 4 deletions cpp/src/arrow/array/builder_run_end.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,21 @@ Status RunCompressorBuilder::AppendNulls(int64_t length) {
}

Status RunCompressorBuilder::AppendEmptyValues(int64_t length) {
return Status::NotImplemented("Append empty values to a run-compressed array.");
if (ARROW_PREDICT_FALSE(length == 0)) {
return Status::OK();
}
// Empty values are usually appended as placeholders for future values, so
// we make no attempt at making the empty values appended now part of the
// current run. Each AppendEmptyValues() creates its own run of the given length.
ARROW_RETURN_NOT_OK(FinishCurrentRun());
{
ARROW_RETURN_NOT_OK(WillCloseRunOfEmptyValues(length));
ARROW_RETURN_NOT_OK(inner_builder_->AppendEmptyValue());
UpdateDimensions();
}
// Current run remains cleared after FinishCurrentRun() as we don't want to
// extend it with empty values potentially coming in the future.
return Status::OK();
}

Status RunCompressorBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
Expand Down Expand Up @@ -183,7 +197,10 @@ Status RunEndEncodedBuilder::AppendNulls(int64_t length) {
}

Status RunEndEncodedBuilder::AppendEmptyValues(int64_t length) {
return Status::NotImplemented("Append empty values to run-end encoded array.");
RETURN_NOT_OK(value_run_builder_->AppendEmptyValues(length));
DCHECK_EQ(value_run_builder_->open_run_length(), 0);
UpdateDimensions(committed_logical_length_, 0);
return Status::OK();
}

Status RunEndEncodedBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
Expand Down Expand Up @@ -313,8 +330,7 @@ Status RunEndEncodedBuilder::AppendRunEnd(int64_t run_end) {
return Status::OK();
}

Status RunEndEncodedBuilder::CloseRun(const std::shared_ptr<const Scalar>& value,
int64_t run_length) {
Status RunEndEncodedBuilder::CloseRun(int64_t run_length) {
// TODO(felipecrv): gracefully fragment runs bigger than INT32_MAX
if (ARROW_PREDICT_FALSE(run_length > std::numeric_limits<int32_t>::max())) {
return Status::Invalid(
Expand Down
23 changes: 16 additions & 7 deletions cpp/src/arrow/array/builder_run_end.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,15 @@ class RunCompressorBuilder : public ArrayBuilder {
return Status::OK();
}

/// \brief Called right before a run of empty values is being closed
///
/// Subclasses can override this function to perform an additional action when
/// a run of empty values is appended (i.e. run-length is known and a single
/// empty value is appended to the inner builder).
///
/// \param length the greater than 0 length of the value run being closed
virtual Status WillCloseRunOfEmptyValues(int64_t length) { return Status::OK(); }

/// \brief Allocate enough memory for a given number of array elements.
///
/// NOTE: Conservatively resizing a run-length compressed array for a given
Expand All @@ -103,8 +112,6 @@ class RunCompressorBuilder : public ArrayBuilder {
Status AppendNull() final { return AppendNulls(1); }
Status AppendNulls(int64_t length) override;

// These two fail with Status::NotImplemented as it is impossible to compress
// unknown placeholder values.
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
Status AppendEmptyValues(int64_t length) override;

Expand Down Expand Up @@ -179,9 +186,12 @@ class ARROW_EXPORT RunEndEncodedBuilder : public ArrayBuilder {

~ValueRunBuilder() override = default;

Status WillCloseRun(const std::shared_ptr<const Scalar>& value,
int64_t length) override {
return ree_builder_.CloseRun(value, length);
Status WillCloseRun(const std::shared_ptr<const Scalar>&, int64_t length) override {
return ree_builder_.CloseRun(length);
}

Status WillCloseRunOfEmptyValues(int64_t length) override {
return ree_builder_.CloseRun(length);
}

private:
Expand Down Expand Up @@ -276,8 +286,7 @@ class ARROW_EXPORT RunEndEncodedBuilder : public ArrayBuilder {
/// length_ to reflect the new run.
///
/// Pre-condition: run_length > 0.
[[nodiscard]] Status CloseRun(const std::shared_ptr<const Scalar>& value,
int64_t run_length);
[[nodiscard]] Status CloseRun(int64_t run_length);

ArrayBuilder& run_end_builder();
ArrayBuilder& value_builder();
Expand Down

0 comments on commit ce1d3ee

Please sign in to comment.