forked from redpanda-data/redpanda
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request redpanda-data#23872 from andrwng/datalake-partitio…
…ning-writer datalake: add partitioning_writer
- Loading branch information
Showing
17 changed files
with
370 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
/* | ||
* Copyright 2024 Redpanda Data, Inc. | ||
* | ||
* Licensed as a Redpanda Enterprise file under the Redpanda Community | ||
* License (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md | ||
*/ | ||
#include "datalake/partitioning_writer.h" | ||
|
||
#include "base/vlog.h" | ||
#include "datalake/data_writer_interface.h" | ||
#include "datalake/logger.h" | ||
#include "datalake/table_definition.h" | ||
#include "iceberg/struct_accessor.h" | ||
|
||
#include <exception> | ||
|
||
namespace datalake { | ||
|
||
namespace { | ||
const auto hourly_spec = hour_partition_spec(); | ||
const auto default_schema = schemaless_struct_type(); | ||
const auto default_accessors = iceberg::struct_accessor::from_struct_type( | ||
default_schema); | ||
} // namespace | ||
|
||
ss::future<data_writer_error> | ||
partitioning_writer::add_data(iceberg::struct_value val, int64_t approx_size) { | ||
iceberg::partition_key pk; | ||
try { | ||
pk = iceberg::partition_key::create( | ||
val, default_accessors, hourly_spec); | ||
} catch (...) { | ||
vlog( | ||
datalake_log.error, | ||
"Error {} while partitioning value: {}", | ||
std::current_exception(), | ||
val); | ||
co_return data_writer_error::parquet_conversion_error; | ||
} | ||
auto writer_iter = writers_.find(pk); | ||
if (writer_iter == writers_.end()) { | ||
auto writer_res = co_await writer_factory_.create_writer(type_); | ||
if (writer_res.has_error()) { | ||
vlog( | ||
datalake_log.error, | ||
"Failed to create new writer: {}", | ||
writer_res.error()); | ||
co_return writer_res.error(); | ||
} | ||
auto new_iter = writers_.emplace( | ||
pk.copy(), std::move(writer_res.value())); | ||
writer_iter = new_iter.first; | ||
} | ||
auto& writer = writer_iter->second; | ||
auto write_res = co_await writer->add_data_struct( | ||
std::move(val), approx_size); | ||
if (write_res != data_writer_error::ok) { | ||
vlog(datalake_log.error, "Failed to add data: {}", write_res); | ||
co_return write_res; | ||
} | ||
co_return write_res; | ||
} | ||
|
||
ss::future<result<chunked_vector<local_file_metadata>, data_writer_error>> | ||
partitioning_writer::finish() && { | ||
chunked_vector<local_file_metadata> files; | ||
auto first_error = data_writer_error::ok; | ||
// TODO: parallelize me! | ||
for (auto& [pk, writer] : writers_) { | ||
int hour = std::get<iceberg::int_value>( | ||
std::get<iceberg::primitive_value>(*pk.val->fields[0])) | ||
.val; | ||
auto file_res = co_await writer->finish(); | ||
if (file_res.has_error()) { | ||
vlog( | ||
datalake_log.error, | ||
"Failed to finish writer: {}", | ||
file_res.error()); | ||
if (first_error == data_writer_error::ok) { | ||
first_error = file_res.error(); | ||
} | ||
// Even on error, move on so that we can close all the writers. | ||
continue; | ||
} | ||
auto& file = file_res.value(); | ||
file.hour = hour; | ||
files.emplace_back(std::move(file)); | ||
} | ||
if (first_error != data_writer_error::ok) { | ||
co_return first_error; | ||
} | ||
co_return files; | ||
} | ||
|
||
} // namespace datalake |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
/* | ||
* Copyright 2024 Redpanda Data, Inc. | ||
* | ||
* Licensed as a Redpanda Enterprise file under the Redpanda Community | ||
* License (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md | ||
*/ | ||
#pragma once | ||
|
||
#include "container/chunked_hash_map.h" | ||
#include "datalake/data_writer_interface.h" | ||
#include "iceberg/datatypes.h" | ||
#include "iceberg/partition_key.h" | ||
#include "iceberg/values.h" | ||
|
||
namespace iceberg { | ||
struct struct_type; | ||
} // namespace iceberg | ||
|
||
namespace datalake { | ||
|
||
// A wrapper around multiple data writers that all share the same schema and | ||
// are partitioned by an Iceberg partition key. | ||
// | ||
// Uses the default partition spec to partition. As such, this class expects | ||
// that schemas and values given as inputs are constructed with the default | ||
// ("schemaless") schema and fields at the front. | ||
class partitioning_writer { | ||
public: | ||
explicit partitioning_writer( | ||
data_writer_factory& factory, iceberg::struct_type type) | ||
: writer_factory_(factory) | ||
, type_(std::move(type)) {} | ||
|
||
// Adds the given value to the writer corresponding to the value's | ||
// partition key. | ||
// | ||
// Expects that the input value abides by the schema denoted by `type_`. | ||
ss::future<data_writer_error> | ||
add_data(iceberg::struct_value, int64_t approx_size); | ||
|
||
// Finishes and returns the list of local files written by the underlying | ||
// writers, with the appropriate partitioning metadata filled in. | ||
ss::future<result<chunked_vector<local_file_metadata>, data_writer_error>> | ||
finish() &&; | ||
|
||
private: | ||
// Factory for data writers. | ||
data_writer_factory& writer_factory_; | ||
|
||
// The Iceberg message type for the underlying writer. Expected to include | ||
// Redpanda-specific fields, e.g. a timestamp field for partitioning. | ||
const iceberg::struct_type type_; | ||
|
||
// Map of partition keys to their corresponding data file writers. | ||
chunked_hash_map<iceberg::partition_key, std::unique_ptr<data_writer>> | ||
writers_; | ||
}; | ||
|
||
} // namespace datalake |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.