-
Notifications
You must be signed in to change notification settings - Fork 98
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement async update method. Improve the performance of update by parallelising reads. #2087
base: master
Are you sure you want to change the base?
Changes from all commits
f583136
1242013
05b489f
6c0d5c2
bc55784
6443c08
646cc1d
229d597
7043683
d87193e
3a9cec2
27cca12
083f4be
c459c73
05e7965
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,8 +61,7 @@ void foreach_active_bit(const util::BitSet &bs, C &&visitor) { | |
} | ||
} | ||
|
||
template<typename ContainerType> | ||
std::vector<SliceAndKey> filter_index(const ContainerType &container, std::optional<CombinedQuery<ContainerType>> &&query) { | ||
inline std::vector<SliceAndKey> filter_index(const index::IndexSegmentReader& container, std::optional<CombinedQuery<index::IndexSegmentReader>> &&query) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume the template parameter always resolved to |
||
ARCTICDB_SAMPLE_DEFAULT(FilterIndex) | ||
std::vector<SliceAndKey> output{}; | ||
if (container.size()> 0) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,20 +16,21 @@ | |
#include <arcticdb/stream/aggregator.hpp> | ||
#include <arcticdb/entity/protobufs.hpp> | ||
#include <arcticdb/util/variant.hpp> | ||
#include <arcticdb/python/python_types.hpp> | ||
#include <arcticdb/pipeline/frame_utils.hpp> | ||
#include <arcticdb/pipeline/write_frame.hpp> | ||
#include <arcticdb/stream/append_map.hpp> | ||
#include <arcticdb/async/task_scheduler.hpp> | ||
#include <arcticdb/util/format_date.hpp> | ||
#include <vector> | ||
#include <array> | ||
#include <ranges> | ||
|
||
|
||
namespace arcticdb::pipelines { | ||
|
||
using namespace arcticdb::entity; | ||
using namespace arcticdb::stream; | ||
namespace ranges = std::ranges; | ||
|
||
WriteToSegmentTask::WriteToSegmentTask( | ||
std::shared_ptr<InputTensorFrame> frame, | ||
|
@@ -252,40 +253,41 @@ static RowRange partial_rewrite_row_range( | |
} | ||
} | ||
|
||
std::optional<SliceAndKey> rewrite_partial_segment( | ||
folly::Future<std::optional<SliceAndKey>> async_rewrite_partial_segment( | ||
const SliceAndKey& existing, | ||
const IndexRange& index_range, | ||
VersionId version_id, | ||
AffectedSegmentPart affected_part, | ||
const std::shared_ptr<Store>& store) { | ||
const auto& key = existing.key(); | ||
auto kv = store->read(key).get(); | ||
const SegmentInMemory& segment = kv.second; | ||
const RowRange affected_row_range = partial_rewrite_row_range(segment, index_range, affected_part); | ||
const int64_t num_rows = affected_row_range.end() - affected_row_range.start(); | ||
if (num_rows <= 0) { | ||
return std::nullopt; | ||
} | ||
SegmentInMemory output = segment.truncate(affected_row_range.start(), affected_row_range.end(), true); | ||
const IndexValue start_ts = TimeseriesIndex::start_value_for_segment(output); | ||
// +1 as in the key we store one nanosecond greater than the last index value in the segment | ||
const IndexValue end_ts = std::get<NumericIndex>(TimeseriesIndex::end_value_for_segment(output)) + 1; | ||
FrameSlice new_slice{ | ||
std::make_shared<StreamDescriptor>(output.descriptor()), | ||
existing.slice_.col_range, | ||
RowRange{0, num_rows}, | ||
existing.slice_.hash_bucket(), | ||
existing.slice_.num_buckets()}; | ||
|
||
auto fut_key = store->write( | ||
key.type(), | ||
version_id, | ||
key.id(), | ||
start_ts, | ||
end_ts, | ||
std::move(output) | ||
); | ||
return SliceAndKey{std::move(new_slice), std::get<AtomKey>(std::move(fut_key).get())}; | ||
return store->read(existing.key()).thenValue([=](std::pair<VariantKey, SegmentInMemory>&& key_segment) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Capture by copy? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could also be a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as #2087 (comment) the implementation is in the lambda and it uses all variables. So I need to pass them to the future. Can't capture by ref as they'll die when this is put in the queue and the function returns. |
||
const auto& key = existing.key(); | ||
const SegmentInMemory& segment = key_segment.second; | ||
const RowRange affected_row_range = partial_rewrite_row_range(segment, index_range, affected_part); | ||
const int64_t num_rows = affected_row_range.end() - affected_row_range.start(); | ||
if (num_rows <= 0) { | ||
return folly::Future<std::optional<SliceAndKey>>{std::nullopt}; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this needs |
||
} | ||
SegmentInMemory output = segment.truncate(affected_row_range.start(), affected_row_range.end(), true); | ||
const IndexValue start_ts = TimeseriesIndex::start_value_for_segment(output); | ||
// +1 as in the key we store one nanosecond greater than the last index value in the segment | ||
const IndexValue end_ts = std::get<NumericIndex>(TimeseriesIndex::end_value_for_segment(output)) + 1; | ||
FrameSlice new_slice{ | ||
std::make_shared<StreamDescriptor>(output.descriptor()), | ||
existing.slice_.col_range, | ||
RowRange{0, num_rows}, | ||
existing.slice_.hash_bucket(), | ||
existing.slice_.num_buckets()}; | ||
return store->write( | ||
key.type(), | ||
version_id, | ||
key.id(), | ||
start_ts, | ||
end_ts, | ||
std::move(output) | ||
).thenValueInline([new_slice=std::move(new_slice)](VariantKey&& k) { | ||
return std::make_optional<SliceAndKey>(std::move(new_slice), std::get<AtomKey>(std::move(k))); | ||
}); | ||
}); | ||
} | ||
|
||
std::vector<SliceAndKey> flatten_and_fix_rows(const std::array<std::vector<SliceAndKey>, 5>& groups, size_t& global_count) { | ||
|
@@ -301,10 +303,9 @@ std::vector<SliceAndKey> flatten_and_fix_rows(const std::array<std::vector<Slice | |
return std::max(a, sk.slice_.row_range.second); | ||
}); | ||
|
||
std::transform(std::begin(group), std::end(group), std::back_inserter(output), [&](SliceAndKey sk) { | ||
ranges::transform(group, std::back_inserter(output), [&](SliceAndKey sk) { | ||
auto range_start = global_count + (sk.slice_.row_range.first - group_start); | ||
auto new_range = RowRange{range_start, range_start + (sk.slice_.row_range.diff())}; | ||
sk.slice_.row_range = new_range; | ||
sk.slice_.row_range = RowRange{range_start, range_start + (sk.slice_.row_range.diff())}; | ||
return sk; | ||
}); | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should leave this implementation using
read_sync
, so that the scheduling overhead can be avoided if necessary