Skip to content

Commit

Permalink
Merge remote-tracking branch 'ad-freiburg/master' into join-refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
RobinTF committed Oct 18, 2024
2 parents 1eee97f + 2b4e6b3 commit 6a575bf
Show file tree
Hide file tree
Showing 22 changed files with 1,862 additions and 1,328 deletions.
2 changes: 1 addition & 1 deletion src/engine/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ add_library(engine
VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp
CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp
TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp
CountConnectedSubgraphs.cpp)
CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp)
qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2)
248 changes: 29 additions & 219 deletions src/engine/SpatialJoin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <tuple>

#include "engine/ExportQueryExecutionTrees.h"
#include "engine/SpatialJoinAlgorithms.h"
#include "engine/VariableToColumnMap.h"
#include "engine/idTable/IdTable.h"
#include "global/Constants.h"
Expand Down Expand Up @@ -230,7 +231,19 @@ size_t SpatialJoin::getCostEstimate() {
// ____________________________________________________________________________
uint64_t SpatialJoin::getSizeEstimateBeforeLimit() {
if (childLeft_ && childRight_) {
return childLeft_->getSizeEstimate() * childRight_->getSizeEstimate();
// If we limit the number of results to k, even in the largest scenario, the
// result can be at most `|childLeft| * k`
auto maxResults = getMaxResults();
if (maxResults.has_value()) {
return childLeft_->getSizeEstimate() * maxResults.value();
}

// If we don't limit the number of results, we cannot draw conclusions about
// the size, other than the worst case `|childLeft| * |childRight|`. However
// to improve query planning for the average case, we apply a constant
// factor (the asymptotic behavior remains unchanged).
return (childLeft_->getSizeEstimate() * childRight_->getSizeEstimate()) /
SPATIAL_JOIN_MAX_DIST_SIZE_ESTIMATE;
}
return 1; // dummy return if not both children are added
}
Expand Down Expand Up @@ -262,7 +275,9 @@ float SpatialJoin::getMultiplicity(size_t col) {
column -= childLeft_->getResultWidth();
}
auto distinctnessChild = getDistinctness(child, column);
return static_cast<float>(getSizeEstimate()) / distinctnessChild;
return static_cast<float>(childLeft_->getSizeEstimate() *
childRight_->getSizeEstimate()) /
distinctnessChild;
} else {
return 1;
}
Expand All @@ -287,70 +302,7 @@ vector<ColumnIndex> SpatialJoin::resultSortedOn() const {
}

// ____________________________________________________________________________
std::optional<GeoPoint> SpatialJoin::getPoint(const IdTable* restable,
size_t row,
ColumnIndex col) const {
auto id = restable->at(row, col);
return id.getDatatype() == Datatype::GeoPoint
? std::optional{id.getGeoPoint()}
: std::nullopt;
};

// ____________________________________________________________________________
Id SpatialJoin::computeDist(const IdTable* idTableLeft,
const IdTable* idTableRight, size_t rowLeft,
size_t rowRight, ColumnIndex leftPointCol,
ColumnIndex rightPointCol) const {
auto point1 = getPoint(idTableLeft, rowLeft, leftPointCol);
auto point2 = getPoint(idTableRight, rowRight, rightPointCol);
if (!point1.has_value() || !point2.has_value()) {
return Id::makeUndefined();
}
return Id::makeFromInt(
ad_utility::detail::wktDistImpl(point1.value(), point2.value()) * 1000);
}

// ____________________________________________________________________________
void SpatialJoin::addResultTableEntry(IdTable* result,
const IdTable* idTableLeft,
const IdTable* idTableRight,
size_t rowLeft, size_t rowRight,
Id distance) const {
// this lambda function copies elements from copyFrom
// into the table res. It copies them into the row
// rowIndRes and column column colIndRes. It returns the column number
// until which elements were copied
auto addColumns = [](IdTable* res, const IdTable* copyFrom, size_t rowIndRes,
size_t colIndRes, size_t rowIndCopy) {
size_t col = 0;
while (col < copyFrom->numColumns()) {
res->at(rowIndRes, colIndRes) = (*copyFrom).at(rowIndCopy, col);
colIndRes += 1;
col += 1;
}
return colIndRes;
};

auto resrow = result->numRows();
result->emplace_back();
// add columns to result table
size_t rescol = 0;
rescol = addColumns(result, idTableLeft, resrow, rescol, rowLeft);
rescol = addColumns(result, idTableRight, resrow, rescol, rowRight);

if (addDistToResult_) {
result->at(resrow, rescol) = distance;
// rescol isn't used after that in this function, but future updates,
// which add additional columns, would need to remember to increase
// rescol at this place otherwise. If they forget to do this, the
// distance column will be overwritten, the variableToColumnMap will
// not work and so on
// rescol += 1;
}
}

// ____________________________________________________________________________
SpatialJoin::PreparedJoinParams SpatialJoin::prepareJoin() const {
PreparedSpatialJoinParams SpatialJoin::prepareJoin() const {
auto getIdTable = [](std::shared_ptr<QueryExecutionTree> child) {
std::shared_ptr<const Result> resTable = child->getResult();
auto idTablePtr = &resTable->idTable();
Expand All @@ -365,172 +317,30 @@ SpatialJoin::PreparedJoinParams SpatialJoin::prepareJoin() const {
};

// Input tables
const auto [idTableLeft, resultLeft] = getIdTable(childLeft_);
const auto [idTableRight, resultRight] = getIdTable(childRight_);
auto [idTableLeft, resultLeft] = getIdTable(childLeft_);
auto [idTableRight, resultRight] = getIdTable(childRight_);

// Input table columns for the join
ColumnIndex leftJoinCol = getJoinCol(childLeft_, leftChildVariable_);
ColumnIndex rightJoinCol = getJoinCol(childRight_, rightChildVariable_);

// Size of output table
size_t numColumns = getResultWidth();
return PreparedJoinParams{
idTableLeft, std::move(resultLeft), idTableRight, std::move(resultRight),
leftJoinCol, rightJoinCol, numColumns};
}

// ____________________________________________________________________________
Result SpatialJoin::baselineAlgorithm() {
const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol,
rightJoinCol, numColumns] = prepareJoin();
IdTable result{numColumns, _executionContext->getAllocator()};
auto maxDist = getMaxDist();
auto maxResults = getMaxResults();

// cartesian product between the two tables, pairs are restricted according to
// `maxDistance_` and `maxResults_`
for (size_t rowLeft = 0; rowLeft < idTableLeft->size(); rowLeft++) {
// This priority queue stores the intermediate best results if `maxResults_`
// is used. Each intermediate result is stored as a pair of its `rowRight`
// and distance. Since the queue will hold at most `maxResults_ + 1` items,
// it is not a memory concern.
auto compare = [](std::pair<size_t, int64_t> a,
std::pair<size_t, int64_t> b) {
return a.second < b.second;
};
std::priority_queue<std::pair<size_t, int64_t>,
std::vector<std::pair<size_t, int64_t>>,
decltype(compare)>
intermediate(compare);

// Inner loop of cartesian product
for (size_t rowRight = 0; rowRight < idTableRight->size(); rowRight++) {
Id dist = computeDist(idTableLeft, idTableRight, rowLeft, rowRight,
leftJoinCol, rightJoinCol);

// Ensure `maxDist_` constraint
if (dist.getDatatype() != Datatype::Int ||
(maxDist.has_value() &&
dist.getInt() > static_cast<int64_t>(maxDist.value()))) {
continue;
}

// If there is no `maxResults_` we can add the result row immediately
if (!maxResults.has_value()) {
addResultTableEntry(&result, idTableLeft, idTableRight, rowLeft,
rowRight, dist);
continue;
}

// Ensure `maxResults_` constraint using priority queue
intermediate.push(std::pair{rowRight, dist.getInt()});
// Too many results? Drop the worst one
if (intermediate.size() > maxResults.value()) {
intermediate.pop();
}
}

// If we are using the priority queue, we didn't add the results in the
// inner loop, so we do it now.
if (maxResults.has_value()) {
size_t numResults = intermediate.size();
for (size_t item = 0; item < numResults; item++) {
// Get and remove largest item from priority queue
auto [rowRight, dist] = intermediate.top();
intermediate.pop();

addResultTableEntry(&result, idTableLeft, idTableRight, rowLeft,
rowRight, Id::makeFromInt(dist));
}
}
}
return Result(std::move(result), std::vector<ColumnIndex>{},
Result::getMergedLocalVocab(*resultLeft, *resultRight));
}

// ____________________________________________________________________________
Result SpatialJoin::s2geometryAlgorithm() {
const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol,
rightJoinCol, numColumns] = prepareJoin();
IdTable result{numColumns, _executionContext->getAllocator()};
auto maxDist = getMaxDist();
auto maxResults = getMaxResults();

// Helper function to convert `GeoPoint` to `S2Point`
auto constexpr toS2Point = [](const GeoPoint& p) {
auto lat = p.getLat();
auto lng = p.getLng();
auto latlng = S2LatLng::FromDegrees(lat, lng);
return S2Point{latlng};
};

S2PointIndex<size_t> s2index;

// Optimization: If we only search by maximum distance, the operation is
// symmetric, so the larger table can be used for the index
bool indexOfRight =
(maxResults.has_value() || (idTableLeft->size() > idTableRight->size()));
auto indexTable = indexOfRight ? idTableRight : idTableLeft;
auto indexJoinCol = indexOfRight ? rightJoinCol : leftJoinCol;

// Populate the index
for (size_t row = 0; row < indexTable->size(); row++) {
auto p = getPoint(indexTable, row, indexJoinCol);
if (p.has_value()) {
s2index.Add(toS2Point(p.value()), row);
}
}

// Performs a nearest neighbor search on the index and returns the closest
// points that satisfy the criteria given by `maxDist_` and `maxResults_`.

// Construct a query object with the given constraints
auto s2query = S2ClosestPointQuery<size_t>{&s2index};

if (maxResults.has_value()) {
s2query.mutable_options()->set_max_results(
static_cast<int>(maxResults.value()));
}
if (maxDist.has_value()) {
s2query.mutable_options()->set_inclusive_max_distance(S2Earth::ToAngle(
util::units::Meters(static_cast<float>(maxDist.value()))));
}

auto searchTable = indexOfRight ? idTableLeft : idTableRight;
auto searchJoinCol = indexOfRight ? leftJoinCol : rightJoinCol;

// Use the index to lookup the points of the other table
for (size_t searchRow = 0; searchRow < searchTable->size(); searchRow++) {
auto p = getPoint(searchTable, searchRow, searchJoinCol);
if (!p.has_value()) {
continue;
}
auto s2target =
S2ClosestPointQuery<size_t>::PointTarget{toS2Point(p.value())};

for (const auto& neighbor : s2query.FindClosestPoints(&s2target)) {
// In this loop we only receive points that already satisfy the given
// criteria
auto indexRow = neighbor.data();
auto dist = static_cast<int64_t>(S2Earth::ToMeters(neighbor.distance()));

auto rowLeft = indexOfRight ? searchRow : indexRow;
auto rowRight = indexOfRight ? indexRow : searchRow;
addResultTableEntry(&result, idTableLeft, idTableRight, rowLeft, rowRight,
Id::makeFromInt(dist));
}
}

return Result(std::move(result), std::vector<ColumnIndex>{},
Result::getMergedLocalVocab(*resultLeft, *resultRight));
return PreparedSpatialJoinParams{idTableLeft, std::move(resultLeft),
idTableRight, std::move(resultRight),
leftJoinCol, rightJoinCol,
numColumns, getMaxDist(),
getMaxResults()};
}

// ____________________________________________________________________________
Result SpatialJoin::computeResult([[maybe_unused]] bool requestLaziness) {
SpatialJoinAlgorithms algorithms{_executionContext, prepareJoin(),
addDistToResult_, config_};
if (useBaselineAlgorithm_) {
return baselineAlgorithm();
return algorithms.BaselineAlgorithm();
} else {
return s2geometryAlgorithm();
return algorithms.S2geometryAlgorithm();
}
}

Expand Down
51 changes: 14 additions & 37 deletions src/engine/SpatialJoin.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,19 @@ struct MaxDistanceConfig {
size_t maxDist_;
};

// helper struct to improve readability in prepareJoin()
struct PreparedSpatialJoinParams {
const IdTable* const idTableLeft_;
std::shared_ptr<const Result> resultLeft_;
const IdTable* const idTableRight_;
std::shared_ptr<const Result> resultRight_;
ColumnIndex leftJoinCol_;
ColumnIndex rightJoinCol_;
size_t numColumns_;
std::optional<size_t> maxDist_;
std::optional<size_t> maxResults_;
};

// This class is implementing a SpatialJoin operation. This operations joins
// two tables, using their positional column. It supports nearest neighbor
// search as well as search of all points within a given range.
Expand Down Expand Up @@ -107,44 +120,8 @@ class SpatialJoin : public Operation {
// helper function, which parses a triple and populates config_
void parseConfigFromTriple();

// helper function which returns a GeoPoint if the element of the given table
// represents a GeoPoint
std::optional<GeoPoint> getPoint(const IdTable* restable, size_t row,
ColumnIndex col) const;

// helper function, which computes the distance of two points, where each
// point comes from a different result table
Id computeDist(const IdTable* resLeft, const IdTable* resRight,
size_t rowLeft, size_t rowRight, ColumnIndex leftPointCol,
ColumnIndex rightPointCol) const;

// Helper function, which adds a row, which belongs to the result to the
// result table. As inputs it uses a row of the left and a row of the right
// child result table.
void addResultTableEntry(IdTable* result, const IdTable* resultLeft,
const IdTable* resultRight, size_t rowLeft,
size_t rowRight, Id distance) const;

// helper struct to improve readability in prepareJoin()
struct PreparedJoinParams {
const IdTable* const idTableLeft_;
std::shared_ptr<const Result> resultLeft_;
const IdTable* const idTableRight_;
std::shared_ptr<const Result> resultRight_;
ColumnIndex leftJoinCol_;
ColumnIndex rightJoinCol_;
size_t numColumns_;
};

// helper function, to initialize various required objects for both algorithms
PreparedJoinParams prepareJoin() const;

// the baseline algorithm, which just checks every combination
Result baselineAlgorithm();

// the improved algorithm, which constructs a S2PointIndex, which is usually
// much faster, however requires that the right relation fits into memory
Result s2geometryAlgorithm();
PreparedSpatialJoinParams prepareJoin() const;

SparqlTriple triple_;
Variable leftChildVariable_;
Expand Down
Loading

0 comments on commit 6a575bf

Please sign in to comment.