Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Separate the implementation of the SpatialJoin operation into a separate class #1552

Merged
merged 9 commits into from
Oct 17, 2024
2 changes: 1 addition & 1 deletion src/engine/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ add_library(engine
VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp
CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp
TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp
CountConnectedSubgraphs.cpp)
CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp)
qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2)
230 changes: 13 additions & 217 deletions src/engine/SpatialJoin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <tuple>

#include "engine/ExportQueryExecutionTrees.h"
#include "engine/SpatialJoinAlgorithms.h"
#include "engine/VariableToColumnMap.h"
#include "engine/idTable/IdTable.h"
#include "global/Constants.h"
Expand Down Expand Up @@ -287,70 +288,7 @@ vector<ColumnIndex> SpatialJoin::resultSortedOn() const {
}

// ____________________________________________________________________________
std::optional<GeoPoint> SpatialJoin::getPoint(const IdTable* restable,
size_t row,
ColumnIndex col) const {
auto id = restable->at(row, col);
return id.getDatatype() == Datatype::GeoPoint
? std::optional{id.getGeoPoint()}
: std::nullopt;
};

// ____________________________________________________________________________
Id SpatialJoin::computeDist(const IdTable* idTableLeft,
const IdTable* idTableRight, size_t rowLeft,
size_t rowRight, ColumnIndex leftPointCol,
ColumnIndex rightPointCol) const {
auto point1 = getPoint(idTableLeft, rowLeft, leftPointCol);
auto point2 = getPoint(idTableRight, rowRight, rightPointCol);
if (!point1.has_value() || !point2.has_value()) {
return Id::makeUndefined();
}
return Id::makeFromInt(
ad_utility::detail::wktDistImpl(point1.value(), point2.value()) * 1000);
}

// ____________________________________________________________________________
void SpatialJoin::addResultTableEntry(IdTable* result,
const IdTable* idTableLeft,
const IdTable* idTableRight,
size_t rowLeft, size_t rowRight,
Id distance) const {
// this lambda function copies elements from copyFrom
// into the table res. It copies them into the row
// rowIndRes and column column colIndRes. It returns the column number
// until which elements were copied
auto addColumns = [](IdTable* res, const IdTable* copyFrom, size_t rowIndRes,
size_t colIndRes, size_t rowIndCopy) {
size_t col = 0;
while (col < copyFrom->numColumns()) {
res->at(rowIndRes, colIndRes) = (*copyFrom).at(rowIndCopy, col);
colIndRes += 1;
col += 1;
}
return colIndRes;
};

auto resrow = result->numRows();
result->emplace_back();
// add columns to result table
size_t rescol = 0;
rescol = addColumns(result, idTableLeft, resrow, rescol, rowLeft);
rescol = addColumns(result, idTableRight, resrow, rescol, rowRight);

if (addDistToResult_) {
result->at(resrow, rescol) = distance;
// rescol isn't used after that in this function, but future updates,
// which add additional columns, would need to remember to increase
// rescol at this place otherwise. If they forget to do this, the
// distance column will be overwritten, the variableToColumnMap will
// not work and so on
// rescol += 1;
}
}

// ____________________________________________________________________________
SpatialJoin::PreparedJoinParams SpatialJoin::prepareJoin() const {
PreparedSpatialJoinParams SpatialJoin::prepareJoin() const {
auto getIdTable = [](std::shared_ptr<QueryExecutionTree> child) {
std::shared_ptr<const Result> resTable = child->getResult();
auto idTablePtr = &resTable->idTable();
Expand All @@ -365,172 +303,30 @@ SpatialJoin::PreparedJoinParams SpatialJoin::prepareJoin() const {
};

// Input tables
const auto [idTableLeft, resultLeft] = getIdTable(childLeft_);
const auto [idTableRight, resultRight] = getIdTable(childRight_);
auto [idTableLeft, resultLeft] = getIdTable(childLeft_);
auto [idTableRight, resultRight] = getIdTable(childRight_);

// Input table columns for the join
ColumnIndex leftJoinCol = getJoinCol(childLeft_, leftChildVariable_);
ColumnIndex rightJoinCol = getJoinCol(childRight_, rightChildVariable_);

// Size of output table
size_t numColumns = getResultWidth();
return PreparedJoinParams{
idTableLeft, std::move(resultLeft), idTableRight, std::move(resultRight),
leftJoinCol, rightJoinCol, numColumns};
}

// ____________________________________________________________________________
Result SpatialJoin::baselineAlgorithm() {
const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol,
rightJoinCol, numColumns] = prepareJoin();
IdTable result{numColumns, _executionContext->getAllocator()};
auto maxDist = getMaxDist();
auto maxResults = getMaxResults();

// cartesian product between the two tables, pairs are restricted according to
// `maxDistance_` and `maxResults_`
for (size_t rowLeft = 0; rowLeft < idTableLeft->size(); rowLeft++) {
// This priority queue stores the intermediate best results if `maxResults_`
// is used. Each intermediate result is stored as a pair of its `rowRight`
// and distance. Since the queue will hold at most `maxResults_ + 1` items,
// it is not a memory concern.
auto compare = [](std::pair<size_t, int64_t> a,
std::pair<size_t, int64_t> b) {
return a.second < b.second;
};
std::priority_queue<std::pair<size_t, int64_t>,
std::vector<std::pair<size_t, int64_t>>,
decltype(compare)>
intermediate(compare);

// Inner loop of cartesian product
for (size_t rowRight = 0; rowRight < idTableRight->size(); rowRight++) {
Id dist = computeDist(idTableLeft, idTableRight, rowLeft, rowRight,
leftJoinCol, rightJoinCol);

// Ensure `maxDist_` constraint
if (dist.getDatatype() != Datatype::Int ||
(maxDist.has_value() &&
dist.getInt() > static_cast<int64_t>(maxDist.value()))) {
continue;
}

// If there is no `maxResults_` we can add the result row immediately
if (!maxResults.has_value()) {
addResultTableEntry(&result, idTableLeft, idTableRight, rowLeft,
rowRight, dist);
continue;
}

// Ensure `maxResults_` constraint using priority queue
intermediate.push(std::pair{rowRight, dist.getInt()});
// Too many results? Drop the worst one
if (intermediate.size() > maxResults.value()) {
intermediate.pop();
}
}

// If we are using the priority queue, we didn't add the results in the
// inner loop, so we do it now.
if (maxResults.has_value()) {
size_t numResults = intermediate.size();
for (size_t item = 0; item < numResults; item++) {
// Get and remove largest item from priority queue
auto [rowRight, dist] = intermediate.top();
intermediate.pop();

addResultTableEntry(&result, idTableLeft, idTableRight, rowLeft,
rowRight, Id::makeFromInt(dist));
}
}
}
return Result(std::move(result), std::vector<ColumnIndex>{},
Result::getMergedLocalVocab(*resultLeft, *resultRight));
}

// ____________________________________________________________________________
Result SpatialJoin::s2geometryAlgorithm() {
const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol,
rightJoinCol, numColumns] = prepareJoin();
IdTable result{numColumns, _executionContext->getAllocator()};
auto maxDist = getMaxDist();
auto maxResults = getMaxResults();

// Helper function to convert `GeoPoint` to `S2Point`
auto constexpr toS2Point = [](const GeoPoint& p) {
auto lat = p.getLat();
auto lng = p.getLng();
auto latlng = S2LatLng::FromDegrees(lat, lng);
return S2Point{latlng};
};

S2PointIndex<size_t> s2index;

// Optimization: If we only search by maximum distance, the operation is
// symmetric, so the larger table can be used for the index
bool indexOfRight =
(maxResults.has_value() || (idTableLeft->size() > idTableRight->size()));
auto indexTable = indexOfRight ? idTableRight : idTableLeft;
auto indexJoinCol = indexOfRight ? rightJoinCol : leftJoinCol;

// Populate the index
for (size_t row = 0; row < indexTable->size(); row++) {
auto p = getPoint(indexTable, row, indexJoinCol);
if (p.has_value()) {
s2index.Add(toS2Point(p.value()), row);
}
}

// Performs a nearest neighbor search on the index and returns the closest
// points that satisfy the criteria given by `maxDist_` and `maxResults_`.

// Construct a query object with the given constraints
auto s2query = S2ClosestPointQuery<size_t>{&s2index};

if (maxResults.has_value()) {
s2query.mutable_options()->set_max_results(
static_cast<int>(maxResults.value()));
}
if (maxDist.has_value()) {
s2query.mutable_options()->set_inclusive_max_distance(S2Earth::ToAngle(
util::units::Meters(static_cast<float>(maxDist.value()))));
}

auto searchTable = indexOfRight ? idTableLeft : idTableRight;
auto searchJoinCol = indexOfRight ? leftJoinCol : rightJoinCol;

// Use the index to lookup the points of the other table
for (size_t searchRow = 0; searchRow < searchTable->size(); searchRow++) {
auto p = getPoint(searchTable, searchRow, searchJoinCol);
if (!p.has_value()) {
continue;
}
auto s2target =
S2ClosestPointQuery<size_t>::PointTarget{toS2Point(p.value())};

for (const auto& neighbor : s2query.FindClosestPoints(&s2target)) {
// In this loop we only receive points that already satisfy the given
// criteria
auto indexRow = neighbor.data();
auto dist = static_cast<int64_t>(S2Earth::ToMeters(neighbor.distance()));

auto rowLeft = indexOfRight ? searchRow : indexRow;
auto rowRight = indexOfRight ? indexRow : searchRow;
addResultTableEntry(&result, idTableLeft, idTableRight, rowLeft, rowRight,
Id::makeFromInt(dist));
}
}

return Result(std::move(result), std::vector<ColumnIndex>{},
Result::getMergedLocalVocab(*resultLeft, *resultRight));
return PreparedSpatialJoinParams{idTableLeft, std::move(resultLeft),
idTableRight, std::move(resultRight),
leftJoinCol, rightJoinCol,
numColumns, getMaxDist(),
getMaxResults()};
}

// ____________________________________________________________________________
Result SpatialJoin::computeResult([[maybe_unused]] bool requestLaziness) {
SpatialJoinAlgorithms algorithms{_executionContext, prepareJoin(),
addDistToResult_, config_};
if (useBaselineAlgorithm_) {
return baselineAlgorithm();
return algorithms.BaselineAlgorithm();
} else {
return s2geometryAlgorithm();
return algorithms.S2geometryAlgorithm();
}
}

Expand Down
51 changes: 14 additions & 37 deletions src/engine/SpatialJoin.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,19 @@ struct MaxDistanceConfig {
size_t maxDist_;
};

// helper struct to improve readability in prepareJoin()
struct PreparedSpatialJoinParams {
const IdTable* const idTableLeft_;
std::shared_ptr<const Result> resultLeft_;
const IdTable* const idTableRight_;
std::shared_ptr<const Result> resultRight_;
ColumnIndex leftJoinCol_;
ColumnIndex rightJoinCol_;
size_t numColumns_;
std::optional<size_t> maxDist_;
std::optional<size_t> maxResults_;
};

// This class is implementing a SpatialJoin operation. This operations joins
// two tables, using their positional column. It supports nearest neighbor
// search as well as search of all points within a given range.
Expand Down Expand Up @@ -107,44 +120,8 @@ class SpatialJoin : public Operation {
// helper function, which parses a triple and populates config_
void parseConfigFromTriple();

// helper function which returns a GeoPoint if the element of the given table
// represents a GeoPoint
std::optional<GeoPoint> getPoint(const IdTable* restable, size_t row,
ColumnIndex col) const;

// helper function, which computes the distance of two points, where each
// point comes from a different result table
Id computeDist(const IdTable* resLeft, const IdTable* resRight,
size_t rowLeft, size_t rowRight, ColumnIndex leftPointCol,
ColumnIndex rightPointCol) const;

// Helper function, which adds a row, which belongs to the result to the
// result table. As inputs it uses a row of the left and a row of the right
// child result table.
void addResultTableEntry(IdTable* result, const IdTable* resultLeft,
const IdTable* resultRight, size_t rowLeft,
size_t rowRight, Id distance) const;

// helper struct to improve readability in prepareJoin()
struct PreparedJoinParams {
const IdTable* const idTableLeft_;
std::shared_ptr<const Result> resultLeft_;
const IdTable* const idTableRight_;
std::shared_ptr<const Result> resultRight_;
ColumnIndex leftJoinCol_;
ColumnIndex rightJoinCol_;
size_t numColumns_;
};

// helper function, to initialize various required objects for both algorithms
PreparedJoinParams prepareJoin() const;

// the baseline algorithm, which just checks every combination
Result baselineAlgorithm();

// the improved algorithm, which constructs a S2PointIndex, which is usually
// much faster, however requires that the right relation fits into memory
Result s2geometryAlgorithm();
PreparedSpatialJoinParams prepareJoin() const;

SparqlTriple triple_;
Variable leftChildVariable_;
Expand Down
Loading
Loading