Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into fix-result-calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
Hannah Bast committed Oct 13, 2024
2 parents 57854c7 + 309f6b7 commit f14cf01
Show file tree
Hide file tree
Showing 148 changed files with 9,837 additions and 2,459 deletions.
34 changes: 25 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.16)
cmake_minimum_required(VERSION 3.27)
project(QLever C CXX)

# C/C++ Versions
Expand Down Expand Up @@ -244,6 +244,18 @@ FetchContent_Declare(
GIT_TAG 93ac3a4f9ee7792af399cebd873ee99ce15aed08 # 2024-05-16
)

################################
# S2 Geometry
################################
SET (BUILD_TESTS OFF CACHE BOOL "no tests for s2")
FetchContent_Declare(
s2
GIT_REPOSITORY https://github.com/google/s2geometry.git
GI_TAG 5b5eccd54a08ae03b4467e79ffbb076d0b5f221e #version 0.11.1
SYSTEM
)


if (USE_PARALLEL)
include(FindOpenMP)
if (OPENMP_FOUND)
Expand Down Expand Up @@ -301,8 +313,13 @@ FetchContent_Declare(
################################
FetchContent_Declare(
fsst
GIT_REPOSITORY https://github.com/cwida/fsst.git
GIT_TAG c8719ef0aa3740da9685ad2738bb9c8ecc327944
#GIT_REPOSITORY https://github.com/cwida/fsst.git
#GIT_TAG c8719ef0aa3740da9685ad2738bb9c8ecc327944
#FSST Currently has a struct name `Encoder` that leads to ODR violations with the `Encoder`class of S2.
# I (joka921) have filed bug reports for both of them and a PR for fsst, but for now we use our own fork
# of Fsst.
GIT_REPOSITORY https://github.com/joka921/fsst.git
GIT_TAG 43fb2d1756f5a5d0e85e765c7b51f5e3be8cc83f
)


Expand All @@ -321,8 +338,9 @@ FetchContent_Declare(
################################
# Apply FetchContent
################################
FetchContent_MakeAvailable(googletest ctre abseil re2 stxxl fsst)
FetchContent_MakeAvailable(googletest ctre abseil re2 stxxl fsst s2)
# Disable some warnings in RE2, STXXL, and GTEST
target_compile_options(s2 PRIVATE -Wno-sign-compare -Wno-unused-parameter -Wno-class-memaccess -Wno-comment -Wno-redundant-move -Wno-unknown-warning-option -Wno-maybe-uninitialized -Wno-class-memaccess)
target_compile_options(re2 PRIVATE -Wno-unused-parameter)
target_compile_options(stxxl PRIVATE -Wno-deprecated-declarations)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
Expand Down Expand Up @@ -409,19 +427,17 @@ add_subdirectory(test)
# Add the library with the constants declared in `CompilationInfo.h` and defined
# in `CompilationInfo.cpp` created by `CompilationInfo.cmake`.
add_library(compilationInfo ${CMAKE_CURRENT_BINARY_DIR}/CompilationInfo.cpp)
qlever_target_link_libraries(compilationInfo)

add_executable(IndexBuilderMain src/index/IndexBuilderMain.cpp)
qlever_target_link_libraries(IndexBuilderMain index ${CMAKE_THREAD_LIBS_INIT} Boost::program_options)
qlever_target_link_libraries(IndexBuilderMain index ${CMAKE_THREAD_LIBS_INIT} Boost::program_options compilationInfo)

add_executable(ServerMain src/ServerMain.cpp)
qlever_target_link_libraries(ServerMain engine ${CMAKE_THREAD_LIBS_INIT} Boost::program_options)
qlever_target_link_libraries(ServerMain engine ${CMAKE_THREAD_LIBS_INIT} Boost::program_options compilationInfo)
target_precompile_headers(ServerMain REUSE_FROM engine)

add_executable(VocabularyMergerMain src/VocabularyMergerMain.cpp)
qlever_target_link_libraries(VocabularyMergerMain index ${CMAKE_THREAD_LIBS_INIT})

add_executable(PermutationExporterMain src/index/PermutationExporterMain.cpp)
qlever_target_link_libraries(PermutationExporterMain index ${CMAKE_THREAD_LIBS_INIT})

add_executable(PrintIndexVersionMain src/PrintIndexVersionMain.cpp)
qlever_target_link_libraries(PrintIndexVersionMain util)
5 changes: 5 additions & 0 deletions CompilationInfo.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ namespace qlever::version {
constexpr std::string_view GitHash = ${GIT_HASH};
constexpr std::string_view GitShortHash = GitHash.substr(0, 6);
constexpr std::string_view DatetimeOfCompilation = ${DATETIME_OF_COMPILATION};
void copyVersionInfo() {
*gitShortHashWithoutLinking.wlock() = GitShortHash;
*datetimeOfCompilationWithoutLinking.wlock() = DatetimeOfCompilation;
}
}")

# For some reason `CMAKE_CURRENT_SOURCE_DIR` inside this script is
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
ENV LC_CTYPE C.UTF-8
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y software-properties-common && add-apt-repository -y ppa:mhier/libboost-latest
RUN apt-get update && apt-get install -y software-properties-common wget && add-apt-repository -y ppa:mhier/libboost-latest
RUN wget https://apt.kitware.com/kitware-archive.sh && chmod +x kitware-archive.sh &&./kitware-archive.sh

FROM base as builder
RUN apt-get update && apt-get install -y build-essential cmake libicu-dev tzdata pkg-config uuid-runtime uuid-dev git libjemalloc-dev ninja-build libzstd-dev libssl-dev libboost1.81-dev libboost-program-options1.81-dev libboost-iostreams1.81-dev libboost-url1.81-dev
Expand Down
24 changes: 23 additions & 1 deletion src/CompilationInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,32 @@
// File `CompilationInfo.cpp` which is created and linked by CMake.

#pragma once
#include <atomic>
#include <string_view>

#include "util/Synchronized.h"
namespace qlever::version {
// Short version of the hash of the commit that was used to QLever.

// The following two constants require linking against the `compilationInfo`
// library which is recreated on every compilation. Short version of the hash of

// The commit that was used to compile QLever.
extern const std::string_view GitShortHash;
// The date and time at which QLever was compiled.
extern const std::string_view DatetimeOfCompilation;

// The following two versions of the above constants do NOT require linking
// against the `compilationInfo` library, but only the inclusion of this header.
// They only have meaningful values once the `copyVersionInfo` function (below)
// was called. This is currently done in the `main` functions of
// `IndexBuilderMain.cpp` and `ServerMain.cpp`.
inline ad_utility::Synchronized<std::string_view> gitShortHashWithoutLinking{
std::string_view{"git short hash not set"}};
inline ad_utility::Synchronized<std::string_view>
datetimeOfCompilationWithoutLinking{
std::string_view{"git short hash not set"}};

// Copy the values from the constants that require linking to the `inline`
// variables that don't require linking. For details see above.
void copyVersionInfo();
} // namespace qlever::version
3 changes: 3 additions & 0 deletions src/ServerMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ namespace po = boost::program_options;

// Main function.
int main(int argc, char** argv) {
// Copy the git hash and datetime of compilation (which require relinking)
// to make them accessible to other parts of the code
qlever::version::copyVersionInfo();
setlocale(LC_CTYPE, "");

std::locale loc;
Expand Down
5 changes: 3 additions & 2 deletions src/engine/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ add_library(engine
Values.cpp Bind.cpp Minus.cpp RuntimeInformation.cpp CheckUsePatternTrick.cpp
VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp
CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp
TextLimit.cpp LocalVocabEntry.cpp)
qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams)
TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp
CountConnectedSubgraphs.cpp)
qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2)
9 changes: 4 additions & 5 deletions src/engine/CountAvailablePredicates.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,12 @@ ProtoResult CountAvailablePredicates::computeResult(
if (!indexScan) {
return false;
}
if (!indexScan->getSubject().isVariable() ||
!indexScan->getObject().isVariable()) {
if (!indexScan->subject().isVariable() ||
!indexScan->object().isVariable()) {
return false;
}

return indexScan->getPredicate() == HAS_PATTERN_PREDICATE;
return indexScan->predicate() == HAS_PATTERN_PREDICATE;
}();

if (isPatternTrickForAllEntities) {
Expand Down Expand Up @@ -164,8 +164,7 @@ void CountAvailablePredicates::computePatternTrickAllEntities(
ScanSpecificationAsTripleComponent{
TripleComponent::Iri::fromIriref(HAS_PATTERN_PREDICATE), std::nullopt,
std::nullopt}
.toScanSpecification(index)
.value();
.toScanSpecification(index);
auto fullHasPattern =
index.getPermutation(Permutation::Enum::PSO)
.lazyScan(scanSpec, std::nullopt, {}, cancellationHandle_);
Expand Down
116 changes: 116 additions & 0 deletions src/engine/CountConnectedSubgraphs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// Copyright 2024, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach <kalmbach@cs.uni-freiburg.de>

#include "engine/CountConnectedSubgraphs.h"

#include "util/BitUtils.h"

namespace countConnectedSubgraphs {

// _____________________________________________________________________________
size_t countSubgraphs(const Graph& graph, size_t budget) {
size_t count = 0;
// For each node `i`, recursively count all subgraphs that contain `i`, but no
// node `k < i` (because these have already been counted previously, when we
// ran the loop for `k`).
for (size_t i = 0; i < graph.size(); ++i) {
++count;
if (count > budget) {
return budget + 1;
}
// The set of nodes that only consists of node `i` is encoded by a single
// `1` bit. The ignored set has `1`s in all `i` bits that have a lower index
// than `i` (e.g. if `i` is 3, then `nodes` is `[0 x 56] 0000 1000` and
// `ignored` is `[0 x 56] 0000 0111`.
uint64_t nodes = 1ULL << i;
uint64_t ignored = ad_utility::bitMaskForLowerBits(i);
count = countSubgraphsRecursively(graph, nodes, ignored, count, budget);
}
return count;
}

// Return the set of nodes in `graph` that are adjacent to at least one of the
// nodes in `nodes`. Nodes that are `ignored` are excluded from the result. Note
// that the result may contain nodes from the `nodes` itself. The result is
// returned using the same encoding as `nodes` and `ignored`.
static uint64_t computeNeighbors(const Graph& graph, uint64_t nodes,
uint64_t ignored) {
uint64_t neighbors{};
for (size_t i = 0; i < 64; ++i) {
bool set = nodes & (1ULL << i);
if (set) {
neighbors |= graph[i].neighbors_;
}
}
neighbors &= (~ignored);
return neighbors;
}

// For a number `i` from 0 .. 2^`neighbors.size()` - 1, return the `i`th
// subset of the elements of `neighbors`. All elements in `neighbors` have
// to be from 0..63 so that the final result can be expressed as a bitmap.
static uint64_t subsetIndexToBitmap(size_t i,
const std::vector<uint8_t>& neighbors) {
// Note: This can probably be done more efficiently using bit fiddling, but it
// is efficient enough for now.
uint64_t subset = 0;
for (size_t k = 0; k < neighbors.size(); ++k) {
if (1 << k & i) {
subset |= (1ULL << neighbors[k]);
}
}
return subset;
}

// Convert a bitset to a vector of the indices of the bits that are set. For
// example, `13` (`1101` as bits) will be converted to `[0, 2, 3]`;
static std::vector<uint8_t> bitsetToVector(uint64_t bitset) {
std::vector<uint8_t> result;
for (uint8_t i = 0; i < 64; ++i) {
if (bitset & (1ULL << i)) {
result.push_back(i);
}
}
return result;
};

// _____________________________________________________________________________
std::string toBitsetString(uint64_t x) {
auto res = std::bitset<64>{x}.to_string();
auto pos = res.find('1');
if (pos >= res.size()) {
return "0";
}
return res.substr(pos);
}

// _____________________________________________________________________________
size_t countSubgraphsRecursively(const Graph& graph, uint64_t nodes,
uint64_t ignored, size_t count,
size_t budget) {
// Compute the set of direct neighbors of the `nodes` that is not
// ignored
uint64_t neighbors = computeNeighbors(graph, nodes, ignored);

std::vector<uint8_t> neighborsAsVector = bitsetToVector(neighbors);

// This is the recursion level which handles all the subsets of the neigrbors,
// and the above recursion levels deal with `nodes`, so we have to exclude
// them further down.
auto newIgnored = ignored | neighbors | nodes;

// Iterate over all Subsets of the neighbors
size_t upperBound = 1ULL << neighborsAsVector.size();
for (size_t i = 1; i < upperBound; ++i) {
++count;
if (count > budget) {
return budget + 1;
}
auto subset = subsetIndexToBitmap(i, neighborsAsVector);
count = countSubgraphsRecursively(graph, nodes | subset, newIgnored, count,
budget);
}
return count;
}
} // namespace countConnectedSubgraphs
41 changes: 41 additions & 0 deletions src/engine/CountConnectedSubgraphs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Copyright 2024, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach <kalmbach@cs.uni-freiburg.de>

#pragma once

#include <cstdint>

// This module implements the efficient counting of the number of connected
// subgraphs in a given graph. This routine can be used to analyze the
// complexity of query graphs and to choose an appropriate query planner (see
// `QueryPlanner.cpp`). The algorithm is taken from
// Neumann and Radke, Adaptive Optimization of Very Large Join Queries, see
// https://dl.acm.org/doi/pdf/10.1145/3183713.3183733
namespace countConnectedSubgraphs {

// A representation of an undirected graph with at most 64 nodes. Each node is
// represented by a 64-bit number, where the i-th bit is 1 iff the corresponding
// node is a neighbor of the node.
struct Node {
uint64_t neighbors_{};
};
using Graph = std::vector<Node>;

// Compute the number of connected subgraphs in the `graph`. If the number of
// such subraphs is `> budget`, return `budget + 1`.
size_t countSubgraphs(const Graph& graph, size_t budget);

// Recursive implementation of `countSubgraphs`. Compute the number of connected
// subgraphs in `graph` that contains all the nodes in `nodes`, but none of the
// nodes in `ignored`. Assume that `count` subgraphs have been previously found
// and therefore count towards the `budget`. The `nodes` and `ignored` are 1-hot
// encoded bitsets (see above).
size_t countSubgraphsRecursively(const Graph& graph, uint64_t nodes,
uint64_t ignored, size_t count, size_t budget);

// Convert `x` to a string of bits, with the leading zeros removed, e.g.,
// `3` will become "11". This is useful for debugging the functions above.
std::string toBitsetString(uint64_t x);

} // namespace countConnectedSubgraphs
8 changes: 7 additions & 1 deletion src/engine/Engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ void Engine::sort(IdTable& idTable, const std::vector<ColumnIndex>& sortCols) {
}

// ___________________________________________________________________________
size_t Engine::countDistinct(const IdTable& input,
size_t Engine::countDistinct(IdTableView<0> input,
const std::function<void()>& checkCancellation) {
AD_EXPENSIVE_CHECK(
std::ranges::is_sorted(input, std::ranges::lexicographical_compare),
Expand All @@ -79,3 +79,9 @@ size_t Engine::countDistinct(const IdTable& input,
auto numDuplicates = std::accumulate(counter.begin(), counter.end(), 0ULL);
return input.numRows() - numDuplicates;
}

// ___________________________________________________________________________
size_t Engine::countDistinct(const IdTable& input,
const std::function<void()>& checkCancellation) {
return countDistinct(input.asStaticView<0>(), checkCancellation);
}
2 changes: 2 additions & 0 deletions src/engine/Engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,6 @@ class Engine {
// be used to implement a cancellation mechanism that throws on cancellation.
static size_t countDistinct(const IdTable& input,
const std::function<void()>& checkCancellation);
static size_t countDistinct(IdTableView<0> input,
const std::function<void()>& checkCancellation);
};
6 changes: 5 additions & 1 deletion src/engine/ExportQueryExecutionTrees.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ ExportQueryExecutionTrees::idToStringAndTypeForEncodedValue(Id id) {
return std::pair{std::to_string(id.getInt()), XSD_INT_TYPE};
case Date:
return id.getDate().toStringAndType();
case GeoPoint:
return id.getGeoPoint().toStringAndType();
case BlankNodeIndex:
return std::pair{absl::StrCat("_:bn", id.getBlankNodeIndex().get()),
nullptr};
Expand Down Expand Up @@ -624,13 +626,15 @@ ad_utility::streams::stream_generator ExportQueryExecutionTrees::
return binding.dump();
};

bool isFirstRow = true;
for (const auto& [idTable, range] : getRowIndices(limitAndOffset, *result)) {
for (uint64_t i : range) {
if (i != 0) [[likely]] {
if (!isFirstRow) [[likely]] {
co_yield ",";
}
co_yield getBinding(idTable, i);
cancellationHandle->throwIfCancelled();
isFirstRow = false;
}
}

Expand Down
Loading

0 comments on commit f14cf01

Please sign in to comment.