Skip to content

Commit

Permalink
Multi-file import (#1537)
Browse files Browse the repository at this point in the history
So far, QLever read its input from a single file or from standard input. This made it hard to associate graph information per file. It also caused problems when parallel parsing was activated and a Turtle file did not have all its prefix declarations at the beginning. With this change, QLever can read its input from multiple input streams (files or pipes), and the streams are parsed concurrently. It can be specified separately for each stream which default graph to use for that stream and whether to use the parallel parser or not. Specifying a value for `"parallel-parsing"` in the `.settings.json` file is now deprecated.

There will a corresponding change in https://github.com/ad-freiburg/qlever-control next that enables the convenient control of this new functionality from a `Qleverfile`.
  • Loading branch information
joka921 authored Oct 16, 2024
1 parent a9a9ae4 commit 4acbca3
Show file tree
Hide file tree
Showing 12 changed files with 554 additions and 110 deletions.
10 changes: 5 additions & 5 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,6 @@ Index::Index(Index&&) noexcept = default;
// https://stackoverflow.com/questions/13414652/forward-declaration-with-unique-ptr
Index::~Index() = default;

// ____________________________________________________________________________
void Index::createFromFile(const std::string& filename, Filetype type) {
pimpl_->createFromFile(filename, type);
}

// ____________________________________________________________________________
void Index::createFromOnDiskIndex(const std::string& onDiskBase) {
pimpl_->createFromOnDiskIndex(onDiskBase);
Expand Down Expand Up @@ -283,3 +278,8 @@ size_t Index::getResultSizeOfScan(const ScanSpecification& scanSpecification,
const Permutation::Enum& permutation) const {
return pimpl_->getResultSizeOfScan(scanSpecification, permutation);
}

// ____________________________________________________________________________
void Index::createFromFiles(const std::vector<InputFileSpecification>& files) {
return pimpl_->createFromFiles(files);
}
8 changes: 5 additions & 3 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include "global/Id.h"
#include "index/CompressedString.h"
#include "index/InputFileSpecification.h"
#include "index/Permutation.h"
#include "index/StringSortComparator.h"
#include "index/Vocabulary.h"
Expand Down Expand Up @@ -63,6 +64,9 @@ class Index {
vector<Score> scores_;
};

using Filetype = qlever::Filetype;
using InputFileSpecification = qlever::InputFileSpecification;

/// Forbid copy and assignment.
Index& operator=(const Index&) = delete;
Index(const Index&) = delete;
Expand All @@ -79,9 +83,7 @@ class Index {
// Create an index from a file. Will write vocabulary and on-disk index data.
// NOTE: The index can not directly be used after this call, but has to be
// setup by `createFromOnDiskIndex` after this call.
enum class Filetype { Turtle, NQuad };
void createFromFile(const std::string& filename,
Filetype filetype = Filetype::Turtle);
void createFromFiles(const std::vector<InputFileSpecification>& files);

// Create an index object from an on-disk index that has previously been
// constructed using the `createFromFile` method which is typically called via
Expand Down
193 changes: 139 additions & 54 deletions src/index/IndexBuilderMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,94 @@ void writeStxxlConfigFile(const string& location, const string& tail) {
<< STXXL_DISK_SIZE_INDEX_BUILDER << ",syscall\n";
}

// Check that `values` has exactly one or `numFiles` many entries. If
// `allowEmpty` is true, then an empty vector will also be accepted. If this
// condition is violated, throw an exception. This is used to validate the
// parameters for file types and default graphs.
static void checkNumParameterValues(const auto& values, size_t numFiles,
bool allowEmpty,
std::string_view parameterName) {
if (allowEmpty && values.empty()) {
return;
}
if (values.size() == 1 || values.size() == numFiles) {
return;
}
auto error = absl::StrCat(
"The parameter \"", parameterName,
"\" must be specified either exactly once (in which case it is "
"used for all input files) or exactly as many times as there are "
"input files, in which case each input file has its own value.");
if (allowEmpty) {
absl::StrAppend(&error,
" The parameter can also be omitted entirely, in which "
" case a default value is used for all input files.");
}
throw std::runtime_error{error};
}

// Convert the `filetype` string, which must be "ttl", "nt", or "nq" to the
// corresponding `qlever::Filetype` value. If no filetyp is given, try to deduce
// the type from the filename.
qlever::Filetype getFiletype(std::optional<std::string_view> filetype,
std::string_view filename) {
auto impl = [](std::string_view s) -> std::optional<qlever::Filetype> {
if (s == "ttl" || s == "nt") {
return qlever::Filetype::Turtle;
} else if (s == "nq") {
return qlever::Filetype::NQuad;
} else {
return std::nullopt;
}
};
if (filetype.has_value()) {
auto result = impl(filetype.value());
if (result.has_value()) {
return result.value();
} else {
throw std::runtime_error{
absl::StrCat("The value of --file-format or -F must be one of "
"`ttl`, `nt`, or `nq`, but is `",
filetype.value(), "`")};
}
}

auto posOfDot = filename.rfind('.');
auto throwNotDeducable = [&filename]() {
throw std::runtime_error{absl::StrCat(
"Could not deduce the file format from the filename \"", filename,
"\". Either use files with names that end on `.ttl`, `.nt`, or `.nq`, "
"or explicitly set the format of the file via --file-format or -F")};
};
if (posOfDot == std::string::npos) {
throwNotDeducable();
}
auto deducedType = impl(filename.substr(posOfDot + 1));
if (deducedType.has_value()) {
return deducedType.value();
} else {
throwNotDeducable();
}
// The following line is necessary because Clang and GCC currently can't
// deduce that the above `else` case always throws and there is currently no
// way to mark the `throwNotDeducable` lambda as `[[noreturn]]`.
AD_FAIL();
}

// Get the parameter value at the given index. If the vector is empty, return
// the given `defaultValue`. If the vector has exactly one element, return that
// element, no matter what the index is.
template <typename T>
T getParameterValue(size_t idx, const auto& values, const T& defaultValue) {
if (values.empty()) {
return defaultValue;
}
if (values.size() == 1) {
return values.at(0);
}
return values.at(idx);
}

// Main function.
int main(int argc, char** argv) {
// Copy the git hash and datetime of compilation (which require relinking)
Expand All @@ -67,8 +155,10 @@ int main(int argc, char** argv) {
string textIndexName;
string kbIndexName;
string settingsFile;
string filetype;
string inputFile;
std::vector<string> filetype;
std::vector<string> inputFile;
std::vector<string> defaultGraphs;
std::vector<bool> parseParallel;
bool noPatterns = false;
bool onlyAddTextIndex = false;
bool keepTemporaryFiles = false;
Expand All @@ -92,8 +182,19 @@ int main(int argc, char** argv) {
"will read from stdin.");
add("file-format,F", po::value(&filetype),
"The format of the input file with the knowledge graph data. Must be one "
"of [nt|ttl|nq]. If not set, QLever will try to deduce it from the "
"filename suffix.");
"of [nt|ttl|nq]. Can be specified once (then all files use that format), "
"or once per file, or not at all (in that case, the format is deduced "
"from the filename suffix if possible).");
add("default-graph,g", po::value(&defaultGraphs),
"The graph IRI without angle brackets. Write `-` for the default graph. "
"Can be omitted (then all files use the default graph), specified once "
"(then all files use that graph), or once per file.");
add("parse-parallel,p", po::value(&parseParallel),
"Enable or disable the parallel parser for all files (if specified once) "
"or once per input file. Parallel parsing works for all input files "
"using the N-Triples or N-Quads format, as well as for well-behaved "
"Turtle files, where all the prefix declarations come in one block at "
"the beginning and there are no multiline literals");
add("kg-index-name,K", po::value(&kbIndexName),
"The name of the knowledge graph index (default: basename of "
"`kg-input-file`).");
Expand Down Expand Up @@ -157,8 +258,8 @@ int main(int argc, char** argv) {

// If no index name was specified, take the part of the input file name after
// the last slash.
if (kbIndexName.empty() && !inputFile.empty()) {
kbIndexName = ad_utility::getLastPartOfString(inputFile, '/');
if (kbIndexName.empty()) {
kbIndexName = "no index name specified";
}

LOG(INFO) << EMPH_ON << "QLever IndexBuilder, compiled on "
Expand All @@ -181,59 +282,43 @@ int main(int argc, char** argv) {
index.setKeepTempFiles(keepTemporaryFiles);
index.setSettingsFile(settingsFile);
index.loadAllPermutations() = !onlyPsoAndPos;
// NOTE: If `onlyAddTextIndex` is true, we do not want to construct an
// index, but we assume that it already exists. In particular, we then need
// the vocabulary from the KB index for building the text index.
if (!onlyAddTextIndex) {
if (inputFile.empty() || inputFile == "-") {
inputFile = "/dev/stdin";
}

if (!filetype.empty()) {
LOG(INFO) << "You specified the input format: "
<< ad_utility::getUppercase(filetype) << std::endl;
} else {
bool filetypeDeduced = false;
if (inputFile.ends_with(".nt")) {
filetype = "nt";
filetypeDeduced = true;
} else if (inputFile.ends_with(".ttl")) {
filetype = "ttl";
filetypeDeduced = true;
} else if (inputFile.ends_with(".nq")) {
filetype = "nq";
filetypeDeduced = true;
} else {
LOG(INFO) << "Unknown or missing extension of input file, assuming: "
"TTL"
<< std::endl;
// Convert the parameters for the filenames, file types, and default graphs
// into a `vector<InputFileSpecification>`.
auto getFileSpecifications = [&]() {
checkNumParameterValues(filetype, inputFile.size(), true,
"--file-format, -F");
checkNumParameterValues(defaultGraphs, inputFile.size(), true,
"--default-graph, -g");
checkNumParameterValues(parseParallel, parseParallel.size(), true,
"--parse-parallel, p");

std::vector<qlever::InputFileSpecification> fileSpecs;
for (size_t i = 0; i < inputFile.size(); ++i) {
auto type = getParameterValue<std::optional<std::string_view>>(
i, filetype, std::nullopt);

auto defaultGraph = getParameterValue<std::optional<string>>(
i, defaultGraphs, std::nullopt);
if (defaultGraph == "-") {
defaultGraph = std::nullopt;
}
if (filetypeDeduced) {
LOG(INFO) << "Format of input file deduced from extension: "
<< ad_utility::getUppercase(filetype) << std::endl;

bool parseInParallel = getParameterValue(i, parseParallel, false);
auto& filename = inputFile.at(i);
if (filename == "-") {
filename = "/dev/stdin";
}
LOG(INFO) << "If this is not correct, start again using the option "
"--file-format (-F)"
<< std::endl;
fileSpecs.emplace_back(filename, getFiletype(type, filename),
std::move(defaultGraph), parseInParallel);
}
return fileSpecs;
};

if (filetype == "ttl") {
LOG(DEBUG) << "Parsing uncompressed TTL from: " << inputFile
<< std::endl;
index.createFromFile(inputFile, Index::Filetype::Turtle);
} else if (filetype == "nt") {
LOG(DEBUG) << "Parsing uncompressed N-Triples from: " << inputFile
<< " (using the Turtle parser)" << std::endl;
index.createFromFile(inputFile, Index::Filetype::Turtle);
} else if (filetype == "nq") {
LOG(DEBUG) << "Parsing uncompressed N-Quads from: " << inputFile
<< std::endl;
index.createFromFile(inputFile, Index::Filetype::NQuad);
} else {
LOG(ERROR) << "File format must be one of: nt ttl nq" << std::endl;
std::cerr << boostOptions << std::endl;
exit(1);
}
if (!onlyAddTextIndex) {
auto fileSpecifications = getFileSpecifications();
AD_CONTRACT_CHECK(!fileSpecifications.empty());
index.createFromFiles(fileSpecifications);
}

if (!wordsfile.empty() || addWordsFromLiterals) {
Expand Down
55 changes: 36 additions & 19 deletions src/index/IndexImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,28 +68,18 @@ IndexBuilderDataAsFirstPermutationSorter IndexImpl::createIdTriplesAndVocab(

// _____________________________________________________________________________
std::unique_ptr<RdfParserBase> IndexImpl::makeRdfParser(
const std::string& filename, Index::Filetype type) const {
const std::vector<Index::InputFileSpecification>& files) const {
auto makeRdfParserImpl =
[&filename]<int useParallel, int isTurtleInput, int useCtre>()
-> std::unique_ptr<RdfParserBase> {
[&files]<int useCtre>() -> std::unique_ptr<RdfParserBase> {
using TokenizerT =
std::conditional_t<useCtre == 1, TokenizerCtre, Tokenizer>;
using InnerParser =
std::conditional_t<isTurtleInput == 1, TurtleParser<TokenizerT>,
NQuadParser<TokenizerT>>;
using Parser =
std::conditional_t<useParallel == 1, RdfParallelParser<InnerParser>,
RdfStreamParser<InnerParser>>;
return std::make_unique<Parser>(filename);
return std::make_unique<RdfMultifileParser<TokenizerT>>(files);
};

// `callFixedSize` litfts runtime integers to compile time integers. We use it
// here to create the correct combinations of template arguments.
return ad_utility::callFixedSize(
std::array{useParallelParser_ ? 1 : 0,
type == Index::Filetype::Turtle ? 1 : 0,
onlyAsciiTurtlePrefixes_ ? 1 : 0},
makeRdfParserImpl);
return ad_utility::callFixedSize(std::array{onlyAsciiTurtlePrefixes_ ? 1 : 0},
makeRdfParserImpl);
}

// Several helper functions for joining the OSP permutation with the patterns.
Expand Down Expand Up @@ -297,18 +287,45 @@ std::pair<size_t, size_t> IndexImpl::createInternalPSOandPOS(
}

// _____________________________________________________________________________
void IndexImpl::createFromFile(const string& filename, Index::Filetype type) {
void IndexImpl::updateInputFileSpecificationsAndLog(
std::vector<Index::InputFileSpecification>& spec,
bool parallelParsingSpecifiedViaJson) {
if (spec.size() == 1) {
LOG(INFO) << "Processing triples from " << spec.at(0).filename_ << " ..."
<< std::endl;
} else {
LOG(INFO) << "Processing triples from " << spec.size()
<< " input streams ..." << std::endl;
}
if (parallelParsingSpecifiedViaJson) {
if (spec.size() == 1) {
LOG(WARN) << "Parallel parsing set to `true` in the `.settings.json` "
"file; this is deprecated, please use the command-line "
" option --parse-parallel or -p instead"
<< std::endl;
spec.at(0).parseInParallel_ = true;
} else {
throw std::runtime_error{
"For more than one input file, the parallel parsing must not be "
"specified via the `.settings.json` file, but has to be specified "
" via the command-line option --parse-parallel or -p"};
}
}
}

// _____________________________________________________________________________
void IndexImpl::createFromFiles(
std::vector<Index::InputFileSpecification> files) {
if (!loadAllPermutations_ && usePatterns_) {
throw std::runtime_error{
"The patterns can only be built when all 6 permutations are created"};
}
LOG(INFO) << "Processing input triples from " << filename << " ..."
<< std::endl;

readIndexBuilderSettingsFromFile();

updateInputFileSpecificationsAndLog(files, useParallelParser_);
IndexBuilderDataAsFirstPermutationSorter indexBuilderData =
createIdTriplesAndVocab(makeRdfParser(filename, type));
createIdTriplesAndVocab(makeRdfParser(files));

// Write the configuration already at this point, so we have it available in
// case any of the permutations fail.
Expand Down
Loading

0 comments on commit 4acbca3

Please sign in to comment.