-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[lld][InstrProf] Profile guided function order #96268
Conversation
@llvm/pr-subscribers-pgo @llvm/pr-subscribers-lld-macho Author: Ellis Hoag (ellishg) ChangesAdd the lld flags We use Balanced Partitioning to determine the best section order using traces from IRPGO profiles (see https://discourse.llvm.org/t/rfc-temporal-profiling-extension-for-irpgo/68068 for details) to improve startup time and using hashes of section contents to improve compressed size. In our recent LLVM talk, we showed that this can reduce page faults during startup by 40% on a large iOS app and we can reduce compressed size by 0.8-3%. More details can be found in https://dl.acm.org/doi/10.1145/3660635 Patch is 27.46 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96268.diff 9 Files Affected:
diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
new file mode 100644
index 0000000000000..c2259aefecdf0
--- /dev/null
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -0,0 +1,420 @@
+//===- BPSectionOrderer.cpp--------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPSectionOrderer.h"
+#include "InputSection.h"
+#include "lld/Common/ErrorHandler.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/Support/BalancedPartitioning.h"
+#include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/xxhash.h"
+
+#define DEBUG_TYPE "bp-section-orderer"
+using namespace llvm;
+using namespace lld::macho;
+
+// TODO: Move to StringRef.h
+static bool isNumber(StringRef S) {
+ return !S.empty() && S.find_first_not_of("0123456789") == StringRef::npos;
+}
+
+/// Symbols can be appended with "(.__uniq.xxxx)?.llvm.yyyy" where "xxxx" and
+/// "yyyy" are numbers that could change between builds. We need to use the root
+/// symbol name before this suffix so these symbols can be matched with profiles
+/// which may have different suffixes.
+static StringRef getRootSymbol(StringRef Name) {
+ auto [P0, S0] = Name.rsplit(".llvm.");
+ if (isNumber(S0))
+ Name = P0;
+ auto [P1, S1] = Name.rsplit(".__uniq.");
+ if (isNumber(S1))
+ return P1;
+ return Name;
+}
+
+static uint64_t getRelocHash(StringRef kind, uint64_t sectionIdx,
+ uint64_t offset, uint64_t addend) {
+ return xxHash64((kind + ": " + Twine::utohexstr(sectionIdx) + " + " +
+ Twine::utohexstr(offset) + " + " + Twine::utohexstr(addend))
+ .str());
+}
+
+static uint64_t
+getRelocHash(const Reloc &reloc,
+ const DenseMap<const InputSection *, uint64_t> §ionToIdx) {
+ auto *isec = reloc.getReferentInputSection();
+ std::optional<uint64_t> sectionIdx;
+ auto sectionIdxIt = sectionToIdx.find(isec);
+ if (sectionIdxIt != sectionToIdx.end())
+ sectionIdx = sectionIdxIt->getSecond();
+ std::string kind;
+ if (isec)
+ kind = ("Section " + Twine(isec->kind())).str();
+ if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) {
+ kind += (" Symbol " + Twine(sym->kind())).str();
+ if (auto *d = dyn_cast<Defined>(sym)) {
+ if (isa_and_nonnull<CStringInputSection>(isec))
+ return getRelocHash(kind, 0, isec->getOffset(d->value), reloc.addend);
+ return getRelocHash(kind, sectionIdx.value_or(0), d->value, reloc.addend);
+ }
+ }
+ return getRelocHash(kind, sectionIdx.value_or(0), 0, reloc.addend);
+}
+
+static void constructNodesForCompression(
+ const SmallVector<const InputSection *> §ions,
+ const DenseMap<const InputSection *, uint64_t> §ionToIdx,
+ const SmallVector<unsigned> §ionIdxs,
+ std::vector<BPFunctionNode> &nodes,
+ DenseMap<unsigned, SmallVector<unsigned>> &duplicateSectionIdxs,
+ BPFunctionNode::UtilityNodeT &maxUN) {
+
+ SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> sectionHashes;
+ sectionHashes.reserve(sectionIdxs.size());
+ SmallVector<uint64_t> hashes;
+ for (unsigned sectionIdx : sectionIdxs) {
+ const auto *isec = sections[sectionIdx];
+ constexpr unsigned windowSize = 4;
+
+ for (size_t i = 0; i < isec->data.size(); i++) {
+ auto window = isec->data.drop_front(i).take_front(windowSize);
+ hashes.push_back(xxHash64(window));
+ }
+ for (const auto &r : isec->relocs) {
+ if (r.length == 0 || r.referent.isNull() || r.offset >= isec->data.size())
+ continue;
+ uint64_t relocHash = getRelocHash(r, sectionToIdx);
+ uint32_t start = (r.offset < windowSize) ? 0 : r.offset - windowSize + 1;
+ for (uint32_t i = start; i < r.offset + r.length; i++) {
+ auto window = isec->data.drop_front(i).take_front(windowSize);
+ hashes.push_back(xxHash64(window) + relocHash);
+ }
+ }
+
+ llvm::sort(hashes);
+ hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end());
+
+ sectionHashes.emplace_back(sectionIdx, hashes);
+ hashes.clear();
+ }
+
+ DenseMap<uint64_t, unsigned> hashFrequency;
+ for (auto &[sectionIdx, hashes] : sectionHashes)
+ for (auto hash : hashes)
+ ++hashFrequency[hash];
+
+ // Merge section that are nearly identical
+ SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
+ DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
+ for (auto &[sectionIdx, hashes] : sectionHashes) {
+ uint64_t wholeHash = 0;
+ for (auto hash : hashes)
+ if (hashFrequency[hash] > 5)
+ wholeHash ^= hash;
+ auto [it, wasInserted] =
+ wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
+ if (wasInserted) {
+ newSectionHashes.emplace_back(sectionIdx, hashes);
+ } else {
+ duplicateSectionIdxs[it->getSecond()].push_back(sectionIdx);
+ }
+ }
+ sectionHashes = newSectionHashes;
+
+ // Recompute hash frequencies
+ hashFrequency.clear();
+ for (auto &[sectionIdx, hashes] : sectionHashes)
+ for (auto hash : hashes)
+ ++hashFrequency[hash];
+
+ // Filter rare and common hashes and assign each a unique utility node that
+ // doesn't conflict with the trace utility nodes
+ DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
+ for (auto &[hash, frequency] : hashFrequency) {
+ if (frequency <= 1 || frequency * 2 > wholeHashToSectionIdx.size())
+ continue;
+ hashToUN[hash] = ++maxUN;
+ }
+
+ std::vector<BPFunctionNode::UtilityNodeT> uns;
+ for (auto &[sectionIdx, hashes] : sectionHashes) {
+ for (auto &hash : hashes) {
+ auto it = hashToUN.find(hash);
+ if (it != hashToUN.end())
+ uns.push_back(it->second);
+ }
+ nodes.emplace_back(sectionIdx, uns);
+ uns.clear();
+ }
+}
+
+DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
+ size_t &highestAvailablePriority, StringRef profilePath,
+ bool forFunctionCompression, bool forDataCompression) {
+
+ SmallVector<const InputSection *> sections;
+ DenseMap<const InputSection *, uint64_t> sectionToIdx;
+ StringMap<DenseSet<unsigned>> symbolToSectionIdxs;
+ for (const auto *file : inputFiles) {
+ for (auto *sec : file->sections) {
+ for (auto &subsec : sec->subsections) {
+ auto *isec = subsec.isec;
+ if (!isec || isec->data.empty() || !isec->data.data())
+ continue;
+ unsigned sectionIdx = sections.size();
+ sectionToIdx.try_emplace(isec, sectionIdx);
+ sections.push_back(isec);
+ for (Symbol *sym : isec->symbols)
+ if (auto *d = dyn_cast_or_null<Defined>(sym))
+ symbolToSectionIdxs[d->getName()].insert(sectionIdx);
+ }
+ }
+ }
+
+ StringMap<DenseSet<unsigned>> rootSymbolToSectionIdxs;
+ for (auto &[name, sectionIdxs] : symbolToSectionIdxs) {
+ name = getRootSymbol(name);
+ rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
+ sectionIdxs.end());
+ // Linkage names can be prefixed with "_" or "l_" on Mach-O. See
+ // Mangler::getNameWithPrefix() for details.
+ if (name.consume_front("_") || name.consume_front("l_"))
+ rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
+ sectionIdxs.end());
+ }
+
+ std::vector<BPFunctionNode> nodesForStartup;
+ BPFunctionNode::UtilityNodeT maxUN = 0;
+ DenseMap<unsigned, SmallVector<BPFunctionNode::UtilityNodeT>>
+ startupSectionIdxUNs;
+ std::unique_ptr<InstrProfReader> reader;
+ if (!profilePath.empty()) {
+ auto fs = vfs::getRealFileSystem();
+ auto readerOrErr = InstrProfReader::create(profilePath, *fs);
+ lld::checkError(readerOrErr.takeError());
+
+ reader = std::move(readerOrErr.get());
+ for (auto &entry : *reader) {
+ // Read all entries
+ (void)entry;
+ }
+ auto &traces = reader->getTemporalProfTraces();
+
+ // Used to define the initial order for startup functions.
+ DenseMap<unsigned, size_t> sectionIdxToTimestamp;
+ DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
+ for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
+ uint64_t currentSize = 0, cutoffSize = 1;
+ size_t cutoffTimestamp = 1;
+ auto &trace = traces[traceIdx].FunctionNameRefs;
+ for (size_t timestamp = 0; timestamp < trace.size(); timestamp++) {
+ auto [Filename, ParsedFuncName] = getParsedIRPGOName(
+ reader->getSymtab().getFuncOrVarName(trace[timestamp]));
+ ParsedFuncName = getRootSymbol(ParsedFuncName);
+
+ auto sectionIdxsIt = rootSymbolToSectionIdxs.find(ParsedFuncName);
+ if (sectionIdxsIt == rootSymbolToSectionIdxs.end())
+ continue;
+ auto §ionIdxs = sectionIdxsIt->getValue();
+ // If the same symbol is found in multiple sections, they might be
+ // identical, so we arbitrarily use the size from the first section.
+ currentSize += sections[*sectionIdxs.begin()]->getSize();
+
+ // Since BalancedPartitioning is sensitive to the initial order, we need
+ // to explicitly define it to be ordered by earliest timestamp.
+ for (unsigned sectionIdx : sectionIdxs) {
+ auto [it, wasInserted] =
+ sectionIdxToTimestamp.try_emplace(sectionIdx, timestamp);
+ if (!wasInserted)
+ it->getSecond() = std::min<size_t>(it->getSecond(), timestamp);
+ }
+
+ if (timestamp >= cutoffTimestamp || currentSize >= cutoffSize) {
+ ++maxUN;
+ cutoffSize = 2 * currentSize;
+ cutoffTimestamp = 2 * cutoffTimestamp;
+ }
+ for (unsigned sectionIdx : sectionIdxs)
+ sectionIdxToFirstUN.try_emplace(sectionIdx, maxUN);
+ }
+ for (auto &[sectionIdx, firstUN] : sectionIdxToFirstUN)
+ for (auto un = firstUN; un <= maxUN; ++un)
+ startupSectionIdxUNs[sectionIdx].push_back(un);
+ ++maxUN;
+ sectionIdxToFirstUN.clear();
+ }
+
+ // These uns should already be sorted without duplicates.
+ for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
+ nodesForStartup.emplace_back(sectionIdx, uns);
+
+ llvm::sort(nodesForStartup, [§ionIdxToTimestamp](auto &L, auto &R) {
+ return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
+ std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
+ });
+ }
+
+ SmallVector<unsigned> sectionIdxsForFunctionCompression,
+ sectionIdxsForDataCompression;
+ for (unsigned sectionIdx = 0; sectionIdx < sections.size(); sectionIdx++) {
+ if (startupSectionIdxUNs.count(sectionIdx))
+ continue;
+ const auto *isec = sections[sectionIdx];
+ if (isCodeSection(isec)) {
+ sectionIdxsForFunctionCompression.push_back(sectionIdx);
+ } else {
+ sectionIdxsForDataCompression.push_back(sectionIdx);
+ }
+ }
+
+ std::vector<BPFunctionNode> nodesForFunctionCompression,
+ nodesForDataCompression;
+ // Map a section index (to be ordered for compression) to a list of duplicate
+ // section indices (not ordered for compression).
+ DenseMap<unsigned, SmallVector<unsigned>> duplicateFunctionSectionIdxs,
+ duplicateDataSectionIdxs;
+ if (forFunctionCompression) {
+ TimeTraceScope timeScope("Build nodes for function compression");
+ constructNodesForCompression(
+ sections, sectionToIdx, sectionIdxsForFunctionCompression,
+ nodesForFunctionCompression, duplicateFunctionSectionIdxs, maxUN);
+ }
+ if (forDataCompression) {
+ TimeTraceScope timeScope("Build nodes for data compression");
+ constructNodesForCompression(
+ sections, sectionToIdx, sectionIdxsForDataCompression,
+ nodesForDataCompression, duplicateDataSectionIdxs, maxUN);
+ }
+
+ // Sort nodes by their Id (which is the section index) because the input
+ // linker order tends to be not bad
+ llvm::sort(nodesForFunctionCompression,
+ [](auto &L, auto &R) { return L.Id < R.Id; });
+ llvm::sort(nodesForDataCompression,
+ [](auto &L, auto &R) { return L.Id < R.Id; });
+
+ {
+ TimeTraceScope timeScope("Balanced Partitioning");
+ BalancedPartitioningConfig config;
+ BalancedPartitioning bp(config);
+ bp.run(nodesForStartup);
+ bp.run(nodesForFunctionCompression);
+ bp.run(nodesForDataCompression);
+ }
+
+ unsigned numStartupSections = 0;
+ unsigned numCodeCompressionSections = 0;
+ unsigned numDuplicateCodeSections = 0;
+ unsigned numDataCompressionSections = 0;
+ unsigned numDuplicateDataSections = 0;
+ SetVector<const InputSection *> orderedSections;
+ // Order startup functions,
+ for (auto &node : nodesForStartup) {
+ const auto *isec = sections[node.Id];
+ if (orderedSections.insert(isec))
+ ++numStartupSections;
+ }
+ // then functions for compression,
+ for (auto &node : nodesForFunctionCompression) {
+ const auto *isec = sections[node.Id];
+ if (orderedSections.insert(isec))
+ ++numCodeCompressionSections;
+
+ auto It = duplicateFunctionSectionIdxs.find(node.Id);
+ if (It == duplicateFunctionSectionIdxs.end())
+ continue;
+ for (auto dupSecIdx : It->getSecond()) {
+ const auto *dupIsec = sections[dupSecIdx];
+ if (orderedSections.insert(dupIsec))
+ ++numDuplicateCodeSections;
+ }
+ }
+ // then data for compression.
+ for (auto &node : nodesForDataCompression) {
+ const auto *isec = sections[node.Id];
+ if (orderedSections.insert(isec))
+ ++numDataCompressionSections;
+ auto It = duplicateDataSectionIdxs.find(node.Id);
+ if (It == duplicateDataSectionIdxs.end())
+ continue;
+ for (auto dupSecIdx : It->getSecond()) {
+ const auto *dupIsec = sections[dupSecIdx];
+ if (orderedSections.insert(dupIsec))
+ ++numDuplicateDataSections;
+ }
+ }
+
+#ifndef NDEBUG
+ unsigned numTotalOrderedSections =
+ numStartupSections + numCodeCompressionSections +
+ numDuplicateCodeSections + numDataCompressionSections +
+ numDuplicateDataSections;
+ dbgs() << "Ordered " << numTotalOrderedSections
+ << " sections using balanced partitioning:\n Functions for startup: "
+ << numStartupSections
+ << "\n Functions for compression: " << numCodeCompressionSections
+ << "\n Duplicate functions: " << numDuplicateCodeSections
+ << "\n Data for compression: " << numDataCompressionSections
+ << "\n Duplicate data: " << numDuplicateDataSections << "\n";
+
+ if (!profilePath.empty()) {
+ // Evaluate this function order for startup
+ StringMap<std::pair<uint64_t, uint64_t>> symbolToPageNumbers;
+ const uint64_t pageSize = (1 << 14);
+ uint64_t currentAddress = 0;
+ for (const auto *isec : orderedSections) {
+ for (Symbol *sym : isec->symbols) {
+ if (auto *d = dyn_cast_or_null<Defined>(sym)) {
+ uint64_t startAddress = currentAddress + d->value;
+ uint64_t endAddress = startAddress + d->size;
+ uint64_t firstPage = startAddress / pageSize;
+ // I think the kernel might pull in a few pages when one it touched,
+ // so it might be more accurate to force lastPage to be aligned by 4?
+ uint64_t lastPage = endAddress / pageSize;
+ StringRef rootSymbol = d->getName();
+ rootSymbol = getRootSymbol(rootSymbol);
+ symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
+ if (rootSymbol.consume_front("_") || rootSymbol.consume_front("l_"))
+ symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
+ }
+ }
+
+ currentAddress += isec->getSize();
+ }
+
+ // The area under the curve F where F(t) is the total number of page faults
+ // at step t.
+ unsigned area = 0;
+ for (auto &trace : reader->getTemporalProfTraces()) {
+ SmallSet<uint64_t, 0> touchedPages;
+ for (unsigned step = 0; step < trace.FunctionNameRefs.size(); step++) {
+ auto traceId = trace.FunctionNameRefs[step];
+ auto [Filename, ParsedFuncName] =
+ getParsedIRPGOName(reader->getSymtab().getFuncOrVarName(traceId));
+ ParsedFuncName = getRootSymbol(ParsedFuncName);
+ auto it = symbolToPageNumbers.find(ParsedFuncName);
+ if (it != symbolToPageNumbers.end()) {
+ auto &[firstPage, lastPage] = it->getValue();
+ for (uint64_t i = firstPage; i <= lastPage; i++)
+ touchedPages.insert(i);
+ }
+ area += touchedPages.size();
+ }
+ }
+ dbgs() << "Total area under the page fault curve: " << (float)area << "\n";
+ }
+#endif
+
+ DenseMap<const InputSection *, size_t> sectionPriorities;
+ for (const auto *isec : orderedSections)
+ sectionPriorities[isec] = --highestAvailablePriority;
+ return sectionPriorities;
+}
diff --git a/lld/MachO/BPSectionOrderer.h b/lld/MachO/BPSectionOrderer.h
new file mode 100644
index 0000000000000..9d8302b441a7c
--- /dev/null
+++ b/lld/MachO/BPSectionOrderer.h
@@ -0,0 +1,35 @@
+//===- BPSectionOrderer.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file uses Balanced Partitioning to order sections to improve startup
+/// time and compressed size.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_MACHO_BPSECTION_ORDERER_H
+#define LLD_MACHO_BPSECTION_ORDERER_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace lld::macho {
+
+class InputSection;
+
+/// Run Balanced Partitioning to find the optimial function and data order to
+/// improve startup time and compressed size.
+///
+/// It is important that .subsections_via_symbols is used to ensure functions
+/// and data are in their own sections and thus can be reordered.
+llvm::DenseMap<const lld::macho::InputSection *, size_t>
+runBalancedPartitioning(size_t &highestAvailablePriority,
+ llvm::StringRef profilePath,
+ bool forFunctionCompression, bool forDataCompression);
+
+} // namespace lld::macho
+
+#endif
diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt
index 0b92488b00bea..8b7183e4ec496 100644
--- a/lld/MachO/CMakeLists.txt
+++ b/lld/MachO/CMakeLists.txt
@@ -25,6 +25,7 @@ add_lld_library(lldMachO
OutputSection.cpp
OutputSegment.cpp
Relocations.cpp
+ BPSectionOrderer.cpp
SectionPriorities.cpp
SymbolTable.cpp
Symbols.cpp
@@ -47,6 +48,7 @@ add_lld_library(lldMachO
Object
Option
Passes
+ ProfileData
Support
TargetParser
TextAPI
diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index 96253e15f7eea..780fea9d38669 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -211,6 +211,9 @@ struct Configuration {
bool csProfileGenerate = false;
llvm::StringRef csProfilePath;
bool pgoWarnMismatch;
+ llvm::StringRef profileGuidedFunctionOrderPath;
+ bool functionOrderForCompression;
+ bool dataOrderForCompression;
bool callGraphProfileSort = false;
llvm::StringRef printSymbolOrder;
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 9dddabcf3680c..5109ed18b37ac 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1676,6 +1676,11 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
config->pgoWarnMismatch =
args.hasFlag(OPT_pgo_warn_mismatch, OPT_no_pgo_warn_mismatch, true);
config->generateUuid = !args.hasArg(OPT_no_uuid);
+ config->profileGuidedFunctionOrderPath =
+ args.getLastArgValue(OPT_profile_guided_function_order);
+ config->functionOrderForCompression =
+ args.hasArg(OPT_function_order_for_compression);
+ config->dataOrderForCompression = args.hasArg(OPT_data_order_for_compression);
for (const Arg *arg : args.filtered(OPT_alias)) {
config->aliasedSymbols.push_back(
diff --git a/lld/MachO...
[truncated]
|
lld/MachO/BPSectionOrderer.cpp
Outdated
using namespace llvm; | ||
using namespace lld::macho; | ||
|
||
// TODO: Move to StringRef.h |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Was this meant to be a part of this PR or in a separate PR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was planning on it being a separate PR since it is a library change. I'm sure other folks would have opinions on this, and I don't want that conversation to be here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This reminds me of what I'd do with these symbols by utilizing more characters to make the symbol names shorter...
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100437
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this will be changed in the future, maybe I should just drop this check and accept any suffix after .llvm.
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, .llvm.
should be sufficient. The 0123456789
check also introduced some overhead.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll remove the isNumber()
checks.
|
||
SmallVector<unsigned> sectionIdxsForFunctionCompression, | ||
sectionIdxsForDataCompression; | ||
for (unsigned sectionIdx = 0; sectionIdx < sections.size(); sectionIdx++) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like we pay the cost of this loop even if we don't enable compression for function or data. Should this be guarded under the condition if either flag is present?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I moved this check for forFunctionCompression
earlier. I'm hoping the optimizer is smart enough to skip the whole loop if both forFunctionCompression
and forDataCompression
are false, but I haven't verified this yet.
|
||
std::vector<BPFunctionNode> nodesForStartup; | ||
BPFunctionNode::UtilityNodeT maxUN = 0; | ||
DenseMap<unsigned, SmallVector<BPFunctionNode::UtilityNodeT>> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SmallVector<X>
tries to make the class 64 bytes (kPreferredSmallVectorSizeof
). With a DenseMap, you usually want to make the value type as small as possible. Would an inline element count of 0 or 1 be better?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since the element type is uint32_t
, I guess it can hold 16 UNs before it allocates memory. I would expect each function to have at least as many UNs as the number of traces (raw profiles), and many of them would have a few times more than that. So I think 16 UNs is a good initial value in this case. Would it be better for the map to hold pointers to the vectors?
lld/MachO/BPSectionOrderer.cpp
Outdated
} | ||
} | ||
|
||
#ifndef NDEBUG |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps add a hidden option to support dumping internal states, instead of relying on !NDEBUG
. Then the test would not need asserts
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added --verbose-bp-section-orderer
, but I'm open to other names.
#ifndef LLD_MACHO_BPSECTION_ORDERER_H | ||
#define LLD_MACHO_BPSECTION_ORDERER_H | ||
|
||
#include "llvm/ADT/DenseMap.h" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For IWYU also include headers for StringRef
and InputSection
. Forward declarations can be used as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I already have a forward declaration for InputSection
, but I've added StringRef.h
.
Add the lld flags `--profile-guided-function-order=<profile>`, `--function-order-for-compression`, and `--data-order-for-compression` to order functions to improve startup time, and functions and data to improve compressed size, respectively. We use Balanced Partitioning to determine the best section order using traces from IRPGO profiles (see https://discourse.llvm.org/t/rfc-temporal-profiling-extension-for-irpgo/68068 for details) to improve startup time and hashes of section contents to improve compressed size. In my recent LLVM talk, I showed that we can reduce page faults during startup by 40% on a large iOS app and we can reduce compressed size by 0.8-3%. More details can be found in https://dl.acm.org/doi/10.1145/3660635
ea74ff5
to
503fb68
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm
Add the lld flags `--irpgo-profile-sort=<profile>` and `--compression-sort={function,data,both}` to order functions to improve startup time, and functions or data to improve compressed size, respectively. We use Balanced Partitioning to determine the best section order using traces from IRPGO profiles (see https://discourse.llvm.org/t/rfc-temporal-profiling-extension-for-irpgo/68068 for details) to improve startup time and using hashes of section contents to improve compressed size. In our recent LLVM talk (https://www.youtube.com/watch?v=yd4pbSTjwuA), we showed that this can reduce page faults during startup by 40% on a large iOS app and we can reduce compressed size by 0.8-3%. More details can be found in https://dl.acm.org/doi/10.1145/3660635 --------- Co-authored-by: Vincent Lee <thevinster@users.noreply.github.com>
kind = ("Section " + Twine(isec->kind())).str(); | ||
if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) { | ||
kind += (" Symbol " + Twine(sym->kind())).str(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These two lines are producing warnings/errors when compiled at least with clang-9:
llvm-project/lld/MachO/BPSectionOrderer.cpp:51:26: error: ambiguous conversion for functional-style cast from 'lld::macho::InputSection::Kind' to 'llvm::Twine'
kind = ("Section " + Twine(isec->kind())).str();
^~~~~~~~~~~~~~~~~~
/work/third_party/llvm-project/llvm/include/llvm/ADT/Twine.h:344:14: note: candidate constructor
explicit Twine(unsigned char Val) : LHSKind(CharKind) {
^
/work/third_party/llvm-project/llvm/include/llvm/ADT/Twine.h:354:14: note: candidate constructor
explicit Twine(int Val) : LHSKind(DecIKind) {
^
/work/third_party/llvm-project/llvm/include/llvm/ADT/Twine.h:334:14: note: candidate constructor
explicit Twine(char Val) : LHSKind(CharKind) {
^
/work/third_party/llvm-project/llvm/include/llvm/ADT/Twine.h:339:14: note: candidate constructor
explicit Twine(signed char Val) : LHSKind(CharKind) {
^
/work/third_party/llvm-project/llvm/include/llvm/ADT/Twine.h:349:14: note: candidate constructor
explicit Twine(unsigned Val) : LHSKind(DecUIKind) {
^
/work/third_party/llvm-project/llvm/include/llvm/ADT/Twine.h:359:14: note: candidate constructor
explicit Twine(const unsigned long &Val) : LHSKind(DecULKind) {
^
/work/third_party/llvm-project/llvm/include/llvm/ADT/Twine.h:364:14: note: candidate constructor
explicit Twine(const long &Val) : LHSKind(DecLKind) {
^
/work/third_party/llvm-project/llvm/include/llvm/ADT/Twine.h:369:14: note: candidate constructor
explicit Twine(const unsigned long long &Val) : LHSKind(DecULLKind) {
^
/work/third_party/llvm-project/llvm/include/llvm/ADT/Twine.h:374:14: note: candidate constructor
explicit Twine(const long long &Val) : LHSKind(DecLLKind) {
^
1 error generated.
(downstream CI logs: https://github.com/iree-org/iree/actions/runs/10097500625/job/27922509801?pr=18004#step:4:6442)
The enum
llvm-project/lld/MachO/InputSection.h
Lines 32 to 38 in 88fb56e
enum Kind : uint8_t { | |
ConcatKind, | |
CStringLiteralKind, | |
WordLiteralKind, | |
}; | |
Kind kind() const { return sectionKind; } |
needs an explicit conversion, e.g.
llvm-project$ git diff
diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
index 26d4e0cb3987..f6a974370836 100644
--- a/lld/MachO/BPSectionOrderer.cpp
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -48,9 +48,9 @@ getRelocHash(const Reloc &reloc,
sectionIdx = sectionIdxIt->getSecond();
std::string kind;
if (isec)
- kind = ("Section " + Twine(isec->kind())).str();
+ kind = ("Section " + Twine((uint8_t)isec->kind())).str();
if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) {
- kind += (" Symbol " + Twine(sym->kind())).str();
+ kind += (" Symbol " + Twine((uint8_t)sym->kind())).str();
if (auto *d = dyn_cast<Defined>(sym)) {
if (isa_and_nonnull<CStringInputSection>(isec))
return getRelocHash(kind, 0, isec->getOffset(d->value), reloc.addend);
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fix: #100627
This fixes `error: ambiguous conversion for functional-style cast from 'lld::macho::InputSection::Kind' to 'llvm::Twine'`, observed when building with clang-9 and reported here: #96268 (comment).
Summary: Add the lld flags `--irpgo-profile-sort=<profile>` and `--compression-sort={function,data,both}` to order functions to improve startup time, and functions or data to improve compressed size, respectively. We use Balanced Partitioning to determine the best section order using traces from IRPGO profiles (see https://discourse.llvm.org/t/rfc-temporal-profiling-extension-for-irpgo/68068 for details) to improve startup time and using hashes of section contents to improve compressed size. In our recent LLVM talk (https://www.youtube.com/watch?v=yd4pbSTjwuA), we showed that this can reduce page faults during startup by 40% on a large iOS app and we can reduce compressed size by 0.8-3%. More details can be found in https://dl.acm.org/doi/10.1145/3660635 --------- Co-authored-by: Vincent Lee <thevinster@users.noreply.github.com> Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251039
auto window = isec->data.drop_front(i).take_front(windowSize); | ||
hashes.push_back(xxHash64(window)); | ||
} | ||
for (const auto &r : isec->relocs) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi.
Can you provide a detailed explanation about how these hash calculations are used and why choose to append the relocation-related hash into the end of original hashes?
In my understanding, this code is used to pre-calculate relocation to represent the instruction of final linked binary. In that case, I think it's better to use relocation to get the original instruction hash (from the previous hashes vector), and re-calculate the hash using hash_combine to update it. Instead of appending new hashes to the end?
And, I can not get the idea of the next line's sorting and unique operation, doesn't this may break the original input order of instruction hash, can the bpc algorithm still handle the thing correctly?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The BP algorithm uses a set of hashes, not an ordered list. This is why we sort and remove duplicates on line 94-95. This may be confusing because two functions can have the same set of hashes even if their instructions have different orders. But in fact, this is what we want. Compression algorithms will compress well if two functions close together have shared instructions (k-mers), not necessarily the exact instruction order.
This code collects two "types" of hashes.
- A sliding window of the section data. You can think of this as having a single hash for each instruction, although this is not technically correct.
- A hash for each relocation. I try to hash the target of the relocation rather than the bits themselves.
In an earlier version, I actually did compute 1 and 2 at the same time and combined the relocations into the hash of the data hash, but I found the code to be more complicated and the results were worse.
I suspect that separating 1 and 2 improves compression because some functions are similar, but their call, load, or store targets are different. These functions should be ordered close together, even if their instructions aren't identical.
You are welcome to experiment with this. The nice thing about BP is that using a different set of hashes can never break anything. At worst, it can regress startup performance or compression. Or it can improve things. Let me know if you have ideas and I can also try on our apps.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've tested the append
vs update
the relocation hash two policy. And it seems that it shows no huge differences on compressed as well as binary size. So it's OK to explain why we chose the easy one.
Tested project (compression ratio):
- lld: 0.693896256 (update old hash) / 0.69398724 (append reloc hash)
- Video App: 0.569880917 (update old hash) / 0.567276219 (append reloc hash)
Rename options related to profile guided function order (#96268) to prepare for the addition to the ELF port.
Rename options related to profile guided function order (llvm#96268) to prepare for the addition to the ELF port.
Add the lld flags
--irpgo-profile-sort=<profile>
and--compression-sort={function,data,both}
to order functions to improve startup time, and functions or data to improve compressed size, respectively.We use Balanced Partitioning to determine the best section order using traces from IRPGO profiles (see https://discourse.llvm.org/t/rfc-temporal-profiling-extension-for-irpgo/68068 for details) to improve startup time and using hashes of section contents to improve compressed size.
In our recent LLVM talk (https://www.youtube.com/watch?v=yd4pbSTjwuA), we showed that this can reduce page faults during startup by 40% on a large iOS app and we can reduce compressed size by 0.8-3%.
More details can be found in https://dl.acm.org/doi/10.1145/3660635