Skip to content

Commit

Permalink
[lld][InstrProf] Profile guided function order (#96268)
Browse files Browse the repository at this point in the history
Add the lld flags `--irpgo-profile-sort=<profile>` and
`--compression-sort={function,data,both}` to order functions to improve
startup time, and functions or data to improve compressed size,
respectively.

We use Balanced Partitioning to determine the best section order using
traces from IRPGO profiles (see
https://discourse.llvm.org/t/rfc-temporal-profiling-extension-for-irpgo/68068
for details) to improve startup time and using hashes of section
contents to improve compressed size.

In our recent LLVM talk (https://www.youtube.com/watch?v=yd4pbSTjwuA),
we showed that this can reduce page faults during startup by 40% on a
large iOS app and we can reduce compressed size by 0.8-3%.

More details can be found in https://dl.acm.org/doi/10.1145/3660635

---------

Co-authored-by: Vincent Lee <thevinster@users.noreply.github.com>
  • Loading branch information
ellishg and thevinster authored Jul 23, 2024
1 parent b42fe67 commit e3b30bc
Show file tree
Hide file tree
Showing 10 changed files with 740 additions and 1 deletion.
413 changes: 413 additions & 0 deletions lld/MachO/BPSectionOrderer.cpp

Large diffs are not rendered by default.

37 changes: 37 additions & 0 deletions lld/MachO/BPSectionOrderer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
//===- BPSectionOrderer.h ---------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// This file uses Balanced Partitioning to order sections to improve startup
/// time and compressed size.
///
//===----------------------------------------------------------------------===//

#ifndef LLD_MACHO_BPSECTION_ORDERER_H
#define LLD_MACHO_BPSECTION_ORDERER_H

#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/StringRef.h"

namespace lld::macho {

class InputSection;

/// Run Balanced Partitioning to find the optimal function and data order to
/// improve startup time and compressed size.
///
/// It is important that .subsections_via_symbols is used to ensure functions
/// and data are in their own sections and thus can be reordered.
llvm::DenseMap<const lld::macho::InputSection *, size_t>
runBalancedPartitioning(size_t &highestAvailablePriority,
llvm::StringRef profilePath,
bool forFunctionCompression, bool forDataCompression,
bool verbose);

} // namespace lld::macho

#endif
2 changes: 2 additions & 0 deletions lld/MachO/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ add_lld_library(lldMachO
OutputSection.cpp
OutputSegment.cpp
Relocations.cpp
BPSectionOrderer.cpp
SectionPriorities.cpp
Sections.cpp
SymbolTable.cpp
Expand All @@ -48,6 +49,7 @@ add_lld_library(lldMachO
Object
Option
Passes
ProfileData
Support
TargetParser
TextAPI
Expand Down
5 changes: 5 additions & 0 deletions lld/MachO/Config.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,11 @@ struct Configuration {
bool callGraphProfileSort = false;
llvm::StringRef printSymbolOrder;

llvm::StringRef irpgoProfileSortProfilePath;
bool functionOrderForCompression = false;
bool dataOrderForCompression = false;
bool verboseBpSectionOrderer = false;

SectionRenameMap sectionRenameMap;
SegmentRenameMap segmentRenameMap;

Expand Down
28 changes: 28 additions & 0 deletions lld/MachO/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1750,6 +1750,34 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
OPT_no_warn_thin_archive_missing_members, true);
config->generateUuid = !args.hasArg(OPT_no_uuid);

auto IncompatWithCGSort = [&](StringRef firstArgStr) {
// Throw an error only if --call-graph-profile-sort is explicitly specified
if (config->callGraphProfileSort)
if (const Arg *arg = args.getLastArgNoClaim(OPT_call_graph_profile_sort))
error(firstArgStr + " is incompatible with " + arg->getSpelling());
};
if (const Arg *arg = args.getLastArg(OPT_irpgo_profile_sort)) {
config->irpgoProfileSortProfilePath = arg->getValue();
IncompatWithCGSort(arg->getSpelling());
}
if (const Arg *arg = args.getLastArg(OPT_compression_sort)) {
StringRef compressionSortStr = arg->getValue();
if (compressionSortStr == "function") {
config->functionOrderForCompression = true;
} else if (compressionSortStr == "data") {
config->dataOrderForCompression = true;
} else if (compressionSortStr == "both") {
config->functionOrderForCompression = true;
config->dataOrderForCompression = true;
} else if (compressionSortStr != "none") {
error("unknown value `" + compressionSortStr + "` for " +
arg->getSpelling());
}
if (compressionSortStr != "none")
IncompatWithCGSort(arg->getSpelling());
}
config->verboseBpSectionOrderer = args.hasArg(OPT_verbose_bp_section_orderer);

for (const Arg *arg : args.filtered(OPT_alias)) {
config->aliasedSymbols.push_back(
std::make_pair(arg->getValue(0), arg->getValue(1)));
Expand Down
10 changes: 10 additions & 0 deletions lld/MachO/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,16 @@ def no_call_graph_profile_sort : Flag<["--"], "no-call-graph-profile-sort">,
def print_symbol_order_eq: Joined<["--"], "print-symbol-order=">,
HelpText<"Print a symbol order specified by --call-graph-profile-sort into the specified file">,
Group<grp_lld>;
def irpgo_profile_sort: Joined<["--"], "irpgo-profile-sort=">,
MetaVarName<"<profile>">,
HelpText<"Read the IRPGO profile at <profile> to order sections to improve startup time">,
Group<grp_lld>;
def compression_sort: Joined<["--"], "compression-sort=">,
MetaVarName<"[none,function,data,both]">,
HelpText<"Order sections to improve compressed size">, Group<grp_lld>;
def verbose_bp_section_orderer: Flag<["--"], "verbose-bp-section-orderer">,
HelpText<"Print information on how many sections were ordered by balanced partitioning and a measure of the expected number of page faults">,
Group<grp_lld>;
def ignore_auto_link_option : Separate<["--"], "ignore-auto-link-option">,
Group<grp_lld>;
def ignore_auto_link_option_eq : Joined<["--"], "ignore-auto-link-option=">,
Expand Down
10 changes: 9 additions & 1 deletion lld/MachO/SectionPriorities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//

#include "SectionPriorities.h"
#include "BPSectionOrderer.h"
#include "Config.h"
#include "InputFiles.h"
#include "Symbols.h"
Expand Down Expand Up @@ -352,7 +353,14 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
DenseMap<const InputSection *, size_t>
macho::PriorityBuilder::buildInputSectionPriorities() {
DenseMap<const InputSection *, size_t> sectionPriorities;
if (config->callGraphProfileSort) {
if (!config->irpgoProfileSortProfilePath.empty() ||
config->functionOrderForCompression || config->dataOrderForCompression) {
TimeTraceScope timeScope("Balanced Partitioning Section Orderer");
sectionPriorities = runBalancedPartitioning(
highestAvailablePriority, config->irpgoProfileSortProfilePath,
config->functionOrderForCompression, config->dataOrderForCompression,
config->verboseBpSectionOrderer);
} else if (config->callGraphProfileSort) {
// Sort sections by the profile data provided by __LLVM,__cg_profile
// sections.
//
Expand Down
8 changes: 8 additions & 0 deletions lld/test/MachO/bp-section-orderer-errs.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# RUN: not %lld -o /dev/null --irpgo-profile-sort=%s --call-graph-profile-sort 2>&1 | FileCheck %s --check-prefix=IRPGO-ERR
# IRPGO-ERR: --irpgo-profile-sort= is incompatible with --call-graph-profile-sort

# RUN: not %lld -o /dev/null --compression-sort=function --call-graph-profile-sort %s 2>&1 | FileCheck %s --check-prefix=COMPRESSION-ERR
# COMPRESSION-ERR: --compression-sort= is incompatible with --call-graph-profile-sort

# RUN: not %lld -o /dev/null --compression-sort=malformed 2>&1 | FileCheck %s --check-prefix=COMPRESSION-MALFORM
# COMPRESSION-MALFORM: unknown value `malformed` for --compression-sort=
105 changes: 105 additions & 0 deletions lld/test/MachO/bp-section-orderer-stress.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# REQUIRES: aarch64

# Generate a large test case and check that the output is deterministic.

# RUN: %python %s %t.s %t.proftext

# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t.s -o %t.o
# RUN: llvm-profdata merge %t.proftext -o %t.profdata

# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order1.txt
# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order2.txt
# RUN: diff %t.order1.txt %t.order2.txt

import random
import sys

assembly_filepath = sys.argv[1]
proftext_filepath = sys.argv[2]

random.seed(1234)
num_functions = 1000
num_data = 100
num_traces = 10

function_names = [f"f{n}" for n in range(num_functions)]
data_names = [f"d{n}" for n in range(num_data)]
profiled_functions = function_names[: int(num_functions / 2)]

function_contents = [
f"""
{name}:
add w0, w0, #{i % 4096}
add w1, w1, #{i % 10}
add w2, w0, #{i % 20}
adrp x3, {name}@PAGE
ret
"""
for i, name in enumerate(function_names)
]

data_contents = [
f"""
{name}:
.ascii "s{i % 2}-{i % 3}-{i % 5}"
.xword {name}
"""
for i, name in enumerate(data_names)
]

trace_contents = [
f"""
# Weight
1
{", ".join(random.sample(profiled_functions, len(profiled_functions)))}
"""
for i in range(num_traces)
]

profile_contents = [
f"""
{name}
# Func Hash:
{i}
# Num Counters:
1
# Counter Values:
1
"""
for i, name in enumerate(profiled_functions)
]

with open(assembly_filepath, "w") as f:
f.write(
f"""
.text
.globl _main

_main:
ret

{"".join(function_contents)}

.data
{"".join(data_contents)}

.subsections_via_symbols
"""
)

with open(proftext_filepath, "w") as f:
f.write(
f"""
:ir
:temporal_prof_traces

# Num Traces
{num_traces}
# Trace Stream Size:
{num_traces}

{"".join(trace_contents)}

{"".join(profile_contents)}
"""
)
123 changes: 123 additions & 0 deletions lld/test/MachO/bp-section-orderer.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# REQUIRES: aarch64

# RUN: rm -rf %t && split-file %s %t
# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/a.s -o %t/a.o
# RUN: llvm-profdata merge %t/a.proftext -o %t/a.profdata

# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer 2>&1 | FileCheck %s --check-prefix=STARTUP
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer --icf=all --compression-sort=none 2>&1 | FileCheck %s --check-prefix=STARTUP

# STARTUP: Ordered 3 sections using balanced partitioning

# RUN: %lld -arch arm64 -lSystem -e _main -o - %t/a.o --irpgo-profile-sort=%t/a.profdata -order_file %t/a.orderfile | llvm-nm --numeric-sort --format=just-symbols - | FileCheck %s --check-prefix=ORDERFILE

# ORDERFILE: A
# ORDERFILE: F
# ORDERFILE: E
# ORDERFILE: D
# ORDERFILE-DAG: _main
# ORDERFILE-DAG: _B
# ORDERFILE-DAG: l_C
# ORDERFILE-DAG: s1
# ORDERFILE-DAG: s2
# ORDERFILE-DAG: r1
# ORDERFILE-DAG: r2

# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=function 2>&1 | FileCheck %s --check-prefix=COMPRESSION-FUNC
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=data 2>&1 | FileCheck %s --check-prefix=COMPRESSION-DATA
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=both 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=both --irpgo-profile-sort=%t/a.profdata 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH

# COMPRESSION-FUNC: Ordered 7 sections using balanced partitioning
# COMPRESSION-DATA: Ordered 4 sections using balanced partitioning
# COMPRESSION-BOTH: Ordered 11 sections using balanced partitioning

#--- a.s
.text
.globl _main, A, _B, l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222

_main:
ret
A:
ret
_B:
add w0, w0, #1
bl A
ret
l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222:
add w0, w0, #2
bl A
ret
D:
add w0, w0, #2
bl _B
ret
E:
add w0, w0, #2
bl l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
ret
F:
add w0, w0, #3
bl l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
ret

.data
s1:
.ascii "hello world"
s2:
.ascii "i am a string"
r1:
.quad s1
r2:
.quad r1

.subsections_via_symbols

#--- a.proftext
:ir
:temporal_prof_traces
# Num Traces
1
# Trace Stream Size:
1
# Weight
1
A, B, C.__uniq.555555555555555555555555555555555555555.llvm.6666666666666666666

A
# Func Hash:
1111
# Num Counters:
1
# Counter Values:
1

B
# Func Hash:
2222
# Num Counters:
1
# Counter Values:
1

C.__uniq.555555555555555555555555555555555555555.llvm.6666666666666666666
# Func Hash:
3333
# Num Counters:
1
# Counter Values:
1

D
# Func Hash:
4444
# Num Counters:
1
# Counter Values:
1

#--- a.orderfile
A
F
E
D

0 comments on commit e3b30bc

Please sign in to comment.