Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Tracy time with instrumentation #9170

Merged
merged 36 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
b03db36
start
codygunton Oct 2, 2024
5593c88
use new macro, things build
codygunton Oct 2, 2024
e2338a1
This produces a reasonable profile
codygunton Oct 2, 2024
32215c0
start new presets
codygunton Oct 2, 2024
8970c29
Cut out old macro interplay
codygunton Oct 2, 2024
9cd94c6
Don't use overloaded memory ops in general
codygunton Oct 2, 2024
16e8ce8
Remove more unused macros
codygunton Oct 2, 2024
ea7907f
WIP: try to chain to run on benching instance
codygunton Oct 2, 2024
2e963a9
Default preset and quieter cmake and taskset
codygunton Oct 2, 2024
0a7ae29
profile parts of pippenger
codygunton Oct 3, 2024
2e22bc0
change default for convenience
codygunton Oct 3, 2024
a98a68c
multithread random polynomial generation
codygunton Oct 3, 2024
124a6a4
some more
codygunton Oct 4, 2024
ca44733
Set up local benchmarking
codygunton Oct 4, 2024
df005d7
Time profile for wasm on airplane
codygunton Oct 5, 2024
6584a1a
Revert "Time profile for wasm on airplane"
codygunton Oct 5, 2024
fea8b01
Use new instrumentation macro
codygunton Oct 6, 2024
11b1cb4
Merge remote-tracking branch 'origin/master' into cg/instrument
codygunton Oct 11, 2024
140f70e
Conditionally run with sudo
codygunton Oct 11, 2024
d12217d
Reset wasi sdk prefix
codygunton Oct 11, 2024
6228bfe
Nix script attempt
codygunton Oct 11, 2024
afb1d27
Reset default bench
codygunton Oct 11, 2024
a78e9b6
Rename script
codygunton Oct 11, 2024
bf3275f
Revert bench args change
codygunton Oct 11, 2024
6f5f1ec
Revert commit bench
codygunton Oct 11, 2024
cc69f25
Set up samply profiling script
codygunton Oct 11, 2024
04ddc03
Rename scripts for consistency
codygunton Oct 11, 2024
9120f26
Base instrumented on clang16
codygunton Oct 11, 2024
002a1fc
Put code running back
codygunton Oct 11, 2024
0182c60
bring back ENABLE_TRACY block
codygunton Oct 11, 2024
0efdf9d
Try to fix libs linker error
codygunton Oct 11, 2024
83b79e8
Clean up cmake stuff a bit
codygunton Oct 11, 2024
b5c1b8e
Merge remote-tracking branch 'origin/master' into cg/instrument
codygunton Oct 11, 2024
4ffb5ba
or ~> OR
codygunton Oct 11, 2024
aa9ddfd
Satisfy formatter
codygunton Oct 11, 2024
4b6bd73
Avoid formatter issue
codygunton Oct 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion barretenberg/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -477,4 +477,4 @@ See Tracy manual linked here <https://github.com/wolfpld/tracy> for in-depth Tra
The basic use of Tracy is to run a benchmark with the `cmake --preset tracy` build type, create a capture file, then
transfer it to a local machine for interactive UI introspection.

All the steps to do this effectively are included in cpp/scripts/benchmark_tracy.sh
All the steps to do this effectively are included in various scripts in cpp/scripts/.
13 changes: 7 additions & 6 deletions barretenberg/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,21 +54,22 @@ if(CHECK_CIRCUIT_STACKTRACES)
add_compile_options(-DCHECK_CIRCUIT_STACKTRACES)
endif()

if(ENABLE_TRACY)
if(ENABLE_TRACY OR ENABLE_TRACY_TIME_INSTRUMENTED)
add_compile_options(-DTRACY_ENABLE)
SET(TRACY_LIBS Tracy::TracyClient)
else()
SET(TRACY_LIBS)
endif()

if(TRACY_PROFILE_MEMORY)
add_compile_options(-DTRACY_MEMORY)
endif()

if(TRACY_PROFILE_TIME)
add_compile_options(-DTRACY_TIME)
if(ENABLE_TRACY_TIME_INSTRUMENTED)
add_compile_options(-DTRACY_INSTRUMENTED)
endif()

if(TRACY_PROFILE_MEMORY)
add_compile_options(-DTRACY_MEMORY)
add_compile_options(-DTRACY_INSTRUMENTED)
endif()

if(ENABLE_ASAN)
add_compile_options(-fsanitize=address)
Expand Down
34 changes: 24 additions & 10 deletions barretenberg/cpp/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@
},
{
"name": "tracy-memory",
"displayName": "Release build with tracy, optimized for memory tracking",
"description": "Release build with tracy, optimized for memory tracking",
"displayName": "Release build with tracy with memory tracking",
"description": "Release build with tracy with memory tracking",
"inherits": "clang16",
"binaryDir": "build-tracy-memory",
"cacheVariables": {
Expand All @@ -122,20 +122,29 @@
}
},
{
"name": "tracy-time",
"displayName": "Build for tracy time profiling",
"description": "Build for tracy time profiling",
"binaryDir": "build-tracy-time",
"name": "tracy-time-instrumented",
"displayName": "Build for tracy time profiling via instrumentation",
"description": "Build for tracy time profiling via instrumentation",
"binaryDir": "build-tracy-time-instrumented",
"inherits": "clang16",
"cacheVariables": {
"ENABLE_TRACY_TIME_INSTRUMENTED": "ON"
}
},
{
"name": "tracy-time-sampled",
"displayName": "Build for tracy time profiling via sampling",
"description": "Build for tracy time profiling via sampling",
"binaryDir": "build-tracy-time-sampled",
"inherits": "default",
"environment": {
"CMAKE_BUILD_TYPE": "RelWithDebInfo",
"CFLAGS": "-g -fno-omit-frame-pointer",
"CXXFLAGS": "-g -fno-omit-frame-pointer",
"LDFLAGS": "-g -fno-omit-frame-pointer -rdynamic"
},
"cacheVariables": {
"ENABLE_TRACY": "ON",
"TRACY_PROFILE_TIME": "ON"
"ENABLE_TRACY": "ON"
}
},
{
Expand Down Expand Up @@ -497,9 +506,14 @@
"configurePreset": "tracy-memory"
},
{
"name": "tracy-time",
"name": "tracy-time-instrumented",
"inherits": "default",
"configurePreset": "tracy-time-instrumented"
},
{
"name": "tracy-time-sampled",
"inherits": "default",
"configurePreset": "tracy-time"
"configurePreset": "tracy-time-sampled"
},
{
"name": "clang16-pic",
Expand Down
4 changes: 2 additions & 2 deletions barretenberg/cpp/scripts/benchmark_wasm.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env bash
set -eu

BENCHMARK=${1:-goblin_bench}
COMMAND=${2:-./bin/$BENCHMARK}
BENCHMARK=${1:-client_ivc_bench}
COMMAND=${2:-./bin/$BENCHMARK --benchmark_filter=ClientIVCBench/Full/6}
HARDWARE_CONCURRENCY=${HARDWARE_CONCURRENCY:-16}

# Move above script dir.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,31 @@ USER=${1:-$USER}
BOX=$USER-box
BENCHMARK=${2:-client_ivc_bench}
COMMAND=${3:-./bin/$BENCHMARK --benchmark_filter=ClientIVCBench/Full/6"\$"}

HARDWARE_CONCURRENCY=${HARDWARE_CONCURRENCY:-16}
# Can also set PRESET=tracy-gates env variable
PRESET=${PRESET:-tracy-memory}

ssh $BOX "
set -eux ;
cd ~/aztec-packages/barretenberg/cpp/ ;
cmake --preset $PRESET && cmake --build --preset $PRESET --target $BENCHMARK ;
! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy ;
cd ~/tracy/capture ;
git checkout 075395620a504c0cdcaf9bab3d196db16a043de7 ;
sudo apt-get install -y libdbus-1-dev libdbus-glib-1-dev ;
mkdir -p build && cd build && cmake .. && make -j ;
sudo apt-get install -y libdbus-1-dev libdbus-glib-1-dev libtbb-dev libfreetype-dev ;
mkdir -p build && cd build && cmake -DCMAKE_MESSAGE_LOG_LEVEL=Warning .. && make -j ;
cd ~/aztec-packages/barretenberg/cpp/ ;
cmake -DCMAKE_MESSAGE_LOG_LEVEL=Warning --preset $PRESET && cmake --build --preset $PRESET --target $BENCHMARK ;
./tracy-capture -a 127.0.0.1 -f -o trace-$BENCHMARK & ;
sleep 0.1 ;
cd ~/aztec-packages/barretenberg/cpp/build-$PRESET ;
ninja $BENCHMARK ;
$COMMAND ;
HARDWARE_CONCURRENCY=$HARDWARE_CONCURRENCY $COMMAND ;
" &

wait # TODO(AD) hack - not sure why needed
! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy
cd ~/tracy
git checkout 075395620a504c0cdcaf9bab3d196db16a043de7 # release 0.11.0
cmake -B profiler/build -S profiler -DCMAKE_BUILD_TYPE=Release
cmake -DCMAKE_MESSAGE_LOG_LEVEL=Warning -B profiler/build -S profiler -DCMAKE_BUILD_TYPE=Release
cmake --build profiler/build --parallel
scp $BOX:/mnt/user-data/$USER/tracy/capture/build/trace-$BENCHMARK .
~/tracy/profiler/build/tracy-profiler trace-$BENCHMARK
36 changes: 36 additions & 0 deletions barretenberg/cpp/scripts/profile_tracy_local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Collect a profile completely locally, i.e., without using any remote machine for building or capturing.

set -eux
USER=${1:-$USER}
BOX=$USER-box
BENCHMARK=${2:-client_ivc_bench}
COMMAND=${3:-./bin/$BENCHMARK --benchmark_filter=ClientIVCBench/Full/6}
HARDWARE_CONCURRENCY=${HARDWARE_CONCURRENCY:-16}
PRESET=${PRESET:-tracy-time-sampled}

! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy
cd ~/tracy
git checkout 075395620a504c0cdcaf9bab3d196db16a043de7 # release 0.11.0
cmake -B profiler/build -S profiler -DCMAKE_BUILD_TYPE=Release
cmake --build profiler/build --parallel

cd ~/aztec-packages/barretenberg/cpp/
cmake --preset $PRESET -DCMAKE_MESSAGE_LOG_LEVEL=Warning && cmake --build --preset $PRESET --target $BENCHMARK

! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy
cd ~/tracy/capture
git checkout 075395620a504c0cdcaf9bab3d196db16a043de7
mkdir -p build && cd build && cmake .. -DCMAKE_MESSAGE_LOG_LEVEL=Warning && make -j

./tracy-capture -a 127.0.0.1 -f -o ../trace-$BENCHMARK &
sleep 0.1
cd ~/aztec-packages/barretenberg/cpp/build-$PRESET/

# Run the COMMAND with sudo if PRESET is 'tracy-time-sampled'
if [ "$PRESET" = "tracy-time-sampled" ]; then
sudo HARDWARE_CONCURRENCY=$HARDWARE_CONCURRENCY $COMMAND
else
HARDWARE_CONCURRENCY=$HARDWARE_CONCURRENCY $COMMAND
fi

~/tracy/profiler/build/tracy-profiler ~/tracy/capture/trace-$BENCHMARK
19 changes: 19 additions & 0 deletions barretenberg/cpp/scripts/profile_wasm_samply.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# This is to be run locally not in a container, so the user must handle samply installation.
#!/usr/bin/env bash
set -eu

BENCHMARK=${1:-client_ivc_bench}
COMMAND=${2:-./bin/$BENCHMARK --benchmark_filter=ClientIVCBench/Full/6}
HARDWARE_CONCURRENCY=${HARDWARE_CONCURRENCY:-16}

# Move above script dir.
cd $(dirname $0)/..

# Configure and build.
cmake --preset wasm-threads -DCMAKE_MESSAGE_LOG_LEVEL=Warning
cmake --build --preset wasm-threads --target $BENCHMARK

cd build-wasm-threads
# Consistency with _wasm.sh targets / shorter $COMMAND.
cp ./bin/$BENCHMARK .
samply record wasmtime run --profile=perfmap --env HARDWARE_CONCURRENCY=$HARDWARE_CONCURRENCY -Wthreads=y -Sthreads=y --dir=.. $COMMAND
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,8 @@ Prover get_prover(void (*test_circuit_function)(typename Prover::Flavor::Circuit
Composer composer;
return composer.create_prover(builder);
} else {
#ifdef TRACY_MEMORY
ZoneScopedN("creating prover");
#endif
PROFILE_THIS_NAME("creating prover");

return Prover(builder);
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ void perform_ivc_accumulation_rounds(size_t NUM_CIRCUITS,
for (size_t circuit_idx = 0; circuit_idx < NUM_CIRCUITS; ++circuit_idx) {
MegaCircuitBuilder circuit;
{
BB_OP_COUNT_TIME_NAME("construct_circuits");
PROFILE_THIS_NAME("construct_circuits");
circuit = circuit_producer.create_next_circuit(ivc);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ template <class Curve> class CommitmentKey {
*/
Commitment commit(PolynomialSpan<const Fr> polynomial)
{
BB_OP_COUNT_TIME();
PROFILE_THIS();
// We must have a power-of-2 SRS points *after* subtracting by start_index.
const size_t consumed_srs = numeric::round_up_power_2(polynomial.size()) + polynomial.start_index;
auto srs = srs::get_crs_factory<Curve>()->get_prover_crs(consumed_srs);
Expand Down Expand Up @@ -120,7 +120,7 @@ template <class Curve> class CommitmentKey {
*/
Commitment commit_sparse(PolynomialSpan<const Fr> polynomial)
{
BB_OP_COUNT_TIME();
PROFILE_THIS();
const size_t poly_size = polynomial.size();
ASSERT(polynomial.end_index() <= srs->get_monomial_size());

Expand Down
2 changes: 1 addition & 1 deletion barretenberg/cpp/src/barretenberg/common/mem.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "barretenberg/common/mem.hpp"

#ifdef TRACY_ENABLE
#ifdef TRACY_MEMORY
void* operator new(std::size_t count)
{
// NOLINTBEGIN(cppcoreguidelines-no-malloc)
Expand Down
23 changes: 14 additions & 9 deletions barretenberg/cpp/src/barretenberg/common/op_count.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,18 @@

#include <memory>
#include <tracy/Tracy.hpp>

#ifdef BB_USE_OP_COUNT_TIME_ONLY
#define PROFILE_THIS() BB_OP_COUNT_TIME_NAME(__func__)
#define PROFILE_THIS_NAME(name) BB_OP_COUNT_TIME_NAME(name)
#elif defined TRACY_INSTRUMENTED
#define PROFILE_THIS() ZoneScopedN(__func__)
#define PROFILE_THIS_NAME(name) ZoneScopedN(name)
#else
#define PROFILE_THIS() (void)0
#define PROFILE_THIS_NAME(name) (void)0
#endif

#ifndef BB_USE_OP_COUNT
// require a semicolon to appease formatters
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
Expand All @@ -12,18 +24,11 @@
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_CYCLES_NAME(name) (void)0
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_CYCLES() (void)0
#ifndef TRACY_TIME
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME_NAME(name) (void)0
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME() (void)0
#else
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME_NAME(name) ZoneScopedN(name)
#define BB_OP_COUNT_CYCLES() (void)0
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME() BB_OP_COUNT_TIME_NAME(__func__)
#endif
#define BB_OP_COUNT_TIME() (void)0
#else
/**
* Provides an abstraction that counts operations based on function names.
Expand Down
9 changes: 6 additions & 3 deletions barretenberg/cpp/src/barretenberg/common/slab_allocator.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#include "slab_allocator.hpp"
#include <barretenberg/common/assert.hpp>
#include <barretenberg/common/log.hpp>
#include <barretenberg/common/mem.hpp>
#include "barretenberg/common/assert.hpp"
#include "barretenberg/common/log.hpp"
#include "barretenberg/common/mem.hpp"
#include "barretenberg/common/op_count.hpp"
#include <cstddef>
#include <numeric>
#include <unordered_map>
Expand Down Expand Up @@ -211,6 +212,8 @@ void init_slab_allocator(size_t circuit_subgroup_size)

std::shared_ptr<void> get_mem_slab(size_t size)
{
PROFILE_THIS();

return allocator.get(size);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ template <class Fq, class Fr, class T>
std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomorphism(
const std::span<const affine_element<Fq, Fr, T>>& points, const Fr& scalar) noexcept
{
BB_OP_COUNT_TIME();
PROFILE_THIS();
typedef affine_element<Fq, Fr, T> affine_element;
const size_t num_points = points.size();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ pippenger_runtime_state<Curve>::pippenger_runtime_state(const size_t num_initial
, bucket_empty_status(reinterpret_cast<bool*>(aligned_alloc(64, num_threads * num_buckets * sizeof(bool))))
, round_counts(reinterpret_cast<uint64_t*>(aligned_alloc(32, MAX_NUM_ROUNDS * sizeof(uint64_t))))
{
PROFILE_THIS();

using Fq = typename Curve::BaseField;
using AffineElement = typename Curve::AffineElement;

Expand All @@ -51,6 +53,7 @@ pippenger_runtime_state<Curve>::pippenger_runtime_state(const size_t num_initial

const size_t points_per_thread = static_cast<size_t>(num_points) / num_threads;
parallel_for(num_threads, [&](size_t i) {
PROFILE_THIS_NAME("memset in Pippenger runtime state creation");
const size_t thread_offset = i * points_per_thread;
memset(reinterpret_cast<void*>(point_pairs_1 + thread_offset + (i * 16)),
0,
Expand Down Expand Up @@ -96,6 +99,8 @@ pippenger_runtime_state<Curve>::pippenger_runtime_state(pippenger_runtime_state&
, round_counts(other.round_counts)

{
PROFILE_THIS();

other.point_schedule = nullptr;
other.skew_table = nullptr;
other.point_pairs_1 = nullptr;
Expand All @@ -111,6 +116,8 @@ template <typename Curve>
pippenger_runtime_state<Curve>& pippenger_runtime_state<Curve>::operator=(
pippenger_runtime_state<Curve>&& other) noexcept
{
PROFILE_THIS();

if (skew_table != nullptr) {
aligned_free(skew_table);
}
Expand Down Expand Up @@ -164,6 +171,8 @@ template <typename Curve>
affine_product_runtime_state<Curve> pippenger_runtime_state<Curve>::get_affine_product_runtime_state(
const size_t num_threads, const size_t thread_index)
{
PROFILE_THIS();

const auto points_per_thread = static_cast<size_t>(num_points / num_threads);
const auto num_buckets =
static_cast<size_t>(1U << scalar_multiplication::get_optimal_bucket_width(static_cast<size_t>(num_points) / 2));
Expand All @@ -181,6 +190,8 @@ affine_product_runtime_state<Curve> pippenger_runtime_state<Curve>::get_affine_p

template <typename Curve> pippenger_runtime_state<Curve>::~pippenger_runtime_state() noexcept
{
PROFILE_THIS();

if (skew_table != nullptr) {
aligned_free(skew_table);
}
Expand Down
Loading
Loading